diff options
Diffstat (limited to 'usr/src/uts/common')
482 files changed, 144151 insertions, 3253 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index e46c461c54..16d89ee737 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -21,10 +21,10 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2012 Joyent, Inc. All rights reserved. # Copyright (c) 2011, 2014 by Delphix. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # Copyright 2015 Nexenta Systems, Inc. All rights reserved. +# Copyright 2016 Joyent, Inc. # Copyright 2016 Garrett D'Amore <garrett@damore.org> # @@ -282,6 +282,7 @@ GENUNIX_OBJS += \ rctl.o \ rctlsys.o \ readlink.o \ + refhash.o \ refstr.o \ rename.o \ resolvepath.o \ @@ -303,6 +304,7 @@ GENUNIX_OBJS += \ seg_map.o \ seg_vn.o \ seg_spt.o \ + seg_umap.o \ semaphore.o \ sendfile.o \ session.o \ @@ -428,6 +430,8 @@ PROFILE_OBJS += profile.o SYSTRACE_OBJS += systrace.o +LX_SYSTRACE_OBJS += lx_systrace.o + LOCKSTAT_OBJS += lockstat.o FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o @@ -492,6 +496,10 @@ PTSL_OBJS += tty_pts.o PTM_OBJS += ptm.o +LX_PTM_OBJS += lx_ptm.o + +LX_NETLINK_OBJS += lx_netlink.o + MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ mii_marvell.o mii_realtek.o mii_other.o @@ -549,6 +557,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \ sctp_misc.o IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o +IP_COMM_OBJS = inet_hash.o IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ @@ -564,7 +573,8 @@ IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ $(IP_TCP_OBJS) \ $(IP_UDP_OBJS) \ $(IP_SCTP_OBJS) \ - $(IP_ILB_OBJS) + $(IP_ILB_OBJS) \ + $(IP_COMM_OBJS) IP6_OBJS += ip6ddi.o @@ -582,6 +592,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o +DATAFILT_OBJS += datafilt.o + SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o SPPPTUN_OBJS += sppptun.o sppptun_mod.o @@ -679,6 +691,15 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ VNIC_OBJS += vnic_ctl.o vnic_dev.o +OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \ + overlay_prop.o overlay_target.o + +OVERLAY_VXLAN_OBJS += overlay_vxlan.o + +VND_OBJS += vnd.o frameio.o + +GSQUEUE_OBJS += gsqueue.o + SIMNET_OBJS += simnet.o IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o @@ -997,6 +1018,8 @@ SIGNALFD_OBJS += signalfd.o I8042_OBJS += i8042.o +INOTIFY_OBJS += inotify.o + KB8042_OBJS += \ at_keyprocess.o \ kb8042.o \ @@ -1071,6 +1094,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o ZCONS_OBJS += zcons.o +ZFD_OBJS += zfd.o + NV_SATA_OBJS += nv_sata.o SI3124_OBJS += si3124.o @@ -1124,8 +1149,7 @@ DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \ sdev_ptsops.o sdev_zvolops.o sdev_comm.o \ sdev_profile.o sdev_ncache.o sdev_netops.o \ - sdev_ipnetops.o \ - sdev_vtops.o + sdev_ipnetops.o sdev_vtops.o sdev_plugin.o CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \ ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o @@ -1142,8 +1166,13 @@ PIPE_OBJS += pipe.o HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o +HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \ + hyprlofs_vnops.o hyprlofs_vfsops.o + LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o +LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o + NAMEFS_OBJS += namevfs.o namevno.o NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ @@ -1295,8 +1324,8 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ pc_vfsops.o pc_vnops.o -PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ - prvfsops.o prvnops.o +PROC_OBJS += prargv.o prcontrol.o prioctl.o prsubr.o \ + prusrio.o prvfsops.o prvnops.o MNTFS_OBJS += mntvfsops.o mntvnops.o @@ -1438,6 +1467,7 @@ ZFS_COMMON_OBJS += \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ + zfs_zone.o \ zil.o \ zio.o \ zio_checksum.o \ @@ -1884,7 +1914,7 @@ ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o MXFE_OBJS += mxfe.o -MPTSAS_OBJS += mptsas.o mptsas_hash.o mptsas_impl.o mptsas_init.o \ +MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o \ mptsas_raid.o mptsas_smhba.o SFE_OBJS += sfe.o sfe_util.o @@ -2005,6 +2035,15 @@ IXGBE_OBJS = ixgbe_82598.o ixgbe_82599.o ixgbe_api.o \ ixgbe_tx.o ixgbe_x540.o ixgbe_mbx.o # +# Intel 40GbE PCIe NIC driver module +# + +# illumos-written ones. +I40E_OBJS = i40e_main.o i40e_osdep.o i40e_intr.o i40e_transceiver.o \ + i40e_stats.o i40e_gld.o +# Intel-written ones. +I40E_INTC_OBJS = i40e_adminq.o i40e_common.o i40e_hmc.o i40e_lan_hmc.o \ + i40e_nvm.o # Solarflare 1/10/40GbE NIC driver module # # NB: The illumos specific sources are listed first, with the @@ -2103,6 +2142,11 @@ MEGA_SAS_OBJS = megaraid_sas.o MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o # +# DR_SAS module +# +DR_SAS_OBJS = dr_sas.o + +# # CPQARY3 module # CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \ @@ -2141,6 +2185,20 @@ NULLDRIVER_OBJS = nulldriver.o TPM_OBJS = tpm.o tpm_hcall.o # +# USB Fast ethernet drivers +# +USBGEM_OBJS = usbgem.o +AXF_OBJS = axf_usbgem.o +UDMF_OBJS = udmf_usbgem.o +URF_OBJS = urf_usbgem.o +UPF_OBJS = upf_usbgem.o + +# +# NFP objects +# +NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o + +# # BNXE objects # BNXE_OBJS += bnxe_cfg.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index fdbe9717f3..a80ec6293f 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -23,6 +23,7 @@ # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2016 Garrett D'Amore <garrett@damore.org> # Copyright 2015 Nexenta Systems, Inc. All rights reserved. +# Copyright 2016 Joyent, Inc. # Copyright 2013 Saso Kiselkov. All rights reserved. # @@ -96,6 +97,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/avl/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/inet/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/ucode/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -244,10 +249,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -759,6 +772,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/drm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/dr_sas/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/efe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -987,6 +1004,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/net80211/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nfp/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nge/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1006,6 +1027,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1142,6 +1171,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1154,6 +1187,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1278,6 +1315,30 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/usb/usba10/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/usbgem/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/axf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/udf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/udmf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/upf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/urf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vuidmice/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1358,6 +1419,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ixgbe/core/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/i40e/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/i40e/core/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ntxn/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1462,9 +1531,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/idspace/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioif/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) + # # krtld must refer to its own bzero/bcopy until the kernel is fully linked # @@ -1533,6 +1607,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1656,6 +1734,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/acl/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/avl/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(COMMONBASE)/inet/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(COMMONBASE)/ucode/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1791,9 +1872,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2139,6 +2226,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dmfe/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/drm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dr_sas/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/efe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2310,6 +2400,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/mwl/mwl_fw/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/net80211/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nfp/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nge/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2325,6 +2418,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/%.s $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/npi/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/plugins/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/pci-ide/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2532,6 +2631,21 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usb/usba/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usb/usba10/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usbgem/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/axf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/udmf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/upf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/urf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/vuidmice/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2592,6 +2706,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ixgbe/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ixgbe/core/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/i40e/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ntxn/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2682,6 +2799,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/nexus/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/pcs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/refhash/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2775,3 +2895,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/skd/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/fsreparse/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(COMMONBASE)/idspace/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c new file mode 100644 index 0000000000..c55fc6d95f --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c @@ -0,0 +1,3152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * See the big theory statement in ../sys/lx_autofs.h + */ + +#include <fs/fs_subr.h> +#include <sys/stat.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/dirent.h> +#include <sys/fs/fifonode.h> +#include <sys/modctl.h> +#include <sys/mount.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/conf.h> +#include <sys/sdt.h> + +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> + +#include <sys/dnlc.h> +#include <nfs/rnode.h> +#include <nfs/rnode4.h> +#include <sys/lx_autofs_impl.h> +#include <sys/lx_types.h> + +/* + * External functions + */ +extern uintptr_t space_fetch(char *key); +extern int space_store(char *key, uintptr_t ptr); +extern int umount2_engine(vfs_t *, int, cred_t *, int); + +/* + * Globals + */ +static vfsops_t *lx_autofs_vfsops; +static vnodeops_t *lx_autofs_vn_ops = NULL; +static int lx_autofs_fstype; +static major_t lx_autofs_major; +static minor_t lx_autofs_minor = 0; +static dev_info_t *lx_autofs_dip = NULL; + +#define LX_AUTOFS_DEV_VERSION_MAJOR 1 +#define LX_AUTOFS_DEV_VERSION_MINOR 0 + +/* The Linux autofs superblock magic number */ +#define LX_AUTOFS_SB_MAGIC 0x0187 + +/* Linux autofs mount types */ +#define LX_AUTOFS_TYPE_INDIRECT 1 +#define LX_AUTOFS_TYPE_DIRECT 2 +#define LX_AUTOFS_TYPE_OFFSET 4 + +/* Structure passed for autofs dev ioctls */ +typedef struct lx_autofs_dv_ioctl { + uint32_t lad_ver_major; + uint32_t lad_ver_minor; + uint32_t lad_size; + uint32_t lad_ioctlfd; + uint32_t lad_arg1; + uint32_t lad_arg2; + char lad_path[0]; +} lx_autofs_dv_ioctl_t; + +/* + * Support functions + */ +static void +lx_autofs_strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +lx_autofs_strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static int +lx_autofs_str_to_int(char *str, int *val) +{ + long res; + + if (str == NULL) + return (-1); + + if ((ddi_strtol(str, NULL, 10, &res) != 0) || + (res < INT_MIN) || (res > INT_MAX)) + return (-1); + + *val = res; + return (0); +} + +static void +ls_autofs_stack_init(list_t *lp) +{ + list_create(lp, + sizeof (stack_elem_t), offsetof(stack_elem_t, se_list)); +} + +static void +lx_autofs_stack_fini(list_t *lp) +{ + ASSERT(list_head(lp) == NULL); + list_destroy(lp); +} + +static void +lx_autofs_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3) +{ + stack_elem_t *se; + + se = kmem_alloc(sizeof (*se), KM_SLEEP); + se->se_ptr1 = ptr1; + se->se_ptr2 = ptr2; + se->se_ptr3 = ptr3; + list_insert_head(lp, se); +} + +static int +lx_autofs_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3) +{ + stack_elem_t *se; + + if ((se = list_head(lp)) == NULL) + return (-1); + list_remove(lp, se); + if (ptr1 != NULL) + *ptr1 = se->se_ptr1; + if (ptr2 != NULL) + *ptr2 = se->se_ptr2; + if (ptr3 != NULL) + *ptr3 = se->se_ptr3; + kmem_free(se, sizeof (*se)); + return (0); +} + +static vnode_t * +lx_autofs_fifo_peer_vp(vnode_t *vp) +{ + fifonode_t *fnp = VTOF(vp); + fifonode_t *fn_dest = fnp->fn_dest; + return (FTOV(fn_dest)); +} + +static vnode_t * +lx_autofs_vn_alloc(vfs_t *vfsp, vnode_t *uvp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *vp, *vp_old; + + /* Allocate a new vnode structure in case we need it. */ + vp = vn_alloc(KM_SLEEP); + vn_setops(vp, lx_autofs_vn_ops); + VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev); + vp->v_data = uvp; + ASSERT(vp->v_count == 1); + + /* + * Take a hold on the vfs structure. This is how unmount will + * determine if there are any active vnodes in the file system. + */ + VFS_HOLD(vfsp); + + /* + * Check if we already have a vnode allocated for this underlying + * vnode_t. + */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) { + + /* + * Didn't find an existing node. + * Add this node to the hash and return. + */ + VERIFY(mod_hash_insert(data->lav_vn_hash, + (mod_hash_key_t)uvp, + (mod_hash_val_t)vp) == 0); + mutex_exit(&data->lav_lock); + return (vp); + } + + /* Get a hold on the existing vnode and free up the one we allocated. */ + VN_HOLD(vp_old); + mutex_exit(&data->lav_lock); + + /* Free up the new vnode we allocated. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); + + return (vp_old); +} + +static void +lx_autofs_vn_free(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *uvp = vp->v_data; + vnode_t *vp_tmp; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + ASSERT(MUTEX_HELD((&vp->v_lock))); + + ASSERT(vp->v_count == 0); + + /* We're about to free this vnode so take it out of the hash. */ + (void) mod_hash_remove(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp); + + /* + * No one else can lookup this vnode any more so there's no need + * to hold locks. + */ + mutex_exit(&data->lav_lock); + mutex_exit(&vp->v_lock); + + /* Release the underlying vnode. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_alloc(lx_autofs_vfs_t *data, boolean_t *is_dup, boolean_t expire, + char *nm) +{ + lx_autofs_automnt_req_t *laar, *laar_dup; + + /* Pre-allocate a new automounter request before grabbing locks. */ + laar = kmem_zalloc(sizeof (*laar), KM_SLEEP); + mutex_init(&laar->laar_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&laar->laar_cv, NULL, CV_DEFAULT, NULL); + laar->laar_ref = 1; + + if (data->lav_min_proto == 5) { + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS5; + + if (data->lav_mnttype == LXAMT_INDIR) { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_INDIR; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_INDIR; + } + } else { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_DIRECT; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_DIRECT; + } + } + laar->laar_pkt_size = sizeof (lx_autofs_v5_pkt_t); + + laar->laar_pkt.lap_v5.lap_dev = data->lav_dev; + laar->laar_pkt.lap_v5.lap_ino = data->lav_ino; + /* + * Note that we're currently not filling in the other v5 pkt + * fields (pid, uid, etc.) since they don't appear to be used + * by the automounter. We can fill those in later if it proves + * necessary. + */ + + /* + * For indirect mounts the token expected by the automounter is + * the name of the directory entry to look up (not the entire + * path that is being accessed.) For direct mounts the Linux + * kernel passes a dummy name, so this is just as good. + */ + laar->laar_pkt.lap_v5.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v5.lap_name_len > + (sizeof (laar->laar_pkt.lap_v5.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs automnt req: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v5.lap_name, nm, + sizeof (laar->laar_pkt.lap_v5.lap_name)); + + } else if (expire) { + zcmn_err(getzoneid(), CE_WARN, + "unsupported expire protocol request: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + + } else { + ASSERT(expire == B_FALSE); + + /* Older protocol pkt (really v2) */ + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS2; + laar->laar_pkt.lap_type = LX_AUTOFS_PTYPE_MISSING; + laar->laar_pkt_size = sizeof (lx_autofs_v2_pkt_t); + + /* + * The token expected by the linux automount is the name of + * the directory entry to look up. (And not the entire + * path that is being accessed.) + */ + laar->laar_pkt.lap_v2.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v2.lap_name_len > + (sizeof (laar->laar_pkt.lap_v2.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs lookup: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v2.lap_name, nm, + sizeof (laar->laar_pkt.lap_v2.lap_name)); + } + + /* Assign a unique id for this request. */ + laar->laar_pkt.lap_id = id_alloc(data->lav_ids); + + /* Check for an outstanding request for this path. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_path_hash, + (mod_hash_key_t)nm, (mod_hash_val_t *)&laar_dup) == 0) { + /* + * There's already an outstanding request for this + * path so we don't need a new one. + */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); + laar = laar_dup; + + /* Bump the ref count on the old request. */ + atomic_add_int(&laar->laar_ref, 1); + + *is_dup = 1; + } else { + /* Add it to the hashes. */ + VERIFY(mod_hash_insert(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)laar) == 0); + VERIFY(mod_hash_insert(data->lav_path_hash, + (mod_hash_key_t)lx_autofs_strdup(nm), + (mod_hash_val_t)laar) == 0); + + *is_dup = 0; + } + mutex_exit(&data->lav_lock); + + return (laar); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_find(lx_autofs_vfs_t *data, int id) +{ + lx_autofs_automnt_req_t *laar; + + /* Check for an outstanding request for this id. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&laar) != 0) { + mutex_exit(&data->lav_lock); + return (NULL); + } + atomic_add_int(&laar->laar_ref, 1); + mutex_exit(&data->lav_lock); + return (laar); +} + +static void +lx_autofs_la_complete(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* Remove this request from the hashes so no one can look it up. */ + mutex_enter(&data->lav_lock); + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* Mark this requst as complete and wakeup anyone waiting on it. */ + mutex_enter(&laar->laar_lock); + laar->laar_complete = 1; + cv_broadcast(&laar->laar_cv); + mutex_exit(&laar->laar_lock); +} + +static void +lx_autofs_la_release(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + ASSERT(!MUTEX_HELD(&laar->laar_lock)); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) + return; + ASSERT(laar->laar_ref == 0); + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static void +lx_autofs_la_abort(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* + * This is a little tricky. We're aborting the wait for this + * request. So if anyone else is waiting for this request we + * can't free it, but if no one else is waiting for the request + * we should free it. + */ + mutex_enter(&data->lav_lock); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) { + mutex_exit(&data->lav_lock); + return; + } + ASSERT(laar->laar_ref == 0); + + /* Remove this request from the hashes so no one can look it up. */ + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* It's ok to free this now because the ref count was zero. */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static int +lx_autofs_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_wr, *ufp_rd = NULL; + file_t *fp_wr, *fp_rd = NULL; + vnode_t *vp_wr, *vp_rd; + int i; + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* Sanity check fifo write fd. */ + if (fd >= fip->fi_nfiles) { + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* Get a pointer to the write fifo. */ + UF_ENTER(ufp_wr, fip, fd); + if (((fp_wr = ufp_wr->uf_file) == NULL) || + ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) { + /* Invalid fifo fd. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + vp_rd = lx_autofs_fifo_peer_vp(fp_wr->f_vnode); + for (i = 0; i < fip->fi_nfiles; i++) { + if (i == fd) + continue; + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * We need to drop fi_lock before we can try to acquire f_tlock + * the good news is that the file pointers are protected because + * we're still holding uf_lock. + */ + mutex_exit(&fip->fi_lock); + + /* + * Here we bump the open counts on the fifos. The reason + * that we do this is because when we go to write to the + * fifo we want to ensure that they are actually open (and + * not in the process of being closed) without having to + * stop the automounter. (If the write end of the fifo + * were closed and we tried to write to it we would panic. + * If the read end of the fifo was closed and we tried to + * write to the other end, the process that invoked the + * lookup operation would get an unexpected SIGPIPE.) + */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + ASSERT(fp_wr->f_count >= 2); + mutex_exit(&fp_wr->f_tlock); + + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + ASSERT(fp_rd->f_count >= 2); + mutex_exit(&fp_rd->f_tlock); + + /* Release all our locks. */ + UF_EXIT(ufp_wr); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + /* Return the file pointers. */ + *fpp_rd = fp_rd; + *fpp_wr = fp_wr; + return (0); +} + +static uint_t +/*ARGSUSED*/ +lx_autofs_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + int *id = (int *)arg; + /* Return the key and terminate the walk. */ + *id = (uintptr_t)key; + return (MH_WALK_TERMINATE); +} + +static void +lx_autofs_fifo_close(lx_autofs_vfs_t *data) +{ + /* + * Close the fifo to prevent any future requests from + * getting sent to the automounter. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr != NULL) { + (void) closef(data->lav_fifo_wr); + data->lav_fifo_wr = NULL; + } + if (data->lav_fifo_rd != NULL) { + (void) closef(data->lav_fifo_rd); + data->lav_fifo_rd = NULL; + } + mutex_exit(&data->lav_lock); + + /* + * Wakeup any threads currently waiting for the automounter + * note that it's possible for multiple threads to have entered + * this function and to be doing the work below simultaneously. + */ + for (;;) { + lx_autofs_automnt_req_t *laar; + int id; + + /* Lookup the first entry in the hash. */ + id = -1; + mod_hash_walk(data->lav_id_hash, + lx_autofs_fifo_close_cb, &id); + if (id == -1) { + /* No more id's in the hash. */ + break; + } + if ((laar = lx_autofs_la_find(data, id)) == NULL) { + /* Someone else beat us to it. */ + continue; + } + + /* Mark the request as complete and release it. */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + } +} + +static int +lx_autofs_fifo_verify_rd(lx_autofs_vfs_t *data) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_rd = NULL; + file_t *fp_rd = NULL; + vnode_t *vp_rd; + int i; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + + /* Check if we've already been shut down. */ + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + return (-1); + } + vp_rd = lx_autofs_fifo_peer_vp(data->lav_fifo_wr->f_vnode); + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(data->lav_pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Seems the automounter still has the read end of the fifo + * open, we're done here. Release all our locks and exit. + */ + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + return (0); +} + +static int +lx_autofs_fifo_write(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laarp) +{ + struct uio uio; + struct iovec iov; + file_t *fp_wr, *fp_rd; + int error; + + /* + * The catch here is we need to make sure _we_ don't close + * the the fifo while writing to it. (Another thread could come + * along and realize the automounter process is gone and close + * the fifo. To do this we bump the open count before we + * write to the fifo. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + fp_wr = data->lav_fifo_wr; + fp_rd = data->lav_fifo_rd; + + /* Bump the open count on the write fifo. */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + mutex_exit(&fp_wr->f_tlock); + + /* Bump the open count on the read fifo. */ + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + mutex_exit(&fp_rd->f_tlock); + + mutex_exit(&data->lav_lock); + + iov.iov_base = (caddr_t)&laarp->laar_pkt; + iov.iov_len = laarp->laar_pkt_size; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = 0; + uio.uio_segflg = (short)UIO_SYSSPACE; + uio.uio_resid = laarp->laar_pkt_size; + uio.uio_llimit = 0; + uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK; + + error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL); + (void) closef(fp_wr); + (void) closef(fp_rd); + + /* + * After every write we verify that the automounter still has + * these files open. + */ + mutex_enter(&data->lav_lock); + if (lx_autofs_fifo_verify_rd(data) != 0) { + /* + * Something happened to the automounter. + * Close down the communication pipe we setup. + */ + mutex_exit(&data->lav_lock); + lx_autofs_fifo_close(data); + if (error != 0) + return (error); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + return (error); +} + +static int +lx_autofs_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack) +{ + struct iovec iov; + struct uio uio; + dirent64_t *dp, *dbuf; + vnode_t *vp; + size_t dlen, dbuflen; + int eof, error, ndirents = 64; + char *nm; + + dlen = ndirents * (sizeof (*dbuf)); + dbuf = kmem_alloc(dlen, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_loffset = 0; + uio.uio_llimit = MAXOFFSET_T; + + eof = 0; + error = 0; + while (!error && !eof) { + uio.uio_resid = dlen; + iov.iov_base = (char *)dbuf; + iov.iov_len = dlen; + + (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); + if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) { + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + kmem_free(dbuf, dlen); + return (-1); + } + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + + if ((dbuflen = dlen - uio.uio_resid) == 0) { + /* We're done. */ + break; + } + + for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) { + + nm = dp->d_name; + + if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0) + continue; + + if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + kmem_free(dbuf, dlen); + return (-1); + } + if (vp->v_type == VDIR) { + if (dir_stack != NULL) { + lx_autofs_stack_push(dir_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } else { + if (file_stack != NULL) { + lx_autofs_stack_push(file_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } + } + } + kmem_free(dbuf, dlen); + return (0); +} + +static void +lx_autofs_bs_destroy(vnode_t *dvp, char *path) +{ + list_t search_stack; + list_t dir_stack; + list_t file_stack; + vnode_t *pdvp, *vp; + char *dpath, *fpath; + int ret; + + if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + /* A directory entry with this name doesn't actually exist. */ + return; + } + + if ((vp->v_type & VDIR) == 0) { + /* Easy, the directory entry is a file so delete it. */ + VN_RELE(vp); + (void) VOP_REMOVE(dvp, path, kcred, NULL, 0); + return; + } + + /* + * The directory entry is a subdirectory, now we have a bit more + * work to do. (We'll have to recurse into the sub directory.) + * It would have been much easier to do this recursively but kernel + * stacks are notoriously small. + */ + ls_autofs_stack_init(&search_stack); + ls_autofs_stack_init(&dir_stack); + ls_autofs_stack_init(&file_stack); + + /* Save our newfound subdirectory into a list. */ + lx_autofs_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp, + lx_autofs_strdup(path)); + + /* Do a recursive depth first search into the subdirectories. */ + while (lx_autofs_stack_pop(&search_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the subdirectories in this directory. */ + if (lx_autofs_bs_readdir(dvp, &search_stack, NULL) != 0) + goto exit; + + /* Save the current directory a separate stack. */ + lx_autofs_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp, + dpath); + } + + /* + * Now dir_stack contains a list of directories, the deepest paths + * are at the top of the list. So let's go through and process them. + */ + while (lx_autofs_stack_pop(&dir_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the files in this directory. */ + if (lx_autofs_bs_readdir(dvp, NULL, &file_stack) != 0) { + VN_RELE(dvp); + lx_autofs_strfree(dpath); + goto exit; + } + + /* Delete all the files in this directory. */ + while (lx_autofs_stack_pop(&file_stack, + NULL, (caddr_t *)&vp, &fpath) == 0) { + VN_RELE(vp) + ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0); + lx_autofs_strfree(fpath); + if (ret != 0) { + lx_autofs_strfree(dpath); + goto exit; + } + } + + /* Delete this directory. */ + VN_RELE(dvp); + ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0); + lx_autofs_strfree(dpath); + if (ret != 0) + goto exit; + } + +exit: + while ( + (lx_autofs_stack_pop(&search_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&dir_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&file_stack, NULL, (caddr_t *)&vp, + &path) == 0)) { + VN_RELE(vp); + lx_autofs_strfree(path); + } + lx_autofs_stack_fini(&search_stack); + lx_autofs_stack_fini(&dir_stack); + lx_autofs_stack_fini(&file_stack); +} + +static vnode_t * +lx_autofs_bs_create(vnode_t *dvp, char *bs_name) +{ + vnode_t *vp; + vattr_t vattr; + + /* + * After looking at the mkdir syscall path it seems we don't need + * to initialize all of the vattr_t structure. + */ + bzero(&vattr, sizeof (vattr)); + vattr.va_type = VDIR; + vattr.va_mode = 0755; /* u+rwx,og=rx */ + vattr.va_mask = AT_TYPE|AT_MODE; + + if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0) + return (NULL); + return (vp); +} + +static int +lx_autofs_automounter_call(vnode_t *dvp, char *nm) +{ + lx_autofs_automnt_req_t *laar; + lx_autofs_vfs_t *data; + int error; + boolean_t is_dup; + + /* Get a pointer to the vfs mount data. */ + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* The automounter only supports queries in the root directory. */ + if (dvp != data->lav_root) + return (ENOENT); + + /* + * Check if the current process is in the automounters process + * group. (If it is, the current process is either the autmounter + * itself or one of it's forked child processes.) If so, don't + * redirect this call back into the automounter because we'll + * hang. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter request structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_FALSE, + nm)) == NULL) + return (ENOENT); + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if ((!is_dup) && + ((error = lx_autofs_fifo_write(data, laar)) != 0)) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (error); + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this call. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + return (EINTR); + } + } + mutex_exit(&laar->laar_lock); + + if (laar->laar_result == LXACR_READY) { + /* + * Mount succeeded, keep track for future expire calls. + * + * See vfs lav_vn_hash. Is this something we could use for + * iterating mounts under this autofs? Used by + * lx_autofs_vn_alloc + */ + lx_autofs_mntent_t *mp; + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, mp->lxafme_len); + + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } + + lx_autofs_la_release(data, laar); + + return (0); +} + +/* + * Same preliminary checks as in lx_autofs_unmount. + */ +static boolean_t +lx_autofs_may_unmount(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (B_FALSE); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (B_FALSE); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + + /* Check for any remaining holds on the root vnode. */ + if (data->lav_root->v_count > 1) + return (B_FALSE); + + return (B_TRUE); +} + +static vfs_t * +lx_autofs_get_mountvfs(char *fs_mntpt, int *cnt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + int fsmplen; + int acnt = 0; + + fsmplen = strlen(fs_mntpt); + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + *cnt = 0; + return (NULL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt; + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) == 0 && + (mntpt[fsmplen] == '\0' || mntpt[fsmplen] == '/')) { + /* + * We'll return the first one we find but don't + * return a mount that is actually autofs (i.e. + * autofs direct or offset mount). + */ + if (vfsp->vfs_op == lx_autofs_vfsops) { + acnt++; + } else if (fnd_vfs == NULL) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + } + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + *cnt = acnt; + return (fnd_vfs); +} + +/* + * Unmount all autofs offset mounts below the given path. + */ +static boolean_t +lx_autofs_umount_offset(char *fs_mntpt, struct cred *cr) +{ + struct vfs *vfsp; + struct vfs *vfslist; + boolean_t busy = B_FALSE; + int fsmplen = strlen(fs_mntpt); + +restart: + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (B_FALSE); + } + + do { + char *mntpt; + lx_autofs_vfs_t *data; + + /* Skip mounts we should ignore. */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB)) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) != 0 || + (mntpt[fsmplen] != '\0' && mntpt[fsmplen] != '/')) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Something got mounted over the autofs mountpoint + * after we checked that this inidrect hierarchy was + * not busy. + */ + busy = B_TRUE; + break; + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_mnttype != LXAMT_OFFSET) { + /* + * Something mounted a non-offset autofs fs under this + * indirect mnt! + */ + busy = B_TRUE; + break; + } + + /* + * Attempt to umount - set busy if fails. + * + * umount2_engine will call VFS_RELE, so we need to take an + * extra hold to match the behavior during the normal umount + * path. + * + * We also need to drop the list lock to prevent deadlock + * during umount. + */ + VFS_HOLD(vfsp); + vfs_list_unlock(); + if (umount2_engine(vfsp, 0, cr, 0) != 0) { + busy = B_TRUE; + goto errexit; + } + + /* Retake list lock and look for more. */ + goto restart; + } while (vfsp != vfslist); + + vfs_list_unlock(); + +errexit: + return (busy); +} + + +/* + * Note that lx_autofs_automounter_call() only supports queries in the root + * directory, so all mntent names are relative to that. + */ +static int +lx_autofs_expire(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + lx_autofs_mntent_t *mp; + lx_autofs_automnt_req_t *laar; + boolean_t is_dup; + vfs_t *fnd_vfs; + int autofs_cnt; + boolean_t busy = B_FALSE; + char exp_path[MAXPATHLEN]; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* + * We process only the first element (i.e. do not do multi). This + * works fine for the automounter. + */ + mutex_enter(&data->lav_lock); + mp = (lx_autofs_mntent_t *)list_remove_head(&data->lav_mnt_list); + mutex_exit(&data->lav_lock); + if (mp == NULL) { + if (data->lav_mnttype == LXAMT_OFFSET) { + /* + * During restart the automounter will openmount each + * offset mount for management. It won't closemount the + * offset mount until we expire it, even though nothing + * is mounted over that offset. We handle this as a + * special expiration case. + */ + int cnt; + + mutex_enter(&data->lav_lock); + cnt = data->lav_openmnt_cnt; + mutex_exit(&data->lav_lock); + + if (cnt == 1 && vn_ismntpt(data->lav_root) == 0) { + char *mntpt = (char *) + refstr_value(vfsp->vfs_mntpt); + char *nm = ZONE_PATH_TRANSLATE(mntpt, curzone); + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), + KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, + KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, + mp->lxafme_len); + + goto exp_offset; + } + } + + return (EAGAIN); + } + + /* + * We only return an expired mount if it is inactive for the full + * timeout. This reduces overly aggressive umount/mount activity. + */ + if (data->lav_timeout > 0) { + uint64_t now = TICK_TO_SEC(ddi_get_lbolt64()); + + if ((now - mp->lxafme_ts) < data->lav_timeout) { + /* put it back at the end of the line */ + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + } + + if (data->lav_mnttype == LXAMT_INDIR) { + (void) snprintf(exp_path, sizeof (exp_path), "%s/%s", + (char *)refstr_value(vfsp->vfs_mntpt), mp->lxafme_path); + } else { + (void) strlcpy(exp_path, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (exp_path)); + } + + fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt); + if (fnd_vfs != NULL) { + boolean_t skip = B_FALSE; + vfssw_t *vfssw; + + /* + * If it's an NFS file system (typical) then we check in + * advance to see if it can be unmounted, otherwise, proceed. + * The fs-specific umount attempted by the automounter will + * either succeed or fail. Both are valid outcomes but checking + * now for nfs will save a bunch of work by the automounter + * if the fs is busy. + * + * Unfortunately, for NFS the vfs_fstype is the same for all + * versions of NFS, so we need to check the vfs_op member to + * determine which version of NFS we're dealing with. + */ + if (!skip && (vfssw = vfs_getvfssw("nfs4")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable4(fnd_vfs)) + busy = B_TRUE; + skip = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + if (!skip && (vfssw = vfs_getvfssw("nfs3")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable(fnd_vfs)) + busy = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + VFS_RELE(fnd_vfs); + + } else if (autofs_cnt > 0) { + /* + * The automounter is asking us to expire and we pulled this + * name from our vfs mountpoint list, but if + * lx_autofs_get_mountvfs returns null then that means we + * didn't find a non-autofs mount under this name. Thus, the + * name could be a subdirectory under an autofs toplevel + * indirect mount with one or more offset mounts below. + * autofs_cnt will indicate how many autofs mounts exist below + * this subdirectory name. + * + * The automounter will take care of unmounting any fs mounted + * over one of these offset mounts (i.e. offset is like a + * direct mount which the automounter will manage) but the + * automounter will not unmount the actual autofs offset mount + * itself, so we have to do that before we can expire the + * top-level subrectory name. + */ + busy = lx_autofs_umount_offset(exp_path, cr); + } + + if (busy) { + /* + * Can't unmount this one right now, put it at the end of the + * list and return. The caller will return EAGAIN for the + * expire ioctl and the automounter will check again later. + */ + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + + /* + * See lx_autofs_automounter_call. We want to send a msg up the pipe + * to the automounter in a similar way. + */ + +exp_offset: + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + goto err_free; + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter expire structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_TRUE, + mp->lxafme_path)) == NULL) + goto err_free; + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if (!is_dup && lx_autofs_fifo_write(data, laar) != 0) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + goto err_free; + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this request. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + goto err_free; + } + } + mutex_exit(&laar->laar_lock); + + /* + * If it failed or if the file system is still mounted after we get the + * response from our expire msg, then that means the automounter tried + * to unmount it but failed because the file system is busy, so we put + * this entry back on our list to try to expire it again later. + */ + fnd_vfs = NULL; + if (laar->laar_result == LXACR_FAIL || + (fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt)) != NULL || + autofs_cnt > 0) { + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } else { + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + lx_autofs_la_release(data, laar); + return (0); + +err_free: + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + return (EAGAIN); +} + +static int +lx_autofs_ack(int reqid, vfs_t *vfsp, enum lx_autofs_callres result) +{ + lx_autofs_vfs_t *data; + lx_autofs_automnt_req_t *laar; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if ((laar = lx_autofs_la_find(data, reqid)) == NULL) + return (ENXIO); + + /* Mark the request as complete and release it. */ + laar->laar_result = result; + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (0); +} + +static int +lx_autofs_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg, cred_t *cr) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + int id = arg; + int v; + int err; + + /* + * Be strict. + * We only accept ioctls from the automounter process group. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp != curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + switch (cmd) { + case LX_AUTOFS_IOC_READY: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_READY)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_FAIL: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_FAIL)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_CATATONIC: + /* The automounter is shutting down. */ + lx_autofs_fifo_close(data); + return (0); + + case LX_AUTOFS_IOC_PROTOVER: + v = LX_AUTOFS_PROTO_VERS5; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_PROTOSUBVER: + v = LX_AUTOFS_PROTO_SUBVERSION; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_ASKUMOUNT: + /* + * This is asking if autofs can be unmounted, not asking to + * actually unmount it. We return 1 if it is busy or 0 if it + * can be unmounted. + */ + v = 1; + if (lx_autofs_may_unmount(vp->v_vfsp, cr)) + v = 0; + + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_SETTIMEOUT: + if (copyin((caddr_t)arg, &data->lav_timeout, sizeof (ulong_t)) + != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_EXPIRE: + return (ENOTSUP); + + case LX_AUTOFS_IOC_EXPIRE_MULTI: + lx_autofs_expire(vp->v_vfsp, cr); + return (EAGAIN); + + default: + ASSERT(0); + return (ENOTSUP); + } +} + +static int +lx_autofs_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data) +{ + char *fd_str, *pgrp_str, *minproto_str, *maxproto_str; + int fd, pgrp, minproto, maxproto; + file_t *fp_wr, *fp_rd; + + /* Require these options to be present. */ + if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1)) + return (EINVAL); + + /* Get the values for each parameter. */ + if ((lx_autofs_str_to_int(fd_str, &fd) != 0) || + (lx_autofs_str_to_int(pgrp_str, &pgrp) != 0) || + (lx_autofs_str_to_int(minproto_str, &minproto) != 0) || + (lx_autofs_str_to_int(maxproto_str, &maxproto) != 0)) + return (EINVAL); + + /* + * We primarily support v2 & v5 of the linux kernel automounter + * protocol. The userland daemon typically needs v5. We'll reject + * unsupported ioctls later if we get one. + */ + if ((minproto > 5) || (maxproto < 2)) + return (EINVAL); + + /* + * Now we need to lookup the fifos we'll be using + * to talk to the userland automounter process. + */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + /* + * The automounter doesn't always have the same id as the pgrp. + * This happens when it is started via one of the various + * service managers. In this case the fifo lookup will fail + * so we retry with our own pid. + */ + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) + return (EINVAL); + } + + if (vfs_optionisset(vfsp, LX_MNTOPT_INDIRECT, NULL)) { + data->lav_mnttype = LXAMT_INDIR; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_DIRECT, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_DIRECT; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_OFFSET, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_OFFSET; + } + /* The automounter does test mounts with none of the options */ + if (data->lav_mnttype == LXAMT_NONE) + data->lav_mnttype = LXAMT_DIRECT; + + /* Save the mount options and fifo pointers. */ + data->lav_fd = fd; + data->lav_min_proto = minproto; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + return (0); +} + +static uint64_t +s2l_dev(dev_t dev) +{ + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + return (LX_MAKEDEVICE(maj, min)); +} + +/* + * VFS entry points + */ +static int +lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lx_autofs_vfs_t *data; + dev_t dev; + char name[40]; + int error; + vattr_t va; + + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) + return (EBUSY); + + /* We don't support mounts in the global zone. */ + if (getzoneid() == GLOBAL_ZONEID) + return (EPERM); + + /* + * Offset mounts will occur below the top-level mountpoint so we + * need to allow for autofs mounts even though mvp is an autofs. + */ + + /* Allocate a vfs struct. */ + data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP); + + /* Parse mount options. */ + if ((error = lx_autofs_parse_mntopt(vfsp, data)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + + /* Initialize the backing store. */ + lx_autofs_bs_destroy(mvp, LX_AUTOFS_BS_DIR); + data->lav_bs_vp = lx_autofs_bs_create(mvp, LX_AUTOFS_BS_DIR); + if (data->lav_bs_vp == NULL) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (EBUSY); + } + data->lav_bs_name = LX_AUTOFS_BS_DIR; + + /* Get the backing store inode for use in v5 protocol msgs */ + va.va_mask = AT_STAT; + if ((error = VOP_GETATTR(data->lav_bs_vp, &va, 0, cr, NULL)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + data->lav_ino = va.va_nodeid; + + /* We have to hold the underlying vnode we're mounted on. */ + data->lav_mvp = mvp; + VN_HOLD(mvp); + + /* Initialize vfs fields */ + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lx_autofs_fstype; + vfsp->vfs_data = data; + + /* Invent a dev_t (sigh) */ + do { + dev = makedevice(lx_autofs_major, + atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32); + } while (vfs_devismounted(dev)); + vfsp->vfs_dev = dev; + vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype); + + data->lav_dev = s2l_dev(vfsp->vfs_dev); + + /* Create an id space arena for automounter requests. */ + (void) snprintf(name, sizeof (name), "lx_autofs_id_%d", + getminor(vfsp->vfs_dev)); + data->lav_ids = id_space_create(name, 1, INT_MAX); + + /* Create hashes to keep track of automounter requests. */ + mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL); + (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_path_hash = mod_hash_create_strhash(name, + LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor); + (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_id_hash = mod_hash_create_idhash(name, + LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor); + + /* Create a hash to keep track of vnodes. */ + (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_vn_hash = mod_hash_create_ptrhash(name, + LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor, + sizeof (vnode_t)); + + list_create(&data->lav_mnt_list, sizeof (lx_autofs_mntent_t), + offsetof(lx_autofs_mntent_t, lxafme_lst)); + + /* Create root vnode */ + data->lav_root = lx_autofs_vn_alloc(vfsp, data->lav_bs_vp); + + data->lav_root->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP; + + /* + * For a direct mountpoint we need to allow a filesystem to be + * mounted overtop of this autofs mount. Otherwise, disallow that. + */ + if (data->lav_mnttype == LXAMT_INDIR) + data->lav_root->v_flag |= VNOMOUNT; + + return (0); +} + +static int +lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (EPERM); + + /* We do not currently support forced unmounts. */ + if (flag & MS_FORCE) + return (ENOTSUP); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (EBUSY); + + /* Check for any remaining holds on the root vnode. */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + if (data->lav_root->v_count > 1) + return (EBUSY); + + /* Close the fifo to the automount process. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + /* + * We have to release our hold on our root vnode before we can + * delete the backing store. (Since the root vnode is linked + * to the backing store.) + */ + VN_RELE(data->lav_root); + + /* Cleanup the backing store. */ + lx_autofs_bs_destroy(data->lav_mvp, data->lav_bs_name); + VN_RELE(data->lav_mvp); + + /* + * Delete all listed mounts. + */ + for (;;) { + lx_autofs_mntent_t *mp; + + mp = list_remove_head(&data->lav_mnt_list); + if (mp == NULL) + break; + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + /* Cleanup out remaining data structures. */ + mod_hash_destroy_strhash(data->lav_path_hash); + mod_hash_destroy_idhash(data->lav_id_hash); + mod_hash_destroy_ptrhash(data->lav_vn_hash); + id_space_destroy(data->lav_ids); + list_destroy(&data->lav_mnt_list); + kmem_free(data, sizeof (lx_autofs_vfs_t)); + + return (0); +} + +static int +lx_autofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + *vpp = data->lav_root; + VN_HOLD(*vpp); + + return (0); +} + +static int +lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *urvp = data->lav_root->v_data; + dev32_t d32; + int error; + + if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0) + return (error); + + /* Update some of values before returning. */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name, + sizeof (sp->f_basetype)); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + return (0); +} + +static const fs_operation_def_t lx_autofs_vfstops[] = { + { VFSNAME_MOUNT, { .vfs_mount = lx_autofs_mount } }, + { VFSNAME_UNMOUNT, { .vfs_unmount = lx_autofs_unmount } }, + { VFSNAME_ROOT, { .vfs_root = lx_autofs_root } }, + { VFSNAME_STATVFS, { .vfs_statvfs = lx_autofs_statvfs } }, + { NULL, NULL } +}; + +/* + * VOP entry points - simple passthrough + * + * For most VOP entry points we can simply pass the request on to + * the underlying filesystem we're mounted on. + */ +static int +lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp)); +} + +static int +lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ctp, int flags) +{ + vnode_t *uvp = vp->v_data; + return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags)); +} + +static int +lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_ACCESS(uvp, mode, flags, cr, ctp)); +} + +static int +lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_RWLOCK(uvp, write_lock, ctp)); +} + +static void +lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + VOP_RWUNLOCK(uvp, write_lock, ctp); +} + +/* + * Check if attempting to access a 'direct' mount and if so, call the + * automounter to perform the mount. Once the mount occurs, the new filesystem + * will be mounted overtop of this autofs mountpoint and we will no longer + * come through this path. + */ +static vnode_t * +lx_autofs_do_direct(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *nvp; + boolean_t skip_am_call = B_FALSE; + + if (data->lav_mnttype == LXAMT_INDIR) + return (NULL); + + /* + * Check if the current process is in the automounter's process group. + * If it is, the current process is either the automounter itself or + * one of it's children. If so, don't call back into the automounter. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + skip_am_call = B_TRUE; + } + mutex_exit(&pidlock); + + /* + * It is possible there is already a new fs mounted on top of our vnode. + * This can happen if the caller first did a lookup of a file name + * using our vnode as the directory vp. The lookup would trigger the + * autofs mount on top of ourself, but if the caller then uses our + * vnode to do a getattr on the directory, it will use the autofs + * vnode and not the newly mounted vnode. We need to skip re-calling + * the automounter for this case. + */ + if (!skip_am_call && vn_mountedvfs(vp) == NULL) { + char tbuf[MAXPATHLEN]; + char *nm; + + (void) strlcpy(tbuf, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (tbuf)); + nm = tbuf + strlen(tbuf); + while (*nm != '/' && nm != tbuf) + nm--; + if (*nm == '/') + nm++; + (void) lx_autofs_automounter_call(vp, nm); + } + + /* + * We need to take an extra hold on our vp (which is the autofs + * root vp) to account for the rele done in traverse. traverse will + * take a hold on the new vp so the caller is responsible for calling + * VN_RELE on the returned vp. + */ + VN_HOLD(vp); + nvp = vp; + if (traverse(&nvp) != 0) { + VN_RELE(nvp); + return (NULL); + } + + /* Confirm that we have a non-autofs fs mounted now */ + if (nvp->v_op == lx_autofs_vn_ops) { + VN_RELE(nvp); + return (NULL); + } + + return (nvp); +} + +/*ARGSUSED*/ +static int +lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ctp, int flags) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + + /* handle direct mount here */ + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + int error; + + error = VOP_RMDIR(nvp, nm, cdir, cr, ctp, flags); + VN_RELE(nvp); + return (error); + } + + /* + * cdir is the calling processes current directory. + * If cdir is lx_autofs vnode then get its real underlying + * vnode ptr. (It seems like the only thing cdir is + * ever used for is to make sure the user doesn't delete + * their current directory.) + */ + if (vn_matchops(cdir, lx_autofs_vn_ops)) { + vnode_t *ucdir = cdir->v_data; + return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags)); + } + + return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags)); +} + +/* + * VOP entry points - special passthrough + * + * For some VOP entry points we will first pass the request on to + * the underlying filesystem we're mounted on. If there's an error + * then we immediately return the error, but if the request succeeds + * we have to do some extra work before returning. + */ +static int +lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp) +{ + vnode_t *ovp = *vpp; + vnode_t *uvp = ovp->v_data; + int error; + + /* direct mounts were handled by the lookup to get *vpp */ + + if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0) + return (error); + + /* Check for clone opens. */ + if (uvp == ovp->v_data) + return (0); + + /* Deal with clone opens by returning a new vnode. */ + *vpp = lx_autofs_vn_alloc(ovp->v_vfsp, uvp); + VN_RELE(ovp); + return (0); +} + +static int +lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + vnode_t *dvp; + int error; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + if ((dvp = lx_autofs_do_direct(vp)) != NULL) { + uvp = dvp; + } + + error = VOP_GETATTR(uvp, vap, flags, cr, ctp); + + if (dvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(dvp); + if (error == 0) { + /* + * During automounter restart recovery the automounter + * will fstat the fd provided in the setpipe ioctl. It + * uses the resulting inode & dev to correlate future + * autofs fifo requests to the correct entry. Thus, we + * have to update the attributes with our own id's. + */ + vap->va_fsid = data->lav_dev; + vap->va_nodeid = data->lav_ino; + } + } else if (error == 0) { + /* Update the attributes with our filesystem id. */ + vap->va_fsid = data->lav_dev; + } + + return (error); +} + +static int +lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + int error; + + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + udvp = nvp; + } + + error = VOP_MKDIR(udvp, nm, vap, vpp, cr, ctp, flags, vsecp); + + if (nvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(nvp); + } else if (error == 0) { + vnode_t *uvp = NULL; + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = dvp->v_vfsp->vfs_dev; + + /* Allocate our new vnode. */ + uvp = *vpp; + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + } + + return (error); +} + +/* + * VOP entry points - custom + */ +/*ARGSUSED*/ +static void +lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + /* + * We need to hold the vfs lock because if we're going to free + * this vnode we have to prevent anyone from looking it up + * in the vnode hash. + */ + mutex_enter(&data->lav_lock); + mutex_enter(&vp->v_lock); + + if (vp->v_count < 1) { + panic("lx_autofs_inactive: bad v_count"); + /*NOTREACHED*/ + } + + /* Drop the temporary hold by vn_rele now. */ + if (--vp->v_count > 0) { + mutex_exit(&vp->v_lock); + mutex_exit(&data->lav_lock); + return; + } + + /* + * No one should have been blocked on this lock because we're + * about to free this vnode. + */ + lx_autofs_vn_free(vp); +} + +static int +lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + lx_autofs_vfs_t *data; + int error = ENOENT; + + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* + * For an indirect mount first try to lookup if this path component + * already exists. + */ + if (data->lav_mnttype == LXAMT_INDIR) { + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } + + /* Only query the automounter if the path does not exist. */ + if (error != ENOENT) + return (error); + + if (data->lav_catatonic) + return (ENOENT); + + /* Save the uid/gid for the requestor ioctl. */ + data->lav_uid = crgetuid(cr); + data->lav_gid = crgetgid(cr); + + /* Refer the lookup to the automounter. */ + if ((error = lx_autofs_automounter_call(dvp, nm)) != 0) + return (error); + + if (data->lav_mnttype == LXAMT_INDIR) { + /* + * Indirect mount. The automounter call should have mounted + * something on nm. Retry the lookup operation. + */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } else { + /* + * Direct or offset mount. The automounter call should have + * covered our 'dvp' with a new filesystem. Traverse into the + * new mount and retry the lookup. + * + * We need to take an extra hold on our vp (which is the autofs + * root vp) to acount for the rele done in traverse. Our caller + * will also do a rele on the original dvp and that would leave + * us one ref short on our autofs root vnode. + */ + VN_HOLD(dvp); + if ((error = traverse(&dvp)) != 0) { + VN_RELE(dvp); + return (error); + } + + error = VOP_LOOKUP(dvp, nm, vpp, pnp, flags, rdir, cr, ctp, + direntflags, realpnp); + + /* release the traverse hold */ + VN_RELE(dvp); + } + return (error); +} + +static int +lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rvalp, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + + /* Intercept our ioctls. */ + switch ((uint_t)cmd) { + case LX_AUTOFS_IOC_READY: + case LX_AUTOFS_IOC_FAIL: + case LX_AUTOFS_IOC_CATATONIC: + case LX_AUTOFS_IOC_PROTOVER: + case LX_AUTOFS_IOC_SETTIMEOUT: + case LX_AUTOFS_IOC_EXPIRE: + case LX_AUTOFS_IOC_EXPIRE_MULTI: + case LX_AUTOFS_IOC_PROTOSUBVER: + case LX_AUTOFS_IOC_ASKUMOUNT: + return (lx_autofs_automounter_ioctl(vp, cmd, arg, cr)); + } + + /* Pass any remaining ioctl on. */ + return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp)); +} + +/* + * VOP entry points definitions + */ +static const fs_operation_def_t lx_autofs_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = lx_autofs_open } }, + { VOPNAME_CLOSE, { .vop_close = lx_autofs_close } }, + { VOPNAME_IOCTL, { .vop_ioctl = lx_autofs_ioctl } }, + { VOPNAME_RWLOCK, { .vop_rwlock = lx_autofs_rwlock } }, + { VOPNAME_RWUNLOCK, { .vop_rwunlock = lx_autofs_rwunlock } }, + { VOPNAME_GETATTR, { .vop_getattr = lx_autofs_getattr } }, + { VOPNAME_ACCESS, { .vop_access = lx_autofs_access } }, + { VOPNAME_READDIR, { .vop_readdir = lx_autofs_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = lx_autofs_lookup } }, + { VOPNAME_INACTIVE, { .vop_inactive = lx_autofs_inactive } }, + { VOPNAME_MKDIR, { .vop_mkdir = lx_autofs_mkdir } }, + { VOPNAME_RMDIR, { .vop_rmdir = lx_autofs_rmdir } }, + { NULL } +}; + +/* + * DEV-specific entry points + */ + +/*ARGSUSED*/ +static int +lx_autofs_dev_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +static int +lx_autofs_dev_validate_cmd(intptr_t arg, lx_autofs_dv_ioctl_t *dcmd) +{ + if (copyin((caddr_t)arg, dcmd, sizeof (lx_autofs_dv_ioctl_t)) != 0) + return (EFAULT); + + if (dcmd->lad_ver_major != LX_AUTOFS_DEV_VERSION_MAJOR || + dcmd->lad_ver_minor > LX_AUTOFS_DEV_VERSION_MINOR) + return (EINVAL); + + DTRACE_PROBE1(lx__dev__cmd, void *, dcmd); + + /* Fill in the version for return */ + dcmd->lad_ver_major = LX_AUTOFS_DEV_VERSION_MAJOR; + dcmd->lad_ver_minor = LX_AUTOFS_DEV_VERSION_MINOR; + return (0); +} + +static vfs_t * +lx_autofs_dev_getvfs_bypath(char *fs_mntpt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (NULL); + } + + do { + if (vfsp->vfs_op == lx_autofs_vfsops) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(fs_mntpt, ZONE_PATH_TRANSLATE(mntpt, zone)) + == 0) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + return (fnd_vfs); +} + +static int +lx_autofs_dev_fd_preamble(intptr_t arg, lx_autofs_dv_ioctl_t *dc, vfs_t **vfspp) +{ + int err; + lx_autofs_vfs_t *data; + file_t *fp; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_validate_cmd(arg, dc)) != 0) + return (err); + + if ((fp = getf(dc->lad_ioctlfd)) == NULL) + return (EBADF); + + vfsp = fp->f_vnode->v_vfsp; + if (vfsp->vfs_op != lx_autofs_vfsops) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_root->v_count <= 1) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + VFS_HOLD(vfsp); + *vfspp = vfsp; + + releasef(dc->lad_ioctlfd); + return (0); +} + +static int +lx_autofs_dev_vers(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_VERS5; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protosubver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_SUBVERSION; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_get_path_cmd(intptr_t arg, lx_autofs_dv_ioctl_t **dcp) +{ + int err; + lx_autofs_dv_ioctl_t dcmd, *dc; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (dcmd.lad_size <= sizeof (dcmd) || + dcmd.lad_size > (sizeof (dcmd) + MAXPATHLEN)) + return (EINVAL); + + dc = kmem_alloc(dcmd.lad_size, KM_SLEEP); + + /* re-copyin the full struct with the path */ + if (copyin((caddr_t)arg, dc, dcmd.lad_size) != 0) { + kmem_free(dc, dcmd.lad_size); + return (EFAULT); + } + dc->lad_size = dcmd.lad_size; + + if (dc->lad_path[0] != '/' || + dc->lad_path[dcmd.lad_size - sizeof (dcmd) - 1] != '\0') { + kmem_free(dc, dcmd.lad_size); + return (EINVAL); + } + + *dcp = dc; + return (0); +} + +static int +lx_autofs_dev_openmount(intptr_t arg) +{ + int err; + int fd; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + if ((vfsp = lx_autofs_dev_getvfs_bypath(dc->lad_path)) == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + /* lad_arg1 is the dev number of the mnt but we don't check that */ + + /* + * Do an "open" on the root vnode. To fully simulate "open" we also add + * a hold on the root vnode itself since lx_autofs_open will only open + * (and hold) the underlying vnode. + */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + VN_HOLD(data->lav_root); + if ((err = fassign(&data->lav_root, FWRITE|FREAD, &fd)) != 0) { + VN_RELE(data->lav_root); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (err); + } + + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt++; + mutex_exit(&data->lav_lock); + + dc->lad_ioctlfd = fd; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + (void) closeandsetf(fd, NULL); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + VFS_RELE(vfsp); + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_closemount(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* "close" the vnode */ + if ((err = closeandsetf(dcmd.lad_ioctlfd, NULL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + mutex_enter(&data->lav_lock); + ASSERT(data->lav_openmnt_cnt > 0); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_ready(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_READY)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_fail(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_FAIL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +/* + * Update the fifo pipe information we use to talk to the automounter. The + * ioctl is used when the automounter restarts. This logic is similar to the + * handling done in lx_autofs_parse_mntopt() when the filesytem is first + * mounted. + */ +static int +lx_autofs_dev_setpipefd(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + int fd, pgrp; + file_t *fp_wr, *fp_rd; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + mutex_enter(&pidlock); + pgrp = curproc->p_pgrp; + mutex_exit(&pidlock); + fd = dcmd.lad_arg1; + + /* Lookup the new fifos. See comment in lx_autofs_parse_mntopt. */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) { + VFS_RELE(vfsp); + return (EINVAL); + } + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* Close the old fifos. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + data->lav_fd = fd; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + /* + * Not explicitly in the ioctl spec. but necessary for correct recovery + */ + data->lav_catatonic = B_FALSE; + + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_catatonic(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_catatonic = B_TRUE; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_expire(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + /* If it succeeds in expiring then we don't want to return EAGAIN */ + if ((err = lx_autofs_expire(vfsp, kcred)) == 0) { + VFS_RELE(vfsp); + return (0); + } + + VFS_RELE(vfsp); + return (EAGAIN); +} + +static int +lx_autofs_dev_timeout(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_timeout = dcmd.lad_arg1; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_requestor(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + struct vfs *vfslist; + zone_t *zone = curzone; + lx_autofs_vfs_t *data; + uid_t uid; + gid_t gid; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Found an indirect mount (probably + * NFS) so we need to get the vfs it's + * mounted onto. + */ + vnode_t *vn = vfsp->vfs_vnodecovered; + vfsp = vn->v_vfsp; + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * autofs doesn't manage this + * path. + */ + break; + } + } + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + data = (lx_autofs_vfs_t *)fnd_vfs->vfs_data; + uid = data->lav_uid; + gid = data->lav_gid; + VFS_RELE(fnd_vfs); + + dc->lad_arg1 = uid; + dc->lad_arg2 = gid; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_ismntpt(intptr_t arg) +{ + int err = 0; + lx_autofs_dv_ioctl_t *dc; + struct vfs *vfslist; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + /* + * The automounter will always pass a path. It can also either pass an + * ioctlfd or, if it's -1, arg1 can be an LX_AUTOFS_TYPE_* value. We + * currently don't need those for our algorithm. + */ + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + do { + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + /* + * To handle direct mounts (on top of an autofs + * mount), we must prefer non-autofs vfs for + * this request. + */ + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + + if (fnd_vfs->vfs_op != lx_autofs_vfsops) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + /* + * arg1 is device number, arg2 is superblock magic number + * The superblock value only matters if autofs or not. + */ + dc->lad_arg1 = fnd_vfs->vfs_dev; + if (fnd_vfs->vfs_op == lx_autofs_vfsops) { + dc->lad_arg2 = LX_AUTOFS_SB_MAGIC; + } else { + dc->lad_arg2 = ~LX_AUTOFS_SB_MAGIC; + } + + VFS_RELE(fnd_vfs); + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + + /* + * We have to return 1 if it is a mount point. The lx ioctl autofs + * translator will convert a negative value back to a positive, + * non-error return value. + */ + return (-1); +} + +static int +lx_autofs_dev_askumount(intptr_t arg) +{ + int err; + int v; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if (lx_autofs_may_unmount(vfsp, kcred)) { + v = 0; + } else { + v = 1; + } + VFS_RELE(vfsp); + + dcmd.lad_arg1 = v; + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + switch (cmd) { + case LX_AUTOFS_DEV_IOC_VERSION_CMD: + return (lx_autofs_dev_vers(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOVER_CMD: + return (lx_autofs_dev_protver(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD: + return (lx_autofs_dev_protosubver(arg)); + + case LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD: + return (lx_autofs_dev_openmount(arg)); + + case LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD: + return (lx_autofs_dev_closemount(arg)); + + case LX_AUTOFS_DEV_IOC_READY_CMD: + return (lx_autofs_dev_ready(arg)); + + case LX_AUTOFS_DEV_IOC_FAIL_CMD: + return (lx_autofs_dev_fail(arg)); + + case LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD: + return (lx_autofs_dev_setpipefd(arg)); + + case LX_AUTOFS_DEV_IOC_CATATONIC_CMD: + return (lx_autofs_dev_catatonic(arg)); + + case LX_AUTOFS_DEV_IOC_TIMEOUT_CMD: + return (lx_autofs_dev_timeout(arg)); + + case LX_AUTOFS_DEV_IOC_REQUESTER_CMD: + return (lx_autofs_dev_requestor(arg)); + + case LX_AUTOFS_DEV_IOC_EXPIRE_CMD: + return (lx_autofs_dev_expire(arg)); + + case LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD: + return (lx_autofs_dev_askumount(arg)); + + case LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD: + return (lx_autofs_dev_ismntpt(arg)); + } + + return (EINVAL); +} + +/* + * lx_autofs_init() gets invoked via the mod_install() call in + * this module's _init() routine. Therefore, the code that cleans + * up the structures we allocate below is actually found in + * our _fini() routine. + */ +/* ARGSUSED */ +static int +lx_autofs_init(int fstype, char *name) +{ + int error; + + lx_autofs_major = ddi_name_to_major(LX_AUTOFS_NAME); + + lx_autofs_fstype = fstype; + if ((error = vfs_setfsops(fstype, lx_autofs_vfstops, + &lx_autofs_vfsops)) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template"); + return (error); + } + + if ((error = vn_make_ops(name, lx_autofs_tops_root, + &lx_autofs_vn_ops)) != 0) { + VERIFY(vfs_freevfsops_by_type(fstype) == 0); + lx_autofs_vn_ops = NULL; + return (error); + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance = ddi_get_instance(dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + ASSERT(instance == 0); + if (instance != 0) + return (DDI_FAILURE); + + /* create our minor node */ + if (ddi_create_minor_node(dip, LX_AUTOFS_MINORNAME, S_IFCHR, 0, + DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + lx_autofs_dip = dip; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + lx_autofs_dip = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, + void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = lx_autofs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Driver flags + */ +static struct cb_ops lx_autofs_cb_ops = { + lx_autofs_dev_open, /* open */ + lx_autofs_dev_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + lx_autofs_dev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* vb_prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +/* + * Module linkage + */ +static mntopt_t lx_autofs_mntopt[] = { + { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_INDIRECT, NULL, 0, 0 }, + { LX_MNTOPT_DIRECT, NULL, 0, 0 }, + { LX_MNTOPT_OFFSET, NULL, 0, 0 } +}; + +static mntopts_t lx_autofs_mntopts = { + sizeof (lx_autofs_mntopt) / sizeof (mntopt_t), + lx_autofs_mntopt +}; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + LX_AUTOFS_NAME, + lx_autofs_init, + VSW_HASPROTO | VSW_VOLATILEDEV | VSW_ZMOUNT, + &lx_autofs_mntopts +}; + +static struct dev_ops lx_autofs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + lx_autofs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_autofs_attach, /* attach */ + lx_autofs_detach, /* detach */ + nodev, /* reset */ + &lx_autofs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx autofs filesystem", &vfw +}; + +static struct modldrv modldrv = { + &mod_driverops, "lx autofs driver", &lx_autofs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlfs, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) { + return (error); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + if (lx_autofs_vn_ops != NULL) { + vn_freevnodeops(lx_autofs_vn_ops); + lx_autofs_vn_ops = NULL; + } + + /* + * In our init routine, if we get an error after calling + * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type(). + * But we don't need to call vfs_freevfsops_by_type() here + * because the fs framework did this for us as part of the + * mod_remove() call above. + */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/autofs/lxautofs.conf b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf new file mode 100644 index 0000000000..36e0119e33 --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2016 Joyent, Inc. +# + +name="lxautofs" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h new file mode 100644 index 0000000000..df938adcea --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h @@ -0,0 +1,223 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXCGRPS_H +#define _LXCGRPS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * cgrps.h: declarations, data structures and macros for lx_cgroup + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> + +/* + * cgrpmgr ioctl interface. + */ +#define CGRPFS_IOC ('C' << 16 | 'G' << 8) +#define CGRPFS_GETEVNT (CGRPFS_IOC | 1) + +typedef struct cgrpmgr_info { + pid_t cgmi_pid; + char *cgmi_rel_agent_path; + char *cgmi_cgroup_path; +} cgrpmgr_info_t; + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +typedef struct cgrpmgr_info32 { + pid_t cgmi_pid; + caddr32_t cgmi_rel_agent_path; + caddr32_t cgmi_cgroup_path; +} cgrpmgr_info32_t; + +#define CG_PSNSIZE 256 /* max size of pseudo file name entries */ +#define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */ + +/* + * The order of these entries must be in sync with the cg_ssde_dir array. + */ +typedef enum cgrp_ssid { + CG_SSID_GENERIC = 1, + CG_SSID_NUM /* last ssid for range checking */ +} cgrp_ssid_t; + +typedef enum cgrp_nodetype { + CG_CGROUP_DIR = 1, /* cgroup directory entry */ + CG_NOTIFY, /* notify_on_release file */ + CG_PROCS, /* cgroup.procs file */ + CG_REL_AGENT, /* release_agent file */ + CG_TASKS, /* tasks file */ +} cgrp_nodetype_t; + +typedef struct cgrp_subsys_dirent { + cgrp_nodetype_t cgrp_ssd_type; + char *cgrp_ssd_name; +} cgrp_subsys_dirent_t; + +#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2) + +/* + * A modern systemd-based Linux system typically has 50-60 cgroups so + * we size the hash for 2x that number. + */ +#define CGRP_HASH_SZ 128 +#define CGRP_AGENT_LEN (MAXPATHLEN + 1) + +/* + * cgroups per-mount data structure. + * + * All but the event related fields are protected by cg_contents. + * The evnt_list and counter is protected by cg_events. + */ +typedef struct cgrp_mnt { + struct vfs *cg_vfsp; /* filesystem's vfs struct */ + struct cgrp_node *cg_rootnode; /* root cgrp_node */ + char *cg_mntpath; /* name of cgroup mount point */ + cgrp_ssid_t cg_ssid; /* subsystem type */ + dev_t cg_dev; /* unique dev # of mounted `device' */ + uint_t cg_gen; /* node ID source for files */ + uint_t cg_grp_gen; /* ID source for cgroups */ + kmutex_t cg_contents; /* global lock for most fs activity */ + char cg_agent[CGRP_AGENT_LEN]; /* release_agent path */ + /* ptr to zone data for containing zone */ + lx_zone_data_t *cg_lxzdata; + struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */ +} cgrp_mnt_t; + +/* + * cgrp_node is the file system dependent node for cgroups. + * + * The node is used to represent both directories (a cgroup) and pseudo files + * within the directory. + * + * Members are tagged in the comment to note which type of node they apply to: + * A - all + * D - dir (i.e. a cgroup) + * F - pseudo file + */ + +typedef struct cgrp_node { + struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */ + struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */ + struct cgrp_dirent *cgn_dir; /* D dirent list */ + struct cgrp_node *cgn_parent; /* A dir containing this node */ + struct cgrp_node *cgn_next; /* D link in per-mount cgroup */ + /* hash table */ + uint_t cgn_dirents; /* D number of dirents */ + cgrp_nodetype_t cgn_type; /* A type for this node */ + uint_t cgn_notify; /* D notify_on_release value */ + uint_t cgn_task_cnt; /* D number of threads in grp */ + struct vnode *cgn_vnode; /* A vnode for this cgrp_node */ + uint_t cgn_id; /* D ID number for the cgroup */ + struct vattr cgn_attr; /* A attributes */ +} cgrp_node_t; + +/* + * File system independent to cgroups conversion macros + */ +#define VFSTOCGM(vfsp) ((cgrp_mnt_t *)(vfsp)->vfs_data) +#define VTOCGM(vp) ((cgrp_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOCGN(vp) ((struct cgrp_node *)(vp)->v_data) +#define CGNTOV(cn) ((cn)->cgn_vnode) +#define cgnode_hold(cn) VN_HOLD(CGNTOV(cn)) +#define cgnode_rele(cn) VN_RELE(CGNTOV(cn)) + +/* + * Attributes + */ +#define cgn_mask cgn_attr.va_mask +#define cgn_mode cgn_attr.va_mode +#define cgn_uid cgn_attr.va_uid +#define cgn_gid cgn_attr.va_gid +#define cgn_fsid cgn_attr.va_fsid +#define cgn_nodeid cgn_attr.va_nodeid +#define cgn_nlink cgn_attr.va_nlink +#define cgn_size cgn_attr.va_size +#define cgn_atime cgn_attr.va_atime +#define cgn_mtime cgn_attr.va_mtime +#define cgn_ctime cgn_attr.va_ctime +#define cgn_rdev cgn_attr.va_rdev +#define cgn_blksize cgn_attr.va_blksize +#define cgn_nblocks cgn_attr.va_nblocks +#define cgn_seq cgn_attr.va_seq + +/* + * cgroup directories are made up of a linked list of cg_dirent structures + * hanging off directory cgrp_nodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct cgrp_dirent { + struct cgrp_node *cgd_cgrp_node; /* cg node for this file */ + struct cgrp_dirent *cgd_next; /* next directory entry */ + struct cgrp_dirent *cgd_prev; /* prev directory entry */ + uint_t cgd_offset; /* "offset" of dir entry */ + uint_t cgd_hash; /* a hash of cgd_name */ + struct cgrp_dirent *cgd_link; /* linked via hash table */ + struct cgrp_node *cgd_parent; /* parent, dir we are in */ + char *cgd_name; /* null terminated */ +} cgrp_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +extern struct vnodeops *cgrp_vnodeops; + +int cgrp_dirdelete(cgrp_node_t *, cgrp_node_t *, char *, enum dr_op, cred_t *); +int cgrp_direnter(cgrp_mnt_t *, cgrp_node_t *, char *, enum de_op, + cgrp_node_t *, struct vattr *, cgrp_node_t **, cred_t *, + caller_context_t *); +void cgrp_dirinit(cgrp_node_t *, cgrp_node_t *, cred_t *); +int cgrp_dirlookup(cgrp_node_t *, char *, cgrp_node_t **, cred_t *); +void cgrp_dirtrunc(cgrp_node_t *); +void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *); +int cgrp_taccess(void *, int, cred_t *); +ino_t cgrp_inode(cgrp_nodetype_t, unsigned int); +int cgrp_num_pseudo_ents(cgrp_ssid_t); +cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t); +void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXCGRPS_H */ diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c new file mode 100644 index 0000000000..8950be1966 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c @@ -0,0 +1,1019 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "cgrps.h" + +static int cgrp_dirmakecgnode(cgrp_node_t *, cgrp_mnt_t *, struct vattr *, + enum de_op, cgrp_node_t **, struct cred *); +static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *, enum de_op); + +static cgrp_subsys_dirent_t cgrp_generic_dir[] = { + { CG_PROCS, "cgroup.procs" }, + { CG_NOTIFY, "notify_on_release" }, + { CG_TASKS, "tasks" } +}; + +typedef struct cgrp_ssde { + cgrp_subsys_dirent_t *cg_ssde_files; + int cg_ssde_nfiles; +} cgrp_ssde_t; + +#define CGDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +/* + * Note, these entries must be in the same order as the cgrp_ssid_t entries. + */ +static cgrp_ssde_t cg_ssde_dir[] = { + /* subsystems start at 1 */ + {NULL, 0}, + + /* CG_SSID_GENERIC */ + {cgrp_generic_dir, CGDIRLISTSZ(cgrp_generic_dir)}, +}; + + +#define CG_HASH_SIZE 8192 /* must be power of 2 */ +#define CG_MUTEX_SIZE 64 + +static cgrp_dirent_t *cg_hashtable[CG_HASH_SIZE]; +static kmutex_t cg_hashmutex[CG_MUTEX_SIZE]; + +#define CG_HASH_INDEX(a) ((a) & (CG_HASH_SIZE-1)) +#define CG_MUTEX_INDEX(a) ((a) & (CG_MUTEX_SIZE-1)) + +#define CG_HASH(cp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(cp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +#define MODESHIFT 3 + +typedef enum cgrp_nodehold { + NOHOLD, + HOLD +} cgrp_nodehold_t; + +void +cgrp_hash_init(void) +{ + int i; + + for (i = 0; i < CG_MUTEX_SIZE; i++) + mutex_init(&cg_hashmutex[i], NULL, MUTEX_DEFAULT, NULL); +} + +static void +cgrp_hash_in(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + CG_HASH(c->cgd_parent, c->cgd_name, hash); + c->cgd_hash = hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + c->cgd_link = *prevpp; + *prevpp = c; + mutex_exit(cg_hmtx); +} + +static void +cgrp_hash_out(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + hash = c->cgd_hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + while (*prevpp != c) + prevpp = &(*prevpp)->cgd_link; + *prevpp = c->cgd_link; + mutex_exit(cg_hmtx); +} + +static cgrp_dirent_t * +cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold, + cgrp_node_t **found) +{ + cgrp_dirent_t *l; + uint_t hash; + kmutex_t *cg_hmtx; + cgrp_node_t *cnp; + + CG_HASH(parent, name, hash); + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + l = cg_hashtable[CG_HASH_INDEX(hash)]; + while (l) { + if ((l->cgd_hash == hash) && + (l->cgd_parent == parent) && + (strcmp(l->cgd_name, name) == 0)) { + /* + * We need to make sure that the cgrp_node that + * we put a hold on is the same one that we pass back. + * Hence, temporary variable cnp is necessary. + */ + cnp = l->cgd_cgrp_node; + if (hold == HOLD) { + ASSERT(cnp); + cgnode_hold(cnp); + } + if (found) + *found = cnp; + mutex_exit(cg_hmtx); + return (l); + } else { + l = l->cgd_link; + } + } + mutex_exit(cg_hmtx); + return (NULL); +} + +/* + * The following functions maintain the per-mount cgroup hash table. + */ +static void +cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + cn->cgn_next = cgm->cg_grp_hash[hsh]; + cgm->cg_grp_hash[hsh] = cn; +} + +static void +cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + cgrp_node_t *np = NULL, *curp, *prevp = NULL; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + if (prevp == NULL) { + cgm->cg_grp_hash[hsh] = curp->cgn_next; + } else { + prevp->cgn_next = curp->cgn_next; + } + np = curp; + np->cgn_next = NULL; + break; + } + + prevp = curp; + } + + ASSERT(np != NULL); + ASSERT(np->cgn_task_cnt == 0); +} + +/* + * Count up the number of threads already running in the zone and initialize the + * first cgroup's task counter. + * + * We have to look at all of the processes to find applicable ones. + */ +static void +cgrp_cg_hash_init(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + int i; + int cnt = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Scan all of the process entries */ + mutex_enter(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != zoneid) { + continue; + } + + mutex_enter(&p->p_lock); + if (p->p_brand != &lx_brand) { + mutex_exit(&p->p_lock); + continue; + } + cnt += p->p_lwpcnt; + mutex_exit(&p->p_lock); + } + + /* + * There should be at least the init process with 1 thread in the zone + */ + ASSERT(cnt > 0); + cn->cgn_task_cnt = cnt; + + DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt); + + mutex_exit(&pidlock); +} + +cgrp_node_t * +cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid) +{ + int hsh = cgid % CGRP_HASH_SZ; + cgrp_node_t *curp; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + return (curp); + } + } + + return (NULL); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them to give the inode number for + * a cgrp pseudo file node. + */ +ino_t +cgrp_inode(cgrp_nodetype_t type, unsigned int cgrpid) +{ + /* + * cgroup inode format: + * 00000000AABBBBBB + * + * AA - node type (from subsystem list) + * BBBBBB - id of the cgroup + */ + + return ((ino_t)(type << 24) | (cgrpid & 0xffffff)); +} + +/* + * Return the number of pseudo file entries in a cgroup directory for the + * given subsystem. + */ +int +cgrp_num_pseudo_ents(cgrp_ssid_t ssid) +{ + cgrp_ssde_t *ssdp = &cg_ssde_dir[ssid]; + + return (ssdp->cg_ssde_nfiles); +} + +int +cgrp_taccess(void *vcp, int mode, cred_t *cred) +{ + cgrp_node_t *cn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in cgrp_node. + */ + if (crgetuid(cred) != cn->cgn_uid) { + shift += MODESHIFT; + if (groupmember(cn->cgn_gid, cred) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cred, CGNTOV(cn), cn->cgn_uid, + cn->cgn_mode << shift, mode)); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found cgrp_node with its vnode held. + */ +int +cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp, + cred_t *cred) +{ + cgrp_mnt_t *cgm = VTOCGM(parent->cgn_vnode); + int error; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + *foundcp = NULL; + if (parent->cgn_type != CG_CGROUP_DIR) + return (ENOTDIR); + + if ((error = cgrp_taccess(parent, VEXEC, cred))) + return (error); + + if (*name == '\0') { + cgnode_hold(parent); + *foundcp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the cgn_dir list + * so that it doesn't change out from underneath us. + * cgrp_hash_lookup() will pass back the cgrp_node + * with a hold on it. + */ + + if (cgrp_hash_lookup(name, parent, HOLD, foundcp) != NULL) { + ASSERT(*foundcp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry for 'name' and 'cp' into directory 'dir' + * + * Returns 0 on success. + */ +int +cgrp_direnter( + cgrp_mnt_t *cgm, + cgrp_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + cgrp_node_t *cn, /* existing cgrp_node, if rename */ + struct vattr *va, + cgrp_node_t **cnp, /* return cgrp_node, if create/mkdir */ + cred_t *cred, + caller_context_t *ctp) +{ + cgrp_dirent_t *cdp; + cgrp_node_t *found = NULL; + int error = 0; + char *s; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("cgrp_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + * Remember that we can only rename within the same directory. + */ + if (op == DE_RENAME) { + if (cn->cgn_nlink == 0) { + return (ENOENT); + } + + if (cn->cgn_nlink == MAXLINK) { + return (EMLINK); + } + cn->cgn_nlink++; + gethrestime(&cn->cgn_ctime); + } + + /* + * This might be a "dangling detached directory". + * it could have been removed, but a reference + * to it kept in u_cwd. don't bother searching + * it, and with any luck the user will get tired + * of dealing with us and cd to some absolute + * pathway. *sigh*, thus in ufs, too. + */ + if (dir->cgn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * Search for the entry. In all cases it is an error if it exists. + */ + cdp = cgrp_hash_lookup(name, dir, HOLD, &found); + + if (cdp) { + ASSERT(found != NULL); + error = EEXIST; + mutex_exit(&cgm->cg_contents); + cgnode_rele(found); + mutex_enter(&cgm->cg_contents); + } else { + + /* + * The entry does not exist. Check write permission in + * directory to see if entry can be created. + */ + if ((error = cgrp_taccess(dir, VWRITE, cred)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new cgrp_node and directory entry as required. + */ + error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred); + if (error) + goto out; + + if (op == DE_MKDIR) { + /* + * inherit notify_on_release value from parent + */ + cn->cgn_notify = dir->cgn_notify; + } + } + + error = cgrp_diraddentry(dir, cn, name, op); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + if ((cn->cgn_type) == CG_CGROUP_DIR) { + ASSERT(cdp == NULL); + /* + * cleanup allocs made by cgrp_dirinit + */ + cgrp_dirtrunc(cn); + } + cn->cgn_nlink = 0; + gethrestime(&cn->cgn_ctime); + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + cn = NULL; + } + } else if (cnp) { + *cnp = cn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + cn->cgn_nlink--; + gethrestime(&cn->cgn_ctime); + } + return (error); +} + +/* + * Delete entry cn of name "nm" from parent dir. This is used to both remove + * a cgroup directory and to remove the pseudo file nodes within the cgroup + * directory (by recursively calling itself). It frees the dir entry space + * and decrements link count on cgrp_node(s). + * + * Return 0 on success. + */ +int +cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, + cred_t *cred) +{ + cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode); + cgrp_dirent_t *cndp; + int error; + size_t namelen; + cgrp_node_t *cnnp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + if (nm[0] == '\0') + panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = cgrp_taccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->cgn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a cgroup dir. Start by + * removing the fixed pseudo file entries from the dir. We do + * this by recursively calling back into this function with + * a different op code. The caller of this function has + * already verified that it is safe to remove this directory. + */ + cgrp_dirent_t *cdp; + + ASSERT(cn->cgn_type == CG_CGROUP_DIR); + + cdp = cn->cgn_dir; + while (cdp) { + cgrp_node_t *pseudo_node; + cgrp_dirent_t *nextp; + + if (strcmp(cdp->cgd_name, ".") == 0 || + strcmp(cdp->cgd_name, "..") == 0) { + cdp = cdp->cgd_next; + continue; + } + + pseudo_node = cdp->cgd_cgrp_node; + nextp = cdp->cgd_next; + + cgnode_hold(pseudo_node); + error = cgrp_dirdelete(cn, pseudo_node, + cdp->cgd_name, DR_REMOVE, cred); + mutex_exit(&cgm->cg_contents); + cgnode_rele(pseudo_node); + mutex_enter(&cgm->cg_contents); + + cdp = nextp; + } + + cgrp_cg_hash_remove(cgm, cn); + } + + cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp); + VERIFY(cndp != NULL); + VERIFY(cn == cnnp); + + cgrp_hash_out(cndp); + + /* Take cndp out of the directory list. */ + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + if (cndp->cgd_prev) { + cndp->cgd_prev->cgd_next = cndp->cgd_next; + } + if (cndp->cgd_next) { + cndp->cgd_next->cgd_prev = cndp->cgd_prev; + } + + /* + * If the roving slot pointer happens to match cndp, + * point it at the previous dirent. + */ + if (dir->cgn_dir->cgd_prev == cndp) { + dir->cgn_dir->cgd_prev = cndp->cgd_prev; + } + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + + /* cndp points to the correct directory entry */ + namelen = strlen(cndp->cgd_name) + 1; + + kmem_free(cndp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + cn->cgn_ctime = now; + + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) { + cgrp_dirtrunc(cn); + ASSERT(cn->cgn_nlink == 0); + } + return (0); +} + +/* + * Initialize a cgrp_node and add it to file list under mount point. + */ +void +cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(vap != NULL); + + cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + cn->cgn_mask = 0; + cn->cgn_attr.va_type = vap->va_type; + cn->cgn_nlink = 1; + cn->cgn_size = 0; + + if (cred == NULL) { + cn->cgn_uid = vap->va_uid; + cn->cgn_gid = vap->va_gid; + } else { + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + } + + cn->cgn_fsid = cgm->cg_dev; + cn->cgn_rdev = vap->va_rdev; + cn->cgn_blksize = PAGESIZE; + cn->cgn_nblocks = 0; + gethrestime(&now); + cn->cgn_atime = now; + cn->cgn_mtime = now; + cn->cgn_ctime = now; + cn->cgn_seq = 0; + cn->cgn_dir = NULL; + + cn->cgn_vnode = vn_alloc(KM_SLEEP); + vp = CGNTOV(cn); + vn_setops(vp, cgrp_vnodeops); + vp->v_vfsp = cgm->cg_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)cn; + + cn->cgn_nodeid = cgm->cg_gen++; + + /* + * Add new cgrp_node to end of linked list of cgrp_nodes for this + * cgroup fs. Root directory is handled specially in cgrp_mount. + */ + if (cgm->cg_rootnode != (cgrp_node_t *)NULL) { + cn->cgn_forw = NULL; + cn->cgn_back = cgm->cg_rootnode->cgn_back; + cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn; + } + vn_exists(vp); +} + +void +cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name, + cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr) +{ + cgrp_node_t *ncn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, nattr, + &ncn, cr, NULL); + + /* + * Fix the inode and assign the pseudo file type to be correct. + */ + ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid); + ncn->cgn_type = type; + + /* + * Since we're creating these entries here and not via the + * normal VOP_CREATE code path, we need to do the rele to drop + * our hold. This will leave the vnode v_count at 0 when we + * come out of cgrp_inactive but we won't reclaim the vnode + * there since the cgn_nlink value will still be 1. + */ + mutex_exit(&cgm->cg_contents); + cgnode_rele(ncn); + mutex_enter(&cgm->cg_contents); +} + +/* + * cgrp_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) +{ + cgrp_dirent_t *dot, *dotdot; + timestruc_t now; + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); + cgrp_ssde_t *ssdp; + cgrp_subsys_dirent_t *pseudo_files; + struct vattr nattr; + int i; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM); + ssdp = &cg_ssde_dir[cgm->cg_ssid]; + + /* + * If this is the top-level cgroup created by the mount then we need to + * count up the number of procs and tasks already running in the zone. + */ + + /* + * Set the cgroup ID for this cgrp_node by using a counter on each + * mount. + */ + dir->cgn_id = cgm->cg_grp_gen++; + cgrp_cg_hash_insert(cgm, dir); + /* Initialise the first cgroup if this is top-level group */ + if (parent == dir) + cgrp_cg_hash_init(cgm, dir); + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP); + dot->cgd_cgrp_node = dir; + dot->cgd_offset = 0; + dot->cgd_name = (char *)dot + sizeof (cgrp_dirent_t); + dot->cgd_name[0] = '.'; + dot->cgd_parent = dir; + cgrp_hash_in(dot); + + dotdot = kmem_zalloc(sizeof (cgrp_dirent_t) + 3, KM_SLEEP); + dotdot->cgd_cgrp_node = parent; + dotdot->cgd_offset = 1; + dotdot->cgd_name = (char *)dotdot + sizeof (cgrp_dirent_t); + dotdot->cgd_name[0] = '.'; + dotdot->cgd_name[1] = '.'; + dotdot->cgd_parent = dir; + cgrp_hash_in(dotdot); + + /* + * Initialize directory entry list. + */ + dot->cgd_next = dotdot; + dot->cgd_prev = dotdot; /* dot's cgd_prev holds roving slot pointer */ + dotdot->cgd_next = NULL; + dotdot->cgd_prev = dot; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + parent->cgn_nlink++; + parent->cgn_ctime = now; + + dir->cgn_dir = dot; + dir->cgn_size = 2 * sizeof (cgrp_dirent_t) + 5; /* dot and dotdot */ + dir->cgn_dirents = 2; + dir->cgn_nlink = 2; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; + + /* + * If this is the top-level dir in the file system then it always + * has a release_agent pseudo file. Only the top-level dir has this + * file. + */ + if (parent == dir) { + cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr, + cr); + } + + pseudo_files = ssdp->cg_ssde_files; + for (i = 0; i < ssdp->cg_ssde_nfiles; i++) { + cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name, + pseudo_files[i].cgrp_ssd_type, &nattr, cr); + } +} + +/* + * cgrp_dirtrunc is called to remove all directory entries under this directory. + */ +void +cgrp_dirtrunc(cgrp_node_t *dir) +{ + cgrp_dirent_t *cgdp; + timestruc_t now; + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) { + size_t namelen; + cgrp_node_t *cn; + + ASSERT(cgdp->cgd_next != cgdp); + ASSERT(cgdp->cgd_prev != cgdp); + ASSERT(cgdp->cgd_cgrp_node); + + dir->cgn_dir = cgdp->cgd_next; + namelen = strlen(cgdp->cgd_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + cn = cgdp->cgd_cgrp_node; + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + + cgrp_hash_out(cgdp); + kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + } + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + ASSERT(dir->cgn_dir == NULL); + ASSERT(dir->cgn_size == 0); + ASSERT(dir->cgn_dirents == 0); +} + +static int +cgrp_diraddentry(cgrp_node_t *dir, cgrp_node_t *cn, char *name, enum de_op op) +{ + cgrp_dirent_t *cdp, *cpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->cgn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (cn->cgn_vnode->v_vfsp != dir->cgn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (cgrp_dirent_t); + cdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); + if (cdp == NULL) + return (ENOSPC); + + cn->cgn_parent = dir; + + dir->cgn_size += alloc_size; + dir->cgn_dirents++; + cdp->cgd_cgrp_node = cn; + cdp->cgd_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + cdp->cgd_name = (char *)cdp + sizeof (cgrp_dirent_t); + (void) strcpy(cdp->cgd_name, name); + + cgrp_hash_in(cdp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in cgrp_readdir. + */ + cpdp = dir->cgn_dir->cgd_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (cpdp->cgd_next != NULL && (cpdp->cgd_next->cgd_offset - + cpdp->cgd_offset) <= 1) { + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + ASSERT(cpdp->cgd_next->cgd_offset > cpdp->cgd_offset); + cpdp = cpdp->cgd_next; + } + cdp->cgd_offset = cpdp->cgd_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's cgd_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (cpdp->cgd_next == NULL && cpdp->cgd_offset > 2 * dir->cgn_dirents) + dir->cgn_dir->cgd_prev = dir->cgn_dir->cgd_next; + else + dir->cgn_dir->cgd_prev = cdp; + + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + cdp->cgd_next = cpdp->cgd_next; + if (cdp->cgd_next) { + cdp->cgd_next->cgd_prev = cdp; + } + cdp->cgd_prev = cpdp; + cpdp->cgd_next = cdp; + + ASSERT(cdp->cgd_next != cdp); + ASSERT(cdp->cgd_prev != cdp); + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + return (0); +} + +static int +cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va, + enum de_op op, cgrp_node_t **newnode, struct cred *cred) +{ + cgrp_node_t *cn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + cn = kmem_zalloc(sizeof (cgrp_node_t), KM_SLEEP); + cgrp_node_init(cgm, cn, va, cred); + + cn->cgn_vnode->v_rdev = cn->cgn_rdev = NODEV; + cn->cgn_vnode->v_type = va->va_type; + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + + if (va->va_mask & AT_ATIME) + cn->cgn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + cn->cgn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + cn->cgn_type = CG_CGROUP_DIR; + cgrp_dirinit(dir, cn, cred); + } + + *newnode = cn; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c new file mode 100644 index 0000000000..a9bd783569 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c @@ -0,0 +1,1052 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The cgroup file system implements a subset of the Linux cgroup functionality + * for use by lx-branded zones. On Linux, cgroups are a generic process grouping + * mechanism which is used to apply various behaviors to the processes within + * the group, although it's primary purpose is for resource management. + * + * In Linux, the cgroup file system provides two pieces of functionality: + * 1) A per-mount set of cgroups arranged in a tree, such that every task in + * the system is in one, and only one, of the cgroups in the tree. + * 2) A set of subsystems; each subsystem has subsystem-specific state and + * behavior and is associated with a cgroup mount. This provides a way to + * apply arbitrary functionality (but generally resource management related) + * to the processes associated with the nodes in the tree at that mount + * point. + * + * For example, it is common to see cgroup trees (each is its own mount with a + * different subsystem controller) for blkio, cpuset, memory, systemd (has no + * controller), etc. Within each tree there is a top-level directory with at + * least a cgroup.procs, notify_on_release, release_agent, and tasks file. + * The cgroup.procs file lists the processes within that group and the tasks + * file lists the threads in the group. There could be subdirectories, which + * define new cgroups, that then contain a subset of the processes. Each + * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and + * tasks file. + * + * Since we're using lx to run user-level code within zones, the majority (all?) + * of the cgroup resource management functionality simply doesn't apply to us. + * The primary need for cgroups is to support the init program 'systemd' as the + * consumer. systemd only requires the process grouping hierarchy of cgroups, + * although it can also use the resource management features if they are + * available. Given this, our cgroup file system only implements the process + * hierarchy and does not report that any resource management controllers are + * available for separate mounts. + * + * In addition to the hierarchy, the other important component of cgroups that + * is used by systemd is the 'release_agent'. This provides a mechanism to + * run a command when a cgroup becomes empty (the last task in the group + * leaves, either by exit or move, and there are no more sub-cgroups). The + * 'release_agent' file only exists in the top-level cgroup of the mounted + * file system and holds the path to a command to run. The 'notify_on_release' + * file exists in each cgroup dir. If that file contains a '1' then the agent + * is run when that group becomes empty. The agent is passed a path string of + * the cgroup, relative to the file system mount point (e.g. a mount on + * /sys/fs/cgroups/systemd with a sub-cgroup of /sys/fs/cgroups/systemd/foo/bar + * gets the arg /foo/bar). + * + * Cgroup membership is implemented via hooks into the lx brand code. When + * the cgroup file system loads it installs callbacks for: + * lx_cgrp_initlwp + * lx_cgrp_freelwp + * and when it unloads it clears those hooks. The lx brand code calls those + * hooks when a lwp starts and when it exits. Internally we use a + * simple reference counter (cgn_task_cnt) on the cgroup node to track how many + * threads are in the group, so we can tell when a group becomes empty. + * To make this quick, a hash table (cg_grp_hash) is maintained on the + * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is + * sized so that there should typically only be 0 or 1 cgroups per bucket. + * We also keep a reference to the file system in the zone-specific brand data + * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t + * when it runs the hook. + * + * Once a cgroup is about to become empty, the final process exiting the cgroup + * will launch a new user-level process which execs the release agent. The new + * process is created as a child of zsched (indicated by the -1 pid argument + * to newproc) and is not associated with the exiting process in any way. + * + * This file system is similar to tmpfs in that directories only exist in + * memory. Each subdirectory represents a different cgroup. Within the cgroup + * there are pseudo files (see cg_ssde_dir) with well-defined names which + * control the configuration and behavior of the cgroup (see cgrp_nodetype_t). + * The primary files within every cgroup are named 'cgroup.procs', + * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the + * top-level cgroup). The cgroup.procs and tasks files are used to control and + * list which processes/threads belong to the cgroup. In the general case there + * could be additional files in the cgroup, which defined additional behavior + * (i.e. subsystem specific pseudo files), although none exist at this time. + * + * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is + * used to correlate with the threads to determine cgroup membership. When + * assigning a PID to a cgroup (via write) the code updates the br_cgroupid + * member in the brand-specific lx_lwp_data structure to control which cgroup + * the thread belongs to. Note that because the br_cgroupid lives in + * lx_lwp_data, native processes will not appear in the cgroup hierarchy. + * + * An overview of the behavior for the various vnode operations is: + * - no hardlinks or symlinks + * - no file create (the subsystem-specific files are a fixed list of + * pseudo-files accessible within the directory) + * - no file remove + * - no file rename, but a directory (i.e. a cgroup) can be renamed within the + * containing directory, but not into a different directory + * - can mkdir and rmdir to create/destroy cgroups + * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup) + * - open, read/write, close on the subsytem-specific pseudo files is + * allowed, as this is the interface to configure and report on the cgroup. + * The pseudo file's mode controls write access and cannot be changed. + * + * The locking in this file system is simple since the file system is not + * subjected to heavy I/O activity and all data is in-memory. There is a single + * global mutex for each mount (cg_contents). This mutex is held for the life + * of most vnode operations. The most active path is probably the LWP start and + * exit hooks which increment/decrement the reference counter on the cgroup + * node. The lock is important for this case since we don't want concurrent + * activity (such as moving the process into another cgroup) while we're trying + * to lookup the cgroup from the mount's hash table. We must be careful to + * avoid a deadlock while reading or writing since that code can take pidlock + * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of + * those is held. To prevent deadlock we always take cg_contents after pidlock + * and p_lock. + * + * EXTENDING THE FILE SYSTEM + * + * When adding support for a new subsystem, be sure to also update the + * lxpr_read_cgroups function in lx_procfs so that the subsystem is reported + * by proc. + * + * Although we don't currently support any subsystem controllers, the design + * allows for the file system to be extended to add controller emulation + * if needed. New controller IDs (i.e. different subsystems) for a mount can + * be defined in the cgrp_ssid_t enum (e.g. CG_SSID_CPUSET or CG_SSID_MEMORY) + * and new node types for additional pseudo files in the tree can be defined in + * the cgrp_nodetype_t enum (e.g. CG_CPUSET_CPUS or CG_MEMORY_USAGE_IN_BYTES). + * The cg_ssde_dir array would need a new entry for the new subsystem to + * control which nodes are visible in a directory for the new subsystem. + * + * New emulation would then need to be written to manage the behavior on the + * new pseudo file(s) associated with new cgrp_nodetype_t types. + * + * Within lx procfs the lxpr_read_pid_cgroup() function would need to be + * updated so that it reported the various subsystems used by the different + * mounts. + * + * In addition, in order to support more than one cgroup mount we would need a + * list of cgroup IDs associated with every thread, instead of just one ID + * (br_cgroupid). The thread data would need to become a struct which held + * both an ID and an indication as to which mounted cgroup file system instance + * the ID was associated with. We would also need a list of cgroup mounts per + * zone, instead the current single zone reference. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/vmparam.h> +#include <sys/corectl.h> +#include <sys/contract_impl.h> +#include <sys/pool.h> +#include <sys/stack.h> +#include <sys/rt.h> +#include <sys/fx.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +/* Module level parameters */ +static int cgrp_fstype; +static dev_t cgrp_dev; + +#define MAX_AGENT_EVENTS 32 /* max num queued events */ + +#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */ +#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */ + +/* + * cgrp_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t cgrp_mountcount; + +/* + * cgrp_minfree is the minimum amount of swap space that cgroups leaves for + * the rest of the zone. In other words, if the amount of free swap space + * in the zone drops below cgrp_minfree, cgroup anon allocations will fail. + * This number is only likely to become factor when DRAM and swap have both + * been capped low to allow for maximum tenancy. + */ +size_t cgrp_minfree = 0; + +/* + * CGMINFREE -- the value from which cgrp_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for cgroups in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow cgroups to consume + * no more than half of this, yielding a CGMINFREE of 64MB. + */ +#define CGMINFREE 64 * 1024 * 1024 /* 64 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +/* + * cgroup vfs operations. + */ +static int cgrp_init(int, char *); +static int cgrp_mount(struct vfs *, struct vnode *, + struct mounta *, struct cred *); +static int cgrp_unmount(struct vfs *, int, struct cred *); +static int cgrp_root(struct vfs *, struct vnode **); +static int cgrp_statvfs(struct vfs *, struct statvfs64 *); +static void cgrp_freevfs(vfs_t *vfsp); + +/* Forward declarations for hooks */ +static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t); +static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_cgroup", + cgrp_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand cgroups", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (cgrp_mountcount) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* Disable hooks used by the lx brand module. */ + lx_cgrp_initlwp = NULL; + lx_cgrp_freelwp = NULL; + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(cgrp_fstype); + vn_freevnodeops(cgrp_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading cgroup module. + */ +static int +cgrp_init(int fstype, char *name) +{ + static const fs_operation_def_t cgrp_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = cgrp_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = cgrp_unmount }, + VFSNAME_ROOT, { .vfs_root = cgrp_root }, + VFSNAME_STATVFS, { .vfs_statvfs = cgrp_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = cgrp_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def cgrp_vnodeops_template[]; + int error; + extern void cgrp_hash_init(); + major_t dev; + + cgrp_hash_init(); + cgrp_fstype = fstype; + ASSERT(cgrp_fstype != 0); + + error = vfs_setfsops(fstype, cgrp_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "cgrp_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, cgrp_vnodeops_template, &cgrp_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "cgrp_init: bad vnode ops template"); + return (error); + } + + /* + * cgrp_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (cgrp_minfree == 0) { + /* Set if not patched */ + cgrp_minfree = btopr(CGMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "cgrp_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + cgrp_dev = makedevice(dev, 0); + + /* Install the hooks used by the lx brand module. */ + lx_cgrp_initlwp = cgrp_lwp_fork_helper; + lx_cgrp_freelwp = cgrp_lwp_exit_helper; + + return (0); +} + +static int +cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + cgrp_mnt_t *cgm = NULL; + struct cgrp_node *cp; + struct pathname dpn; + int error; + struct vattr rattr; + cgrp_ssid_t ssid = CG_SSID_GENERIC; + lx_zone_data_t *lxzdata; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Since we depend on per-thread lx brand data, only allow mounting + * within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Having the resource be anything but "swap" doesn't make sense. + */ + vfs_setresource(vfsp, "swap", 0); + + /* cgroups don't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + /* + * Here is where we could support subsystem-specific controller + * mounting. For example, if mounting a cgroup fs with the 'cpuset' + * option to specify that particular controller. + * + * char *argstr; + * if (vfs_optionisset(vfsp, "cpuset", &argstr)) { + * if (ssid != CG_SSID_GENERIC) { + * error = EINVAL; + * goto out; + * } + * ssid = CG_SSID_CPUSET; + * } + */ + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * We currently only support one mount per zone. + */ + lxzdata = ztolxzd(curproc->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + if (lxzdata->lxzd_cgroup != NULL) { + mutex_exit(&lxzdata->lxzd_lock); + return (EINVAL); + } + + cgm = kmem_zalloc(sizeof (*cgm), KM_SLEEP); + + /* Set but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL); + + cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp; + mutex_exit(&lxzdata->lxzd_lock); + + cgm->cg_lxzdata = lxzdata; + cgm->cg_ssid = ssid; + + vfsp->vfs_data = (caddr_t)cgm; + vfsp->vfs_fstype = cgrp_fstype; + vfsp->vfs_dev = cgrp_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, cgrp_dev, cgrp_fstype); + cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(cgm->cg_mntpath, dpn.pn_path); + + cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ, + KM_SLEEP); + + /* allocate and initialize root cgrp_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP); + + mutex_enter(&cgm->cg_contents); + cgrp_node_init(cgm, cp, &rattr, cr); + + CGNTOV(cp)->v_flag |= VROOT; + + /* + * initialize linked list of cgrp_nodes so that the back pointer of + * the root cgrp_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + cp->cgn_back = cp; + cp->cgn_forw = NULL; + cp->cgn_nlink = 0; + cgm->cg_rootnode = cp; + + cp->cgn_type = CG_CGROUP_DIR; + cp->cgn_nodeid = cgrp_inode(ssid, cgm->cg_gen); + cgrp_dirinit(cp, cp, cr); + + mutex_exit(&cgm->cg_contents); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&cgrp_mountcount); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cgnp, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + int retry_cnt = 0; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + +retry: + mutex_enter(&cgm->cg_contents); + + /* + * In the normal unmount case, if there were no open files, only the + * root node would have a reference count. However, the user-level + * agent manager should have the root vnode open and be waiting in + * ioctl. We need to wake the manager and it may take some retries + * before it closes its file descriptor. + * + * With cg_contents held, nothing can be added or removed. + * There may be some dirty pages. To prevent fsflush from + * disrupting the unmount, put a hold on each node while scanning. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + cgnp = cgm->cg_rootnode; + + ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL); + + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return (EINVAL); + } + + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + /* Likely because the user-level manager hasn't exited yet */ + if (retry_cnt++ < UMNT_RETRY_MAX) { + delay(UMNT_DELAY_TIME); + goto retry; + } + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (cgnp = cgnp->cgn_forw; cgnp; cgnp = cgnp->cgn_forw) { + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = cgm->cg_rootnode->cgn_forw; + while (cancel != cgnp) { + vp = CGNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->cgn_forw; + } + mutex_exit(&cgm->cg_contents); + return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + mutex_enter(&cgm->cg_lxzdata->lxzd_lock); + cgm->cg_lxzdata->lxzd_cgroup = NULL; + mutex_exit(&cgm->cg_lxzdata->lxzd_lock); + kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ); + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&cgm->cg_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * cgrp_umount. + */ +void +cgrp_freevfs(vfs_t *vfsp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + struct vnode *vp; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries + */ + for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) { + mutex_enter(&cgm->cg_contents); + if (cn->cgn_type == CG_CGROUP_DIR) + cgrp_dirtrunc(cn); + mutex_exit(&cgm->cg_contents); + } + + ASSERT(cgm->cg_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on + * a cgrp_node via its pages or anon slots from blowing it away + * (in cgrp_inactive) while we're trying to get to it here. Once + * we have a HOLD on it we know it'll stick around. + * + */ + mutex_enter(&cgm->cg_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((cn = cgm->cg_rootnode->cgn_back) != cgm->cg_rootnode) { + mutex_exit(&cgm->cg_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = CGNTOV(cn); + VN_RELE(vp); + mutex_enter(&cgm->cg_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (cn == cgm->cg_rootnode->cgn_back) { + VN_HOLD(vp); + mutex_exit(&cgm->cg_contents); + delay(hz / 4); + mutex_enter(&cgm->cg_contents); + } + } + mutex_exit(&cgm->cg_contents); + + VN_RELE(CGNTOV(cgm->cg_rootnode)); + + ASSERT(cgm->cg_mntpath); + + kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1); + + mutex_destroy(&cgm->cg_contents); + kmem_free(cgm, sizeof (cgrp_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&cgrp_mountcount); +} + +/* + * return root cgnode for given vnode + */ +static int +cgrp_root(struct vfs *vfsp, struct vnode **vpp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cp = cgm->cg_rootnode; + struct vnode *vp; + + ASSERT(cp); + + vp = CGNTOV(cp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = cgm->cg_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > cgrp_minfree) + sbp->f_bfree = blocks - cgrp_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of cgrp_nodes we can allocate from the remaining kernel memory + * available to cgroups. This is fairly inaccurate since it doesn't + * take into account the names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (cgrp_node_t) + sizeof (cgrp_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[cgrp_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, cgm->cg_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen) +{ + cgrp_node_t *parent; + cgrp_dirent_t *dp; + + buf[0] = '\0'; + + parent = cn->cgn_parent; + if (parent == NULL || parent == cn) { + (void) strlcpy(buf, ".", blen); + return (0); + } + + /* + * Search the parent dir list to find this cn's name. + */ + for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) { + if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) { + (void) strlcpy(buf, dp->cgd_name, blen); + return (0); + } + } + + return (-1); +} + +typedef struct cgrp_rra_arg { + char *crraa_agent_path; + char *crraa_event_path; +} cgrp_rra_arg_t; + +static void +cgrp_run_rel_agent(void *a) +{ + cgrp_rra_arg_t *rarg = a; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + struct core_globals *cg; + int res; + + ASSERT(!INGLOBALZONE(curproc)); + + /* The following block is derived from start_init_common */ + ASSERT_STACK_ALIGNED(); + + p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; + p->p_usrstack = (caddr_t)USRSTACK32; + p->p_model = DATAMODEL_ILP32; + p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; + p->p_datprot = PROT_ZFOD & ~PROT_EXEC; + p->p_stk_ctl = INT32_MAX; + + p->p_as = as_alloc(); + p->p_as->a_proc = p; + p->p_as->a_userlimit = (caddr_t)USERLIMIT32; + (void) hat_setup(p->p_as->a_hat, HAT_INIT); + + VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + curproc->p_corefile = cg->core_default_path; + curproc->p_content = cg->core_default_content; + + init_mstate(curthread, LMS_SYSTEM); + res = exec_init(rarg->crraa_agent_path, rarg->crraa_event_path); + + /* End of code derived from start_init_common */ + + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, CGRP_AGENT_LEN); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + + /* The following is derived from zone_start_init - see comments there */ + if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + if (proc_exit(CLD_EXITED, res) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + } else { + id_t cid = curthread->t_cid; + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + (void) parmsset(&pcparms, curthread); + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + + /* cause the process to return to userland. */ + lwp_rtt(); + } +} + +/* + * Launch the user-level release_agent manager. The event data is the + * pathname (relative to the mount point of the file system) of the newly empty + * cgroup. + * + * The cg_contents mutex is held on entry and dropped before returning. + */ +void +cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + cgrp_node_t *parent; + char nm[MAXNAMELEN]; + char *argstr, *oldstr, *tmp; + id_t cid; + int agent_err; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + cgrp_rra_arg_t *rarg; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Nothing to do if the agent is not set */ + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return; + } + + parent = cn->cgn_parent; + /* Cannot remove the top-level cgroup (only via unmount) */ + if (parent == cn) { + mutex_exit(&cgm->cg_contents); + return; + } + + argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + *argstr = '\0'; + + /* + * Iterate up the directory tree to construct the agent argument string. + */ + do { + cgrp_get_dirname(cn, nm, sizeof (nm)); + DTRACE_PROBE1(cgrp__dir__name, char *, nm); + if (*argstr == '\0') { + (void) snprintf(argstr, MAXPATHLEN, "/%s", nm); + } else { + tmp = oldstr; + oldstr = argstr; + argstr = tmp; + (void) snprintf(argstr, MAXPATHLEN, "/%s%s", nm, + oldstr); + } + + if (cn->cgn_parent == NULL) + break; + cn = cn->cgn_parent; + parent = cn->cgn_parent; + + /* + * The arg path is relative to the mountpoint so we stop when + * we get to the top level. + */ + if (parent == NULL || parent == cn) + break; + } while (parent != cn); + + kmem_free(oldstr, MAXPATHLEN); + + rarg = kmem_alloc(sizeof (cgrp_rra_arg_t), KM_SLEEP); + rarg->crraa_agent_path = kmem_alloc(sizeof (cgm->cg_agent), KM_SLEEP); + (void) strlcpy(rarg->crraa_agent_path, cgm->cg_agent, + sizeof (cgm->cg_agent)); + rarg->crraa_event_path = argstr; + + DTRACE_PROBE2(cgrp__agent__event, cgrp_rra_arg_t *, rarg, + int, plwpd->br_cgroupid); + + /* The release agent process cannot belong to our cgroup */ + plwpd->br_cgroupid = 0; + + /* + * The cg_contents mutex cannot be held while taking the pool lock + * or calling newproc. + */ + mutex_exit(&cgm->cg_contents); + + if (z->zone_defaultcid > 0) { + cid = z->zone_defaultcid; + } else { + pool_lock(); + cid = pool_get_class(z->zone_pool); + pool_unlock(); + } + if (cid == -1) + cid = defaultcid; + + if ((agent_err = newproc(cgrp_run_rel_agent, (void *)rarg, cid, + minclsyspri - 1, NULL, -1)) != 0) { + /* There's nothing we can do if creating the proc fails. */ + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, sizeof (cgm->cg_agent)); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + } +} + +/*ARGSUSED*/ +static void +cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + cn->cgn_task_cnt++; + mutex_exit(&cgm->cg_contents); + + DTRACE_PROBE1(cgrp__lwp__fork, void *, cn); +} + +/*ARGSUSED*/ +static void +cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + if (cn->cgn_task_cnt == 0) { + /* top-level cgroup cnt can be 0 during reboot */ + mutex_exit(&cgm->cg_contents); + return; + } + cn->cgn_task_cnt--; + DTRACE_PROBE1(cgrp__lwp__exit, void *, cn); + + if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) && + cn->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, cn); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + } else { + mutex_exit(&cgm->cg_contents); + } +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c new file mode 100644 index 0000000000..bd571c8c18 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c @@ -0,0 +1,1608 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/vm.h> +#include <sys/prsystm.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +typedef enum cgrp_wr_type { + CG_WR_PROCS = 1, + CG_WR_TASKS +} cgrp_wr_type_t; + +/* ARGSUSED1 */ +static int +cgrp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct) +{ + /* + * swapon to a cgrp file is not supported so access is denied on open + * if VISSWAP is set. + */ + if ((*vpp)->v_flag & VISSWAP) + return (EINVAL); + + return (0); +} + +/* ARGSUSED1 */ +static int +cgrp_close(struct vnode *vp, int flag, int count, offset_t offset, + struct cred *cred, caller_context_t *ct) +{ + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); +} + +/* + * Lookup proc or task based on pid and typ. + */ +static proc_t * +cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ) +{ + int i; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + ASSERT(MUTEX_HELD(&pidlock)); + + /* getting a proc from a pid is easy */ + if (typ == CG_WR_PROCS) + return (prfind(pid)); + + ASSERT(typ == CG_WR_TASKS); + + /* + * We have to scan all of the process entries to find the proc + * containing this task. + */ + mutex_exit(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + kthread_t *t; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check all threads in this proc. + */ + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_pid == pid) { + mutex_exit(&p->p_lock); + return (p); + } + + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + } + + mutex_enter(&pidlock); + return (NULL); +} + +/* + * Move a thread from one cgroup to another. If the old cgroup is empty + * we queue up an agent event. We return true in that case since we've + * dropped the locks and the caller needs to reacquire them. + */ +static boolean_t +cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn, + uint_t cg_id, proc_t *p) +{ + cgrp_node_t *ocn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid); + VERIFY(ocn != NULL); + + ASSERT(ocn->cgn_task_cnt > 0); + atomic_dec_32(&ocn->cgn_task_cnt); + atomic_inc_32(&ncn->cgn_task_cnt); + plwpd->br_cgroupid = cg_id; + + if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) && + ocn->cgn_notify == 1) { + /* + * We want to drop p_lock before queuing the event since + * that might sleep. Dropping p_lock might cause the caller to + * have to restart the move process from the beginning. + */ + mutex_exit(&p->p_lock); + cgrp_rel_agent_event(cgm, ocn); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Assign either all of the threads, or a single thread, for the specified pid + * to the new cgroup. Controlled by the typ argument. + */ +static int +cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) +{ + proc_t *p; + kthread_t *t; + int error; + cgrp_node_t *ncn; + + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + /* + * Move one or all threads to this cgroup. + */ + if (typ == CG_WR_TASKS) { + error = ESRCH; + } else { + error = 0; + } + +restart: + mutex_enter(&pidlock); + + p = cgrp_p_for_wr(pid, typ); + if (p == NULL) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Fail writes for pids for which there is no corresponding process, + * system processes, a pid of 0, the pid for our zsched process, + * anything the security policy doesn't allow us to look at, and + * processes that are not in the zone. + */ + if (p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == curproc->p_zone->zone_zsched->p_pid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != curproc->p_zone->zone_id) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Ignore writes for PID which is not an lx-branded process or with + * no threads. + */ + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL || + p->p_flag & SEXITING) { + mutex_exit(&p->p_lock); + return (0); + } + + mutex_enter(&cgm->cg_contents); + + ncn = cgrp_cg_hash_lookup(cgm, cg_id); + VERIFY(ncn != NULL); + + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_cgroupid != cg_id) { + if (typ == CG_WR_PROCS) { + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + /* + * We dropped all of the locks so we + * need to start over. + */ + goto restart; + } + + } else if (plwpd->br_pid == pid) { + /* type is CG_WR_TASKS and we found the task */ + error = 0; + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + goto done; + } else { + break; + } + } + } + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&cgm->cg_contents); + mutex_exit(&p->p_lock); +done: + + return (error); +} + +/* + * User-level is writing a pid string. We need to get that string and convert + * it to a pid. The user-level code has to completely write an entire pid + * string at once. The user-level code could write multiple strings (delimited + * by newline) although that is frowned upon. However, we must handle this + * case too. Thus we consume the input one byte at a time until we get a whole + * pid string. We can't consume more than a byte at a time since otherwise we + * might be left with a partial pid string. + */ +static int +cgrp_get_pid_str(struct uio *uio, pid_t *pid) +{ + char buf[16]; /* big enough for a pid string */ + int i; + int error; + char *p = &buf[0]; + char *ep; + long pidnum; + + bzero(buf, sizeof (buf)); + for (i = 0; uio->uio_resid > 0 && i < sizeof (buf); i++, p++) { + error = uiomove(p, 1, UIO_WRITE, uio); + if (error != 0) + return (error); + if (buf[i] == '\n') { + buf[i] = '\0'; + break; + } + } + + if (buf[0] == '\0' || i >= sizeof (buf)) /* no input or too long */ + return (EINVAL); + + error = ddi_strtol(buf, &ep, 10, &pidnum); + if (error != 0 || *ep != '\0' || pidnum > maxpid || pidnum < 0) + return (EINVAL); + + *pid = (pid_t)pidnum; + return (0); +} + +static int +cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio) +{ + int error; + uint_t value; + + /* + * This is cheesy but since we only take a 0 or 1 value we can + * let the pid_str function do the uio string conversion. + */ + error = cgrp_get_pid_str(uio, (pid_t *)&value); + if (error != 0) + return (error); + + if (value != 0 && value != 1) + return (EINVAL); + + /* + * The flag is on the containing dir. We don't bother taking the + * cg_contents lock since this is a simple assignment. + */ + cn->cgn_parent->cgn_notify = value; + return (0); +} + +static int +cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int error; + int len; + char *wrp; + + len = uio->uio_offset + uio->uio_resid; + if (len > MAXPATHLEN) + return (EFBIG); + + mutex_enter(&cgm->cg_contents); + + wrp = &cgm->cg_agent[uio->uio_offset]; + error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio); + cgm->cg_agent[len] = '\0'; + if (len > 1 && cgm->cg_agent[len - 1] == '\n') + cgm->cg_agent[len - 1] = '\0'; + + mutex_exit(&cgm->cg_contents); + return (error); +} + +static int +cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, + cgrp_wr_type_t typ) +{ + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + int error; + pid_t pidnum; + + while (uio->uio_resid > 0) { + error = cgrp_get_pid_str(uio, &pidnum); + if (error != 0) + return (error); + + error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ); + if (error != 0) + return (error); + } + + return (0); +} + +static int +cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + struct vnode *vp; + int error = 0; + rlim64_t limit = uio->uio_llimit; + + vp = CGNTOV(cn); + ASSERT(vp->v_type == VREG); + + if (uio->uio_loffset < 0) + return (EINVAL); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + if (uio->uio_loffset >= MAXOFF_T) + return (EFBIG); + + if (uio->uio_resid == 0) + return (0); + + if (limit > MAXOFF_T) + limit = MAXOFF_T; + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_wr_notify(cn, uio); + break; + case CG_PROCS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS); + break; + case CG_REL_AGENT: + error = cgrp_wr_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* + * pidlock is held on entry but dropped on exit. Because we might have to drop + * locks and loop if the process is already P_PR_LOCKed, it is possible that + * the process might be gone when we return from this function. + */ +static proc_t * +cgrp_p_lock(proc_t *p) +{ + kmutex_t *mp; + pid_t pid; + + ASSERT(MUTEX_HELD(&pidlock)); + + /* first try the fast path */ + mutex_enter(&p->p_lock); + if (p->p_flag & SEXITING) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) { + p->p_proc_flag |= P_PR_LOCK; + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + THREAD_KPRI_REQUEST(); + return (p); + } + mutex_exit(&p->p_lock); + + pid = p->p_pid; + for (;;) { + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + + mutex_enter(&pidlock); + p = prfind(pid); + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + } + + p->p_proc_flag |= P_PR_LOCK; + mutex_exit(mp); + ASSERT(!MUTEX_HELD(&pidlock)); + THREAD_KPRI_REQUEST(); + return (p); +} + +static void +cgrp_p_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + p->p_proc_flag &= ~P_PR_LOCK; + cv_signal(&pr_pid_cv[p->p_slot]); + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +/* + * Read value from the notify_on_release pseudo file on the parent node + * (which is the actual cgroup node). We don't bother taking the cg_contents + * lock since it's a single instruction so an empty group action/read will + * only see one value or the other. + */ +/* ARGSUSED */ +static int +cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int len; + int error = 0; + char buf[16]; + char *rdp; + /* the flag is on the containing dir */ + uint_t value = cn->cgn_parent->cgn_notify; + + len = snprintf(buf, sizeof (buf), "%u\n", value); + if (uio->uio_offset > len) + return (0); + + len -= uio->uio_offset; + rdp = &buf[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + return (error); +} + +/* + * Read value from the release_agent pseudo file. + */ +static int +cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int len; + int error = 0; + char *rdp; + + mutex_enter(&cgm->cg_contents); + + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len = strlen(cgm->cg_agent); + if (uio->uio_offset > len) { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len -= uio->uio_offset; + rdp = &cgm->cg_agent[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + + mutex_exit(&cgm->cg_contents); + + return (error); +} + +/* + * Read pids from the cgroup.procs pseudo file. We have to look at all of the + * processes to find applicable ones, then report pids for any process which + * has all of its threads in the same cgroup. + */ +static int +cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + ssize_t uresid; + zoneid_t zoneid = curproc->p_zone->zone_id; + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) { + proc_t *p; + int len; + pid_t pid; + char buf[16]; + char *rdp; + kthread_t *t; + boolean_t in_cg; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check if all threads are in this cgroup. + */ + in_cg = B_TRUE; + mutex_enter(&cgm->cg_contents); + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL || plwpd->br_cgroupid != cg_id) { + in_cg = B_FALSE; + break; + } + + t = t->t_forw; + } while (t != p->p_tlist); + mutex_exit(&cgm->cg_contents); + + mutex_exit(&p->p_lock); + if (!in_cg) { + /* + * This proc, or at least one of its threads, is not + * in this cgroup. + */ + mutex_exit(&pidlock); + continue; + } + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc struct + */ + if (p->p_pid == initpid) { + pid = 1; + } else { + pid = p->p_pid; + } + + mutex_exit(&pidlock); + + /* + * Generate pid line and write all or part of it if we're + * in the right spot within the pseudo file. + */ + len = snprintf(buf, sizeof (buf), "%u\n", pid); + if ((offset + len) > uio->uio_offset) { + int diff = (int)(uio->uio_offset - offset); + + ASSERT(diff < len); + offset += diff; + rdp = &buf[diff]; + len -= diff; + if (len > uresid) + len = uresid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + offset += len; + } + + return (0); +} + +/* + * We are given a locked process we know is valid, report on any of its thresds + * that are in the cgroup. + */ +static int +cgrp_rd_proc_tasks(uint_t cg_id, proc_t *p, pid_t initpid, ssize_t *offset, + struct uio *uio) +{ + int error = 0; + uint_t tid; + char buf[16]; + char *rdp; + kthread_t *t; + + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * Report all threads in this cgroup. + */ + t = p->p_tlist; + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL) { + t = t->t_forw; + continue; + } + + if (plwpd->br_cgroupid == cg_id) { + int len; + + /* + * Convert taskid to the Linux default of 1 if + * we're the zone's init process. + */ + tid = plwpd->br_pid; + if (tid == initpid) + tid = 1; + + len = snprintf(buf, sizeof (buf), "%u\n", tid); + if ((*offset + len) > uio->uio_offset) { + int diff; + + diff = (int)(uio->uio_offset - *offset); + ASSERT(diff < len); + *offset = *offset + diff; + rdp = &buf[diff]; + len -= diff; + if (len > uio->uio_resid) + len = uio->uio_resid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + *offset = *offset + len; + } + + t = t->t_forw; + } while (t != p->p_tlist && uio->uio_resid > 0); + + return (0); +} + +/* + * Read pids from the tasks pseudo file. We have to look at all of the + * processes to find applicable ones, then report pids for any thread in the + * cgroup. We return the emulated lx thread pid here, not the internal thread + * ID. Because we're possibly doing IO for each taskid we lock the process + * so that the threads don't change while we're working on it (although threads + * can change if we fill up the read buffer and come back later for a + * subsequent read). + */ +int +cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + ssize_t uresid; + zoneid_t zoneid = curproc->p_zone->zone_id; + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) { + proc_t *p; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + if (p->p_tlist == NULL) { + /* no threads, skip it */ + mutex_exit(&pidlock); + continue; + } + + p = cgrp_p_lock(p); + ASSERT(!MUTEX_HELD(&pidlock)); + if (p == NULL) + continue; + + mutex_enter(&cgm->cg_contents); + error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio); + mutex_exit(&cgm->cg_contents); + + mutex_enter(&p->p_lock); + cgrp_p_unlock(p); + + if (error != 0) + return (error); + } + + return (0); +} + +static int +cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct) +{ + int error = 0; + + if (uio->uio_loffset >= MAXOFF_T) + return (0); + if (uio->uio_loffset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_rd_notify(cgm, cn, uio); + break; + case CG_PROCS: + error = cgrp_rd_procs(cgm, cn, uio); + break; + case CG_REL_AGENT: + error = cgrp_rd_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_rd_tasks(cgm, cn, uio); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support reading non-regular files + */ + if (vp->v_type == VDIR) + return (EISDIR); + if (vp->v_type != VREG) + return (EINVAL); + error = cgrp_rd(cgm, cn, uiop, ct); + + return (error); +} + +static int +cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support writing to non-regular files + */ + if (vp->v_type != VREG) + return (EINVAL); + + if (ioflag & FAPPEND) { + /* In append mode start at end of file. */ + uiop->uio_loffset = cn->cgn_size; + } + + error = cgrp_wr(cgm, cn, uiop, cred, ct); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + struct vattr va; + int attrs = 1; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + if (attrs == 0) { + cn->cgn_uid = va.va_uid; + cn->cgn_gid = va.va_gid; + } + vap->va_type = vp->v_type; + vap->va_mode = cn->cgn_mode & MODEMASK; + vap->va_uid = cn->cgn_uid; + vap->va_gid = cn->cgn_gid; + vap->va_fsid = cn->cgn_fsid; + vap->va_nodeid = (ino64_t)cn->cgn_nodeid; + vap->va_nlink = cn->cgn_nlink; + vap->va_size = (u_offset_t)cn->cgn_size; + vap->va_atime = cn->cgn_atime; + vap->va_mtime = cn->cgn_mtime; + vap->va_ctime = cn->cgn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = cn->cgn_rdev; + vap->va_seq = cn->cgn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + mutex_exit(&cgm->cg_contents); + return (0); +} + +/*ARGSUSED4*/ +static int +cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error = 0; + struct vattr *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR) || + (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE)) + return (EINVAL); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + get = &cn->cgn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, cgrp_taccess, + cn); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&cn->cgn_ctime); + +out: + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_taccess(cn, mode, cred); + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED3 */ +static int +cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + cgrp_node_t *cn = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *ncn = NULL; + int error; + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* + * Null component name is a synonym for directory being searched. + */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(cn); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(cn, nm, &ncn, cred); + mutex_exit(&cgm->cg_contents); + + if (error == 0) { + ASSERT(ncn); + *vpp = CGNTOV(ncn); + } + + return (error); +} + +/*ARGSUSED7*/ +static int +cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap, + enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred, + int flag, caller_context_t *ct, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + int error; + + if (*nm == '\0') + return (EPERM); + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + if (error == 0) { /* name found */ + ASSERT(cn); + + mutex_exit(&cgm->cg_contents); + /* + * Creating an existing file, allow it except for the following + * errors. + */ + if (exclusive == EXCL) { + error = EEXIST; + } else if ((CGNTOV(cn)->v_type == VDIR) && (mode & VWRITE)) { + error = EISDIR; + } else { + error = cgrp_taccess(cn, mode, cred); + } + if (error != 0) { + cgnode_rele(cn); + return (error); + } + *vpp = CGNTOV(cn); + return (0); + } + mutex_exit(&cgm->cg_contents); + + /* + * cgroups doesn't allow creation of additional, non-subsystem specific + * files in a dir + */ + return (EPERM); +} + +/* ARGSUSED3 */ +static int +cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + int error; + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + + /* + * Removal of subsystem-specific files is not allowed but we need + * to return the correct error if they try to remove a non-existent + * file. + */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + mutex_exit(&cgm->cg_contents); + if (error) + return (error); + + ASSERT(cn); + cgnode_rele(cn); + return (EPERM); +} + +/* ARGSUSED4 */ +static int +cgrp_link(struct vnode *dvp, struct vnode *srcvp, char *cnm, struct cred *cred, + caller_context_t *ct, int flags) +{ + /* cgroups doesn't support hard links */ + return (EPERM); +} + +/* + * Rename of subsystem-specific files is not allowed but we can rename + * directories (i.e. sub-groups). We cannot mv subdirs from one group to + * another so the src and dest vnode must be the same. + */ +/* ARGSUSED5 */ +static int +cgrp_rename( + struct vnode *odvp, /* source parent vnode */ + char *onm, /* source name */ + struct vnode *ndvp, /* destination parent vnode */ + char *nnm, /* destination name */ + struct cred *cred, + caller_context_t *ct, + int flags) +{ + cgrp_node_t *fromparent; + cgrp_node_t *toparent; + cgrp_node_t *fromcn = NULL; /* source cgrp_node */ + cgrp_mnt_t *cgm = VTOCGM(odvp); + int error, err; + + fromparent = VTOCGN(odvp); + toparent = VTOCGN(ndvp); + + if (fromparent != toparent) + return (EIO); + + /* discourage additional use of toparent */ + toparent = NULL; + + mutex_enter(&cgm->cg_contents); + + /* + * Look up cgrp_node of file we're supposed to rename. + */ + error = cgrp_dirlookup(fromparent, onm, &fromcn, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + if (fromcn->cgn_type != CG_CGROUP_DIR) { + error = EPERM; + goto done; + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. + */ + if (((error = cgrp_taccess(fromparent, VWRITE, cred)) != 0)) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromcn == fromparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (fromparent == fromcn)) { + error = EINVAL; + goto done; + } + + /* + * Link source to new target + */ + error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME, + fromcn, (struct vattr *)NULL, + (cgrp_node_t **)NULL, cred, ct); + + if (error) + goto done; + + /* + * Unlink from source. + */ + error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred); + + /* + * The following handles the case where our source cgrp_node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + if (err == 0) { + vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct); + } + +done: + mutex_exit(&cgm->cg_contents); + cgnode_rele(fromcn); + + return (error); +} + +/* ARGSUSED5 */ +static int +cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, + struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *self = NULL; + cgrp_mnt_t *cgm = VTOCGM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT + * return from cgrp_dirlookup() is an "ok return". + */ + if (parent->cgn_nlink == 0) + return (ENOENT); + + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error == 0) { + ASSERT(self != NULL); + mutex_exit(&cgm->cg_contents); + cgnode_rele(self); + return (EEXIST); + } + if (error != ENOENT) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL, + va, &self, cred, ct); + if (error) { + mutex_exit(&cgm->cg_contents); + if (self != NULL) + cgnode_rele(self); + return (error); + } + mutex_exit(&cgm->cg_contents); + *vpp = CGNTOV(self); + return (0); +} + +/* ARGSUSED4 */ +static int +cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *self = NULL; + struct vnode *vp; + int error = 0; + + /* + * Return error when removing . and .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + vp = CGNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done; + } + if (self->cgn_type != CG_CGROUP_DIR) { + error = ENOTDIR; + goto done; + } + + cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp); + + /* + * Check for the existence of any sub-cgroup directories or tasks in + * the cgroup. + */ + if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) { + error = EEXIST; + /* + * Update atime because checking cn_dirents is logically + * equivalent to reading the directory + */ + gethrestime(&self->cgn_atime); + goto done; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + } else { + error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred); + } + + vn_vfsunlock(vp); + + if (parent->cgn_task_cnt == 0 && + parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, parent); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + goto dropped; + } + +done: + mutex_exit(&cgm->cg_contents); +dropped: + vnevent_rmdir(CGNTOV(self), dvp, nm, ct); + cgnode_rele(self); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, + caller_context_t *ct, int flags) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + cgrp_dirent_t *cdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + if (cn->cgn_dir == NULL) { + VERIFY(cn->cgn_nlink == 0); + mutex_exit(&cgm->cg_contents); + return (0); + } + + /* + * Get space for multiple directory entries + */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)outbuf; + + offset = 0; + cdp = cn->cgn_dir; + while (cdp) { + namelen = strlen(cdp->cgd_name); /* no +1 needed */ + offset = cdp->cgd_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) { + /* Buffer too small for any entries. */ + error = EINVAL; + } + break; + } + ASSERT(cdp->cgd_cgrp_node != NULL); + + /* use strncpy(9f) to zero out uninitialized bytes */ + + (void) strncpy(dp->d_name, cdp->cgd_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)cdp->cgd_cgrp_node->cgn_nodeid; + dp->d_off = (offset_t)cdp->cgd_offset + 1; + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + cdp = cdp->cgd_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!cdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&cn->cgn_atime); + + mutex_exit(&cgm->cg_contents); + + kmem_free(outbuf, bufsize); + return (error); +} + +/* ARGSUSED5 */ +static int +cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm, + struct cred *cred, caller_context_t *ct, int flags) +{ + /* cgroups doesn't support symlinks */ + return (EPERM); +} + +/* ARGSUSED */ +static void +cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp); + + mutex_enter(&cgm->cg_contents); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || cn->cgn_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return; + } + + if (cn->cgn_forw == NULL) + cgm->cg_rootnode->cgn_back = cn->cgn_back; + else + cn->cgn_forw->cgn_back = cn->cgn_back; + cn->cgn_back->cgn_forw = cn->cgn_forw; + + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + + /* Here's our chance to send invalid event */ + vn_invalid(CGNTOV(cn)); + + vn_free(CGNTOV(cn)); + kmem_free(cn, sizeof (cgrp_node_t)); +} + +/* ARGSUSED */ +static int +cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* ARGSUSED */ +static int +cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + return (write_lock); +} + +/* ARGSUSED */ +static void +cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ +} + +static int +cgrp_pathconf(struct vnode *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + switch (cmd) { + case _PC_XATTR_EXISTS: + if (vp->v_vfsp->vfs_flag & VFS_XATTR) { + *valp = 0; /* assume no attributes */ + error = 0; /* okay to ask */ + } else { + error = EINVAL; + } + break; + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_type == VREG || vp->v_type == VDIR); + error = 0; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *cgrp_vnodeops; + +const fs_operation_def_t cgrp_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = cgrp_open }, + VOPNAME_CLOSE, { .vop_close = cgrp_close }, + VOPNAME_READ, { .vop_read = cgrp_read }, + VOPNAME_WRITE, { .vop_write = cgrp_write }, + VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr }, + VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr }, + VOPNAME_ACCESS, { .vop_access = cgrp_access }, + VOPNAME_LOOKUP, { .vop_lookup = cgrp_lookup }, + VOPNAME_CREATE, { .vop_create = cgrp_create }, + VOPNAME_REMOVE, { .vop_remove = cgrp_remove }, + VOPNAME_LINK, { .vop_link = cgrp_link }, + VOPNAME_RENAME, { .vop_rename = cgrp_rename }, + VOPNAME_MKDIR, { .vop_mkdir = cgrp_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = cgrp_rmdir }, + VOPNAME_READDIR, { .vop_readdir = cgrp_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = cgrp_symlink }, + VOPNAME_INACTIVE, { .vop_inactive = cgrp_inactive }, + VOPNAME_RWLOCK, { .vop_rwlock = cgrp_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = cgrp_rwunlock }, + VOPNAME_SEEK, { .vop_seek = cgrp_seek }, + VOPNAME_PATHCONF, { .vop_pathconf = cgrp_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h new file mode 100644 index 0000000000..cd256c27c5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd.h @@ -0,0 +1,232 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXD_H +#define _LXD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxd.h: declarations, data structures and macros for lxd (lxd devfs). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> +#include <sys/lx_types.h> + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +/* + * It's unlikely that we need to create more than 50-60 subdirs/symlinks + * as front files so we size the file system hash for 2x that number. + * The back devfs typically has ~80 nodes so this is also a comfortable size + * for the back hash table. + */ +#define LXD_HASH_SZ 128 + +#define LXD_BACK_HASH(v) ((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1)) + +#define LXD_NM_HASH(ldn, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(ldn) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + hash &= (LXD_HASH_SZ - 1); \ + } + + +enum lxd_node_type { LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT }; + +/* + * lxd per-mount data structure. + * + * All fields are protected by lxd_contents. + * File renames on a specific file system are protected lxdm_renamelck. + */ +typedef struct lxd_mnt { + struct vfs *lxdm_vfsp; /* filesystem's vfs struct */ + struct lxd_node *lxdm_rootnode; /* root lxd_node */ + char *lxdm_mntpath; /* name of lxd mount point */ + dev_t lxdm_dev; /* unique dev # of mounted `device' */ + kmutex_t lxdm_contents; /* per-mount lock */ + kmutex_t lxdm_renamelck; /* rename lock for this mount */ + uint_t lxdm_gen; /* node ID source for files */ + + /* protects buckets in both "dir ent" and "back" hash tables */ + kmutex_t lxdm_hash_mutex[LXD_HASH_SZ]; + + /* per-mount data for "back" vnodes in the fs */ + uint_t lxdm_back_refcnt; /* # outstanding "back" vnodes */ + struct lxd_node *lxdm_back_htable[LXD_HASH_SZ]; + + /* + * Per-mount directory data for "front" nodes in the fs. + * Each front node has a directory entry but directory entries can live + * on either front or back nodes. + */ + uint_t lxdm_dent_refcnt; /* # outstanding dir ents */ + struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ]; +} lxd_mnt_t; + +/* + * lxd_node is the file system dependent node for lxd. + * + * The node is used to represent both front and back files. For front files + * the node can represent either a directory or symlink. + */ +typedef struct lxd_node { + enum lxd_node_type lxdn_type; + + /* Data for "front" nodes */ + struct lxd_node *lxdn_prev; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_next; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_parent; /* dir containing this node */ + krwlock_t lxdn_rwlock; /* serialize mods/dir updates */ + kmutex_t lxdn_tlock; /* time, flag, and nlink lock */ + + /* these could be in a union ala tmpfs but not really necessary */ + uint_t lxdn_dirents; /* number of dirents */ + struct lxd_dirent *lxdn_dir; /* dirent list */ + char *lxdn_symlink; /* pointer to symlink */ + struct vattr lxdn_attr; /* attributes */ + + /* Hash table link */ + struct lxd_node *lxdn_hnxt; /* link in per-mount entry */ + /* hash table */ + vnode_t *lxdn_vnode; /* vnode for this lxd_node */ + + vnode_t *lxdn_real_vp; /* back file - real vnode */ +} lxd_node_t; + +/* + * Attributes + */ +#define lxdn_mask lxdn_attr.va_mask +#define lxdn_mode lxdn_attr.va_mode +#define lxdn_uid lxdn_attr.va_uid +#define lxdn_gid lxdn_attr.va_gid +#define lxdn_fsid lxdn_attr.va_fsid +#define lxdn_nodeid lxdn_attr.va_nodeid +#define lxdn_nlink lxdn_attr.va_nlink +#define lxdn_size lxdn_attr.va_size +#define lxdn_atime lxdn_attr.va_atime +#define lxdn_mtime lxdn_attr.va_mtime +#define lxdn_ctime lxdn_attr.va_ctime +#define lxdn_rdev lxdn_attr.va_rdev +#define lxdn_blksize lxdn_attr.va_blksize +#define lxdn_nblocks lxdn_attr.va_nblocks +#define lxdn_seq lxdn_attr.va_seq + +/* + * lx devfs conversion macros + */ +#define VFSTOLXDM(vfsp) ((lxd_mnt_t *)(vfsp)->vfs_data) +#define VTOLXDM(vp) ((lxd_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOLDN(vp) ((lxd_node_t *)(vp)->v_data) +#define LDNTOV(ln) ((ln)->lxdn_vnode) +#define ldnode_hold(ln) VN_HOLD(LDNTOV(ln)) +#define ldnode_rele(ln) VN_RELE(LDNTOV(ln)) + +#define REALVP(vp) (VTOLDN(vp)->lxdn_real_vp) + +/* + * front directories are made up of a linked list of lxd_dirent structures + * hanging off directory lxdn_nodes. File names are not fixed length, but are + * null terminated. + */ +typedef struct lxd_dirent { + lxd_node_t *lddir_node; /* lxd node for this file */ + struct lxd_dirent *lddir_next; /* next directory entry */ + struct lxd_dirent *lddir_prev; /* prev directory entry */ + uint_t lddir_offset; /* "offset" of dir entry */ + uint_t lddir_hash; /* a hash of lddir_name */ + struct lxd_dirent *lddir_link; /* linked via hash table */ + lxd_node_t *lddir_parent; /* parent, dir we are in */ + char *lddir_name; /* null terminated */ +} lxd_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +typedef struct lxd_minor_translator { + char *lxd_mt_path; /* illumos minor node path */ + minor_t lxd_mt_minor; /* illumos minor node number */ + int lxd_mt_lx_major; /* linux major node number */ + int lxd_mt_lx_minor; /* linux minor node number */ +} lxd_minor_translator_t; + +enum lxd_xl_tp { DTT_INVALID, DTT_LIST, DTT_CUSTOM }; + +#define xl_list lxd_xl_minor.lxd_xl_list +#define xl_custom lxd_xl_minor.lxd_xl_custom + +typedef struct lxd_devt_translator { + char *lxd_xl_driver; /* driver name */ + major_t lxd_xl_major; /* driver number */ + + enum lxd_xl_tp lxd_xl_type; /* dictates how we intrep. xl_minor */ + union { + uintptr_t lxd_xl_foo; /* required to compile */ + lxd_minor_translator_t *lxd_xl_list; + void (*lxd_xl_custom)(dev_t, dev_t *); + } lxd_xl_minor; +} lxd_devt_translator_t; + +extern struct vnodeops *lxd_vnodeops; +extern lxd_devt_translator_t lxd_devt_translators[]; + +vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *); +void lxd_free_back_node(lxd_node_t *); +int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *); +int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *, + lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *, + caller_context_t *); +void lxd_dirinit(lxd_node_t *, lxd_node_t *, cred_t *); +int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *); +void lxd_dirtrunc(lxd_node_t *); +void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *); +int lxd_naccess(void *, int, cred_t *); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXD_H */ diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c new file mode 100644 index 0000000000..9e67f988bc --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c @@ -0,0 +1,1004 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "lxd.h" + +#define LXD_HASH_SIZE 8192 /* must be power of 2 */ +#define LXD_MUTEX_SIZE 64 + + +#define MODESHIFT 3 + +typedef enum lxd_nodehold { + NOHOLD, + HOLD +} lxd_nodehold_t; + +/* + * The following functions maintain the per-mount "front" files. + */ +static void +lxd_save_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash); + de->lddir_hash = hash; + + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + ASSERT(de->lddir_link == NULL); + de->lddir_link = lxdm->lxdm_dent_htable[hash]; + lxdm->lxdm_dent_htable[hash] = de; + mutex_exit(hmtx); + + atomic_inc_32(&lxdm->lxdm_dent_refcnt); +} + +static void +lxd_rm_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + lxd_dirent_t **prevpp; + kmutex_t *hmtx; + + hash = de->lddir_hash; + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + prevpp = &lxdm->lxdm_dent_htable[hash]; + while (*prevpp != de) + prevpp = &(*prevpp)->lddir_link; + *prevpp = de->lddir_link; + de->lddir_link = NULL; + mutex_exit(hmtx); + + ASSERT(lxdm->lxdm_dent_refcnt > 0); + atomic_dec_32(&lxdm->lxdm_dent_refcnt); +} + +static lxd_dirent_t * +lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold, + lxd_node_t **found) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(parent)); + lxd_dirent_t *de; + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(parent, name, hash); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + de = lxdm->lxdm_dent_htable[hash]; + while (de) { + if (de->lddir_hash == hash && de->lddir_parent == parent && + strcmp(de->lddir_name, name) == 0) { + lxd_node_t *ldn = de->lddir_node; + + if (do_hold == HOLD) { + ASSERT(ldn != NULL); + ldnode_hold(ldn); + } + if (found != NULL) + *found = ldn; + mutex_exit(hmtx); + return (de); + } + + de = de->lddir_link; + } + mutex_exit(hmtx); + return (NULL); +} + +int +lxd_naccess(void *vcp, int mode, cred_t *cr) +{ + lxd_node_t *ldn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in lxd_node. + */ + if (crgetuid(cr) != ldn->lxdn_uid) { + shift += MODESHIFT; + if (groupmember(ldn->lxdn_gid, cr) == 0) + shift += MODESHIFT; + } + + if (ldn->lxdn_type == LXDNT_FRONT) + return (secpolicy_vnode_access2(cr, LDNTOV(ldn), + ldn->lxdn_uid, ldn->lxdn_mode << shift, mode)); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL)); +} + +static lxd_node_t * +lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm) +{ + lxd_node_t *l; + + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) { + if (l->lxdn_real_vp == vp) { + ASSERT(l->lxdn_type == LXDNT_BACK); + + VN_HOLD(LDNTOV(l)); + return (l); + } + } + return (NULL); +} + +static void +lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm) +{ + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(l->lxdn_real_vp != NULL); + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + atomic_inc_32(&lxdm->lxdm_back_refcnt); + + l->lxdn_hnxt = lxdm->lxdm_back_htable[hash]; + lxdm->lxdm_back_htable[hash] = l; +} + + +struct vnode * +lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + + hash = LXD_BACK_HASH(vp); /* Note: hashing with realvp */ + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + l = lxd_find_back(vp, hash, lxdm); + if (l == NULL) { + vnode_t *nvp; + + l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + nvp = vn_alloc(KM_SLEEP); + + rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + + l->lxdn_vnode = nvp; + l->lxdn_type = LXDNT_BACK; + l->lxdn_real_vp = vp; + + VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type, + vp->v_rdev); + nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN)); + vn_setops(nvp, lxd_vnodeops); + nvp->v_data = (caddr_t)l; + + lxd_save_back(l, hash, lxdm); + vn_exists(vp); + } else { + VN_RELE(vp); + } + + mutex_exit(hmtx); + return (LDNTOV(l)); +} + +void +lxd_free_back_node(lxd_node_t *lp) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + lxd_node_t *lprev = NULL; + vnode_t *vp = LDNTOV(lp); + vnode_t *realvp = REALVP(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + /* in lxd_make_back_node we call lxd_find_back with the realvp */ + hash = LXD_BACK_HASH(realvp); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + mutex_enter(&vp->v_lock); + if (vp->v_count > 1) { + vp->v_count--; /* release our hold from vn_rele */ + mutex_exit(&vp->v_lock); + mutex_exit(hmtx); + return; + } + mutex_exit(&vp->v_lock); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; + lprev = l, l = l->lxdn_hnxt) { + + if (l != lp) + continue; + + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(lxdm->lxdm_back_refcnt > 0); + + atomic_dec_32(&lxdm->lxdm_back_refcnt); + vn_invalid(vp); + + if (lprev == NULL) { + lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt; + } else { + lprev->lxdn_hnxt = l->lxdn_hnxt; + } + + mutex_exit(hmtx); + rw_destroy(&l->lxdn_rwlock); + mutex_destroy(&l->lxdn_tlock); + kmem_free(l, sizeof (lxd_node_t)); + vn_free(vp); + VN_RELE(realvp); + return; + } + + panic("lxd_free_back_node"); + /*NOTREACHED*/ +} +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found lxd_node with its vnode held. + */ +int +lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr) +{ + int error; + + *foundnp = NULL; + if (parent->lxdn_vnode->v_type != VDIR) + return (ENOTDIR); + + if ((error = lxd_naccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + ldnode_hold(parent); + *foundnp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the lxdn_dir list + * so that it doesn't change out from underneath us. + * lxd_find_dirent() will pass back the lxd_node + * with a hold on it. + */ + + if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) { + ASSERT(*foundnp); + return (0); + } + + return (ENOENT); +} + +/* + * Check if the source directory is in the path of the target directory. + * The target directory is locked by the caller. + */ +static int +lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent, cred_t *cr) +{ + int error = 0; + lxd_node_t *dir, *dotdot; + + ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock)); + ASSERT(toparent->lxdn_vnode->v_type == VDIR); + + dotdot = toparent->lxdn_parent; + if (dotdot == NULL) + return (ENOENT); + ldnode_hold(dotdot); + + if (dotdot == toparent) { + /* root of fs. search trivially satisfied. */ + ldnode_rele(dotdot); + return (0); + } + + for (;;) { + /* + * Return error for cases like "mv c c/d", + * "mv c c/d/e" and so on. + */ + if (dotdot == fromnode) { + ldnode_rele(dotdot); + error = EINVAL; + break; + } + + dir = dotdot; + dotdot = dir->lxdn_parent; + if (dotdot == NULL) { + ldnode_rele(dir); + error = ENOENT; + break; + } + ldnode_hold(dotdot); + + /* + * We're okay if we traverse the directory tree up to + * the root directory and don't run into the + * parent directory. + */ + if (dir == dotdot) { + ldnode_rele(dir); + ldnode_rele(dotdot); + break; + } + ldnode_rele(dir); + } + + return (error); +} + +static int +lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va, + enum de_op op, lxd_node_t **newnode, struct cred *cred) +{ + lxd_node_t *ldn; + + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + + ldn->lxdn_type = LXDNT_FRONT; + lxd_node_init(lxdm, ldn, NULL, va, cred); + + ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV; + ldn->lxdn_vnode->v_type = va->va_type; + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + if (va->va_mask & AT_ATIME) + ldn->lxdn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + ldn->lxdn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + lxd_dirinit(dir, ldn, cred); + } + + *newnode = ldn; + return (0); +} + +static int +lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name, enum de_op op) +{ + lxd_dirent_t *dp, *pdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->lxdn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (lxd_dirent_t); + dp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); + if (dp == NULL) + return (ENOSPC); + + ldn->lxdn_parent = dir; + + dir->lxdn_size += alloc_size; + dir->lxdn_dirents++; + dp->lddir_node = ldn; + dp->lddir_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t); + (void) strcpy(dp->lddir_name, name); + + lxd_save_dirent(dp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in lxd_readdir. + */ + pdp = dir->lxdn_dir->lddir_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (pdp->lddir_next != NULL && + (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) { + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset); + pdp = pdp->lddir_next; + } + dp->lddir_offset = pdp->lddir_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's lddir_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (pdp->lddir_next == NULL && + pdp->lddir_offset > 2 * dir->lxdn_dirents) + dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next; + else + dir->lxdn_dir->lddir_prev = dp; + + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + dp->lddir_next = pdp->lddir_next; + if (dp->lddir_next) { + dp->lddir_next->lddir_prev = dp; + } + dp->lddir_prev = pdp; + pdp->lddir_next = dp; + + ASSERT(dp->lddir_next != dp); + ASSERT(dp->lddir_prev != dp); + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + return (0); +} + +/* + * Enter a directory entry for 'name' into directory 'dir' + * + * Returns 0 on success. + */ +int +lxd_direnter( + lxd_mnt_t *lxdm, + lxd_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + lxd_node_t *fromparent, /* original directory if rename */ + lxd_node_t *ldn, /* existing lxd_node, if rename */ + struct vattr *va, + lxd_node_t **rnp, /* return lxd_node, if create/mkdir */ + cred_t *cr, + caller_context_t *ctp) +{ + lxd_dirent_t *dirp; + lxd_node_t *found = NULL; + int error = 0; + char *s; + + /* lxdn_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("lxd_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + */ + if (op == DE_RENAME) { + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink == 0) { + mutex_exit(&ldn->lxdn_tlock); + return (ENOENT); + } + + if (ldn->lxdn_nlink == MAXLINK) { + mutex_exit(&ldn->lxdn_tlock); + return (EMLINK); + } + ldn->lxdn_nlink++; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + + /* + * This might be a "dangling detached directory" (it could have been + * removed, but a reference to it kept in u_cwd). Don't bother + * searching it, and with any luck the user will get tired of dealing + * with us and cd to some absolute pathway (thus in ufs, too). + */ + if (dir->lxdn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * If this is a rename of a directory and the parent is different + * (".." must be changed), then the source directory must not be in the + * directory hierarchy above the target, as this would orphan + * everything below the source directory. + */ + if (op == DE_RENAME) { + if (ldn == dir) { + error = EINVAL; + goto out; + } + if ((ldn->lxdn_vnode->v_type) == VDIR) { + if ((fromparent != dir) && + (error = lxd_dircheckpath(ldn, dir, cr)) != 0) { + goto out; + } + } + } + + /* Search for an existing entry. */ + dirp = lxd_find_dirent(name, dir, HOLD, &found); + if (dirp != NULL) { + ASSERT(found != NULL); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (rnp != NULL) { + *rnp = found; + error = EEXIST; + } else { + ldnode_rele(found); + } + break; + + case DE_RENAME: + /* + * Note that we only hit this path when we're renaming + * a symlink from one directory to another and there is + * a pre-existing symlink as the target. lxd_rename + * will unlink the src from the original directory but + * here we need to unlink the dest that we collided + * with, then create the new directory entry as we do + * below when there is no pre-existing symlink. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + + ASSERT(found->lxdn_vnode->v_type == VLNK); + /* dir rw lock is already held and asserted above */ + rw_enter(&found->lxdn_rwlock, RW_WRITER); + error = lxd_dirdelete(dir, found, name, DR_RENAME, cr); + rw_exit(&found->lxdn_rwlock); + ldnode_rele(found); + if (error != 0) + goto out; + + error = lxd_diraddentry(dir, ldn, name, op); + if (error == 0 && rnp != NULL) + *rnp = ldn; + break; + } + } else { + + /* + * The directory entry does not exist, but the node might if + * this is a rename. Check write permission in directory to + * see if entry can be created. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new lxd_node and directory entry as required. + */ + error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr); + if (error) + goto out; + } + + error = lxd_diraddentry(dir, ldn, name, op); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if ((ldn->lxdn_vnode->v_type) == VDIR) { + ASSERT(dirp == NULL); + /* + * cleanup allocs made by lxd_dirinit + */ + lxd_dirtrunc(ldn); + } + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink = 0; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + ldnode_rele(ldn); + ldn = NULL; + } + } else if (rnp != NULL) { + *rnp = ldn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + ldnode_rele(ldn); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + return (error); +} + +/* + * Delete entry ldn of name "nm" from parent dir. This is used to both remove + * a directory and to remove file nodes within the directory (by recursively + * calling itself). It frees the dir entry space and decrements link count on + * lxd_node(s). + * + * Return 0 on success. + */ +int +lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op, + cred_t *cred) +{ + lxd_dirent_t *dirp; + int error; + size_t namelen; + lxd_node_t *fndnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + if (nm[0] == '\0') + panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->lxdn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a directory. Start by + * removing any file entries from the dir. We do this by + * recursively calling back into this function with a different + * op code. The caller of this function has already verified + * that it is safe to remove this directory. + */ + lxd_dirent_t *dirp; + + ASSERT(ldn->lxdn_vnode->v_type == VDIR); + + dirp = ldn->lxdn_dir; + while (dirp) { + lxd_node_t *dn; + lxd_dirent_t *nextp; + + if (strcmp(dirp->lddir_name, ".") == 0 || + strcmp(dirp->lddir_name, "..") == 0) { + dirp = dirp->lddir_next; + continue; + } + + dn = dirp->lddir_node; + nextp = dirp->lddir_next; + + ldnode_hold(dn); + error = lxd_dirdelete(ldn, dn, dirp->lddir_name, + DR_REMOVE, cred); + ldnode_rele(dn); + + dirp = nextp; + } + } + + dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp); + VERIFY(dirp != NULL); + VERIFY(ldn == fndnp); + + lxd_rm_dirent(dirp); + + /* Take dirp out of the directory list. */ + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + if (dirp->lddir_prev) { + dirp->lddir_prev->lddir_next = dirp->lddir_next; + } + if (dirp->lddir_next) { + dirp->lddir_next->lddir_prev = dirp->lddir_prev; + } + + /* + * If the roving slot pointer happens to match dirp, + * point it at the previous dirent. + */ + if (dir->lxdn_dir->lddir_prev == dirp) { + dir->lxdn_dir->lddir_prev = dirp->lddir_prev; + } + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + + /* dirp points to the correct directory entry */ + namelen = strlen(dirp->lddir_name) + 1; + + kmem_free(dirp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + ldn->lxdn_ctime = now; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) { + lxd_dirtrunc(ldn); + ASSERT(ldn->lxdn_nlink == 0); + } + return (0); +} + +/* + * Initialize a lxd_node and add it to file list under mount point. + */ +void +lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap, + cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + ldn->lxdn_mask = 0; + ldn->lxdn_attr.va_type = vap->va_type; + ldn->lxdn_nlink = 1; + ldn->lxdn_size = 0; + + if (cred == NULL) { + ldn->lxdn_uid = vap->va_uid; + ldn->lxdn_gid = vap->va_gid; + } else { + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + } + + ldn->lxdn_fsid = lxdm->lxdm_dev; + ldn->lxdn_rdev = vap->va_rdev; + ldn->lxdn_blksize = PAGESIZE; + ldn->lxdn_nblocks = 0; + gethrestime(&now); + ldn->lxdn_atime = now; + ldn->lxdn_mtime = now; + ldn->lxdn_ctime = now; + ldn->lxdn_seq = 0; + ldn->lxdn_dir = NULL; + + ldn->lxdn_real_vp = realvp; + + ldn->lxdn_vnode = vn_alloc(KM_SLEEP); + vp = LDNTOV(ldn); + vn_setops(vp, lxd_vnodeops); + vp->v_vfsp = lxdm->lxdm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)ldn; + + mutex_enter(&lxdm->lxdm_contents); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Add new lxd_node to end of linked list of lxd_nodes for this + * lxdevfs. Root directory is handled specially in lxd_mount. + */ + if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) { + ldn->lxdn_next = NULL; + ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev; + ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev = + ldn; + } + mutex_exit(&lxdm->lxdm_contents); + vn_exists(vp); +} + +/* + * lxd_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir, cred_t *cr) +{ + lxd_dirent_t *dot, *dotdot; + timestruc_t now; + lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode); + struct vattr nattr; + + ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + dir->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP); + dot->lddir_node = dir; + dot->lddir_offset = 0; + dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t); + dot->lddir_name[0] = '.'; + dot->lddir_parent = dir; + lxd_save_dirent(dot); + + dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP); + dotdot->lddir_node = parent; + dotdot->lddir_offset = 1; + dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t); + dotdot->lddir_name[0] = '.'; + dotdot->lddir_name[1] = '.'; + dotdot->lddir_parent = dir; + lxd_save_dirent(dotdot); + + /* + * Initialize directory entry list. + */ + dot->lddir_next = dotdot; + dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */ + dotdot->lddir_next = NULL; + dotdot->lddir_prev = dot; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + parent->lxdn_nlink++; + parent->lxdn_ctime = now; + + dir->lxdn_dir = dot; + dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5; /* dot and dotdot */ + dir->lxdn_dirents = 2; + dir->lxdn_nlink = 2; + dir->lxdn_parent = parent; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; +} + +/* + * lxd_dirtrunc is called to remove all directory entries under this directory. + */ +void +lxd_dirtrunc(lxd_node_t *dir) +{ + lxd_dirent_t *ldp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) { + size_t namelen; + lxd_node_t *ldn; + + ASSERT(ldp->lddir_next != ldp); + ASSERT(ldp->lddir_prev != ldp); + ASSERT(ldp->lddir_node); + + dir->lxdn_dir = ldp->lddir_next; + namelen = strlen(ldp->lddir_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + ldn = ldp->lddir_node; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + + lxd_rm_dirent(ldp); + kmem_free(ldp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + } + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + ASSERT(dir->lxdn_dir == NULL); + ASSERT(dir->lxdn_size == 0); + ASSERT(dir->lxdn_dirents == 0); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c new file mode 100644 index 0000000000..b474c329ad --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c @@ -0,0 +1,830 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The lx devfs (lxd) file system is used within lx branded zones to provide + * the Linux view of /dev. + * + * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev. + * lxd now provides the Linux /dev. + * + * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file + * system which is the special device and corresponds to the special device in + * a lofs mount. As with lofs, all files in the special device are accessible + * through the lxd mount. Because the zone's devfs is not directly modifiable + * within the zone (also mknod(2) is not generally allowed within a zone) it is + * impossible to create files in devfs. For lx, in some cases it's useful to be + * able to make new symlinks or new directories under /dev. lxd implements + * these operations by creating "files" in memory in the same way as tmpfs + * does. Within lxd these are referred to as "front" files. For operations such + * as lookup or readdir, lxd provides a merged view of both the front and back + * files. lxd does not support regular front files or simple I/O (read/write) + * to front files, since there is no need for that. For back files, all + * operations are simply passed through to the real vnode, as is done with + * lofs. Front files are not allowed to mask back files. + * + * The Linux /dev is now a lxd mount with the special file (i.e. the back + * file system) as /native/dev. + * + * In addition, lx has a need for some illumos/Linux translation for the + * various *stat(2) system calls when used on a device. This translation can + * be centralized within lxd's getattr vnode entry point. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_ptm.h> +#include <sys/lx_impl.h> + +#include "lxd.h" + +/* Module level parameters */ +static int lxd_fstype; +static dev_t lxd_dev; + +/* + * lxd_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t lxd_mountcount; + +/* + * lxd_minfree is the minimum amount of swap space that lx devfs leaves for + * the rest of the zone. + */ +size_t lxd_minfree = 0; + +/* + * LXDMINFREE -- the value from which lxd_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for lxd in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow lxd to consume + * no more than ~10% of this, yielding a LXDMINFREE of 12MB. + */ +#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *, + caller_context_t *, int); +extern int stat64(char *, struct stat64 *); + +/* + * lxd vfs operations. + */ +static int lxd_init(int, char *); +static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int lxd_unmount(vfs_t *, int, cred_t *); +static int lxd_root(vfs_t *, vnode_t **); +static int lxd_statvfs(vfs_t *, statvfs64_t *); +static void lxd_freevfs(vfs_t *vfsp); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_devfs", + lxd_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand devfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +/* + * Definitions and translators for devt's. + */ +static void lxd_pts_devt_translator(dev_t, dev_t *); +static void lxd_ptm_devt_translator(dev_t, dev_t *); + +static kmutex_t lxd_xlate_lock; +static boolean_t lxd_xlate_initialized = B_FALSE; + +static lxd_minor_translator_t lxd_mtranslator_mm[] = { + { "/dev/null", 0, 1, 3 }, + { "/dev/zero", 0, 1, 5 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_random[] = { + { "/dev/random", 0, 1, 8 }, + { "/dev/urandom", 0, 1, 9 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_sy[] = { + { "/dev/tty", 0, LX_TTY_MAJOR, 0 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_zcons[] = { + { "/dev/console", 0, LX_TTY_MAJOR, 1 }, + { NULL, 0, 0, 0 } +}; +lxd_devt_translator_t lxd_devt_translators[] = { + { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm }, + { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random }, + { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy }, + { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons }, + { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator }, + { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator }, + { NULL, 0, DTT_INVALID, NULL } +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (lxd_mountcount > 0) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(lxd_fstype); + vn_freevnodeops(lxd_vnodeops); + mutex_destroy(&lxd_xlate_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading lxd module. + */ +static int +lxd_init(int fstype, char *name) +{ + static const fs_operation_def_t lxd_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxd_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount }, + VFSNAME_ROOT, { .vfs_root = lxd_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def lxd_vnodeops_template[]; + int error; + major_t dev; + + lxd_fstype = fstype; + ASSERT(lxd_fstype != 0); + + error = vfs_setfsops(fstype, lxd_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxd_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxd_init: bad vnode ops template"); + return (error); + } + + /* + * lxd_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (lxd_minfree == 0) { + /* Set if not patched */ + lxd_minfree = btopr(LXDMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxd_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxd_dev = makedevice(dev, 0); + + mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* + * Initialize device translator mapping table. + * + * Note that we cannot do this in lxd_init since that can lead to a recursive + * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/ + * devi_attach_node/modload). Thus we do it in the mount path and keep track + * so that we only initialize the table once. + */ +static void +lxd_xlate_init() +{ + int i; + + mutex_enter(&lxd_xlate_lock); + if (lxd_xlate_initialized) { + mutex_exit(&lxd_xlate_lock); + return; + } + + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + lxd_minor_translator_t *mt; + int j; + + lxd_devt_translators[i].lxd_xl_major = + mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver); + + /* if this translator doesn't use a list mapping we're done. */ + if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST) + continue; + + /* for each device listed, lookup the minor node number */ + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + vnode_t *vp; + struct vattr va; + char *tpath; + char tnm[MAXPATHLEN]; + + /* + * The attach might be triggered in either the global + * zone or in a non-global zone, so we may need to + * adjust the path if we're in a NGZ. + */ + if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) { + tpath = mt[j].lxd_mt_path; + } else { + (void) snprintf(tnm, sizeof (tnm), "/native%s", + mt[j].lxd_mt_path); + tpath = tnm; + } + + if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL, + &vp, NULL) != 0) { + mt[j].lxd_mt_minor = -1; + continue; + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) { + va.va_rdev = NODEV; + } else { + ASSERT(getmajor(va.va_rdev) == + lxd_devt_translators[i].lxd_xl_major); + ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN); + } + + mt[j].lxd_mt_minor = getminor(va.va_rdev); + + VN_RELE(vp); + } + } + + lxd_xlate_initialized = B_TRUE; + mutex_exit(&lxd_xlate_lock); +} + +static int +lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lxd_mnt_t *lxdm = NULL; + struct lxd_node *ldn; + struct pathname dpn; + int error; + int i; + int nodev; + struct vattr rattr; + vnode_t *realrootvp; + vnode_t *tvp; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + vattr_t vattr; + + nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + lxd_xlate_init(); + + /* + * This is the same behavior as with lofs. + * Loopback devices which get "nodevices" added can be done without + * "nodevices" set because we cannot import devices into a zone + * with loopback. Note that we have all zone privileges when + * this happens; if not, we'd have gotten "nosuid". + */ + if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); + + /* + * Only allow mounting within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* lxd doesn't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * Find real root + */ + if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) { + pn_free(&dpn); + return (error); + } + + if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (error); + } + + /* If realroot is not a devfs, error out */ + if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (EINVAL); + } + + lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP); + + /* init but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL); + + /* Initialize the hash table mutexes */ + for (i = 0; i < LXD_HASH_SZ; i++) { + mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT, + NULL); + } + + lxdm->lxdm_vfsp = vfsp; + lxdm->lxdm_gen = 1; /* start inode counter at 1 */ + + vfsp->vfs_data = (caddr_t)lxdm; + vfsp->vfs_fstype = lxd_fstype; + vfsp->vfs_dev = lxd_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype); + lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path); + + /* allocate and initialize root lxd_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + + tvp = lxd_make_back_node(realrootvp, lxdm); + ldn = VTOLDN(tvp); + + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + LDNTOV(ldn)->v_flag |= VROOT; + + /* + * initialize linked list of lxd_nodes so that the back pointer of + * the root lxd_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + ldn->lxdn_prev = ldn; + ldn->lxdn_next = NULL; + ldn->lxdn_nlink = 0; + lxdm->lxdm_rootnode = ldn; + + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + lxd_dirinit(ldn, ldn, cr); + + rw_exit(&ldn->lxdn_rwlock); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&lxd_mountcount); + + lxzdata = ztolxzd(curproc->p_zone); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vattr.va_mask = AT_TYPE | AT_MODE; + vattr.va_type = VLNK; + vattr.va_mode = 0777; + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + /* only create links for actual zvols */ + if (vd->lxvd_type == LXVD_ZVOL) { + char lnknm[MAXPATHLEN]; + + (void) snprintf(lnknm, sizeof (lnknm), + "./zvol/dsk/%s", vd->lxvd_real_name); + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + lnknm, cr, NULL, 0); + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + + mutex_enter(&lxdm->lxdm_contents); + + /* + * In the normal unmount case only the root node would have a reference + * count. + * + * With lxdm_contents held, nothing can be added or removed. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + ldn = lxdm->lxdm_rootnode; + + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EINVAL); + } + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) { + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = lxdm->lxdm_rootnode->lxdn_next; + while (cancel != ldn) { + vp = LDNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->lxdn_next; + } + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } else { + /* + * It may seem incorrect for us to have a vnode with + * a count of 0, but this is modeled on tmpfs and works + * the same way. See lxd_front_inactive. There we allow + * the v_count to go to 0 but rely on the link count to + * keep the vnode alive. Since we now want to cleanup + * these vnodes we manually add a VN_HOLD so that the + * VN_RELEs that occur in the lxd_freevfs() cleanup + * will take us down the lxd_inactive code path. We + * can directly add a VN_HOLD since we have the lock. + */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&lxdm->lxdm_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * lxd_unmount. + */ +void +lxd_freevfs(vfs_t *vfsp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn; + struct vnode *vp; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the lxd_mnt_t that + * says we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries (this doesn't remove top-level dirs). + */ + for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if (ldn->lxdn_vnode->v_type == VDIR) + lxd_dirtrunc(ldn); + rw_exit(&ldn->lxdn_rwlock); + } + + ASSERT(lxdm->lxdm_rootnode != NULL); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a + * lxd_node from blowing it away (in lxd_inactive) while we're trying + * to get to it here. Once we have a HOLD on it we know it'll stick + * around. + */ + mutex_enter(&lxdm->lxdm_contents); + + /* + * Remove all the files (except the rootnode) backwards. + */ + while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) { + mutex_exit(&lxdm->lxdm_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = LDNTOV(ldn); + ASSERT(vp->v_type == VLNK || vp->v_type == VDIR || + vp->v_type == VSOCK); + VN_RELE(vp); + mutex_enter(&lxdm->lxdm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (ldn == lxdm->lxdm_rootnode->lxdn_prev) { + VN_HOLD(vp); + mutex_exit(&lxdm->lxdm_contents); + delay(hz / 4); + mutex_enter(&lxdm->lxdm_contents); + } + } + mutex_exit(&lxdm->lxdm_contents); + + ASSERT(lxdm->lxdm_back_refcnt == 1); + ASSERT(lxdm->lxdm_dent_refcnt == 0); + + VN_RELE(LDNTOV(lxdm->lxdm_rootnode)); + + ASSERT(lxdm->lxdm_mntpath != NULL); + kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1); + + mutex_destroy(&lxdm->lxdm_contents); + mutex_destroy(&lxdm->lxdm_renamelck); + kmem_free(lxdm, sizeof (lxd_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&lxd_mountcount); +} + +/* + * return root lxdnode for given vnode + */ +static int +lxd_root(struct vfs *vfsp, struct vnode **vpp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn = lxdm->lxdm_rootnode; + struct vnode *vp; + + ASSERT(ldn != NULL); + + vp = LDNTOV(ldn); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = lxdm->lxdm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > lxd_minfree) + sbp->f_bfree = blocks - lxd_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of lxd_nodes we can allocate from the remaining kernel memory + * available to lxdevfs in this zone. This is fairly inaccurate since + * it doesn't take into account the names stored in the directory + * entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (lxd_node_t) + sizeof (lxd_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static void +lxd_pts_devt_translator(dev_t dev, dev_t *jdev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + /* + * Linux uses a range of major numbers for pts devices to address the + * relatively small minor number space (20 bits). + */ + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + if (lx_maj > LX_PTS_MAJOR_MAX) { + /* + * The major is outside the acceptable range but there's little + * we can presently do about it short of overhauling the + * translation logic. + */ + lx_unsupported("pts major out of translation range"); + } + + *jdev = LX_MAKEDEVICE(lx_maj, lx_min); +} + +static void +lxd_ptm_devt_translator(dev_t dev, dev_t *jdev) +{ + *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c new file mode 100644 index 0000000000..bee93f6aad --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c @@ -0,0 +1,1506 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <sys/lx_brand.h> +#include <sys/brand.h> + +#include "lxd.h" + +static int +lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(*vpp); + vnode_t *vp = *vpp; + vnode_t *rvp; + vnode_t *oldvp; + int error; + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + oldvp = vp; + vp = rvp = REALVP(vp); + /* + * Need to hold new reference to vp since VOP_OPEN() may + * decide to release it. + */ + VN_HOLD(vp); + error = VOP_OPEN(&rvp, flag, cr, ct); + + if (!error && rvp != vp) { + /* + * the FS which we called should have released the + * new reference on vp + */ + *vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp)); + + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + VN_RELE(oldvp); + } else { + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + } + + return (error); +} + +static int +lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_CLOSE(vp, flag, count, offset, cr, ct)); +} + +static int +lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READ(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_WRITE(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr, + int *rvalp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct)); +} + +static int +lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETFL(vp, oflags, nflags, cr, ct)); +} + +/* + * Translate SunOS devt to Linux devt. + */ +static void +lxd_s2l_devt(dev_t dev, dev_t *rdev) +{ + lxd_minor_translator_t *mt; + int i, j; + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + /* look for a devt translator for this major number */ + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + if (lxd_devt_translators[i].lxd_xl_major == maj) + break; + } + + if (lxd_devt_translators[i].lxd_xl_driver != NULL) { + /* try to translate the illumos devt to a linux devt */ + switch (lxd_devt_translators[i].lxd_xl_type) { + case DTT_INVALID: + ASSERT(0); + break; + + case DTT_LIST: + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + if (mt[j].lxd_mt_minor == min) { + ASSERT(mt[j].lxd_mt_minor < LX_MAXMIN); + + /* found a translation */ + *rdev = LX_MAKEDEVICE( + mt[j].lxd_mt_lx_major, + mt[j].lxd_mt_lx_minor); + return; + } + } + break; + + case DTT_CUSTOM: + lxd_devt_translators[i].xl_custom(dev, rdev); + return; + } + } + + /* we don't have a translator for this device */ + *rdev = LX_MAKEDEVICE(maj, min); +} + +static int +lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + int error; + vnode_t *rvp; + + if (ldn->lxdn_type == LXDNT_FRONT) { + mutex_enter(&ldn->lxdn_tlock); + + vap->va_type = vp->v_type; + vap->va_mode = ldn->lxdn_mode & MODEMASK; + vap->va_uid = ldn->lxdn_uid; + vap->va_gid = ldn->lxdn_gid; + vap->va_fsid = ldn->lxdn_fsid; + vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid; + vap->va_nlink = ldn->lxdn_nlink; + vap->va_size = (u_offset_t)ldn->lxdn_size; + vap->va_atime = ldn->lxdn_atime; + vap->va_mtime = ldn->lxdn_mtime; + vap->va_ctime = ldn->lxdn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = 0; /* no devs in front */ + vap->va_seq = ldn->lxdn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr( + vap->va_size))); + mutex_exit(&ldn->lxdn_tlock); + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + rvp = REALVP(vp); + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct))) + return (error); + + /* Skip devt translation for native programs */ + if (curproc->p_brand != &lx_brand) { + return (0); + } else { + /* + * We also skip translation when called from the user-land + * emulation code. + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + if (lwpd == NULL || lwpd->br_stack_mode != LX_STACK_MODE_BRAND) + return (0); + } + + if (rvp->v_type == VCHR) { + dev_t ldev; + + lxd_s2l_devt(vap->va_rdev, &ldev); + DTRACE_PROBE3(lxd__devxl, void *, rvp, void *, vap, int, ldev); + vap->va_rdev = ldev; + } + + return (0); +} + +static int +lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error = 0; + struct vattr *set; + long mask = vap->va_mask; + + /* Cannot set these attributes */ + if ((mask & AT_NOSET) || (mask & AT_XVATTR) || + (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) || + (mask & AT_SIZE)) + return (EINVAL); + + mutex_enter(&ldn->lxdn_tlock); + + set = &ldn->lxdn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, set, flags, + lxd_naccess, ldn); + if (error) { + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + if (mask & AT_MODE) { + set->va_mode &= S_IFMT; + set->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + set->va_uid = vap->va_uid; + if (mask & AT_GID) + set->va_gid = vap->va_gid; + if (mask & AT_ATIME) + set->va_atime = vap->va_atime; + if (mask & AT_MTIME) + set->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&ldn->lxdn_ctime); + + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETATTR(vp, vap, flags, cr, ct)); +} + +static int +lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + mutex_enter(&ldn->lxdn_tlock); + error = lxd_naccess(ldn, mode, cr); + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + vp = REALVP(vp); + return (VOP_ACCESS(vp, mode, flags, cr, ct)); +} + +static int +lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FSYNC(vp, syncflag, cr, ct)); +} + +static void +lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + mutex_enter(&ldn->lxdn_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || ldn->lxdn_nlink != 0) { + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + return; + } + + /* + * We have the last hold *and* the link count is zero, so this node is + * dead from the filesystem's viewpoint. + */ + if (ldn->lxdn_size != 0) { + if (ldn->lxdn_vnode->v_type == VLNK) + kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1); + } + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + + vn_invalid(LDNTOV(ldn)); + + mutex_enter(&lxdm->lxdm_contents); + if (ldn->lxdn_next == NULL) + lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev; + else + ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev; + ldn->lxdn_prev->lxdn_next = ldn->lxdn_next; + + mutex_exit(&lxdm->lxdm_contents); + rw_exit(&ldn->lxdn_rwlock); + rw_destroy(&ldn->lxdn_rwlock); + mutex_destroy(&ldn->lxdn_tlock); + + vn_free(LDNTOV(ldn)); + kmem_free(ldn, sizeof (lxd_node_t)); +} + +/*ARGSUSED*/ +static void +lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + lxd_front_inactive(vp, cr, ct); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + lxd_free_back_node(ldn); +} + +/* ARGSUSED */ +static int +lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FID(vp, fidp, ct)); +} + +/* + * For a front node lookup in the dirent hash table and return a shadow vnode + * (lxd_node_t type) of type LXDNT_FRONT. + * + * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type) + * of the real vnode found. + */ +static int +lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *vp = NULL; + int error; + vnode_t *realdvp; + lxd_mnt_t *lxdm = VTOLXDM(dvp); + int doingdotdot = 0; + lxd_node_t *ldn = VTOLDN(dvp); + lxd_node_t *nldn = NULL; + + /* + * First check for front file which could be instantiated on either a + * front or back node (e.g. the top-level moint point directory node is + * a back node which can have front files created in it). + */ + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for dir being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(ldn, nm, &nldn, cr); + rw_exit(&ldn->lxdn_rwlock); + + if (error == 0) { + /* found */ + ASSERT(nldn != NULL); + *vpp = LDNTOV(nldn); + return (0); + } + + /* At this point, if dir node is a front node, error */ + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOENT); + } + + realdvp = REALVP(dvp); + + if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { + doingdotdot++; + /* + * Handle ".." out of mounted filesystem + */ + while ((realdvp->v_flag & VROOT) && realdvp != rootdir) { + realdvp = realdvp->v_vfsp->vfs_vnodecovered; + ASSERT(realdvp != NULL); + } + } + + *vpp = NULL; /* default(error) case */ + + /* + * Do the normal lookup + */ + if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr, + ct, direntflags, realpnp)) != 0) { + vp = NULL; + goto out; + } + + /* + * We do this check here to avoid returning a stale file handle to the + * caller. + */ + if (nm[0] == '.' && nm[1] == '\0') { + ASSERT(vp == realdvp); + VN_HOLD(dvp); + VN_RELE(vp); + *vpp = dvp; + return (0); + } + + if (doingdotdot) { + *vpp = lxd_make_back_node(vp, lxdm); + return (0); + } + + /* + * If this vnode is mounted on, then we + * traverse to the vnode which is the root of + * the mounted file system. + */ + if ((error = traverse(&vp)) != 0) + goto out; + + /* + * Make a lxd node for the real vnode. + */ + *vpp = lxd_make_back_node(vp, lxdm); + if (vp->v_type != VDIR) { + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + VN_RELE(vp); + error = ENOSYS; + } else { + *vpp = svp; + } + } + return (error); + } + +out: + if (error != 0 && vp != NULL) + VN_RELE(vp); + + return (error); +} + +/*ARGSUSED*/ +static int +lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, + int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + int error; + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *lnp = NULL; + + rw_enter(&parent->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(parent, nm, &lnp, cr); + rw_exit(&parent->lxdn_rwlock); + /* + * If this vnode already exists in lx devfs, we should pass the create + * operation through to the underlying resource it represents. For + * existing back nodes, the VOP_CREATE is done directly against the + * returned lxd node with an empty name (to avoid a redunant lookup). + * For existing front nodes, an appropriate error must be chosen since + * they cannot represent regular files + */ + if (error == 0) { + if (lnp->lxdn_type == LXDNT_BACK) { + error = VOP_CREATE(lnp->lxdn_real_vp, "\0", va, + exclusive, mode, vpp, cr, flag, ct, vsecp); + } else { + if (exclusive == EXCL) { + error = EEXIST; + } else if (LDNTOV(lnp)->v_type == VDIR && + (mode & S_IWRITE)) { + error = EISDIR; + } else { + error = ENOTSUP; + } + } + if (error != 0) { + ldnode_rele(lnp); + } + return (error); + } + + /* + * We cannot create files in the back devfs but we want to allow for + * O_CREAT on existing files. Pass this through and let the back file + * system allow or deny it. + */ + if (parent->lxdn_type == LXDNT_BACK) { + vnode_t *vp = NULL; + + if (*nm == '\0') { + ASSERT(vpp && dvp == *vpp); + vp = REALVP(*vpp); + } + if ((error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode, + &vp, cr, flag, ct, vsecp)) == 0) { + *vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp)); + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, + (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + return (ENOSYS); + } + *vpp = svp; + } + return (0); + } + /* + * If we were unable to perform the VOP_CREATE for any reason + * other than sdev being read-only, we should bail. + */ + if (error != ENOTSUP && error != EROFS) { + return (error); + } + } + + /* + * While we don't allow create data-containing files under LX devfs, we + * must allow VSOCK front nodes to be created so that paths such as + * /dev/log can be used as AF_UNIX sockets. + */ + if (va->va_type == VSOCK) { + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + lnp = NULL; + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + va, &lnp, cr, ct); + rw_exit(&parent->lxdn_rwlock); + + if (error == 0) { + *vpp = LDNTOV(lnp); + } else if (lnp != NULL) { + /* + * It's possible that a racing process created an entry + * at this name since we last performed the lookup. + */ + ldnode_rele(lnp); + } + } else { + error = ENOTSUP; + } + + return (error); +} + +static int +lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct, + int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *ldn = NULL; + int error; + + /* can only remove existing front nodes */ + error = lxd_dirlookup(parent, nm, &ldn, cr); + if (error) { + return (error); + } + + ASSERT(ldn != NULL); + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr); + + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + + ldnode_rele(ldn); + + return (error); +} + +static int +lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + return (ENOTSUP); +} + +static int +lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + lxd_node_t *oldparent = VTOLDN(odvp); + lxd_node_t *newparent; + lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode); + lxd_node_t *fromnode = NULL; + int error; + int samedir = 0; + + if (!vn_matchops(ndvp, lxd_vnodeops)) { + /* cannot rename out of this file system */ + return (EACCES); + } + + mutex_enter(&lxdm->lxdm_renamelck); + + newparent = VTOLDN(ndvp); + + /* + * We can only rename front nodes. + */ + error = lxd_dirlookup(oldparent, onm, &fromnode, cr); + if (error != 0) { + /* not found in front */ + mutex_exit(&lxdm->lxdm_renamelck); + return (error); + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. If + * that directory is "sticky" it requires further checks. + */ + if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromnode == oldparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (oldparent == fromnode)) { + error = EINVAL; + goto done; + } + + samedir = (oldparent == newparent); + + /* + * Make sure we can search and rename into the destination directory. + */ + if (!samedir) { + if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0) + goto done; + } + + /* + * Link source to new target + */ + rw_enter(&newparent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME, + oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL, + cr, ct); + rw_exit(&newparent->lxdn_rwlock); + + if (error) + goto done; + + /* + * Unlink from source. + */ + rw_enter(&oldparent->lxdn_rwlock, RW_WRITER); + rw_enter(&fromnode->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr); + + /* + * The following handles the case where our source node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + rw_exit(&fromnode->lxdn_rwlock); + rw_exit(&oldparent->lxdn_rwlock); + +done: + ldnode_rele(fromnode); + mutex_exit(&lxdm->lxdm_renamelck); + return (error); +} + +static int +lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, + struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + int error; + vnode_t *tvp; + lxd_node_t *ndir = NULL; + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + /* check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make front directory */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL, + va, &ndir, cr, ct); + rw_exit(&parent->lxdn_rwlock); + + if (error != 0) { + if (ndir != NULL) + ldnode_rele(ndir); + } else { + *vpp = LDNTOV(ndir); + } + + return (error); +} + +static int +lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + *vpp = vp; + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + while (vn_matchops(vp, lxd_vnodeops)) + vp = REALVP(vp); + + if (VOP_REALVP(vp, vpp, ct) != 0) + *vpp = vp; + return (0); +} + +static int +lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr, + caller_context_t *ct, int flags) +{ + int error; + lxd_node_t *ldn; + struct vnode *vp; + lxd_node_t *parent = VTOLDN(dvp); + + /* + * Return error if trying to remove . or .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); + + error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr); + if (error != 0) { + /* not found in front */ + return (error); + } + + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + vp = LDNTOV(ldn); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto err; + } + + if (ldn->lxdn_vnode->v_type != VDIR) { + error = ENOTDIR; + goto err; + } + + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink > 2) { + mutex_exit(&ldn->lxdn_tlock); + error = EEXIST; + goto err; + } + mutex_exit(&ldn->lxdn_tlock); + + /* Check for an empty directory */ + if (ldn->lxdn_dirents > 2) { + error = EEXIST; + gethrestime(&ldn->lxdn_atime); + goto err; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto err; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + vn_vfsunlock(vp); + goto err; + } + + error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr); + vn_vfsunlock(vp); + +err: + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + ldnode_rele(ldn); + + return (error); +} + +/* Not static so it can be used during mount. */ +int +lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm, + struct cred *cr, caller_context_t *ct, int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + lxd_node_t *self = NULL; + vnode_t *tvp; + char *cp = NULL; + int error; + size_t len; + + /* this will check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make symlink in the front */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + tva, &self, cr, ct); + rw_exit(&parent->lxdn_rwlock); + + if (error) { + if (self != NULL) + ldnode_rele(self); + return (error); + } + + len = strlen(tnm) + 1; + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); + if (cp == NULL) { + ldnode_rele(self); + return (ENOSPC); + } + (void) strcpy(cp, tnm); + + self->lxdn_symlink = cp; + self->lxdn_size = len - 1; + ldnode_rele(self); + + return (error); +} + +static int +lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + if (vp->v_type != VLNK) + return (EINVAL); + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ, + uiop); + gethrestime(&ldn->lxdn_atime); + rw_exit(&ldn->lxdn_rwlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READLINK(vp, uiop, cr, ct)); +} + +static int +lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp) +{ + lxd_node_t *ldn = VTOLDN(vp); + struct dirent *sd; + lxd_dirent_t *ldp; + enum lxd_node_type type = ldn->lxdn_type; + ssize_t uresid; + off_t front_off; + int error = 0; + int sdlen; + + /* skip the front entries if the back read was incomplete */ + if (*eofp == 0) + return (0); + + /* + * If this was a back node then reading that node has completed and we + * may have a partially full uio struct. eof should be set to true. + * Leave it set since we're likely to hit eof for the front nodes (if + * any). + */ + + front_off = uiop->uio_offset + 1; + sdlen = sizeof (struct dirent) + MAXPATHLEN; + /* zalloc to ensure we don't have anything in the d_name buffer */ + sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP); + ldp = ldn->lxdn_dir; + while (ldp != NULL && (uresid = uiop->uio_resid) > 0) { + int namelen; + int reclen; + + /* + * Skip dot and dotdot for back nodes since we have them + * already. + */ + if (type == LXDNT_BACK && + (strcmp(ldp->lddir_name, ".") == 0 || + strcmp(ldp->lddir_name, "..") == 0)) { + ldp = ldp->lddir_next; + continue; + } + + /* + * Might have previously had a partial readdir of the front + * nodes, and now we're back for more, or we may just be + * be doing a follow-up readdir after we've previously + * returned all front and back nodes. + */ + if (front_off > req_off) { + namelen = strlen(ldp->lddir_name); /* no +1 needed */ + reclen = (int)DIRENT64_RECLEN(namelen); + + /* + * If the size of the data to transfer is greater + * than that requested, then we can't do it this + * transfer. + */ + if (reclen > uresid) { + *eofp = 0; + /* Buffer too small for any entries. */ + if (front_off == 0) + error = EINVAL; + break; + } + + (void) strncpy(sd->d_name, ldp->lddir_name, + DIRENT64_NAMELEN(reclen)); + sd->d_reclen = (ushort_t)reclen; + sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid; + sd->d_off = front_off; + + /* uiomove will adjust iov_base properly */ + if ((error = uiomove((caddr_t)sd, reclen, UIO_READ, + uiop)) != 0) { + *eofp = 0; + break; + } + } + + /* + * uiomove() above updates both uio_resid and uio_offset by the + * same amount but we want uio_offset to change in increments + * of 1, which is different from the number of bytes being + * returned to the caller, so we set uio_offset explicitly, + * ignoring what uiomove() did. + */ + uiop->uio_offset = front_off; + front_off++; + + ldp = ldp->lddir_next; + } + + kmem_free(sd, sdlen); + return (error); +} + +static int +lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxd_node_t *ldn = VTOLDN(vp); + vnode_t *rvp; + int res; + off_t req_off; + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + req_off = uiop->uio_offset; + + /* First read the back node (if it is one) */ + if (ldn->lxdn_type == LXDNT_BACK) { + rvp = REALVP(vp); + res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags); + if (res != 0) + return (res); + } else { + /* setup for merge_front */ + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + /* caller should have already called lxd_rwlock */ + ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock)); + + *eofp = 1; + /* + * The merge code starts the offset calculation from uio_offset, + * which is normally already set to the high value by the back + * code, but in this case we need to count up from 0. + */ + uiop->uio_offset = 0; + } + + /* + * Our back nodes can also have front entries hanging on them so we + * need to merge those in. Or, we may simply have a front node (i.e. a + * front subdir). + */ + res = lx_merge_front(vp, uiop, req_off, eofp); + return (res); +} + +static int +lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + if (write_lock) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + } else { + rw_enter(&ldn->lxdn_rwlock, RW_READER); + } + return (write_lock); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_RWLOCK(vp, write_lock, ct)); +} + +static void +lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + rw_exit(&ldn->lxdn_rwlock); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + VOP_RWUNLOCK(vp, write_lock, ct); +} + +static int +lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SEEK(vp, ooff, noffp, ct)); +} + +static int +lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + while (vn_matchops(vp1, lxd_vnodeops) && + VTOLDN(vp1)->lxdn_type == LXDNT_BACK) { + vp1 = REALVP(vp1); + } + while (vn_matchops(vp2, lxd_vnodeops) && + VTOLDN(vp2)->lxdn_type == LXDNT_BACK) { + vp2 = REALVP(vp2); + } + + if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +static int +lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +static int +lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct)); +} + +static int +lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot, + struct page *parr[], size_t psz, struct seg *seg, caddr_t addr, + enum seg_rw rw, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr, + ct)); +} + +static int +lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PUTPAGE(vp, off, len, flags, cr, ct)); +} + +static int +lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct)); +} + +static int +lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp, + struct pollhead **phpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct)); +} + +static int +lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DUMP(vp, addr, bn, count, ct)); +} + +static int +lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PATHCONF(vp, cmd, valp, cr, ct)); +} + +static int +lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len, + int flags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct)); +} + +static void +lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + if (vp != NULL && !VN_ISKAS(vp)) + VOP_DISPOSE(vp, pp, fl, dn, cr, ct); +} + +static int +lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (vn_is_readonly(vp)) + return (EROFS); + + vp = REALVP(vp); + return (VOP_SETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct)); +} + +/* + * Loopback vnode operations vector. + */ + +struct vnodeops *lxd_vnodeops; + +const fs_operation_def_t lxd_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxd_open }, + VOPNAME_CLOSE, { .vop_close = lxd_close }, + VOPNAME_READ, { .vop_read = lxd_read }, + VOPNAME_WRITE, { .vop_write = lxd_write }, + VOPNAME_IOCTL, { .vop_ioctl = lxd_ioctl }, + VOPNAME_SETFL, { .vop_setfl = lxd_setfl }, + VOPNAME_GETATTR, { .vop_getattr = lxd_getattr }, + VOPNAME_SETATTR, { .vop_setattr = lxd_setattr }, + VOPNAME_ACCESS, { .vop_access = lxd_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxd_lookup }, + VOPNAME_CREATE, { .vop_create = lxd_create }, + VOPNAME_REMOVE, { .vop_remove = lxd_remove }, + VOPNAME_LINK, { .vop_link = lxd_link }, + VOPNAME_RENAME, { .vop_rename = lxd_rename }, + VOPNAME_MKDIR, { .vop_mkdir = lxd_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = lxd_rmdir }, + VOPNAME_READDIR, { .vop_readdir = lxd_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = lxd_symlink }, + VOPNAME_READLINK, { .vop_readlink = lxd_readlink }, + VOPNAME_FSYNC, { .vop_fsync = lxd_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = lxd_inactive }, + VOPNAME_FID, { .vop_fid = lxd_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = lxd_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = lxd_rwunlock }, + VOPNAME_SEEK, { .vop_seek = lxd_seek }, + VOPNAME_CMP, { .vop_cmp = lxd_cmp }, + VOPNAME_FRLOCK, { .vop_frlock = lxd_frlock }, + VOPNAME_SPACE, { .vop_space = lxd_space }, + VOPNAME_REALVP, { .vop_realvp = lxd_realvp }, + VOPNAME_GETPAGE, { .vop_getpage = lxd_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = lxd_putpage }, + VOPNAME_MAP, { .vop_map = lxd_map }, + VOPNAME_ADDMAP, { .vop_addmap = lxd_addmap }, + VOPNAME_DELMAP, { .vop_delmap = lxd_delmap }, + VOPNAME_POLL, { .vop_poll = lxd_poll }, + VOPNAME_DUMP, { .vop_dump = lxd_dump }, + VOPNAME_DUMPCTL, { .error = fs_error }, + VOPNAME_PATHCONF, { .vop_pathconf = lxd_pathconf }, + VOPNAME_PAGEIO, { .vop_pageio = lxd_pageio }, + VOPNAME_DISPOSE, { .vop_dispose = lxd_dispose }, + VOPNAME_SETSECATTR, { .vop_setsecattr = lxd_setsecattr }, + VOPNAME_GETSECATTR, { .vop_getsecattr = lxd_getsecattr }, + VOPNAME_SHRLOCK, { .vop_shrlock = lxd_shrlock }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c new file mode 100644 index 0000000000..510626d220 --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -0,0 +1,497 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + + +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/frame.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + +/* + * We store the syscall number in the low 16 bits (which limits us to 64k + * syscalls). The next bit indicates entry/return probe and the next bit + * indicates 64bit/32bit syscall. + */ +#define SCALL_MASK 0xffff +#define ENTRY_FLAG 0x10000 +#define SYSC_64_BIT 0x100000 + +#define LX_SYSTRACE_IS64BIT(x) ((int)(x) & SYSC_64_BIT) +#define LX_SYSTRACE_ISENTRY(x) ((int)(x) & ENTRY_FLAG) +#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & SCALL_MASK) + +#define LX_SYSTRACE32_ENTRY(id) (ENTRY_FLAG | (id)) +#define LX_SYSTRACE32_RETURN(id) (id) + +#define LX_SYSTRACE64_ENTRY(id) (SYSC_64_BIT | ENTRY_FLAG | (id)) +#define LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id) + +#define LX_SYSTRACE_ENTRY_AFRAMES 2 +#define LX_SYSTRACE_RETURN_AFRAMES 4 + +typedef struct lx_systrace_sysent { + const char *lss_name; + dtrace_id_t lss_entry; + dtrace_id_t lss_return; +} lx_systrace_sysent_t; + +static dev_info_t *lx_systrace_devi; +static dtrace_provider_id_t lx_systrace_id; +static kmutex_t lx_systrace_lock; +static uint_t lx_systrace_nenabled; + +static int lx_systrace_nsysent32; +static lx_systrace_sysent_t *lx_systrace_sysent32; + +#if defined(_LP64) +static int lx_systrace_nsysent64; +static lx_systrace_sysent_t *lx_systrace_sysent64; +#endif + +/*ARGSUSED*/ +static void +lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_entry; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_entry; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_return; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_return; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc) +{ + int i; + + if (desc != NULL) + return; + + for (i = 0; i < lx_systrace_nsysent32; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i))); + + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + for (i = 0; i < lx_systrace_nsysent64; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i))); + + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif +} + +/*ARGSUSED*/ +static int +lx_systrace_enable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + mutex_enter(&lx_systrace_lock); + if (lx_systrace_nenabled++ == 0) + lx_brand_systrace_enable(); + mutex_exit(&lx_systrace_lock); + + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { +#if defined(_LP64) + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = id; + } else { + lx_systrace_sysent64[sysnum].lss_return = id; + } +#endif + } else { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = id; + } else { + lx_systrace_sysent32[sysnum].lss_return = id; + } + } + return (0); +} + +/*ARGSUSED*/ +static void +lx_systrace_disable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { +#if defined(_LP64) + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE; + } +#endif + } else { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE; + } + } + + mutex_enter(&lx_systrace_lock); + if (--lx_systrace_nenabled == 0) + lx_brand_systrace_disable(); + mutex_exit(&lx_systrace_lock); +} + +/*ARGSUSED*/ +static void +lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg) +{ +} + +/*ARGSUSED*/ +static uint64_t +lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct frame *fp = (struct frame *)dtrace_getfp(); + uintptr_t *stack; + uint64_t val = 0; + int i; + + if (argno >= 6) + return (0); + + /* + * Walk the four frames down the stack to the entry or return callback. + * Our callback calls dtrace_probe() which calls dtrace_dif_variable() + * which invokes this function to get the extended arguments. We get + * the frame pointer in via call to dtrace_getfp() above which makes for + * four frames. + */ + for (i = 0; i < 4; i++) { + fp = (struct frame *)fp->fr_savfp; + } + + stack = (uintptr_t *)&fp[1]; + + /* + * Skip the first argument to the callback -- the system call number. + */ + argno++; + +#ifdef __amd64 + /* + * On amd64, the first 6 arguments are passed in registers while + * subsequent arguments are on the stack. + */ + argno -= 6; +#endif + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[argno]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + + +static const dtrace_pattr_t lx_systrace_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t lx_systrace_pops = { + lx_systrace_provide, + NULL, + lx_systrace_enable, + lx_systrace_disable, + NULL, + NULL, + NULL, + lx_systrace_getarg, + NULL, + lx_systrace_destroy +}; + +static int +lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int i; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR, + 0, DDI_PSEUDO, NULL) == DDI_FAILURE || + dtrace_register("lx-syscall", &lx_systrace_attr, + DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL, + &lx_systrace_id) != 0) { + ddi_remove_minor_node(devi, NULL); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + lx_systrace_devi = devi; + + /* + * Initialize the 32-bit table. + */ + VERIFY(lx_nsysent32 > 0); + lx_systrace_nsysent32 = lx_nsysent32; + lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent32; i++) { + lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name; + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + /* + * Initialize the 64-bit table. + */ + VERIFY(lx_nsysent64 > 0); + lx_systrace_nsysent64 = lx_nsysent64; + lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent64; i++) { + lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name; + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif + + /* + * Install probe triggers. + */ + lx_systrace_entry_ptr = lx_systrace_entry; + lx_systrace_return_ptr = lx_systrace_return; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (dtrace_unregister(lx_systrace_id) != 0) + return (DDI_FAILURE); + + /* + * Free tables. + */ + kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent32 = NULL; + lx_systrace_nsysent32 = 0; + +#if defined(_LP64) + kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent64 = NULL; + lx_systrace_nsysent64 = 0; +#endif + + /* + * Reset probe triggers. + */ + lx_systrace_entry_ptr = NULL; + lx_systrace_return_ptr = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +static struct cb_ops lx_systrace_cb_ops = { + lx_systrace_open, /* open */ + nodev, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops lx_systrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_systrace_attach, /* attach */ + lx_systrace_detach, /* detach */ + nodev, /* reset */ + &lx_systrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Linux Brand System Call Tracing", /* name of module */ + &lx_systrace_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf new file mode 100644 index 0000000000..e4499c8a5b --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_systrace" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c new file mode 100644 index 0000000000..6fec9ef4cb --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c @@ -0,0 +1,1684 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Compatibility for the Linux netlink(7) kernel/user transport, as well as + * for in-kernel netlink(7) providers like rtnetlink(7). See RFC 3549 for + * details of the protocol, and the Linux man pages for details of the Linux + * implementation that we're mimicking. + */ + +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <sys/debug.h> +#include <sys/ucred.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_impl.h> +#include <inet/ip_ire.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/policy.h> + +/* + * Flags in netlink header + */ +#define LX_NETLINK_NLM_F_REQUEST 1 +#define LX_NETLINK_NLM_F_MULTI 2 +#define LX_NETLINK_NLM_F_ACK 4 +#define LX_NETLINK_NLM_F_ECHO 8 +#define LX_NETLINK_NLM_F_DUMP_INTR 16 +#define LX_NETLINK_NLM_F_ROOT 0x100 +#define LX_NETLINK_NLM_F_MATCH 0x200 +#define LX_NETLINK_NLM_F_ATOMIC 0x400 + +/* + * Generic message type constants + */ +#define LX_NETLINK_NLMSG_NONE 0 +#define LX_NETLINK_NLMSG_NOOP 1 +#define LX_NETLINK_NLMSG_ERROR 2 +#define LX_NETLINK_NLMSG_DONE 3 +#define LX_NETLINK_NLMSG_OVERRUN 4 + +/* + * Protocol constants. + */ +#define LX_NETLINK_ROUTE 0 +#define LX_NETLINK_UNUSED 1 +#define LX_NETLINK_USERSOCK 2 +#define LX_NETLINK_FIREWALL 3 +#define LX_NETLINK_SOCK_DIAG 4 +#define LX_NETLINK_NFLOG 5 +#define LX_NETLINK_XFRM 6 +#define LX_NETLINK_SELINUX 7 +#define LX_NETLINK_ISCSI 8 +#define LX_NETLINK_AUDIT 9 +#define LX_NETLINK_FIB_LOOKUP 10 +#define LX_NETLINK_CONNECTOR 11 +#define LX_NETLINK_NETFILTER 12 +#define LX_NETLINK_IP6_FW 13 +#define LX_NETLINK_DNRTMSG 14 +#define LX_NETLINK_KOBJECT_UEVENT 15 +#define LX_NETLINK_GENERIC 16 +#define LX_NETLINK_SCSITRANSPORT 18 +#define LX_NETLINK_ECRYPTFS 19 +#define LX_NETLINK_RDMA 20 +#define LX_NETLINK_CRYPTO 21 + +/* + * rtnetlink(7) attribute-related constants + */ +#define LX_NETLINK_NLA_ALIGNTO 4 + +#define LX_NETLINK_RTM_NEWLINK 16 +#define LX_NETLINK_RTM_DELLINK 17 +#define LX_NETLINK_RTM_GETLINK 18 +#define LX_NETLINK_RTM_SETLINK 19 +#define LX_NETLINK_RTM_NEWADDR 20 +#define LX_NETLINK_RTM_DELADDR 21 +#define LX_NETLINK_RTM_GETADDR 22 +#define LX_NETLINK_RTM_NEWROUTE 24 +#define LX_NETLINK_RTM_DELROUTE 25 +#define LX_NETLINK_RTM_GETROUTE 26 +#define LX_NETLINK_RTM_NEWNEIGH 28 +#define LX_NETLINK_RTM_DELNEIGH 29 +#define LX_NETLINK_RTM_GETNEIGH 30 +#define LX_NETLINK_RTM_NEWRULE 32 +#define LX_NETLINK_RTM_DELRULE 33 +#define LX_NETLINK_RTM_GETRULE 34 +#define LX_NETLINK_RTM_NEWQDISC 36 +#define LX_NETLINK_RTM_DELQDISC 37 +#define LX_NETLINK_RTM_GETQDISC 38 +#define LX_NETLINK_RTM_NEWTCLASS 40 +#define LX_NETLINK_RTM_DELTCLASS 41 +#define LX_NETLINK_RTM_GETTCLASS 42 +#define LX_NETLINK_RTM_NEWTFILTER 44 +#define LX_NETLINK_RTM_DELTFILTER 45 +#define LX_NETLINK_RTM_GETTFILTER 46 +#define LX_NETLINK_RTM_NEWACTION 48 +#define LX_NETLINK_RTM_DELACTION 49 +#define LX_NETLINK_RTM_GETACTION 50 +#define LX_NETLINK_RTM_NEWPREFIX 52 +#define LX_NETLINK_RTM_GETMULTICAST 58 +#define LX_NETLINK_RTM_GETANYCAST 62 +#define LX_NETLINK_RTM_NEWNEIGHTBL 64 +#define LX_NETLINK_RTM_GETNEIGHTBL 66 +#define LX_NETLINK_RTM_SETNEIGHTBL 67 +#define LX_NETLINK_RTM_NEWNDUSEROPT 68 +#define LX_NETLINK_RTM_NEWADDRLABEL 72 +#define LX_NETLINK_RTM_DELADDRLABEL 73 +#define LX_NETLINK_RTM_GETADDRLABEL 74 +#define LX_NETLINK_RTM_GETDCB 78 +#define LX_NETLINK_RTM_SETDCB 79 +#define LX_NETLINK_RTM_NEWNETCONF 80 +#define LX_NETLINK_RTM_GETNETCONF 82 +#define LX_NETLINK_RTM_NEWMDB 84 +#define LX_NETLINK_RTM_DELMDB 85 +#define LX_NETLINK_RTM_GETMDB 86 +#define LX_NETLINK_RTM_MAX 87 + +/* + * rtnetlink(7) attribute constants + */ +#define LX_NETLINK_RTA_UNSPEC 0 +#define LX_NETLINK_RTA_DST 1 +#define LX_NETLINK_RTA_SRC 2 +#define LX_NETLINK_RTA_IIF 3 +#define LX_NETLINK_RTA_OIF 4 +#define LX_NETLINK_RTA_GATEWAY 5 +#define LX_NETLINK_RTA_PRIORITY 6 +#define LX_NETLINK_RTA_PREFSRC 7 +#define LX_NETLINK_RTA_METRICS 8 +#define LX_NETLINK_RTA_MULTIPATH 9 +#define LX_NETLINK_RTA_PROTOINFO 10 +#define LX_NETLINK_RTA_FLOW 11 +#define LX_NETLINK_RTA_CACHEINFO 12 +#define LX_NETLINK_RTA_SESSION 13 +#define LX_NETLINK_RTA_MP_ALGO 14 +#define LX_NETLINK_RTA_TABLE 15 +#define LX_NETLINK_RTA_MARK 16 +#define LX_NETLINK_RTA_MFC_STATS 17 +#define LX_NETLINK_MAX_RTA LX_NETLINK_RTA_MFC_STATS + +/* + * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants + */ +#define LX_NETLINK_IFLA_UNSPEC 0 +#define LX_NETLINK_IFLA_ADDRESS 1 +#define LX_NETLINK_IFLA_BROADCAST 2 +#define LX_NETLINK_IFLA_IFNAME 3 +#define LX_NETLINK_IFLA_MTU 4 +#define LX_NETLINK_IFLA_LINK 5 +#define LX_NETLINK_IFLA_QDISC 6 +#define LX_NETLINK_IFLA_STATS 7 +#define LX_NETLINK_IFLA_COST 8 +#define LX_NETLINK_IFLA_PRIORITY 9 +#define LX_NETLINK_IFLA_MASTER 10 +#define LX_NETLINK_IFLA_WIRELESS 11 +#define LX_NETLINK_IFLA_PROTINFO 12 +#define LX_NETLINK_IFLA_TXQLEN 13 +#define LX_NETLINK_IFLA_MAP 14 +#define LX_NETLINK_IFLA_WEIGHT 15 +#define LX_NETLINK_IFLA_OPERSTATE 16 +#define LX_NETLINK_IFLA_LINKMODE 17 +#define LX_NETLINK_IFLA_LINKINFO 18 +#define LX_NETLINK_IFLA_NET_NS_PID 19 +#define LX_NETLINK_IFLA_IFALIAS 20 +#define LX_NETLINK_IFLA_NUM_VF 21 +#define LX_NETLINK_IFLA_VFINFO_LIST 22 +#define LX_NETLINK_IFLA_STATS64 23 +#define LX_NETLINK_IFLA_VF_PORTS 24 +#define LX_NETLINK_IFLA_PORT_SELF 25 +#define LX_NETLINK_IFLA_AF_SPEC 26 +#define LX_NETLINK_IFLA_GROUP 27 +#define LX_NETLINK_IFLA_NET_NS_FD 28 +#define LX_NETLINK_IFLA_EXT_MASK 29 +#define LX_NETLINK_IFLA_PROMISCUITY 30 +#define LX_NETLINK_IFLA_NUM_TX_QUEUES 31 +#define LX_NETLINK_IFLA_NUM_RX_QUEUES 32 +#define LX_NETLINK_IFLA_CARRIER 33 +#define LX_NETLINK_IFLA_PHYS_PORT_ID 34 +#define LX_NETLINK_IFLA_CARRIER_CHANGES 35 +#define LX_NETLINK_IFLA_MAX 36 + +/* + * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants + */ +#define LX_NETLINK_IFA_UNSPEC 0 +#define LX_NETLINK_IFA_ADDRESS 1 +#define LX_NETLINK_IFA_LOCAL 2 +#define LX_NETLINK_IFA_LABEL 3 +#define LX_NETLINK_IFA_BROADCAST 4 +#define LX_NETLINK_IFA_ANYCAST 5 +#define LX_NETLINK_IFA_CACHEINFO 6 +#define LX_NETLINK_IFA_MULTICAST 7 +#define LX_NETLINK_IFA_FLAGS 8 +#define LX_NETLINK_IFA_MAX 9 + +#define LX_NETLINK_IFA_F_SECONDARY 0x01 +#define LX_NETLINK_IFA_F_TEMPORARY LX_NETLINK_IFA_F_SECONDARY +#define LX_NETLINK_IFA_F_NODAD 0x02 +#define LX_NETLINK_IFA_F_OPTIMISTIC 0x04 +#define LX_NETLINK_IFA_F_DADFAILED 0x08 +#define LX_NETLINK_IFA_F_HOMEADDRESS 0x10 +#define LX_NETLINK_IFA_F_DEPRECATED 0x20 +#define LX_NETLINK_IFA_F_TENTATIVE 0x40 +#define LX_NETLINK_IFA_F_PERMANENT 0x80 +#define LX_NETLINK_IFA_F_MANAGETEMPADDR 0x100 +#define LX_NETLINK_IFA_F_NOPREFIXROUTE 0x200 + +/* + * Linux interface flags. + */ +#define LX_IFF_UP (1<<0) +#define LX_IFF_BROADCAST (1<<1) +#define LX_IFF_DEBUG (1<<2) +#define LX_IFF_LOOPBACK (1<<3) +#define LX_IFF_POINTOPOINT (1<<4) +#define LX_IFF_NOTRAILERS (1<<5) +#define LX_IFF_RUNNING (1<<6) +#define LX_IFF_NOARP (1<<7) +#define LX_IFF_PROMISC (1<<8) +#define LX_IFF_ALLMULTI (1<<9) +#define LX_IFF_MASTER (1<<10) +#define LX_IFF_SLAVE (1<<11) +#define LX_IFF_MULTICAST (1<<12) +#define LX_IFF_PORTSEL (1<<13) +#define LX_IFF_AUTOMEDIA (1<<14) +#define LX_IFF_DYNAMIC (1<<15) +#define LX_IFF_LOWER_UP (1<<16) +#define LX_IFF_DORMANT (1<<17) +#define LX_IFF_ECHO (1<<18) + +/* rtm_table */ +#define LX_ROUTE_TABLE_MAIN 254 + +/* rtm_type */ +#define LX_RTN_UNSPEC 0 +#define LX_RTN_UNICAST 1 +#define LX_RTN_LOCAL 2 +#define LX_RTN_BROADCAST 3 +#define LX_RTN_ANYCAST 4 +#define LX_RTN_MULTICAST 5 +#define LX_RTN_BLACKHOLE 6 +#define LX_RTN_UNREACHABLE 7 +#define LX_RTN_PROHIBIT 8 +#define LX_RTN_THROW 9 +#define LX_RTN_NAT 10 +#define LX_RTN_XRESOLVE 11 + +/* rtm_protocol */ +#define LX_RTPROT_UNSPEC 0 +#define LX_RTPROT_REDIRECT 1 /* From ICMP redir */ +#define LX_RTPROT_KERNEL 2 /* From kernel */ +#define LX_RTPROT_BOOT 3 /* From boot */ +#define LX_RTPROT_STATIC 4 /* From administrator */ +#define LX_RTPROT_NULL 0xff /* Uninitialized */ + +/* rtm_scope */ +#define LX_RTSCOPE_UNIVERSE 0 +#define LX_RTSCOPE_SITE 200 +#define LX_RTSCOPE_LINK 253 +#define LX_RTSCOPE_HOST 254 +#define LX_RTSCOPE_NOWHERE 255 + + +/* + * Netlink sockopts + */ +#define SOL_LX_NETLINK 270 + +#define LX_NETLINK_SO_ADD_MEMBERSHIP 1 +#define LX_NETLINK_SO_DROP_MEMBERSHIP 2 +#define LX_NETLINK_SO_PKTINFO 3 +#define LX_NETLINK_SO_BROADCAST_ERROR 4 +#define LX_NETLINK_SO_NO_ENOBUFS 5 +#define LX_NETLINK_SO_RX_RING 6 +#define LX_NETLINK_SO_TX_RING 7 + +/* Internal socket flags */ +#define LXNLF_RECVUCRED 0x1 + +/* nlmsg structure macros */ +#define LXNLMSG_ALIGNTO 4 +#define LXNLMSG_ALIGN(len) \ + (((len) + LXNLMSG_ALIGNTO - 1) & ~(LXNLMSG_ALIGNTO - 1)) +#define LXNLMSG_HDRLEN \ + ((int)LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t))) +#define LXNLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define LXNLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define LXNLMSG_DATA(nlh) ((void*)(((char *)nlh) + NLMSG_LENGTH(0))) +#define LXNLMSG_PAYLOAD(nlh, len) \ + ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define LXATTR_PAYLOAD(lxa) \ + ((void*)((caddr_t)(lxa) + sizeof (lx_netlink_attr_t))) +#define LXATTR_HDRLEN LXNLMSG_ALIGN(sizeof (lx_netlink_attr_t)) +#define LXATTR_LEN(len) (LXATTR_HDRLEN + LXNLMSG_ALIGN(len)) + +typedef struct lx_netlink_hdr { + uint32_t lxnh_len; /* length of message */ + uint16_t lxnh_type; /* type of message */ + uint16_t lxnh_flags; /* flags */ + uint32_t lxnh_seq; /* sequence number */ + uint32_t lxnh_pid; /* sending pid */ +} lx_netlink_hdr_t; + +typedef struct lx_netlink_err { + lx_netlink_hdr_t lxne_hdr; /* header */ + int32_t lxne_errno; /* errno */ + lx_netlink_hdr_t lxne_failed; /* header of err */ +} lx_netlink_err_t; + +typedef struct lx_netlink_attr { + uint16_t lxna_len; /* length of attribute */ + uint16_t lxna_type; /* type of attribute */ +} lx_netlink_attr_t; + +typedef struct lx_netlink_ifinfomsg { + uint8_t lxnl_ifi_family; /* family: AF_UNSPEC */ + uint8_t lxnl_ifi__pad; + uint16_t lxnl_ifi_type; /* device type */ + uint32_t lxnl_ifi_index; /* interface index */ + uint32_t lxnl_ifi_flags; /* device flags */ + uint32_t lxnl_ifi_change; /* unused; must be -1 */ +} lx_netlink_ifinfomsg_t; + +typedef struct lx_netlink_ifaddrmsg { + uint8_t lxnl_ifa_family; /* address type */ + uint8_t lxnl_ifa_prefixlen; /* prefix length of address */ + uint8_t lxnl_ifa_flags; /* address flags */ + uint8_t lxnl_ifa_scope; /* address scope */ + uint8_t lxnl_ifa_index; /* interface index */ +} lx_netlink_ifaddrmsg_t; + +typedef struct lx_netlink_rtmsg { + uint8_t rtm_family; /* route AF */ + uint8_t rtm_dst_len; /* destination addr length */ + uint8_t rtm_src_len; /* source addr length */ + uint8_t rtm_tos; /* TOS filter */ + uint8_t rtm_table; /* routing table ID */ + uint8_t rtm_protocol; /* routing protocol */ + uint8_t rtm_scope; + uint8_t rtm_type; + uint32_t rtm_flags; +} lx_netlink_rtmsg_t; + +typedef struct lx_netlink_sockaddr { + sa_family_t lxnl_family; /* AF_LX_NETLINK */ + uint16_t lxnl_pad; /* padding */ + uint32_t lxnl_port; /* port id */ + uint32_t lxnl_groups; /* multicast groups mask */ +} lx_netlink_sockaddr_t; + +typedef struct lx_netlink_sock { + struct lx_netlink_sock *lxns_next; /* list of lx_netlink sockets */ + sock_upcalls_t *lxns_upcalls; /* pointer to socket upcalls */ + sock_upper_handle_t lxns_uphandle; /* socket upcall handle */ + ldi_handle_t lxns_iphandle; /* handle to /dev/ip */ + ldi_handle_t lxns_ip6handle; /* handle to /dev/ip6 */ + ldi_handle_t lxns_current; /* current ip handle */ + int lxns_proto; /* protocol */ + uint32_t lxns_port; /* port identifier */ + uint32_t lxns_groups; /* group subscriptions */ + uint32_t lxns_bufsize; /* buffer size */ + uint32_t lxns_flags; /* socket flags */ +} lx_netlink_sock_t; + +typedef struct lx_netlink_reply { + lx_netlink_hdr_t lxnr_hdr; /* header that we're reply to */ + lx_netlink_sock_t *lxnr_sock; /* socket */ + uint32_t lxnr_seq; /* sequence number */ + uint16_t lxnr_type; /* type of reply */ + mblk_t *lxnr_mp; /* current mblk */ + mblk_t *lxnr_err; /* error mblk */ + mblk_t *lxnr_mp1; /* T_UNITDATA_IND mblk */ + int lxnr_errno; /* errno, if any */ +} lx_netlink_reply_t; + +static lx_netlink_sock_t *lx_netlink_head; /* head of lx_netlink sockets */ +static kmutex_t lx_netlink_lock; /* lock to protect state */ +static ldi_ident_t lx_netlink_ldi; /* LDI handle */ +static int lx_netlink_bufsize = 4096; /* default buffer size */ +static int lx_netlink_flowctrld; /* # of times flow controlled */ + +/*ARGSUSED*/ +static void +lx_netlink_activate(sock_lower_handle_t handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, + int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ | + SOCKOPT_MAXBLK | SOCKOPT_MINPSZ; + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl); + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_minpsz = 0; + + lxsock->lxns_upcalls = sock_upcalls; + lxsock->lxns_uphandle = sock_handle; + + sock_upcalls->su_set_proto_props(sock_handle, &sopp); +} + +/*ARGSUSED*/ +static int +lx_netlink_setsockopt(sock_lower_handle_t handle, int level, + int option_name, const void *optval, socklen_t optlen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + + if (level == SOL_SOCKET && option_name == SO_RECVUCRED) { + int *ival; + if (optlen != sizeof (int)) { + return (EINVAL); + } + ival = (int *)optval; + if (*ival == 0) { + lxsock->lxns_flags &= ~LXNLF_RECVUCRED; + } else { + lxsock->lxns_flags |= LXNLF_RECVUCRED; + } + return (0); + } else if (level == SOL_SOCKET) { + /* Punt on the other SOL_SOCKET options */ + return (0); + } else if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_ADD_MEMBERSHIP: + case LX_NETLINK_SO_DROP_MEMBERSHIP: + case LX_NETLINK_SO_PKTINFO: + case LX_NETLINK_SO_BROADCAST_ERROR: + case LX_NETLINK_SO_NO_ENOBUFS: + case LX_NETLINK_SO_RX_RING: + case LX_NETLINK_SO_TX_RING: + /* Blatant lie */ + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name, + socklen_t namelen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name; + + if (namelen != sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + + if (lxsa->lxnl_groups != 0) { + /* + * On linux, CAP_NET_ADMIN is needed to bind to netlink groups. + * This roughly maps to PRIV_SYS_IP_CONFIG. + */ + if (secpolicy_ip_config(cr, B_FALSE) != 0) { + return (EACCES); + } + + /* Lie about group subscription for now */ + lxsock->lxns_groups = lxsa->lxnl_groups; + } + + /* + * Linux netlink uses nl_port to identify distinct netlink sockets. + * Binding to an address of nl_port=0 triggers the kernel to + * automatically assign a free nl_port identifier. Originally, + * consumers of lx_netlink were required to bind with that automatic + * address. We now support non-zero values for nl_port although strict + * checking to identify conflicts is not performed. Use of the + * id_space facility could be a convenient solution, if a need arose. + */ + if (lxsa->lxnl_port == 0) { + /* + * Because we are not doing conflict detection, there is no + * need to expend effort selecting a unique port for automatic + * addressing during bind. + */ + lxsock->lxns_port = curproc->p_pid; + } else { + lxsock->lxns_port = lxsa->lxnl_port; + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa, + socklen_t *len, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa; + + if (*len < sizeof (lx_netlink_sockaddr_t)) + return (EINVAL); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_pad = 0; + lxsa->lxnl_port = lxsock->lxns_port; + lxsa->lxnl_groups = lxsock->lxns_groups; + + *len = sizeof (lx_netlink_sockaddr_t); + + return (0); +} + +static mblk_t * +lx_netlink_alloc_mp1(lx_netlink_sock_t *lxsock) +{ + mblk_t *mp; + size_t size; + struct T_unitdata_ind *tunit; + lx_netlink_sockaddr_t *lxsa; + boolean_t send_ucred; + + /* + * Certain netlink clients (such as systemd) will set SO_RECVUCRED + * (via the Linux SCM_CREDENTIALS) on the expectation that all replies + * will contain credentials passed via cmsg. They require this to + * authenticate those messages as having originated in the kernel by + * checking uc_pid == 0. + */ + VERIFY(lxsock != NULL); + send_ucred = ((lxsock->lxns_flags & LXNLF_RECVUCRED) != 0); + + /* + * Message structure: + * +----------------------------+ + * | struct T_unit_data_ind | + * +----------------------------+ + * | lx_netlink_sockaddr_t | + * +----------------------------+ -+ + * | struct cmsghdr (SCM_UCRED) | | + * +----------------------------+ +-(optional) + * | struct ucred_s (cmsg data) | | + * +----------------------------+ -+ + */ + size = sizeof (*tunit) + sizeof (*lxsa); + if (send_ucred) { + size += sizeof (struct cmsghdr) + + ROUNDUP_cmsglen(sizeof (struct ucred_s)); + } + mp = allocb(size, 0); + if (mp == NULL) { + return (NULL); + } + + tunit = (struct T_unitdata_ind *)mp->b_rptr; + lxsa = (lx_netlink_sockaddr_t *)((caddr_t)tunit + sizeof (*tunit)); + mp->b_wptr += size; + + mp->b_datap->db_type = M_PROTO; + tunit->PRIM_type = T_UNITDATA_IND; + tunit->SRC_length = sizeof (*lxsa); + tunit->SRC_offset = (caddr_t)lxsa - (caddr_t)mp->b_rptr; + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_port = 0; + lxsa->lxnl_groups = 0; + lxsa->lxnl_pad = 0; + + if (send_ucred) { + struct cmsghdr *cmsg; + struct ucred_s *ucred; + + cmsg = (struct cmsghdr *)((caddr_t)lxsa + sizeof (*lxsa)); + ucred = (struct ucred_s *)CMSG_CONTENT(cmsg); + cmsg->cmsg_len = sizeof (*cmsg) + sizeof (*ucred); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + bzero(ucred, sizeof (*ucred)); + ucred->uc_size = sizeof (*ucred); + ucred->uc_zoneid = getzoneid(); + + tunit->OPT_length = sizeof (*cmsg) + + ROUNDUP_cmsglen(sizeof (*ucred)); + tunit->OPT_offset = (caddr_t)cmsg - (caddr_t)mp->b_rptr; + } else { + tunit->OPT_length = 0; + tunit->OPT_offset = 0; + } + + return (mp); +} + +static lx_netlink_reply_t * +lx_netlink_reply(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, uint16_t type) +{ + lx_netlink_reply_t *reply; + mblk_t *err, *mp1; + + /* + * We always allocate an error block to assure that even if subsequent + * allocations fail, we can return an error. + */ + if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL) + return (NULL); + + if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) { + freeb(err); + return (NULL); + } + + reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP); + reply->lxnr_err = err; + reply->lxnr_sock = lxsock; + reply->lxnr_hdr = *hdr; + reply->lxnr_type = type; + reply->lxnr_mp1 = mp1; + + return (reply); +} + +static void +lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + uint32_t aligned; + mblk_t *mp = reply->lxnr_mp; + + if (reply->lxnr_errno) + return; + + aligned = LXNLMSG_ALIGN(size); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + + if (hdr->lxnh_len + aligned > lxsock->lxns_bufsize) { + reply->lxnr_errno = E2BIG; + return; + } + + bcopy(payload, mp->b_wptr, size); + hdr->lxnh_len += aligned; + mp->b_wptr += aligned; +} + +static void +lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + if (reply->lxnr_errno) + return; + + VERIFY(reply->lxnr_mp == NULL); + + if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + + mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + + if (payload == NULL) { + /* + * A NULL payload denotes a "done" message. + */ + hdr->lxnh_type = LX_NETLINK_NLMSG_DONE; + } else { + hdr->lxnh_type = reply->lxnr_type; + lx_netlink_reply_add(reply, payload, size); + } +} + +static void +lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type, + void *payload, uint32_t size) +{ + lx_netlink_attr_t attr; + + attr.lxna_len = size + sizeof (lx_netlink_attr_t); + attr.lxna_type = type; + + lx_netlink_reply_add(reply, &attr, sizeof (attr)); + lx_netlink_reply_add(reply, payload, size); +} + +static void +lx_netlink_reply_attr_string(lx_netlink_reply_t *reply, + uint16_t type, const char *str) +{ + lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1); +} + +static void +lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply, + uint16_t type, int32_t val) +{ + int32_t v = val; + + lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t)); +} + +static int +lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg) +{ + int rval; + + if (reply->lxnr_errno != 0) + return (reply->lxnr_errno); + + if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current, + cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) { + reply->lxnr_errno = rval; + } + + return (rval); +} + +static void +lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + int error; + + /* + * To prevent the stream head from coalescing messages and to indicate + * their origin, we send them as T_UNITDATA_IND messages, not as raw + * M_DATA. + */ + mp1->b_cont = mp; + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1, + msgdsize(mp1), 0, &error, NULL); + + if (error != 0) + lx_netlink_flowctrld++; +} + +static void +lx_netlink_reply_send(lx_netlink_reply_t *reply) +{ + mblk_t *mp1; + + if (reply->lxnr_errno) + return; + + if ((mp1 = lx_netlink_alloc_mp1(reply->lxnr_sock)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1); + reply->lxnr_mp = NULL; +} + +static void +lx_netlink_reply_done(lx_netlink_reply_t *reply) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + /* + * Denote that we're done via a message with a NULL payload. + */ + lx_netlink_reply_msg(reply, NULL, 0); + + if (reply->lxnr_errno) { + /* + * If anything failed, we'll send up an error message. + */ + lx_netlink_hdr_t *hdr; + lx_netlink_err_t *err; + + if (reply->lxnr_mp != NULL) { + freeb(reply->lxnr_mp); + reply->lxnr_mp = NULL; + } + + mp = reply->lxnr_err; + VERIFY(mp != NULL); + reply->lxnr_err = NULL; + err = (lx_netlink_err_t *)mp->b_rptr; + hdr = &err->lxne_hdr; + mp->b_wptr += sizeof (lx_netlink_err_t); + + err->lxne_failed = reply->lxnr_hdr; + err->lxne_errno = reply->lxnr_errno; + hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_len = sizeof (lx_netlink_err_t); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + } else { + mp = reply->lxnr_mp; + VERIFY(mp != NULL); + reply->lxnr_mp = NULL; + } + + lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + + if (reply->lxnr_mp != NULL) + freeb(reply->lxnr_mp); + + if (reply->lxnr_err != NULL) + freeb(reply->lxnr_err); + + kmem_free(reply, sizeof (lx_netlink_reply_t)); +} + +static int +lx_netlink_reply_error(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, int errno) +{ + /* + * The type of the message doesn't matter, as we're going to explicitly + * set lxnr_errno and therefore send only an error message. + */ + lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0); + + if (reply == NULL) + return (ENOMEM); + + reply->lxnr_errno = errno; + lx_netlink_reply_done(reply); + + return (0); +} + +static int +lx_netlink_parse_msg_attrs(mblk_t *mp, void **msgp, unsigned int msg_size, + lx_netlink_attr_t **attrp, unsigned int *attr_max) +{ + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + lx_netlink_attr_t *lxa; + unsigned char *buf = mp->b_rptr + LXNLMSG_HDRLEN; + unsigned int i; + uint32_t buf_left = MBLKL(mp) - LXNLMSG_HDRLEN; + uint32_t msg_left = hdr->lxnh_len; + + msg_size = LXNLMSG_ALIGN(msg_size); + if (msg_size > buf_left || msg_size > msg_left) { + return (-1); + } + + *msgp = (void *)buf; + buf += msg_size; + buf_left -= msg_size; + msg_left -= msg_size; + + /* Do not bother with attr parsing if not requested */ + if (attrp == NULL || *attr_max == 0) { + return (0); + } + + for (i = 0; i < *attr_max; i++) { + if (buf_left < LXATTR_HDRLEN || msg_left < LXATTR_HDRLEN) { + break; + } + + lxa = (lx_netlink_attr_t *)buf; + if (lxa->lxna_len > buf_left || lxa->lxna_len > msg_left) { + return (-1); + } + + attrp[i] = lxa; + buf += lxa->lxna_len; + buf_left -= lxa->lxna_len; + msg_left -= lxa->lxna_len; + } + *attr_max = i; + + return (0); +} + +/* + * Takes an IPv4 address (in network byte order) and returns the address scope. + */ +static uint8_t +lx_ipv4_rtscope(in_addr_t nbo_addr) { + in_addr_t addr = ntohl(nbo_addr); + if ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + return (LX_RTSCOPE_HOST); + } else if ((addr & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET) { + return (LX_RTSCOPE_LINK); + } else if ((addr & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET || + (addr & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET || + (addr & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +/* + * Takes an IPv6 address and returns the address scope. + */ +static uint8_t +lx_ipv6_rtscope(const in6_addr_t *addr) { + if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_RTSCOPE_HOST); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_RTSCOPE_LINK); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +static void +lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifinfomsg_t ifi; + int i; + char if_name[IFNAMSIZ]; + struct sockaddr_dl *sdl; + struct sockaddr hwaddr; + int hwaddr_size; + boolean_t is_loopback; + + struct { + int native; + int lx; + } flags[] = { + { IFF_UP, LX_IFF_UP }, + { IFF_BROADCAST, LX_IFF_BROADCAST }, + { IFF_DEBUG, LX_IFF_DEBUG }, + { IFF_LOOPBACK, LX_IFF_LOOPBACK }, + { IFF_POINTOPOINT, LX_IFF_POINTOPOINT }, + { IFF_NOTRAILERS, LX_IFF_NOTRAILERS }, + { IFF_RUNNING, LX_IFF_RUNNING }, + { IFF_NOARP, LX_IFF_NOARP }, + { IFF_PROMISC, LX_IFF_PROMISC }, + { IFF_ALLMULTI, LX_IFF_ALLMULTI }, + { IFF_MULTICAST, LX_IFF_MULTICAST }, + { 0 } + }; + + /* + * illumos interfaces that contain a ':' are non-zero logical + * interfaces. We should only emit the name of the zeroth logical + * interface, since RTM_GETLINK only expects to see the name of + * devices. The addresses of all logical devices will be + * returned via an RTM_GETADDR. + */ + if (strchr(lifr->lifr_name, ':') != NULL) + return; + + /* + * Most of the lx_netlink module is architected to emit information in + * an illumos-native manner. Socket syscalls such as getsockname will + * not translate fields to values Linux programs would expect since + * that conversion is performed by the generic socket emulation. + * + * This is _not_ true of the actual protocol output from lx_netlink. + * Since translating it at the socket layer would be onerous, all + * output (including constants and names) is pre-translated to values + * valid for Linux. + */ + + bzero(&ifi, sizeof (ifi)); + ifi.lxnl_ifi_family = AF_UNSPEC; + ifi.lxnl_ifi_change = (uint32_t)-1; + + /* Convert the name to be Linux-friendly */ + (void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ); + lx_ifname_convert(if_name, LX_IF_FROMNATIVE); + is_loopback = (strncmp(if_name, "lo", 2) == 0); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifi.lxnl_ifi_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + for (i = 0; flags[i].native; i++) { + if (lifr->lifr_flags & flags[i].native) + ifi.lxnl_ifi_flags |= flags[i].lx; + } + + /* + * Query the datalink address. + * The interface type will be included in the outgoing infomsg while + * the address itself will be output separately. + */ + sdl = (struct sockaddr_dl *)&lifr->lifr_addr; + bzero(sdl, sizeof (*sdl)); + if (!is_loopback) { + lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr); + } else { + /* Simulate an empty hwaddr for loopback */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + } + lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size); + + ifi.lxnl_ifi_type = hwaddr.sa_family; + lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t)); + + lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0) + return; + + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu); + + if (hwaddr_size != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS, + hwaddr.sa_data, hwaddr_size); + } + + /* Emulate a txqlen of 1. (0 for loopbacks) */ + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN, + (is_loopback) ? 0 : 1); + + lx_netlink_reply_send(reply); +} + +static void +lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply, + void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct) +{ + lx_netlink_sock_t *sock = reply->lxnr_sock; + int nlifr, i; + + struct { + int family; + ldi_handle_t handle; + struct lifconf lifc; + struct lifnum lifn; + } families[] = { + { AF_INET, sock->lxns_iphandle }, + { AF_INET6, sock->lxns_ip6handle }, + { AF_UNSPEC } + }, *family, *check; + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + struct lifnum *lifn = &family->lifn; + + lifn->lifn_family = family->family; + sock->lxns_current = family->handle; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0) + break; + + lifc->lifc_family = lifn->lifn_family; + lifc->lifc_flags = 0; + lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq); + if (lifn->lifn_count == 0) { + lifc->lifc_buf = NULL; + continue; + } + lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0) + break; + + nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]); + + for (i = 0; i < nlifr; i++) { + if (!distinct) { + func(reply, &lifc->lifc_req[i]); + continue; + } + + /* + * If we have been asked to provide each interface + * exactly once, we need to (annoyingly) check this + * name against others that we've already processed for + * other families. Yes, this is quadratic time -- but + * the number of interfaces per family is expected to + * be very small. + */ + for (check = families; check != family; check++) { + struct lifconf *clifc = &check->lifc; + int cnlifr = clifc->lifc_len / + sizeof (clifc->lifc_req[0]), j; + char *nm = lifc->lifc_req[i].lifr_name, *cnm; + + for (j = 0; j < cnlifr; j++) { + cnm = clifc->lifc_req[j].lifr_name; + + if (strcmp(nm, cnm) == 0) + break; + } + + if (j != cnlifr) + break; + } + + if (check != family) + continue; + + func(reply, &lifc->lifc_req[i]); + } + } + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + + if (lifc->lifc_buf != NULL) + kmem_free(lifc->lifc_buf, lifc->lifc_len); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE); + lx_netlink_reply_done(reply); + + return (0); +} + +static void +lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifaddrmsg_t ifa; + + bzero(&ifa, sizeof (ifa)); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifa.lxnl_ifa_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + /* + * Don't report on-link subnets + */ + if ((lifr->lifr_flags & IFF_NOLOCAL) != 0) + return; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0) + return; + + ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0) + return; + + if (lifr->lifr_addr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + ifa.lxnl_ifa_family = LX_AF_INET; + + sin = (struct sockaddr_in *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv4_rtscope( + sin->sin_addr.s_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr_int32(reply, + LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin; + + ifa.lxnl_ifa_family = LX_AF_INET6; + + sin = (struct sockaddr_in6 *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv6_rtscope(&sin->sin6_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS, + &sin->sin6_addr, sizeof (sin->sin6_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE); + lx_netlink_reply_done(reply); + + return (0); +} + +struct lx_getroute_ctx { + lx_netlink_reply_t *lgrtctx_reply; + lx_netlink_rtmsg_t *lgrtctx_rtmsg; + lx_netlink_attr_t *lgrtctx_attrs[LX_NETLINK_MAX_RTA]; + unsigned int lgrtctx_max_attr; + lx_netlink_attr_t *lgrtctx_rtadst; +}; + +static void +lx_netlink_getroute_ipv4(ire_t *ire, struct lx_getroute_ctx *ctx) +{ + lx_netlink_reply_t *reply = ctx->lgrtctx_reply; + lx_netlink_rtmsg_t *rtmsg = ctx->lgrtctx_rtmsg; + lx_netlink_attr_t *rtadst = ctx->lgrtctx_rtadst; + lx_netlink_rtmsg_t res; + ill_t *ill = NULL; + + /* Certain IREs are too specific for netlink */ + if ((ire->ire_type & (IRE_BROADCAST | IRE_MULTICAST | IRE_NOROUTE | + IRE_LOOPBACK | IRE_LOCAL)) != 0 || ire->ire_testhidden != 0) { + return; + } + /* + * When listing routes, CLONE entries are undesired. + * They are required for 'ip route get' on a local address. + */ + if (rtmsg->rtm_dst_len == 0 && (ire->ire_type & IRE_IF_CLONE) != 0) { + return; + } + + bzero(&res, sizeof (res)); + res.rtm_family = LX_AF_INET; + res.rtm_table = LX_ROUTE_TABLE_MAIN; + res.rtm_type = LX_RTN_UNICAST; + res.rtm_dst_len = ire->ire_masklen; + + if (ire->ire_type & (IRE_IF_NORESOLVER|IRE_IF_RESOLVER)) { + /* Interface-local networks considered kernel-created */ + res.rtm_protocol = LX_RTPROT_KERNEL; + res.rtm_scope = LX_RTSCOPE_LINK; + } else if (ire->ire_flags & RTF_STATIC) { + res.rtm_protocol = LX_RTPROT_STATIC; + } + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* + * SpecifY single-destination route. + * RTA_DST details will be added later + */ + res.rtm_dst_len = rtmsg->rtm_dst_len; + } + + + lx_netlink_reply_msg(reply, &res, sizeof (res)); + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* Add RTA_DST details for single-destination route. */ + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + LXATTR_PAYLOAD(rtadst), sizeof (ipaddr_t)); + } else if (ire->ire_masklen != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + &ire->ire_addr, sizeof (ire->ire_addr)); + } + + if (ire->ire_ill != NULL) { + ill = ire->ire_ill; + } else if (ire->ire_dep_parent != NULL) { + ill = ire->ire_dep_parent->ire_ill; + } + + if (ill != NULL) { + uint32_t ifindex, addr_src; + + ifindex = ill->ill_phyint->phyint_ifindex; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_OIF, + &ifindex, sizeof (ifindex)); + + addr_src = ill->ill_ipif->ipif_lcl_addr; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_PREFSRC, + &addr_src, sizeof (addr_src)); + } + + if (ire->ire_flags & RTF_GATEWAY) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_GATEWAY, + &ire->ire_gateway_addr, sizeof (ire->ire_gateway_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getroute(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, + mblk_t *mp) +{ + struct lx_getroute_ctx ctx; + lx_netlink_reply_t *reply; + lx_netlink_rtmsg_t rtmsg, *rtmsgp; + int rtmsg_size = sizeof (rtmsg); + netstack_t *ns; + int i; + + bzero(&ctx, sizeof (ctx)); + ctx.lgrtctx_max_attr = LX_NETLINK_MAX_RTA; + + if (lx_netlink_parse_msg_attrs(mp, (void **)&rtmsgp, + rtmsg_size, ctx.lgrtctx_attrs, &ctx.lgrtctx_max_attr) != 0) { + return (EPROTO); + } + + /* + * Older version of libnetlink send a truncated rtmsg struct for + * certain RTM_GETROUTE queries. We must detect this condition and + * truncate our input to prevent later confusion. + */ + if (curproc->p_zone->zone_brand == &lx_brand && + lx_kern_release_cmp(curproc->p_zone, "2.6.32") <= 0 && + rtmsgp->rtm_dst_len == 0) { + rtmsg_size = sizeof (rtmsg.rtm_family); + } + bzero(&rtmsg, sizeof (rtmsg)); + bcopy(rtmsgp, &rtmsg, rtmsg_size); + ctx.lgrtctx_rtmsg = &rtmsg; + + /* If RTA_DST was passed, it effects later decisions */ + for (i = 0; i < ctx.lgrtctx_max_attr; i++) { + lx_netlink_attr_t *attr = ctx.lgrtctx_attrs[i]; + + if (attr->lxna_type == LX_NETLINK_RTA_DST && + attr->lxna_len == LXATTR_LEN(sizeof (ipaddr_t))) { + ctx.lgrtctx_rtadst = attr; + break; + } + } + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWROUTE); + if (reply == NULL) { + return (ENOMEM); + } + ctx.lgrtctx_reply = reply; + + /* Do not report anything outside the main table */ + if (rtmsg.rtm_table != LX_ROUTE_TABLE_MAIN && + rtmsg.rtm_table != 0) { + lx_netlink_reply_done(reply); + return (0); + } + + ns = netstack_get_current(); + if (ns == NULL) { + lx_netlink_reply_done(reply); + return (0); + } + if (rtmsg.rtm_family == LX_AF_INET || rtmsg.rtm_family == 0) { + if (rtmsg.rtm_dst_len == 0x20 && ctx.lgrtctx_rtadst != NULL) { + /* resolve route for host */ + ipaddr_t *dst = LXATTR_PAYLOAD(ctx.lgrtctx_rtadst); + ire_t *ire_dst; + + ire_dst = ire_route_recursive_dstonly_v4(*dst, 0, 0, + ns->netstack_ip); + lx_netlink_getroute_ipv4(ire_dst, &ctx); + ire_refrele(ire_dst); + } else { + /* get route listing */ + ire_walk_v4(&lx_netlink_getroute_ipv4, &ctx, ALL_ZONES, + ns->netstack_ip); + } + } + if (rtmsg.rtm_family == LX_AF_INET6) { + /* punt on ipv6 for now */ + netstack_rele(ns); + lx_netlink_reply_done(reply); + return (EPROTO); + } + netstack_rele(ns); + + lx_netlink_reply_done(reply); + return (0); +} + + +/*ARGSUSED*/ +static int +lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For all auditing messages, we return ECONNREFUSED, which seems to + * keep user-level auditing happy. (Or at least, non-suicidal.) + */ + return (ECONNREFUSED); +} + +/*ARGSUSED*/ +static int +lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For udev, we just silently accept all writes and never actually + * reply with anything -- which appears to be sufficient for things + * to work. + */ + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp, + struct nmsghdr *msg, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + int i, rval; + + static struct { + int proto; + uint16_t type; + int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *); + } handlers[] = { + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETLINK, lx_netlink_getlink }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETROUTE, lx_netlink_getroute }, + { LX_NETLINK_AUDIT, + LX_NETLINK_NLMSG_NONE, lx_netlink_audit }, + { LX_NETLINK_KOBJECT_UEVENT, + LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent }, + { LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL } + }; + + if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) { + freemsg(mp); + return (EPROTO); + } + + for (i = 0; handlers[i].func != NULL; i++) { + if (lxsock->lxns_proto != handlers[i].proto) + continue; + + if (handlers[i].type != LX_NETLINK_NLMSG_NONE && + hdr->lxnh_type != handlers[i].type) + continue; + + rval = handlers[i].func(lxsock, hdr, mp); + freemsg(mp); + + return (rval); + } + + /* + * An unrecognized message. We will bounce up an EOPNOTSUPP reply. + */ + rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP); + freemsg(mp); + + return (rval); +} + +/*ARGSUSED*/ +static int +lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev; + + mutex_enter(&lx_netlink_lock); + + prev = &lx_netlink_head; + + for (sock = *prev; sock != lxsock; sock = sock->lxns_next) + prev = &sock->lxns_next; + + *prev = sock->lxns_next; + + mutex_exit(&lx_netlink_lock); + + (void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred); + (void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred); + kmem_free(lxsock, sizeof (lx_netlink_sock_t)); + + return (0); +} + +static sock_downcalls_t sock_lx_netlink_downcalls = { + lx_netlink_activate, /* sd_activate */ + sock_accept_notsupp, /* sd_accept */ + lx_netlink_bind, /* sd_bind */ + sock_listen_notsupp, /* sd_listen */ + sock_connect_notsupp, /* sd_connect */ + sock_getpeername_notsupp, /* sd_getpeername */ + lx_netlink_getsockname, /* sd_getsockname */ + sock_getsockopt_notsupp, /* sd_getsockopt */ + lx_netlink_setsockopt, /* sd_setsockopt */ + lx_netlink_send, /* sd_send */ + NULL, /* sd_send_uio */ + NULL, /* sd_recv_uio */ + NULL, /* sd_poll */ + sock_shutdown_notsupp, /* sd_shutdown */ + sock_clr_flowctrl_notsupp, /* sd_setflowctrl */ + sock_ioctl_notsupp, /* sd_ioctl */ + lx_netlink_close /* sd_close */ +}; + +/*ARGSUSED*/ +static sock_lower_handle_t +lx_netlink_create(int family, int type, int proto, + sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, + int flags, cred_t *credp) +{ + lx_netlink_sock_t *lxsock; + ldi_handle_t handle, handle6; + cred_t *kcred = zone_kcred(); + int err; + + if (family != AF_LX_NETLINK || + (type != SOCK_DGRAM && type != SOCK_RAW)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + switch (proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + break; + + default: + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred, + &handle, lx_netlink_ldi)) != 0) { + *errorp = err; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred, + &handle6, lx_netlink_ldi)) != 0) { + (void) ldi_close(handle, FREAD, kcred); + *errorp = err; + return (NULL); + } + + *sock_downcalls = &sock_lx_netlink_downcalls; + *smodep = SM_ATOMIC; + + lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP); + lxsock->lxns_iphandle = handle; + lxsock->lxns_ip6handle = handle6; + lxsock->lxns_bufsize = lx_netlink_bufsize; + lxsock->lxns_proto = proto; + + mutex_enter(&lx_netlink_lock); + + lxsock->lxns_next = lx_netlink_head; + lx_netlink_head = lxsock; + + mutex_exit(&lx_netlink_lock); + + return ((sock_lower_handle_t)lxsock); +} + +static void +lx_netlink_init(void) +{ + major_t major = mod_name_to_major("ip"); + int err; + + VERIFY(major != DDI_MAJOR_T_NONE); + + err = ldi_ident_from_major(major, &lx_netlink_ldi); + VERIFY(err == 0); +} + +static void +lx_netlink_fini(void) +{ + ldi_ident_release(lx_netlink_ldi); +} + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "lx_netlink", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + lx_netlink_create, + NULL +}; + +/* modldrv structure */ +static struct modlsockmod sockmod = { + &mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo +}; + +/* modlinkage structure */ +static struct modlinkage ml = { + MODREV_1, + &sockmod, + NULL +}; + +int +_init(void) +{ + int err; + + lx_netlink_init(); + + if ((err = mod_install(&ml)) != 0) + lx_netlink_fini(); + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&ml, modinfop)); +} + +int +_fini(void) +{ + int err = 0; + + mutex_enter(&lx_netlink_lock); + + if (lx_netlink_head != NULL) + err = EBUSY; + + mutex_exit(&lx_netlink_lock); + + if (err == 0 && (err = mod_remove(&ml)) == 0) + lx_netlink_fini(); + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c new file mode 100644 index 0000000000..23e0c6f459 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c @@ -0,0 +1,1188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + + +/* + * This driver attempts to emulate some of the the behaviors of + * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris + * + * It does this by layering over the /dev/ptmx device and intercepting + * opens to it. + * + * This driver makes the following assumptions about the way the ptm/pts + * drivers on Solaris work: + * + * - all opens of the /dev/ptmx device node return a unique dev_t. + * + * - the dev_t minor node value for each open ptm instance corrospondes + * to it's associated slave terminal device number. ie. the path to + * the slave terminal device associated with an open ptm instance + * who's dev_t minor node vaue is 5, is /dev/pts/5. + * + * - the ptm driver always allocates the lowest numbered slave terminal + * device possible. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/devops.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/kstr.h> +#include <sys/lx_ptm.h> +#include <sys/modctl.h> +#include <sys/pathname.h> +#include <sys/ptms.h> +#include <sys/ptyvar.h> +#include <sys/stat.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/sdt.h> + +#define LP_PTM_PATH "/dev/ptmx" +#define LP_PTS_PATH "/dev/pts/" +#define LP_PTS_DRV_NAME "pts" +#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */ +#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */ + +/* + * this driver is layered on top of the ptm driver. we'd like to + * make this drivers minor name space a mirror of the ptm drivers + * namespace, but we can't actually do this. the reason is that the + * ptm driver is opened via the clone driver. there for no minor nodes + * of the ptm driver are actually accessible via the filesystem. + * since we're not a streams device we can't be opened by the clone + * driver. there for we need to have at least minor node accessible + * via the filesystem so that consumers can open it. we use the device + * node with a minor number of 0 for this purpose. what this means is + * that minor node 0 can't be used to map ptm minor node 0. since this + * minor node is now reserved we need to shift our ptm minor node + * mappings by one. ie. a ptm minor node with a value of 0 will + * corrospond to our minor node with a value of 1. these mappings are + * managed with the following macros. + */ +#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x) +#define INDEX_TO_MINOR(x) ((x) + 1) + +/* + * grow our layered handle array by the same size increment that the ptm + * driver uses to grow the pty device space - PTY_MAXDELTA + */ +#define LP_PTY_INC 128 + +/* + * lx_ptm_ops contains state information about outstanding operations on the + * underlying master terminal device. Currently we only track information + * for read operations. + * + * Note that this data has not been rolled directly into the lx_ptm_handle + * structure because we can't put mutex's of condition variables into + * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle + * structures linked to from the global lx_ptm state can be resized + * dynamically, and when it's resized, the new array is at a different + * memory location and the old array memory is discarded. Mutexs and cvs + * are accessed based off their address, so if this array was re-sized while + * there were outstanding operations on any mutexs or cvs in the array + * then the system would tip over. In the future the lx_ptm_handle structure + * array should probably be replaced with either an array of pointers to + * lx_ptm_handle structures or some other kind of data structure containing + * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure + * could be folded directly into the lx_ptm_handle structures. (This will + * also require the definition of a new locking mechanism to protect the + * contents of lx_ptm_handle structures.) + */ +typedef struct lx_ptm_ops { + int lpo_rops; + kcondvar_t lpo_rops_cv; + kmutex_t lpo_rops_lock; +} lx_ptm_ops_t; + +/* + * Every open of the master terminal device in a zone results in a new + * lx_ptm_handle handle allocation. These handles are stored in an array + * hanging off the lx_ptm_state structure. + */ +typedef struct lx_ptm_handle { + /* Device handle to the underlying real /dev/ptmx master terminal. */ + ldi_handle_t lph_handle; + + /* Flag to indicate if TIOCPKT mode has been enabled. */ + int lph_pktio; + + /* Number of times the slave device has been opened/closed. */ + int lph_eofed; + + /* Callback handler in the ptm driver to check if slave is open. */ + ptmptsopencb_t lph_ppocb; + + /* Pointer to state for operations on underlying device. */ + lx_ptm_ops_t *lph_lpo; +} lx_ptm_handle_t; + +/* + * Global state for the lx_ptm driver. + */ +typedef struct lx_ptm_state { + /* lx_ptm device devinfo pointer */ + dev_info_t *lps_dip; + + /* LDI ident used to open underlying real /dev/ptmx master terminals. */ + ldi_ident_t lps_li; + + /* pts drivers major number */ + major_t lps_pts_major; + + /* rw lock used to manage access and growth of lps_lh_array */ + krwlock_t lps_lh_rwlock; + + /* number of elements in lps_lh_array */ + uint_t lps_lh_count; + + /* Array of handles to underlying real /dev/ptmx master terminals. */ + lx_ptm_handle_t *lps_lh_array; +} lx_ptm_state_t; + +/* Pointer to the lx_ptm global state structure. */ +static lx_ptm_state_t lps; + +/* + * List of modules to be autopushed onto slave terminal devices when they + * are opened in an lx branded zone. + */ +static char *lx_pts_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +static void +lx_ptm_lh_grow(uint_t index) +{ + uint_t new_lh_count, old_lh_count; + lx_ptm_handle_t *new_lh_array, *old_lh_array; + + /* + * allocate a new array. we drop the rw lock on the array so that + * readers can still access devices in case our memory allocation + * blocks. + */ + new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1); + new_lh_array = + kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP); + + /* + * double check that we still actually need to increase the size + * of the array + */ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + if (index < lps.lps_lh_count) { + /* someone beat us to it so there's nothing more to do */ + rw_exit(&lps.lps_lh_rwlock); + kmem_free(new_lh_array, + sizeof (lx_ptm_handle_t) * new_lh_count); + return; + } + + /* copy the existing data into the new array */ + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_count != 0) { + bcopy(lps.lps_lh_array, new_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + } + + /* save info on the old array */ + old_lh_array = lps.lps_lh_array; + old_lh_count = lps.lps_lh_count; + + /* install the new array */ + lps.lps_lh_array = new_lh_array; + lps.lps_lh_count = new_lh_count; + + rw_exit(&lps.lps_lh_rwlock); + + /* free the old array */ + if (old_lh_array != NULL) { + kmem_free(old_lh_array, + sizeof (lx_ptm_handle_t) * old_lh_count); + } +} + +static void +lx_ptm_lh_insert(uint_t index, ldi_handle_t lh) +{ + lx_ptm_ops_t *lpo; + + ASSERT(lh != NULL); + + /* Allocate and initialize the ops structure */ + lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP); + mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + /* check if we need to grow the size of the layered handle array */ + if (index >= lps.lps_lh_count) { + rw_exit(&lps.lps_lh_rwlock); + lx_ptm_lh_grow(index); + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + } + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle == NULL); + ASSERT(lps.lps_lh_array[index].lph_pktio == 0); + ASSERT(lps.lps_lh_array[index].lph_eofed == 0); + ASSERT(lps.lps_lh_array[index].lph_lpo == NULL); + + /* insert the new handle and return */ + lps.lps_lh_array[index].lph_handle = lh; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + lps.lps_lh_array[index].lph_lpo = lpo; + + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_remove(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0); + ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock)); + + /* free the write handle */ + kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t)); + lps.lps_lh_array[index].lph_lpo = NULL; + + /* remove the handle and return it */ + lh = lps.lps_lh_array[index].lph_handle; + lps.lps_lh_array[index].lph_handle = NULL; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static void +lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + *ppocb = lps.lps_lh_array[index].lph_ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static void +lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + lps.lps_lh_array[index].lph_ppocb = *ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_lookup(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the handle */ + lh = lps.lps_lh_array[index].lph_handle; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static lx_ptm_ops_t * +lx_ptm_lpo_lookup(uint_t index) +{ + lx_ptm_ops_t *lpo; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_lpo != NULL); + + /* return the handle */ + lpo = lps.lps_lh_array[index].lph_lpo; + rw_exit(&lps.lps_lh_rwlock); + return (lpo); +} + +static int +lx_ptm_lh_pktio_get(uint_t index) +{ + int pktio; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the pktio state */ + pktio = lps.lps_lh_array[index].lph_pktio; + rw_exit(&lps.lps_lh_rwlock); + return (pktio); +} + +static void +lx_ptm_lh_pktio_set(uint_t index, int pktio) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the pktio state */ + lps.lps_lh_array[index].lph_pktio = pktio; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_lh_eofed_get(uint_t index) +{ + int eofed; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the eofed state */ + eofed = lps.lps_lh_array[index].lph_eofed; + rw_exit(&lps.lps_lh_rwlock); + return (eofed); +} + +static void +lx_ptm_lh_eofed_set(uint_t index) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the eofed state */ + lps.lps_lh_array[index].lph_eofed++; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_read_start(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* Wait for other read operations to finish */ + while (lpo->lpo_rops != 0) { + if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) { + mutex_exit(&lpo->lpo_rops_lock); + return (-1); + } + } + + /* Start a read operation */ + VERIFY(++lpo->lpo_rops == 1); + mutex_exit(&lpo->lpo_rops_lock); + return (0); +} + +static void +lx_ptm_read_end(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* End a read operation */ + VERIFY(--lpo->lpo_rops == 0); + cv_signal(&lpo->lpo_rops_cv); + + mutex_exit(&lpo->lpo_rops_lock); +} + +static int +lx_ptm_pts_isopen(dev_t dev) +{ + ptmptsopencb_t ppocb; + + lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb); + return (ppocb.ppocb_func(ppocb.ppocb_arg)); +} + +static void +lx_ptm_eof_read(ldi_handle_t lh) +{ + struct uio uio; + iovec_t iov; + char junk[1]; + + /* + * We can remove any EOF message from the head of the stream by + * doing a zero byte read from the stream. + */ + iov.iov_len = 0; + iov.iov_base = junk; + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_resid = iov.iov_len; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + (void) ldi_read(lh, &uio, kcred); +} + +static int +lx_ptm_eof_drop_1(dev_t dev, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, msg_size, msg_count; + + *rvalp = 0; + + /* + * Check if there is an EOF message (represented by a zero length + * data message) at the head of the stream. Note that the + * I_NREAD ioctl is a streams framework ioctl so it will succeed + * even if there have been previous write errors on this stream. + */ + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + + if ((msg_count == 0) || (msg_size != 0)) { + /* No EOF message found */ + return (0); + } + + /* Record the fact that the slave device has been closed. */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + + /* drop the EOF */ + lx_ptm_eof_read(lh); + *rvalp = 1; + return (0); +} + +static int +lx_ptm_eof_drop(dev_t dev, int *rvalp) +{ + int rval, err; + + if (rvalp != NULL) + *rvalp = 0; + for (;;) { + if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0) + return (err); + if (rval == 0) + return (0); + if (rvalp != NULL) + *rvalp = 1; + } +} + +static int +lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + *rvalp = 0; + if (ignore_eof) { + int size, rval; + + if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size, + FKIOCTL, kcred, &rval)) != 0) + return (err); + if (size != 0) + *rvalp = 1; + } else { + int msg_size, msg_count; + + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + if (msg_count != 0) + *rvalp = 1; + } + return (0); +} + +static int +lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int err; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + err = ldi_ident_from_dip(dip, &lps.lps_li); + if (err != 0) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + lps.lps_dip = dip; + lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME); + + rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL); + lps.lps_lh_count = 0; + lps.lps_lh_array = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ldi_ident_release(lps.lps_li); + lps.lps_dip = NULL; + + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_array != NULL) { + kmem_free(lps.lps_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + lps.lps_lh_array = NULL; + lps.lps_lh_count = 0; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + struct strioctl iocb; + ptmptsopencb_t ppocb = { NULL, NULL }; + ldi_handle_t lh; + major_t maj, our_major = getmajor(*devp); + minor_t min, lastmin; + uint_t index, anchor = 1; + dev_t ptm_dev; + int err, rval = 0; + + /* + * Don't support the FNDELAY flag and FNONBLOCK until we either + * find a Linux app that opens /dev/ptmx with the O_NDELAY + * or O_NONBLOCK flags explicitly, or until we create test cases + * to determine how reads of master terminal devices opened with + * these flags behave in different situations on Linux. Supporting + * these flags will involve enhancing our read implementation + * and changing the way it deals with EOF notifications. + */ + if (flag & (FNDELAY | FNONBLOCK)) + return (ENOTSUP); + + /* + * we're layered on top of the ptm driver so open that driver + * first. (note that we're opening /dev/ptmx in the global + * zone, not ourselves in the lx zone.) + */ + err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li); + if (err != 0) + return (err); + + /* get the devt returned by the ptmx open */ + err = ldi_get_dev(lh, &ptm_dev); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (err); + } + + /* + * we're a cloning driver so here's where we'll change the devt that we + * return. the ptmx is also a cloning driver so we'll just use + * it's minor number as our minor number (it already manages it's + * minor name space so no reason to duplicate the effort.) + */ + index = getminor(ptm_dev); + *devp = makedevice(our_major, INDEX_TO_MINOR(index)); + + /* Get a callback function to query if the pts device is open. */ + iocb.ic_cmd = PTMPTSOPENCB; + iocb.ic_timout = 0; + iocb.ic_len = sizeof (ppocb); + iocb.ic_dp = (char *)&ppocb; + + err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval); + if ((err != 0) || (rval != 0)) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + ASSERT(ppocb.ppocb_func != NULL); + + /* + * now setup autopush for the terminal slave device. this is + * necessary so that when a Linux program opens the device we + * can push required strmod modules onto the stream. in Solaris + * this is normally done by the application that actually + * allocates the terminal. + */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin, + &anchor, lx_pts_mods); + if (err != 0 && err != EEXIST) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + + /* save off this layered handle for future accesses */ + lx_ptm_lh_insert(index, lh); + lx_ptm_lh_set_ppocb(index, &ppocb); + return (0); +} + +/*ARGSUSED*/ +static int +lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + ldi_handle_t lh; + major_t maj; + minor_t min, lastmin; + uint_t index; + int err; + int i; + + index = DEVT_TO_INDEX(dev); + + /* + * we must cleanup all the state associated with this major/minor + * terminal pair before actually closing the ptm master device. + * this is required because once the close of the ptm device is + * complete major/minor terminal pair is immediatly available for + * re-use in any zone. + */ + + /* free up our saved reference for this layered handle */ + lh = lx_ptm_lh_remove(index); + + /* unconfigure autopush for the associated terminal slave device */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + for (i = 0; i < 5; i++) { + /* + * we loop here because we don't want to release this ptm + * node if autopush can't be disabled on the associated + * slave device because then bad things could happen if + * another brand were to get this terminal allocated + * to them. If we keep failing we eventually drive on so that + * things don't hang. + */ + err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin, + 0, NULL); + if (err == 0) + break; + + cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush", + getzoneid(), err); + + /* wait one second and try again */ + delay(drv_usectohz(1000000)); + } + + err = ldi_close(lh, flag, credp); + + /* + * note that we don't have to bother with changing the permissions + * on the associated slave device here. the reason is that no one + * can actually open the device untill it's associated master + * device is re-opened, which will result in the permissions on + * it being reset. + */ + return (err); +} + +static int +lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, rval; + struct uio uio = *uiop; + + *loop = 0; + + /* + * Here's another way that Linux master terminals behave differently + * from Solaris master terminals. If you do a read on a Linux + * master terminal (that was opened witout NDELAY and NONBLOCK) + * who's corrosponding slave terminal is currently closed and + * has been opened and closed at least once, Linux return -1 and + * set errno to EIO where as Solaris blocks. + */ + if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) { + /* Slave has been opened and closed at least once. */ + if (lx_ptm_pts_isopen(dev) == 0) { + /* + * Slave is closed. Make sure that data is avaliable + * before attempting a read. + */ + if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0) + return (err); + + /* If there is no data available then return. */ + if (rval == 0) + return (EIO); + } + } + + /* Actually do the read operation. */ + if ((err = ldi_read(lh, uiop, credp)) != 0) + return (err); + + /* If read returned actual data then return. */ + if (uio.uio_resid != uiop->uio_resid) + return (0); + + /* + * This was a zero byte read (ie, an EOF). This indicates + * that the slave terinal device has been closed. Record + * the fact that the slave device has been closed and retry + * the read operation. + */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + *loop = 1; + return (0); +} + +static int +lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev)); + int err, loop; + struct uio uio; + struct iovec iovp; + + ASSERT(uiop->uio_iovcnt > 0); + + /* + * If packet mode has been enabled (via TIOCPKT) we need to pad + * all read requests with a leading byte that indicates any + * relevant control status information. + */ + if (pktio != 0) { + /* + * We'd like to write the control information into + * the current buffer but we can't yet. We don't + * want to modify userspace memory here only to have + * the read operation fail later. So instead + * what we'll do here is read one character from the + * beginning of the memory pointed to by the uio + * structure. This will advance the output pointer + * by one. Then when the read completes successfully + * we can update the byte that we passed over. Before + * we do the read make a copy of the current uiop and + * iovec structs so we can write to them later. + */ + uio = *uiop; + iovp = *uiop->uio_iov; + uio.uio_iov = &iovp; + + if (uwritec(uiop) == -1) + return (EFAULT); + } + + do { + /* + * Before we actually attempt a read operation we need + * to make sure there's some buffer space to actually + * read in some data. We do this because if we're in + * pktio mode and the caller only requested one byte, + * then we've already used up that one byte and we + * don't want to pass this read request. Doing a 0 + * byte read (unless there is a problem with the stream + * head) always returns succcess. Normally when a streams + * read returns 0 bytes we interpret that as an EOF on + * the stream (ie, the slave side has been opened and + * closed) and we ignore it and re-try the read operation. + * So if we pass on a 0 byte read here lx_ptm_read_loop() + * will tell us to loop around and we'll end up in an + * infinite loop. + */ + if (uiop->uio_resid == 0) + break; + + /* + * Serialize all reads. We need to do this so that we can + * properly emulate the behavior of master terminals on Linux. + * In reality this serializaion should not pose any kind of + * performance problem since it would be very strange to have + * multiple threads trying to read from the same master + * terminal device concurrently. + */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_read_loop(dev, uiop, credp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + + if (pktio != 0) { + uint8_t pktio_data = TIOCPKT_DATA; + + /* + * Note that the control status information we + * pass back is faked up in the sense that we + * don't actually report any events, we always + * report a status of 0. + */ + if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0) + return (EFAULT); + } + + return (0); +} + +static int +lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + err = ldi_write(lh, uiop, credp); + + return (err); +} + +static int +lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + /* + * here we need to make sure that we never allow the + * I_SETSIG and I_ESETSIG ioctls to pass through. we + * do this because we can't support them. + * + * the native Solaris ptm device supports these ioctls because + * they are streams framework ioctls and all streams devices + * support them by default. these ioctls cause the current + * process to be registered with a stream and receive signals + * when certain stream events occur. + * + * a problem arises with cleanup of these registrations + * for layered drivers. + * + * normally the streams framework is notified whenever a + * process closes any reference to a stream and it goes ahead + * and cleans up these registrations. but actual device drivers + * are not notified when a process performs a close operation + * unless the process is closing the last opened reference to + * the device on the entire system. + * + * so while we could pass these ioctls on and allow processes + * to register for signal delivery, we would never receive + * any notification when those processes exit (or close a + * stream) and we wouldn't be able to unregister them. + * + * luckily these operations are streams specific and Linux + * doesn't support streams devices. so it doesn't actually + * seem like we need to support these ioctls. if it turns + * out that we do need to support them for some reason in + * the future, the current driver model will have to be + * enhanced to better support streams device layering. + */ + if ((cmd == I_SETSIG) || (cmd == I_ESETSIG)) + return (EINVAL); + + /* + * here we fake up support for TIOCPKT. Linux applications expect + * /etc/ptmx to support this ioctl, but on Solaris it doesn't. + * (it is supported on older bsd style ptys.) so we'll fake + * up support for it here. + * + * the reason that this ioctl is emulated here instead of in + * userland is that this ioctl affects the results returned + * from read() operations. if this ioctl was emulated in + * userland the brand library would need to intercept all + * read operations and check to see if pktio was enabled + * for the fd being read from. since this ioctl only needs + * to be supported on the ptmx device it makes more sense + * to support it here where we can easily update the results + * returned for read() operations performed on ourselves. + */ + if (cmd == TIOCPKT) { + int pktio; + + if (ddi_copyin((void *)arg, &pktio, sizeof (pktio), + mode) != DDI_SUCCESS) + return (EFAULT); + + if (pktio == 0) + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0); + else + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1); + + return (0); + } + + err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp); + + /* + * On recent versions of Linux some apps issue the following ioctls to + * the master side of the ptm before opening the slave side. Because + * our streams modules (specifically ptem) aren't autopushed until the + * slave side has been opened, these ioctls will fail. To alleviate the + * issue we simply pretend that these ioctls have succeeded. + * + * We could push our own "lx_ptem" module onto the master side of the + * stream in lx_ptm_open if we need better emulation, but that would + * require an "lx_ptem" module which duplicates most of ptem. ptem + * doesn't work properly when pushed on the master side. + */ + if (err == EINVAL && (cmd == TIOCSWINSZ || cmd == TCSETS) && + lx_ptm_pts_isopen(dev) == 0) { + /* slave side not open, assume we need to succeed */ + DTRACE_PROBE1(lx_ptm_ioctl__override, int, cmd); + return (0); + } + + return (err); +} + +static int +lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + short reventsp2; + int err, rval; + + *loop = 0; + + /* + * If the slave device has been opened and closed at least + * once and the slave device is currently closed, then poll + * always needs to returns immediatly. + */ + if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) && + (lx_ptm_pts_isopen(dev) == 0)) { + /* In this case always return POLLHUP */ + *reventsp = POLLHUP; + + /* + * Check if there really is data on the stream. + * If so set the correct return flags. + */ + if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) { + /* Something went wrong. */ + return (err); + } + if (rval != 0) + *reventsp |= (events & (POLLIN | POLLRDNORM)); + + /* + * Is the user checking for writability? Note that for ptm + * devices Linux seems to ignore the POLLWRBAND write flag. + */ + if ((events & POLLWRNORM) == 0) + return (0); + + /* + * To check if the stream is writable we have to actually + * call poll, but make sure to set anyyet to 1 to prevent + * the streams framework from setting up callbacks. + */ + if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0) + return (err); + + *reventsp |= (reventsp2 & POLLWRNORM); + } else { + int lockstate; + + /* The slave device is open, do the poll */ + if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0) + return (err); + + /* + * Drop any leading EOFs on the stream. + * + * Note that we have to use pollunlock() here to avoid + * recursive mutex enters in the poll framework. The + * reason is that if there is an EOF message on the stream + * then the act of reading from the queue to remove the + * message can cause the ptm drivers event service + * routine to be invoked, and if there is no open + * slave device then the ptm driver may generate + * error messages and put them on the stream. This + * in turn will generate a poll event and the poll + * framework will try to invoke any poll callbacks + * associated with the stream. In the process of + * doing that the poll framework will try to aquire + * locks that we are already holding. So we need to + * drop those locks here before we do our read. + */ + if (pollunlock(&lockstate) != 0) { + *reventsp = POLLNVAL; + return (0); + } + err = lx_ptm_eof_drop(dev, &rval); + pollrelock(lockstate); + if (err) + return (err); + + /* If no EOF was dropped then return */ + if (rval == 0) + return (0); + + /* + * An EOF was removed from the stream. Retry the entire + * poll operation from the top because polls on the ptm + * device should behave differently now. + */ + *loop = 1; + } + return (0); +} + +static int +lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int loop, err; + + do { + /* Serialize ourself wrt read operations. */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_poll_loop(dev, + events, anyyet, reventsp, phpp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + return (0); +} + +static struct cb_ops lx_ptm_cb_ops = { + lx_ptm_open, /* open */ + lx_ptm_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lx_ptm_read, /* read */ + lx_ptm_write, /* write */ + lx_ptm_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + lx_ptm_poll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lx_ptm_ops = { + DEVO_REV, + 0, + ddi_getinfo_1to1, + nulldev, + nulldev, + lx_ptm_attach, + lx_ptm_detach, + nodev, + &lx_ptm_cb_ops, + NULL, + NULL, + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "Linux master terminal driver", /* description of module */ + &lx_ptm_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf new file mode 100644 index 0000000000..481b4e3c74 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_ptm" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c new file mode 100644 index 0000000000..6cff045a80 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -0,0 +1,2586 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016, Joyent, Inc. All rights reserved. + */ + +/* + * The LX Brand: emulation of a Linux operating environment within a zone. + * + * OVERVIEW + * + * The LX brand enables a full Linux userland -- including a C library, + * init(1) framework, and some set of applications -- to run unmodified + * within an illumos zone. Unlike illumos, where applications are expected + * to link against and consume functions exported from libraries, the + * supported Linux binary compatibility boundary is the system call + * interface. By accurately emulating the behaviour of Linux system calls, + * Linux software can be executed in this environment as if it were running + * on a native Linux system. + * + * EMULATING LINUX SYSTEM CALLS + * + * Linux system calls are made in 32-bit processes via the "int 0x80" + * instruction; in 64-bit processes the "syscall" instruction is used, as it + * is with native illumos processes. In both cases, arguments to system + * calls are generally passed in registers and the usermode stack is not + * interpreted or modified by the Linux kernel. + * + * When the emulated Linux process makes a system call, it traps into the + * illumos kernel. The in-kernel brand module contains various emulation + * routines, and can fully service some emulated system calls; e.g. read(2) + * and write(2). Other system calls require assistance from the illumos + * libc, bouncing back out to the brand library ("lx_brand.so.1") for + * emulation. + * + * The brand mechanism allows for the provision of an alternative trap + * handler for the various system call mechanisms. Traditionally this was + * used to immediately revector execution to the usermode emulation library, + * which was responsible for handling all system calls. In the interests of + * more accurate emulation and increased performance, much of the regular + * illumos system call path is now invoked. Only the argument processing and + * handler dispatch are replaced by the brand, via the per-LWP + * "lwp_brand_syscall" interposition function pointer. + * + * THE NATIVE AND BRAND STACKS + * + * Some runtime environments (e.g. the Go language) allocate very small + * thread stacks, preferring to grow or split the stack as necessary. The + * Linux kernel generally does not use the usermode stack when servicing + * system calls, so this is not a problem. In order for our emulation to + * have the same zero stack impact, we must execute usermode emulation + * routines on an _alternate_ stack. This is similar, in principle, to the + * use of sigaltstack(3C) to run signal handlers off the main thread stack. + * + * To this end, the brand library allocates and installs an alternate stack + * (called the "native" stack) for each LWP. The in-kernel brand code uses + * this stack for usermode emulation calls and interposed signal delivery, + * while the emulated Linux process sees only the data on the main thread + * stack, known as the "brand" stack. The stack mode is tracked in the + * per-LWP brand-private data, using the LX_STACK_MODE_* enum. + * + * The stack mode doubles as a system call "mode bit". When in the + * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux + * system calls. In other modes, system calls are assumed to be native + * illumos system calls as made during brand library initialisation and + * usermode emulation. + * + * USERMODE EMULATION + * + * When a Linux system call cannot be emulated within the kernel, we preserve + * the register state of the Linux process and revector the LWP to the brand + * library usermode emulation handler: the "lx_emulate()" function in + * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, + * and is performed in "lx_emulate_user()". + * + * First, the emulated process state is written out to the usermode stack of + * the process as a "ucontext_t" object. Arguments to the emulation routine + * are passed on the stack or in registers, depending on the ABI. When the + * usermode emulation is complete, the result is passed back to the kernel + * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context + * for restoration. + * + * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT + * + * When servicing emulated system calls in the usermode brand library, or + * during signal delivery, various state is preserved by the kernel so that + * the running LWP may be revectored to a handling routine. The context + * allows the kernel to restart the program at the point of interruption, + * either at the return of the signal handler, via setcontext(3C); or after + * the usermode emulation request has been serviced, via B_EMULATION_DONE. + * + * In illumos native processes, the saved context (a "ucontext_t" object) + * includes the state of registers and the current signal mask at the point + * of interruption. The context also includes a link to the most recently + * saved context, forming a chain to be unwound as requests complete. The LX + * brand requires additional book-keeping to describe the machine state: in + * particular, the current stack mode and the occupied extent of the native + * stack. + * + * The brand code is able to interpose on the context save and restore + * operations in the kernel -- see "lx_savecontext()" and + * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to + * function correctly in the face of a dual stack LWP. The brand also + * interposes on the signal delivery mechanism -- see "lx_sendsig()" and + * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand + * library interposer on the native stack, regardless of the interrupted + * execution mode. Linux sigaltstack(2) emulation is performed entirely by + * the usermode brand library during signal handler interposition. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/exec.h> +#include <sys/lx_impl.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <sys/lx_futex.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/param.h> +#include <sys/termios.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/auxv.h> +#include <sys/priv.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/archsystm.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sdt.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <sys/core.h> +#include <sys/stack.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <lx_signum.h> +#include <util/sscanf.h> +#include <sys/lx_brand.h> +#include <sys/zfs_ioctl.h> + +int lx_debug = 0; + +void lx_init_brand_data(zone_t *, kmutex_t *); +void lx_free_brand_data(zone_t *); +void lx_setbrand(proc_t *); +int lx_getattr(zone_t *, int, void *, size_t *); +int lx_setattr(zone_t *, int, void *, size_t); +int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t); +void lx_set_kern_version(zone_t *, char *); +void lx_copy_procdata(proc_t *, proc_t *); + +extern int getsetcontext(int, void *); +extern int waitsys(idtype_t, id_t, siginfo_t *, int); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +extern int waitsys32(idtype_t, id_t, siginfo_t *, int); +#endif + +extern int zvol_name2minor(const char *, minor_t *); +extern int zvol_create_minor(const char *); + +extern void lx_proc_exit(proc_t *); +extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); + +extern void lx_ioctl_init(); +extern void lx_ioctl_fini(); +extern void lx_socket_init(); +extern void lx_socket_fini(); + +lx_systrace_f *lx_systrace_entry_ptr; +lx_systrace_f *lx_systrace_return_ptr; + +static int lx_systrace_enabled; + +/* + * cgroup file system maintenance functions which are set when cgroups loads. + */ +void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +/* + * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly + * want an MMU dependency here (and should there be a microprocessor without + * a hole, we don't want to start allocating from the top of the VA range). + */ +#define LX_MAXSTACK64 0x7ffffff00000 + +uint64_t lx_maxstack64 = LX_MAXSTACK64; + +static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action); + +static boolean_t lx_native_exec(uint8_t, const char **); +static uint32_t lx_map32limit(proc_t *); + +static void lx_savecontext(ucontext_t *); +static void lx_restorecontext(ucontext_t *); +static caddr_t lx_sendsig_stack(int); +static void lx_sendsig(int); +#if defined(_SYSCALL32_IMPL) +static void lx_savecontext32(ucontext32_t *); +#endif +static int lx_setid_clear(vattr_t *, cred_t *); +#if defined(_LP64) +static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); +#endif + +typedef struct lx_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +} lx_zfs_ds_t; + +/* lx brand */ +struct brand_ops lx_brops = { + lx_init_brand_data, /* b_init_brand_data */ + lx_free_brand_data, /* b_free_brand_data */ + lx_brandsys, /* b_brandsys */ + lx_setbrand, /* b_setbrand */ + lx_getattr, /* b_getattr */ + lx_setattr, /* b_setattr */ + lx_copy_procdata, /* b_copy_procdata */ + lx_proc_exit, /* b_proc_exit */ + lx_exec, /* b_exec */ + lx_setrval, /* b_lwp_setrval */ + lx_lwpdata_alloc, /* b_lwpdata_alloc */ + lx_lwpdata_free, /* b_lwpdata_free */ + lx_initlwp, /* b_initlwp */ + lx_initlwp_post, /* b_initlwp_post */ + lx_forklwp, /* b_forklwp */ + lx_freelwp, /* b_freelwp */ + lx_exitlwp, /* b_lwpexit */ + lx_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + lx_sigfd_translate, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + lx_exit_with_sig, /* b_exit_with_sig */ + lx_wait_filter, /* b_wait_filter */ + lx_native_exec, /* b_native_exec */ + lx_map32limit, /* b_map32limit */ + lx_stop_notify, /* b_stop_notify */ + lx_waitid_helper, /* b_waitid_helper */ + lx_sigcld_repost, /* b_sigcld_repost */ + lx_ptrace_issig_stop, /* b_issig_stop */ + lx_ptrace_sig_ignorable, /* b_sig_ignorable */ + lx_savecontext, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + lx_savecontext32, /* b_savecontext32 */ +#endif + lx_restorecontext, /* b_restorecontext */ + lx_sendsig_stack, /* b_sendsig_stack */ + lx_sendsig, /* b_sendsig */ + lx_setid_clear, /* b_setid_clear */ +#if defined(_LP64) + lx_pagefault, /* b_pagefault */ +#else + NULL, +#endif + B_FALSE /* b_intp_parse_arg */ +}; + +struct brand_mach_ops lx_mops = { + NULL, + NULL, + NULL, + NULL, + NULL, + lx_fixsegreg, + lx_fsbase +}; + +struct brand lx_brand = { + BRAND_VER_1, + "lx", + &lx_brops, + &lx_mops, + sizeof (struct lx_proc_data) +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "lx brand", &lx_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +lx_proc_exit(proc_t *p) +{ + lx_proc_data_t *lxpd; + proc_t *cp; + + mutex_enter(&p->p_lock); + VERIFY(lxpd = ptolxproc(p)); + VERIFY(lxpd->l_ptrace == 0); + if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { + mutex_exit(&p->p_lock); + return; + } + mutex_exit(&p->p_lock); + + /* Check for children which desire notification of parental death. */ + mutex_enter(&pidlock); + for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { + mutex_enter(&cp->p_lock); + if ((lxpd = ptolxproc(cp)) == NULL) { + mutex_exit(&cp->p_lock); + continue; + } + if (lxpd->l_parent_deathsig != 0) { + sigtoproc(cp, NULL, lxpd->l_parent_deathsig); + } + mutex_exit(&cp->p_lock); + } + mutex_exit(&pidlock); +} + +void +lx_setbrand(proc_t *p) +{ + /* Send SIGCHLD to parent by default when child exits */ + ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; + + lx_read_argv_bounds(p); +} + +/* ARGSUSED */ +int +lx_setattr(zone_t *zone, int attr, void *ubuf, size_t ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + bzero(buf, LX_KERN_RELEASE_MAX); + if (ubufsz >= LX_KERN_RELEASE_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_release, buf, + LX_KERN_RELEASE_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + bzero(buf, LX_KERN_VERSION_MAX); + if (ubufsz >= LX_KERN_VERSION_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_version, buf, + LX_KERN_VERSION_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + default: + return (EINVAL); + } +} + +/* ARGSUSED */ +int +lx_getattr(zone_t *zone, int attr, void *ubuf, size_t *ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + int len; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_release, LX_KERN_RELEASE_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_release, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_version, LX_KERN_VERSION_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_version, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + default: + return (EINVAL); + } +} + +uint32_t +lx_map32limit(proc_t *p) +{ + /* + * To be bug-for-bug compatible with Linux, we have MAP_32BIT only + * allow mappings in the first 31 bits. This was a nuance in the + * original Linux implementation circa 2002, and applications have + * come to depend on its behavior. + * + * This is only relevant for 64-bit processes. + */ + if (p->p_model == DATAMODEL_LP64) + return (1 << 31); + + return ((uint32_t)USERLIMIT32); +} + +void +lx_brand_systrace_enable(void) +{ + VERIFY(!lx_systrace_enabled); + + lx_systrace_enabled = 1; +} + +void +lx_brand_systrace_disable(void) +{ + VERIFY(lx_systrace_enabled); + + lx_systrace_enabled = 0; +} + +void +lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) +{ + VERIFY(lwpd->br_ntv_stack != 0); + + /* + * The "brand-lx-set-ntv-stack-current" probe has arguments: + * arg0: stack pointer before change + * arg1: stack pointer after change + * arg2: current stack base + */ + DTRACE_PROBE3(brand__lx__set__ntv__stack__current, + uintptr_t, lwpd->br_ntv_stack_current, + uintptr_t, new_sp, + uintptr_t, lwpd->br_ntv_stack); + + lwpd->br_ntv_stack_current = new_sp; +} + +#if defined(_LP64) +static int +lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, + enum seg_rw rw) +{ + int syscall_num; + + /* + * We only want to handle a very specific set of circumstances. + * Namely: this is a 64-bit LX-branded process attempting to execute an + * address in a page for which it does not have a valid mapping. If + * this is not the case, we bail out as fast as possible. + */ + VERIFY(PROC_IS_BRANDED(p)); + if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != + DATAMODEL_NATIVE) { + return (-1); + } + + if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { + return (-1); + } + + /* + * This is a valid vsyscall address. We service the system call and + * return 0 to signal that the pagefault has been handled completely. + */ + lx_vsyscall_enter(p, lwp, syscall_num); + return (0); +} +#endif + +/* + * This hook runs prior to sendsig() processing and allows us to nominate + * an alternative stack pointer for delivery of the signal handling frame. + * Critically, this routine should _not_ modify any LWP state as the + * savecontext() does not run until after this hook. + */ +static caddr_t +lx_sendsig_stack(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We want to take signal delivery on the native stack, but only if + * one has been allocated and installed for this LWP. + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The program is not running on the native stack. Return + * the native stack pointer from our brand-private data so + * that we may switch to it for signal handling. + */ + return ((caddr_t)lwpd->br_ntv_stack_current); + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * Either the program is already running on the native stack, + * or one has not yet been allocated for this LWP. Use the + * current stack pointer value. + */ + return ((caddr_t)rp->r_sp); + } +} + +/* + * This hook runs after sendsig() processing and allows us to update the + * per-LWP mode flags for system calls and stacks. The pre-signal + * context has already been saved and delivered to the user at this point. + */ +static void +lx_sendsig(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * In lx_sendsig_stack(), we nominated a stack pointer from the + * native stack. Update the stack mode, and the current in-use + * extent of the native stack, accordingly: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, rp->r_sp); + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + break; + + default: + /* + * Otherwise, the brand library has not yet installed the + * alternate stack for this LWP. Signals will be handled on + * the regular stack thread. + */ + return; + } +} + +/* + * This hook runs prior to the context restoration, allowing us to take action + * or modify the context before it is loaded. + */ +static void +lx_restorecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + caddr_t sp = ucp->uc_brand_data[1]; + + /* + * We have a saved native stack pointer value that we must restore + * into the per-LWP data. + */ + if (flags & LX_UC_RESTORE_NATIVE_SP) { + lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); + } + + /* + * We do not wish to restore the value of uc_link in this context, + * so replace it with the value currently in the LWP. + */ + if (flags & LX_UC_IGNORE_LINK) { + ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; + } + + /* + * Restore the stack mode: + */ + if (flags & LX_UC_STACK_NATIVE) { + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + } else if (flags & LX_UC_STACK_BRAND) { + lwpd->br_stack_mode = LX_STACK_MODE_BRAND; + } + +#if defined(__amd64) + /* + * Override the fs/gsbase in the context with the value provided + * through the Linux arch_prctl(2) system call. + */ + if (flags & LX_UC_STACK_BRAND) { + if (lwpd->br_lx_fsbase != 0) { + ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; + } + if (lwpd->br_lx_gsbase != 0) { + ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; + } + } +#endif +} + +static void +lx_savecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = + (void *)(uintptr_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = (void *)flags; +} + +#if defined(_SYSCALL32_IMPL) +static void +lx_savecontext32(ucontext32_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + unsigned int flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = flags; +} +#endif + +static int +lx_zfs_ioctl(ldi_handle_t lh, int cmd, zfs_cmd_t *zc, size_t *dst_alloc_size) +{ + uint64_t cookie; + size_t dstsize; + int rc, unused; + + cookie = zc->zc_cookie; + + dstsize = (dst_alloc_size == NULL ? 0 : 8192); + +again: + if (dst_alloc_size != NULL) { + zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(dstsize, + KM_SLEEP); + zc->zc_nvlist_dst_size = dstsize; + } + + rc = ldi_ioctl(lh, cmd, (intptr_t)zc, FKIOCTL, kcred, &unused); + if (rc == ENOMEM && dst_alloc_size != NULL) { + /* + * Our nvlist_dst buffer was too small, retry with a bigger + * buffer. ZFS will tell us the exact needed size. + */ + size_t newsize = zc->zc_nvlist_dst_size; + ASSERT(newsize > dstsize); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, dstsize); + dstsize = newsize; + zc->zc_cookie = cookie; + + goto again; + } + + if (dst_alloc_size != NULL) { + *dst_alloc_size = dstsize; + } + + return (rc); +} + +static int +lx_zone_zfs_open(ldi_handle_t *lh, dev_t *zfs_dev) +{ + ldi_ident_t li; + + if (ldi_ident_from_mod(&modlinkage, &li) != 0) { + return (-1); + } + if (ldi_open_by_name("/dev/zfs", FREAD|FWRITE, kcred, lh, li) != 0) { + ldi_ident_release(li); + return (-1); + } + ldi_ident_release(li); + if (ldi_get_dev(*lh, zfs_dev) != 0) { + ldi_close(*lh, FREAD|FWRITE, kcred); + return (-1); + } + return (0); +} + +/* + * We only get the relevant properties for zvols. This is because we're + * essentially iterating all of the ZFS datasets/zvols on the entire system + * when we boot the zone and there is a significant performance penalty if we + * have to retrieve all of the properties for everything. Especially since we + * don't care about any of them except the zvols actually in our delegated + * datasets. + * + * Note that the two properties we care about, volsize & volblocksize, are + * mandatory for zvols and should always be present. Also, note that the + * blocksize property value cannot change after the zvol has been created. + */ +static void +lx_zvol_props(ldi_handle_t lh, zfs_cmd_t *zc, uint64_t *vsz, uint64_t *bsz) +{ + int rc; + size_t size; + nvlist_t *nv = NULL, *nv2; + + rc = lx_zfs_ioctl(lh, ZFS_IOC_OBJSET_STATS, zc, &size); + if (rc != 0) + return; + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &nv, 0); + ASSERT(rc == 0); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + zc->zc_nvlist_dst = NULL; + zc->zc_nvlist_dst_size = 0; + + if ((rc = nvlist_lookup_nvlist(nv, "volsize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *vsz = val; + } + } + + if ((rc = nvlist_lookup_nvlist(nv, "volblocksize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *bsz = val; + } + } + + nvlist_free(nv); +} + +/* + * Unlike ZFS proper, which does dynamic zvols, we currently only generate the + * zone's "disk" list once at zone boot time and use that consistently in all + * of the various subsystems (devfs, sysfs, procfs). This allows us to avoid + * re-iterating the datasets every time one of those subsystems accesses a + * "disk" and allows us to keep the view consistent across all subsystems, but + * it does mean a reboot is required to see new "disks". This is somewhat + * mitigated by its similarity to actual disk drives on a real system. + */ +static void +lx_zone_get_zvols(zone_t *zone, ldi_handle_t lh, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd; + list_t *zvol_lst, ds_lst; + int rc; + unsigned int devnum = 0; + size_t size; + zfs_cmd_t *zc; + nvpair_t *elem = NULL; + nvlist_t *pnv = NULL; + + lxzd = ztolxzd(zone); + ASSERT(lxzd != NULL); + zvol_lst = lxzd->lxzd_vdisks; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + if (lx_zfs_ioctl(lh, ZFS_IOC_POOL_CONFIGS, zc, &size) != 0) { + goto out; + } + ASSERT(zc->zc_cookie > 0); + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &pnv, 0); + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + if (rc != 0) + goto out; + + /* + * We use a dataset list to process all of the datasets in the pool + * without doing recursion so that we don't risk blowing the kernel + * stack. + */ + list_create(&ds_lst, sizeof (lx_zfs_ds_t), + offsetof(lx_zfs_ds_t, ds_link)); + + while ((elem = nvlist_next_nvpair(pnv, elem)) != NULL) { + lx_zfs_ds_t *ds; + + ds = kmem_zalloc(sizeof (lx_zfs_ds_t), KM_SLEEP); + (void) strcpy(ds->ds_name, nvpair_name(elem)); + list_insert_head(&ds_lst, ds); + + while (ds != NULL) { + int w; /* dummy variable */ + + bzero(zc, sizeof (zfs_cmd_t)); + zc->zc_cookie = ds->ds_cookie; + (void) strcpy(zc->zc_name, ds->ds_name); + + rc = lx_zfs_ioctl(lh, ZFS_IOC_DATASET_LIST_NEXT, + zc, NULL); + /* Update the cookie before doing anything else. */ + ds->ds_cookie = zc->zc_cookie; + + if (rc != 0) { + list_remove(&ds_lst, ds); + kmem_free(ds, sizeof (lx_zfs_ds_t)); + ds = list_tail(&ds_lst); + continue; + } + + /* Reserved internal names, skip over these. */ + if (strchr(zc->zc_name, '$') != NULL || + strchr(zc->zc_name, '%') != NULL) + continue; + + if (!zone_dataset_visible_inzone(zone, zc->zc_name, &w)) + continue; + + if (zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) { + lx_virt_disk_t *vd; + minor_t m = 0; + char *znm = zc->zc_name; + + /* Create a virtual disk entry for the zvol */ + vd = kmem_zalloc(sizeof (lx_virt_disk_t), + KM_SLEEP); + vd->lxvd_type = LXVD_ZVOL; + (void) snprintf(vd->lxvd_name, + sizeof (vd->lxvd_name), + "zvol%u", devnum++); + (void) strlcpy(vd->lxvd_real_name, + zc->zc_name, + sizeof (vd->lxvd_real_name)); + + /* Record emulated and real dev_t values */ + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, + (*emul_minor)++); + if (zvol_name2minor(znm, &m) != 0) { + (void) zvol_create_minor(znm); + zvol_name2minor(znm, &m); + } + if (m != 0) { + vd->lxvd_real_dev = makedevice( + getmajor(lxzd->lxzd_zfs_dev), m); + } + + /* Query volume size properties */ + lx_zvol_props(lh, zc, &vd->lxvd_volsize, + &vd->lxvd_blksize); + + list_insert_tail(zvol_lst, vd); + } else { + lx_zfs_ds_t *nds; + + /* Create a new ds_t for the child. */ + nds = kmem_zalloc(sizeof (lx_zfs_ds_t), + KM_SLEEP); + (void) strcpy(nds->ds_name, zc->zc_name); + list_insert_after(&ds_lst, ds, nds); + + /* Depth-first, so do the one just created. */ + ds = nds; + } + } + + ASSERT(list_is_empty(&ds_lst)); + } + + list_destroy(&ds_lst); + +out: + nvlist_free(pnv); + kmem_free(zc, sizeof (zfs_cmd_t)); +} + +static void +lx_zone_get_zfsds(zone_t *zone, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + vfs_t *vfsp = zone->zone_rootvp->v_vfsp; + + /* + * Only the root will be mounted at zone init time. + * Finding means of discovering other datasets mounted in the zone + * would be a good enhancement later. + */ + if (getmajor(vfsp->vfs_dev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = kmem_zalloc(sizeof (lx_virt_disk_t), KM_SLEEP); + vd->lxvd_type = LXVD_ZFS_DS; + vd->lxvd_real_dev = vfsp->vfs_dev; + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, (*emul_minor)++); + snprintf(vd->lxvd_name, sizeof (vd->lxvd_name), + "zfsds%u", 0); + (void) strlcpy(vd->lxvd_real_name, + refstr_value(vfsp->vfs_resource), + sizeof (vd->lxvd_real_name)); + + list_insert_tail(lxzd->lxzd_vdisks, vd); + } +} + +/* Cleanup virtual disk list */ +static void +lx_zone_cleanup_vdisks(lx_zone_data_t *lxzd) +{ + lx_virt_disk_t *vd; + + ASSERT(lxzd->lxzd_vdisks != NULL); + vd = (list_remove_head(lxzd->lxzd_vdisks)); + while (vd != NULL) { + kmem_free(vd, sizeof (lx_virt_disk_t)); + vd = list_remove_head(lxzd->lxzd_vdisks); + } + + list_destroy(lxzd->lxzd_vdisks); + kmem_free(lxzd->lxzd_vdisks, sizeof (list_t)); + lxzd->lxzd_vdisks = NULL; +} + +void +lx_init_brand_data(zone_t *zone, kmutex_t *zsl) +{ + lx_zone_data_t *data; + ldi_handle_t lh; + + ASSERT(MUTEX_HELD(zsl)); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(zone->zone_brand_data == NULL); + + data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); + mutex_init(&data->lxzd_lock, NULL, MUTEX_DEFAULT, NULL); + + /* No need to hold mutex now since zone_brand_data is not set yet. */ + + /* + * Set the default lxzd_kernel_version to 2.4. + * This can be changed by a call to setattr() during zone boot. + */ + (void) strlcpy(data->lxzd_kernel_release, "2.4.21", + LX_KERN_RELEASE_MAX); + (void) strlcpy(data->lxzd_kernel_version, "BrandZ virtual linux", + LX_KERN_VERSION_MAX); + + zone->zone_brand_data = data; + + /* + * In Linux, if the init(1) process terminates the system panics. + * The zone must reboot to simulate this behaviour. + */ + zone->zone_reboot_on_init_exit = B_TRUE; + + /* + * We cannot hold the zone_status_lock while performing zfs operations + * so we drop the lock, get the zfs devs as the last step in this + * function, then reaquire the lock. Don't add any code after this + * which requires that the zone_status_lock was continuously held. + */ + mutex_exit(zsl); + + data->lxzd_vdisks = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(data->lxzd_vdisks, sizeof (lx_virt_disk_t), + offsetof(lx_virt_disk_t, lxvd_link)); + + if (lx_zone_zfs_open(&lh, &data->lxzd_zfs_dev) == 0) { + minor_t emul_minor = 1; + + lx_zone_get_zfsds(zone, &emul_minor); + lx_zone_get_zvols(zone, lh, &emul_minor); + ldi_close(lh, FREAD|FWRITE, kcred); + } else { + /* Avoid matching any devices */ + data->lxzd_zfs_dev = makedevice(-1, 0); + } + mutex_enter(zsl); +} + +void +lx_free_brand_data(zone_t *zone) +{ + lx_zone_data_t *data = ztolxzd(zone); + ASSERT(data != NULL); + mutex_enter(&data->lxzd_lock); + if (data->lxzd_ioctl_sock != NULL) { + /* + * Since zone_kcred has been cleaned up already, close the + * socket using the global kcred. + */ + ksocket_close(data->lxzd_ioctl_sock, kcred); + data->lxzd_ioctl_sock = NULL; + } + ASSERT(data->lxzd_cgroup == NULL); + + lx_zone_cleanup_vdisks(data); + + mutex_exit(&data->lxzd_lock); + zone->zone_brand_data = NULL; + mutex_destroy(&data->lxzd_lock); + kmem_free(data, sizeof (*data)); +} + +void +lx_unsupported(char *dmsg) +{ + lx_proc_data_t *pd = ttolxproc(curthread); + + DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); + + if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { + /* + * If this process was run with strict mode enabled + * (via LX_STRICT in the environment), we mark this + * LWP as having triggered an unsupported behaviour. + * This flag will be checked at an appropriate point + * by lx_check_strict_failure(). + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + lwpd->br_strict_failure = B_TRUE; + } +} + +void +lx_check_strict_failure(lx_lwp_data_t *lwpd) +{ + proc_t *p; + + if (!lwpd->br_strict_failure) { + return; + } + + lwpd->br_strict_failure = B_FALSE; + + /* + * If this process is operating in strict mode (via LX_STRICT in + * the environment), and has triggered a call to + * lx_unsupported(), we drop SIGSYS on it as we return. + */ + p = curproc; + mutex_enter(&p->p_lock); + sigtoproc(p, curthread, SIGSYS); + mutex_exit(&p->p_lock); +} + +void +lx_trace_sysenter(int syscall_num, uintptr_t *args) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_entry_ptr != NULL); + + (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], + args[2], args[3], args[4], args[5]); + } +} + +void +lx_trace_sysreturn(int syscall_num, long ret) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); + } +} + +/* + * Get the addresses of the user-space system call handler and attach it to + * the proc structure. Returning 0 indicates success; the value returned + * by the system call is the value stored in rval. Returning a non-zero + * value indicates a failure; the value returned is used to set errno, -1 + * is returned from the syscall and the contents of rval are ignored. To + * set errno and have the syscall return a value other than -1 we can + * manually set errno and rval and return 0. + */ +int +lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lx_proc_data_t *pd; + struct termios *termios; + uint_t termios_len; + int error; + int code; + int sig; + lx_brand_registration_t reg; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * There is one operation that is suppored for non-branded + * process. B_EXEC_BRAND. This is the equilivant of an + * exec call, but the new process that is created will be + * a branded process. + */ + if (cmd == B_EXEC_BRAND) { + VERIFY(p->p_zone != NULL); + VERIFY(p->p_zone->zone_brand == &lx_brand); + return (exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_BRAND)); + } + + /* For all other operations this must be a branded process. */ + if (p->p_brand == NULL) + return (ENOSYS); + + VERIFY(p->p_brand == &lx_brand); + VERIFY(p->p_brand_data != NULL); + + switch (cmd) { + case B_REGISTER: + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("stack mode was not PREINIT during " + "REGISTER\n"); + return (EINVAL); + } + + if (p->p_model == DATAMODEL_NATIVE) { + if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + } +#ifdef _LP64 + else { + /* 32-bit userland on 64-bit kernel */ + lx_brand_registration32_t reg32; + + if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + + reg.lxbr_version = (uint_t)reg32.lxbr_version; + reg.lxbr_handler = + (void *)(uintptr_t)reg32.lxbr_handler; + reg.lxbr_flags = reg32.lxbr_flags; + } +#endif + + if (reg.lxbr_version != LX_VERSION_1) { + lx_print("Invalid brand library version (%u)\n", + reg.lxbr_version); + return (EINVAL); + } + + if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { + lx_print("Invalid brand flags (%u)\n", + reg.lxbr_flags); + return (EINVAL); + } + + lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", + (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); + pd = p->p_brand_data; + pd->l_handler = (uintptr_t)reg.lxbr_handler; + pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; + + return (0); + + case B_TTYMODES: + /* This is necessary for emulating TCGETS ioctls. */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, + &termios_len) != DDI_SUCCESS) + return (EIO); + + ASSERT(termios_len == sizeof (*termios)); + + if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { + ddi_prop_free(termios); + return (EFAULT); + } + + ddi_prop_free(termios); + return (0); + + case B_ELFDATA: { + mutex_enter(&p->p_lock); + pd = curproc->p_brand_data; + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_elf_data_t led; + + bcopy(&pd->l_elf_data, &led, sizeof (led)); + mutex_exit(&p->p_lock); + + if (copyout(&led, (void *)arg1, + sizeof (lx_elf_data_t)) != 0) { + return (EFAULT); + } + } +#if defined(_LP64) + else { + /* 32-bit userland on 64-bit kernel */ + lx_elf_data32_t led32; + + led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; + led32.ed_phent = (int)pd->l_elf_data.ed_phent; + led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; + led32.ed_entry = (int)pd->l_elf_data.ed_entry; + led32.ed_base = (int)pd->l_elf_data.ed_base; + led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; + mutex_exit(&p->p_lock); + + if (copyout(&led32, (void *)arg1, + sizeof (led32)) != 0) { + return (EFAULT); + } + } +#endif + return (0); + } + + case B_EXEC_NATIVE: + return (exec_common((char *)arg1, (const char **)arg2, + (const char **)arg3, EBA_NATIVE)); + + /* + * The B_TRUSS_POINT subcommand is used so that we can make a no-op + * syscall for debugging purposes (dtracing) from within the user-level + * emulation. + */ + case B_TRUSS_POINT: + return (0); + + case B_LPID_TO_SPAIR: { + /* + * Given a Linux pid as arg1, return the Solaris pid in arg2 and + * the Solaris LWP in arg3. We also translate pid 1 (which is + * hardcoded in many applications) to the zone's init process. + */ + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { + return (ESRCH); + } + + if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { + return (EFAULT); + } + + return (0); + } + + case B_SIGEV_THREAD_ID: { + /* + * Emulate Linux's timer_create(2) SIGEV_THREAD_ID + * notification method. This mechanism is only meant + * for userland threading libraries such as glibc and + * is documented as such. Therefore, assume this is + * only ever invoked for the purpose of alerting a + * Linux threading library. Assume that the tid is a + * member of the caller's process and the signal + * number is valid. See lx_sigev_thread_id() for the + * userland side of this emulation. + * + * The return code from this function is not checked + * by the caller since it executes in an asynchronous + * context and there is nothing much to be done. If + * this function does fail then it will manifest as + * Linux threads waiting for a signal they will never + * receive. + * + * arg1 -- Linux tid + * arg2 -- Linux signal number + * arg3 -- sigval pointer + */ + + int native_sig = lx_ltos_signo((int)arg2, 0); + pid_t native_pid; + int native_tid; + sigqueue_t *sqp; + + if (native_sig == 0) + return (EINVAL); + + lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid); + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + mutex_enter(&curproc->p_lock); + + if ((t = idtot(curproc, native_tid)) == NULL) { + mutex_exit(&curproc->p_lock); + kmem_free(sqp, sizeof (sigqueue_t)); + return (ESRCH); + } + + sqp->sq_info.si_signo = native_sig; + sqp->sq_info.si_code = SI_TIMER; + sqp->sq_info.si_pid = curproc->p_pid; + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sqp->sq_info.si_value.sival_ptr = (void *)arg3; + sigaddqa(curproc, t, sqp); + + mutex_exit(&curproc->p_lock); + + return (0); + } + + case B_SET_AFFINITY_MASK: + case B_GET_AFFINITY_MASK: + /* + * Retrieve or store the CPU affinity mask for the + * requested linux pid. + * + * arg1 is a linux PID (0 means curthread). + * arg2 is the size of the given mask. + * arg3 is the address of the affinity mask. + */ + return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); + + case B_PTRACE_STOP_FOR_OPT: + return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); + + case B_PTRACE_CLONE_BEGIN: + return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE)); + + case B_HELPER_WAITID: { + idtype_t idtype = (idtype_t)arg1; + id_t id = (id_t)arg2; + siginfo_t *infop = (siginfo_t *)arg3; + int options = (int)arg4; + + lwpd = ttolxlwp(curthread); + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = (int)arg5; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (waitsys32(idtype, id, infop, options)); + } else +#endif + { + return (waitsys(idtype, id, infop, options)); + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (0); + } + + case B_UNSUPPORTED: { + char dmsg[256]; + + if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { + lx_print("Failed to copyin unsupported msg " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + dmsg[255] = '\0'; + lx_unsupported(dmsg); + + lx_check_strict_failure(lwpd); + + return (0); + } + + case B_STORE_ARGS: { + /* + * B_STORE_ARGS subcommand + * arg1 = address of struct to be copied in + * arg2 = size of the struct being copied in + * arg3-arg6 ignored + * rval = the amount of data copied. + */ + void *buf; + + /* only have upper limit because arg2 is unsigned */ + if (arg2 > LX_BR_ARGS_SIZE_MAX) { + return (EINVAL); + } + + buf = kmem_alloc(arg2, KM_SLEEP); + if (copyin((void *)arg1, buf, arg2) != 0) { + lx_print("Failed to copyin scall arg at 0x%p\n", + (void *) arg1); + kmem_free(buf, arg2); + /* + * Purposely not setting br_scall_args to NULL + * to preserve data for debugging. + */ + return (EFAULT); + } + + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, + lwpd->br_args_size); + } + + lwpd->br_scall_args = buf; + lwpd->br_args_size = arg2; + *rval = arg2; + return (0); + } + + case B_HELPER_CLONE: + return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, + (void *)arg4)); + + case B_HELPER_SETGROUPS: + return (lx_helper_setgroups(arg1, (gid_t *)arg2)); + + case B_HELPER_SIGQUEUE: + return (lx_helper_rt_sigqueueinfo(arg1, arg2, + (siginfo_t *)arg3)); + + case B_HELPER_TGSIGQUEUE: + return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, + (siginfo_t *)arg4)); + + case B_SET_THUNK_PID: + lwpd->br_lx_thunk_pid = arg1; + return (0); + + case B_GETPID: + /* + * The usermode clone(2) code needs to be able to call + * lx_getpid() from native code: + */ + *rval = lx_getpid(); + return (0); + + case B_SET_NATIVE_STACK: + /* + * B_SET_NATIVE_STACK subcommand + * arg1 = the base of the stack to use for emulation + */ + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("B_SET_NATIVE_STACK when stack was already " + "set to %p\n", (void *)arg1); + return (EEXIST); + } + + /* + * We move from the PREINIT state, where we have no brand + * emulation stack, to the INIT state. Here, we are still + * running on what will become the BRAND stack, but are running + * emulation (i.e. native) code. Once the initialisation + * process for this thread has finished, we will jump to + * brand-specific code, while moving to the BRAND mode. + * + * When a new LWP is created, lx_initlwp() will clear the + * stack data. If that LWP is actually being duplicated + * into a child process by fork(2), lx_forklwp() will copy + * it so that the cloned thread will keep using the same + * alternate stack. + */ + lwpd->br_ntv_stack = arg1; + lwpd->br_stack_mode = LX_STACK_MODE_INIT; + lx_lwp_set_native_stack_current(lwpd, arg1); + + return (0); + + case B_GET_CURRENT_CONTEXT: + /* + * B_GET_CURRENT_CONTEXT subcommand: + * arg1 = address for pointer to current ucontext_t + */ + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; + + error = copyout(&addr, (void *)arg1, sizeof (addr)); + } else +#endif + { + error = copyout(&lwp->lwp_oldcontext, (void *)arg1, + sizeof (lwp->lwp_oldcontext)); + } + + return (error != 0 ? EFAULT : 0); + + case B_JUMP_TO_LINUX: + /* + * B_JUMP_TO_LINUX subcommand: + * arg1 = ucontext_t pointer for jump state + */ + + if (arg1 == NULL) + return (EINVAL); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_NATIVE: { + struct regs *rp = lwptoregs(lwp); + + /* + * We are on the NATIVE stack, so we must preserve + * the extent of that stack. The pointer will be + * reset by a future setcontext(). + */ + lx_lwp_set_native_stack_current(lwpd, + (uintptr_t)rp->r_sp); + break; + } + + case LX_STACK_MODE_INIT: + /* + * The LWP is transitioning to Linux code for the first + * time. + */ + break; + + case LX_STACK_MODE_PREINIT: + /* + * This LWP has not installed an alternate stack for + * usermode emulation handling. + */ + return (ENOENT); + + case LX_STACK_MODE_BRAND: + /* + * The LWP should not be on the BRAND stack. + */ + exit(CLD_KILLED, SIGSYS); + return (0); + } + + /* + * Transfer control to Linux: + */ + return (lx_runexe(lwp, (void *)arg1)); + + case B_EMULATION_DONE: + /* + * B_EMULATION_DONE subcommand: + * arg1 = ucontext_t * to restore + * arg2 = system call number + * arg3 = return code + * arg4 = if operation failed, the errno value + */ + + /* + * The first part of this operation is a setcontext() to + * restore the register state to the copy we preserved + * before vectoring to the usermode emulation routine. + * If that fails, we return (hopefully) to the emulation + * routine and it will handle the error. + */ +#if (_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + error = getsetcontext32(SETCONTEXT, (void *)arg1); + } else +#endif + { + error = getsetcontext(SETCONTEXT, (void *)arg1); + } + + if (error != 0) { + return (error); + } + + /* + * The saved Linux context has been restored. We handle the + * return value or errno with code common to the in-kernel + * system call emulation. + */ + if ((error = (int)arg4) != 0) { + /* + * lx_syscall_return() looks at the errno in the LWP, + * so set it here: + */ + set_errno(error); + } + lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); + + return (0); + + case B_EXIT_AS_SIG: + code = CLD_KILLED; + sig = (int)arg1; + proc_is_exiting(p); + if (exitlwps(1) != 0) { + mutex_enter(&p->p_lock); + lwp_exit(); + } + ttolwp(curthread)->lwp_cursig = sig; + if (sig == SIGSEGV) { + if (core(sig, 0) == 0) + code = CLD_DUMPED; + } + exit(code, sig); + /* NOTREACHED */ + break; + + case B_OVERRIDE_KERN_VER: { + void *urel = (void *)arg1; + void *uver = (void *)arg2; + size_t len; + + pd = ptolxproc(p); + if (urel != NULL) { + if (copyinstr(urel, pd->l_uname_release, + LX_KERN_RELEASE_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_release[LX_KERN_RELEASE_MAX - 1] = '\0'; + } + if (uver != NULL) { + if (copyinstr(uver, pd->l_uname_version, + LX_KERN_VERSION_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_version[LX_KERN_VERSION_MAX - 1] = '\0'; + } + + return (0); + } + + case B_GET_PERSONALITY: { + unsigned int result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + result = pd->l_personality; + mutex_exit(&p->p_lock); + return (result); + } + + } + + return (EINVAL); +} + +/* + * Compare linux kernel version to the one set for the zone. + * Returns greater than 0 if zone version is higher, less than 0 if the zone + * version is lower, and 0 if the versions are equal. + */ +int +lx_kern_release_cmp(zone_t *zone, const char *vers) +{ + int zvers[3] = {0, 0, 0}; + int cvers[3] = {0, 0, 0}; + int i; + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + VERIFY(zone->zone_brand == &lx_brand); + + mutex_enter(&lxzd->lxzd_lock); + (void) sscanf(lxzd->lxzd_kernel_release, "%d.%d.%d", &zvers[0], + &zvers[1], &zvers[2]); + mutex_exit(&lxzd->lxzd_lock); + (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); + + for (i = 0; i < 3; i++) { + if (zvers[i] > cvers[i]) { + return (1); + } else if (zvers[i] < cvers[i]) { + return (-1); + } + } + return (0); +} + +/* + * Linux unconditionally removes the setuid and setgid bits when changing + * file ownership. This brand hook overrides the illumos native behaviour, + * which is based on the PRIV_FILE_SETID privilege. + */ +static int +lx_setid_clear(vattr_t *vap, cred_t *cr) +{ + if (S_ISDIR(vap->va_mode)) { + return (0); + } + + if (vap->va_mode & S_ISUID) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISUID; + } + if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Copy the per-process brand data from a parent proc to a child. + */ +void +lx_copy_procdata(proc_t *cp, proc_t *pp) +{ + lx_proc_data_t *cpd, *ppd; + + /* + * Since b_copy_procdata is called during getproc(), while the child + * process is still being initialized, acquiring cp->p_lock should not + * be required. + */ + VERIFY(cp->p_brand == &lx_brand); + VERIFY(cpd = cp->p_brand_data); + + mutex_enter(&pp->p_lock); + VERIFY(pp->p_brand == &lx_brand); + VERIFY(ppd = pp->p_brand_data); + + bcopy(ppd, cpd, sizeof (lx_proc_data_t)); + mutex_exit(&pp->p_lock); + + /* + * The l_ptrace count is normally manipulated only while under holding + * p_lock. Since this is a freshly created process, it's safe to zero + * out. If it is to be inherited, the attach will occur later. + */ + cpd->l_ptrace = 0; + + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; + + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; +} + +#if defined(_LP64) +static void +Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) +{ + bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); + dst->e_type = src->e_type; + dst->e_machine = src->e_machine; + dst->e_version = src->e_version; + dst->e_entry = src->e_entry; + dst->e_phoff = src->e_phoff; + dst->e_shoff = src->e_shoff; + dst->e_flags = src->e_flags; + dst->e_ehsize = src->e_ehsize; + dst->e_phentsize = src->e_phentsize; + dst->e_phnum = src->e_phnum; + dst->e_shentsize = src->e_shentsize; + dst->e_shnum = src->e_shnum; + dst->e_shstrndx = src->e_shstrndx; +} +#endif /* _LP64 */ + +static void +restoreexecenv(struct execenv *ep, stack_t *sp) +{ + klwp_t *lwp = ttolwp(curthread); + + setexecenv(ep); + lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; + lwp->lwp_sigaltstack.ss_size = sp->ss_size; + lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; +} + +extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int *); + +extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int *); + +static uintptr_t +lx_map_vdso(struct uarg *args, struct cred *cred) +{ + int err; + char *fpath = LX_VDSO_PATH; + vnode_t *vp; + vattr_t attr; + caddr_t addr; + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + fpath = LX_VDSO_PATH32; + } +#endif + + /* + * The comm page should have been mapped in already. + */ + if (args->commpage == NULL) { + return (NULL); + } + + /* + * Ensure the VDSO library is present and appropriately sized. + * This lookup is started at the zone root to avoid complications for + * processes which have chrooted. For the specified lookup root to be + * used, the leading slash must be dropped from the path. + */ + ASSERT(fpath[0] == '/'); + fpath++; + if (lookupnameat(fpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, + curzone->zone_rootvp) != 0) { + return (NULL); + } + + /* + * The VDSO requires data exposed via the comm page in order to + * function properly. The VDSO is always mapped in at a fixed known + * offset from the comm page, providing an easy means to locate it. + */ + addr = (caddr_t)(args->commpage - LX_VDSO_SIZE); + attr.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &attr, 0, cred, NULL) != 0 || + attr.va_size > LX_VDSO_SIZE) { + VN_RELE(vp); + return (NULL); + } + + err = execmap(vp, addr, attr.va_size, 0, 0, + PROT_USER|PROT_READ|PROT_EXEC, 1, 0); + VN_RELE(vp); + if (err != 0) { + return (NULL); + } + return ((uintptr_t)addr); +} + +/* + * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux + * binaries. + */ +static int +lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action) +{ + int error, i; + vnode_t *nvp; + Ehdr ehdr; + Addr uphdr_vaddr; + intptr_t voffset; + char *interp = NULL; + uintptr_t ldaddr = NULL; + proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + lx_proc_data_t *lxpd = ptolxproc(p); + struct execenv env, origenv; + stack_t orig_sigaltstack; + struct user *up = PTOU(ttoproc(curthread)); + lx_elf_data_t edp; + char *lib_path = LX_LIB_PATH; + boolean_t execstk = B_TRUE; + unsigned int personality; + + ASSERT(p->p_brand == &lx_brand); + ASSERT(lxpd != NULL); + + /* + * Start with a separate struct for ELF data instead of inheriting + * values from the currently running binary. This ensures that fields + * such as ed_base are cleared if the new binary does not utilize an + * interpreter. + */ + bzero(&edp, sizeof (edp)); + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + lib_path = LX_LIB_PATH32; + } +#endif + + /* + * Set the brandname and library name for the new process so that + * elfexec() puts them onto the stack. + */ + args->brandname = LX_BRANDNAME; + args->emulator = lib_path; + +#if defined(_LP64) + /* + * To conform with the way Linux lays out the address space, we clamp + * the stack to be the top of the lower region of the x86-64 canonical + * form address space -- which has the side-effect of laying out the + * entire address space in that lower region. Note that this only + * matters on 64-bit processes (this value will always be greater than + * the size of a 32-bit address space) and doesn't actually affect + * USERLIMIT: if a Linux-branded processes wishes to map something + * into the top half of the address space, it can do so -- but with + * the user stack starting at the top of the bottom region, those high + * virtual addresses won't be used unless explicitly directed. + */ + args->maxstack = lx_maxstack64; +#endif + + /* + * Search the binary for a PT_GNU_STACK header. The PF_X bit contained + * within is used to dictate protection defaults for the stack, among + * other things. + */ + if (args->to_model == DATAMODEL_NATIVE) { + Ehdr ehdr; + Phdr *phdrp; + caddr_t phdrbase = NULL; + ssize_t phdrsize = 0; + int nphdrs, hsize; + + if ((error = elfreadhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + phdrp = (Phdr *)phdrbase; + for (i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + phdrp = (Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr; + Elf32_Phdr *phdrp; + caddr_t phdrbase = NULL; + ssize_t phdrsize = 0; + int nphdrs, hsize; + + if ((error = elf32readhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + phdrp = (Elf32_Phdr *)phdrbase; + for (i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + phdrp = (Elf32_Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#endif + + /* + * Revert the base personality while maintaining any existing flags. + */ + personality = LX_PER_LINUX | (lxpd->l_personality & ~LX_PER_MASK); + + /* + * Linux defaults to an executable stack unless the aformentioned + * PT_GNU_STACK entry in the elf header dictates otherwise. Enabling + * the READ_IMPLIES_EXEC personality flag is also implied in this case. + */ + if (execstk) { + args->stk_prot |= PROT_EXEC; + args->stk_prot_override = B_TRUE; + personality |= LX_PER_READ_IMPLIES_EXEC; + } + + /* + * We will first exec the brand library, then map in the linux + * executable and the linux linker. + */ + if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp))) { + uprintf("%s: not found.", lib_path); + return (error); + } + + /* + * We will eventually set the p_exec member to be the vnode for the new + * executable when we call setexecenv(). However, if we get an error + * before that call we need to restore the execenv to its original + * values so that when we return to the caller fop_close() works + * properly while cleaning up from the failed exec(). Restoring the + * original value will also properly decrement the 2nd VN_RELE that we + * took on the brand library. + */ + origenv.ex_bssbase = p->p_bssbase; + origenv.ex_brkbase = p->p_brkbase; + origenv.ex_brksize = p->p_brksize; + origenv.ex_vp = p->p_exec; + orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; + orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; + orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; + + if (args->to_model == DATAMODEL_NATIVE) { + error = elfexec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#if defined(_LP64) + else { + error = elf32exec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#endif + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * exec-ed in the brand library above. + * The u_auxv vectors are now setup by elfexec to point to the + * brand emulation library and its linker. + */ + + /* + * After execing the brand library (which should have implicitly mapped + * in the comm page), map the VDSO into the approprate place in the AS. + */ + lxpd->l_vdso = lx_map_vdso(args, cred); + + bzero(&env, sizeof (env)); + + /* + * map in the the Linux executable + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + + return (error); + } + + /* + * Save off the important properties of the lx executable. The brand + * library will ask us for this data later, when it is ready to set + * things up for the lx executable. + */ + edp.ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : + voffset + uphdr_vaddr; + edp.ed_entry = voffset + ehdr.e_entry; + edp.ed_phent = ehdr.e_phentsize; + edp.ed_phnum = ehdr.e_phnum; + + if (interp != NULL) { + if (ehdr.e_type == ET_DYN) { + /* + * This is a shared object executable, so we need to + * pick a reasonable place to put the heap. Just don't + * use the first page. + */ + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_bssbase = (caddr_t)PAGESIZE; + } + + /* + * If the program needs an interpreter (most do), map it in and + * store relevant information about it in the aux vector, where + * the brand library can find it. + */ + if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, + NULLVPP, &nvp))) { + uprintf("%s: not found.", interp); + restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); + return (error); + } + + kmem_free(interp, MAXPATHLEN); + interp = NULL; + + /* + * map in the Linux linker + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(nvp, args, &ehdr, + &uphdr_vaddr, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(nvp, args, &ehdr32, + &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * Now that we know the base address of the brand's linker, + * we also save this for later use by the brand library. + */ + edp.ed_base = voffset; + edp.ed_ldentry = voffset + ehdr.e_entry; + } else { + /* + * This program has no interpreter. The lx brand library will + * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, + * so in this case, put the entry point of the main executable + * there. + */ + if (ehdr.e_type == ET_EXEC) { + /* + * An executable with no interpreter, this must be a + * statically linked executable, which means we loaded + * it at the address specified in the elf header, in + * which case the e_entry field of the elf header is an + * absolute address. + */ + edp.ed_ldentry = ehdr.e_entry; + edp.ed_entry = ehdr.e_entry; + } else { + /* + * A shared object with no interpreter, we use the + * calculated address from above. + */ + edp.ed_ldentry = edp.ed_entry; + + /* + * In all situations except an ET_DYN elf object with no + * interpreter, we want to leave the brk and base + * values set by mapexec_brand alone. Normally when + * running ET_DYN objects on Solaris (most likely + * /lib/ld.so.1) the kernel sets brk and base to 0 since + * it doesn't know where to put the heap, and later the + * linker will call brk() to initialize the heap in: + * usr/src/cmd/sgs/rtld/common/setup.c:setup() + * after it has determined where to put it. (This + * decision is made after the linker loads and inspects + * elf properties of the target executable being run.) + * + * So for ET_DYN Linux executables, we also don't know + * where the heap should go, so we'll set the brk and + * base to 0. But in this case the Solaris linker will + * not initialize the heap, so when the Linux linker + * starts running there is no heap allocated. This + * seems to be ok on Linux 2.4 based systems because the + * Linux linker/libc fall back to using mmap() to + * allocate memory. But on 2.6 systems, running + * applications by specifying them as command line + * arguments to the linker results in segfaults for an + * as yet undetermined reason (which seems to indicatej + * that a more permanent fix for heap initalization in + * these cases may be necessary). + */ + if (ehdr.e_type == ET_DYN) { + env.ex_bssbase = (caddr_t)0; + env.ex_brkbase = (caddr_t)0; + env.ex_brksize = 0; + } + } + } + + env.ex_vp = vp; + setexecenv(&env); + + /* + * We try to keep /proc's view of the aux vector consistent with + * what's on the process stack. + */ + if (args->to_model == DATAMODEL_NATIVE) { + auxv_t phdr_auxv[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv[0].a_un.a_val = edp.ed_phdr; + phdr_auxv[1].a_un.a_val = ldaddr; + phdr_auxv[2].a_un.a_val = hz; + phdr_auxv[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv, args->auxp_brand, + sizeof (phdr_auxv)) == -1) + return (EFAULT); + } +#if defined(_LP64) + else { + auxv32_t phdr_auxv32[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv32[0].a_un.a_val = edp.ed_phdr; + phdr_auxv32[1].a_un.a_val = ldaddr; + phdr_auxv32[2].a_un.a_val = hz; + phdr_auxv32[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv32, args->auxp_brand, + sizeof (phdr_auxv32)) == -1) + return (EFAULT); + } +#endif + + /* + * /proc uses the AT_ENTRY aux vector entry to deduce + * the location of the executable in the address space. The user + * structure contains a copy of the aux vector that needs to have those + * entries patched with the values of the real lx executable (they + * currently contain the values from the lx brand library that was + * elfexec'd, above). + * + * For live processes, AT_BASE is used to locate the linker segment, + * which /proc and friends will later use to find Solaris symbols + * (such as rtld_db_preinit). However, for core files, /proc uses + * AT_ENTRY to find the right segment to label as the executable. + * So we set AT_ENTRY to be the entry point of the linux executable, + * but leave AT_BASE to be the address of the Solaris linker. + */ + for (i = 0; i < __KERN_NAUXV_IMPL; i++) { + switch (up->u_auxv[i].a_type) { + case AT_ENTRY: + up->u_auxv[i].a_un.a_val = edp.ed_entry; + break; + + case AT_SUN_BRAND_LX_PHDR: + up->u_auxv[i].a_un.a_val = edp.ed_phdr; + break; + + case AT_SUN_BRAND_LX_INTERP: + up->u_auxv[i].a_un.a_val = ldaddr; + break; + + case AT_SUN_BRAND_LX_CLKTCK: + up->u_auxv[i].a_un.a_val = hz; + break; + + default: + break; + } + } + + /* + * Record the brand ELF data and new personality now that the exec has + * proceeded successfully. + */ + bcopy(&edp, &lxpd->l_elf_data, sizeof (edp)); + lxpd->l_personality = personality; + + return (0); +} + +boolean_t +lx_native_exec(uint8_t osabi, const char **interp) +{ + if (osabi != ELFOSABI_SOLARIS) + return (B_FALSE); + + /* + * If the process root matches the zone root, prepend /native to the + * interpreter path for native executables. Absolute precision from + * VN_CMP is not necessary since any change of process root is likely + * to make native binaries inaccessible via /native. + * + * Processes which chroot directly into /native will be able to + * function as expected with no need for the prefix. + */ + if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { + *interp = "/native"; + } + + return (B_TRUE); +} + +static void +lx_syscall_init(void) +{ + int i; + + /* + * Count up the 32-bit Linux system calls. Note that lx_sysent32 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) + continue; + lx_nsysent32 = i; + +#if defined(_LP64) + /* + * Count up the 64-bit Linux system calls. Note that lx_sysent64 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) + continue; + lx_nsysent64 = i; +#endif +} + +int +_init(void) +{ + int err = 0; + + lx_syscall_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_futex_init(); + lx_ptrace_init(); + lx_socket_init(); + + err = mod_install(&modlinkage); + if (err != 0) { + cmn_err(CE_WARN, "Couldn't install lx brand module"); + + /* + * This looks drastic, but it should never happen. These + * two data structures should be completely free-able until + * they are used by Linux processes. Since the brand + * wasn't loaded there should be no Linux processes, and + * thus no way for these data structures to be modified. + */ + lx_pid_fini(); + lx_ioctl_fini(); + if (lx_futex_fini()) + panic("lx brand module cannot be loaded or unloaded."); + } + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + int futex_done = 0; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&lx_brand)) + return (EBUSY); + + lx_ptrace_fini(); + lx_pid_fini(); + lx_ioctl_fini(); + lx_socket_fini(); + + if ((err = lx_futex_fini()) != 0) { + goto done; + } + futex_done = 1; + + err = mod_remove(&modlinkage); + +done: + if (err) { + /* + * If we can't unload the module, then we have to get it + * back into a sane state. + */ + lx_ptrace_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_socket_init(); + + if (futex_done) { + lx_futex_init(); + } + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c new file mode 100644 index 0000000000..7ede833ca4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -0,0 +1,1103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016, Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/privregs.h> +#include <sys/exec.h> +#include <sys/lwp.h> +#include <sys/sem.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_siginfo.h> +#include <sys/lx_futex.h> +#include <lx_errno.h> +#include <sys/cmn_err.h> +#include <sys/siginfo.h> +#include <sys/contract/process_impl.h> +#include <sys/x86_archext.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <net/if.h> +#include <inet/ip6.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/sysmacros.h> + +/* Linux specific functions and definitions */ +static void lx_save(klwp_t *); +static void lx_restore(klwp_t *); + +/* + * Set the return code for the forked child, always zero + */ +/*ARGSUSED*/ +void +lx_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_r0 = 0; +} + +/* + * Reset process state on exec(2) + */ +void +lx_exec() +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = ttoproc(curthread); + lx_proc_data_t *pd = ptolxproc(p); + struct regs *rp = lwptoregs(lwp); + + /* b_exec is called without p_lock held */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Any l_handler handlers set as a result of B_REGISTER are now + * invalid; clear them. + */ + pd->l_handler = NULL; + + /* + * If this was a multi-threaded Linux process and this lwp wasn't the + * main lwp, then we need to make its Illumos and Linux PIDs match. + */ + if (curthread->t_tid != 1) { + lx_pid_reassign(curthread); + } + + /* + * Inform ptrace(2) that we are processing an execve(2) call so that if + * we are traced we can post either the PTRACE_EVENT_EXEC event or the + * legacy SIGTRAP. + */ + (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0); + + /* clear the fs/gsbase values until the app. can reinitialize them */ + lwpd->br_lx_fsbase = NULL; + lwpd->br_ntv_fsbase = NULL; + lwpd->br_lx_gsbase = NULL; + lwpd->br_ntv_gsbase = NULL; + + /* + * Clear the native stack flags. This will be reinitialised by + * lx_init() in the new process image. + */ + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + lwpd->br_ntv_stack = 0; + lwpd->br_ntv_stack_current = 0; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, + NULL); + + /* + * clear out the tls array + */ + bzero(lwpd->br_tls, sizeof (lwpd->br_tls)); + + /* + * reset the tls entries in the gdt + */ + kpreempt_disable(); + lx_restore(lwp); + kpreempt_enable(); + + /* Grab the updated argv bounds */ + mutex_enter(&p->p_lock); + lx_read_argv_bounds(p); + mutex_exit(&p->p_lock); + + /* + * The exec syscall doesn't return (so we don't call lx_syscall_return) + * but for our ptrace emulation we need to do this so that a tracer + * does not get out of sync. We know that by the time this lx_exec + * function is called that the exec has succeeded. + */ + rp->r_r0 = 0; + lx_ptrace_stop(LX_PR_SYSEXIT); +} + +static void +lx_cleanlwp(klwp_t *lwp, proc_t *p) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + void *rb_list = NULL; + + VERIFY(lwpd != NULL); + + mutex_enter(&p->p_lock); + if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) { + lx_ptrace_exit(p, lwp); + } + + /* + * While we have p_lock, safely grab any robust_list references and + * clear the lwp field. + */ + sprlock_proc(p); + rb_list = lwpd->br_robust_list; + lwpd->br_robust_list = NULL; + sprunlock(p); + + if (rb_list != NULL) { + lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid); + } +} + +void +lx_exitlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + kthread_t *t; + sigqueue_t *sqp = NULL; + pid_t ppid; + id_t ptid; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* second time thru' */ + return; + } + + lx_cleanlwp(lwp, p); + + if (lwpd->br_clear_ctidp != NULL) { + (void) suword32(lwpd->br_clear_ctidp, 0); + (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1, + NULL, NULL, 0); + lwpd->br_clear_ctidp = NULL; + } + + if (lwpd->br_signal != 0) { + /* + * The first thread in a process doesn't cause a signal to + * be sent when it exits. It was created by a fork(), not + * a clone(), so the parent should get signalled when the + * process exits. + */ + if (lwpd->br_ptid == -1) + goto free; + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * If br_ppid is 0, it means this is a CLONE_PARENT thread, + * so the signal goes to the parent process - not to a + * specific thread in this process. + */ + p = lwptoproc(lwp); + if (lwpd->br_ppid == 0) { + mutex_enter(&p->p_lock); + ppid = p->p_ppid; + t = NULL; + } else { + /* + * If we have been reparented to init or if our + * parent thread is gone, then nobody gets + * signaled. + */ + if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) || + (ptid == -1)) + goto free; + + mutex_enter(&pidlock); + if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + goto free; + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((t = idtot(p, ptid)) == NULL) { + mutex_exit(&p->p_lock); + goto free; + } + } + + sqp->sq_info.si_signo = lwpd->br_signal; + sqp->sq_info.si_code = lwpd->br_exitwhy; + sqp->sq_info.si_status = lwpd->br_exitwhat; + sqp->sq_info.si_pid = lwpd->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + sqp = NULL; + } + +free: + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, lwpd->br_args_size); + } + if (sqp) + kmem_free(sqp, sizeof (sigqueue_t)); +} + +void +lx_freelwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* + * There is one case where an LX branded process will possess + * LWPs which lack their own brand data. During the course of + * executing native binary, the process will be preemptively + * branded to allow hooks such as b_native_exec to function. + * If that process possesses multiple LWPS, they will _not_ be + * branded since they will exit if the exec succeeds. It's + * during this LWP exit that lx_freelwp would be called on an + * unbranded LWP. When that is the case, it is acceptable to + * bypass the hook. + */ + return; + } + + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_freelwp != NULL); + (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } + + /* + * It is possible for the lx_freelwp hook to be called without a prior + * call to lx_exitlwp being made. This happens as part of lwp + * de-branding when a native binary is executed from a branded process. + * + * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well + * here in lx_freelwp. When the second call is redundant, the + * resources will already be freed and no work will be needed. + */ + lx_cleanlwp(lwp, p); + + /* + * Remove our system call interposer. + */ + lwp->lwp_brand_syscall = NULL; + lwp->lwp_brand_syscall_fast = NULL; + + (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + if (lwpd->br_pid != 0) { + lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid); + } + + /* + * Ensure that lx_ptrace_exit() has been called to detach + * ptrace(2) tracers and tracees. + */ + VERIFY(lwpd->br_ptrace_tracer == NULL); + VERIFY(lwpd->br_ptrace_accord == NULL); + + lwp->lwp_brand = NULL; + kmem_free(lwpd, sizeof (struct lx_lwp_data)); +} + +void * +lx_lwpdata_alloc(proc_t *p) +{ + lx_lwp_data_t *lwpd; + struct lx_pid *lpidp; + pid_t newpid = 0; + struct pid *pidp = NULL; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * LWPs beyond the first will require a pid to be allocated to emulate + * Linux's goofy thread model. While this allocation may be + * unnecessary when a single-lwp process undergoes branding, it cannot + * be performed during b_initlwp due to p_lock being held. + */ + if (p->p_lwpcnt > 0) { + if ((newpid = pid_allocate(p, 0, 0)) < 0) { + return (NULL); + } + pidp = pid_find(newpid); + } + + lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP); + lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP); + + lpidp->l_pid = newpid; + lpidp->l_pidp = pidp; + lwpd->br_lpid = lpidp; + return (lwpd); +} + +/* + * Free lwp brand data if an error occurred during lwp_create. + * Otherwise, lx_freelwp will be used to free the resources after they're + * associated with the lwp via lx_initlwp. + */ +void +lx_lwpdata_free(void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + VERIFY(lwpd != NULL); + VERIFY(lwpd->br_lpid != NULL); + + if (lwpd->br_lpid->l_pidp != NULL) { + (void) pid_rele(lwpd->br_lpid->l_pidp); + } + kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid)); + kmem_free(lwpd, sizeof (*lwpd)); +} + +void +lx_initlwp(klwp_t *lwp, void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + kthread_t *tp = lwptot(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwp->lwp_brand == NULL); + + lwpd->br_exitwhy = CLD_EXITED; + lwpd->br_lwp = lwp; + lwpd->br_clear_ctidp = NULL; + lwpd->br_set_ctidp = NULL; + lwpd->br_signal = 0; + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + /* + * lwpd->br_affinitymask was zeroed by kmem_zalloc() + * as was lwpd->br_scall_args and lwpd->br_args_size. + */ + + /* + * The first thread in a process has ppid set to the parent + * process's pid, and ptid set to -1. Subsequent threads in the + * process have their ppid set to the pid of the thread that + * created them, and their ptid to that thread's tid. + */ + if (tp->t_next == tp) { + lwpd->br_ppid = tp->t_procp->p_ppid; + lwpd->br_ptid = -1; + } else if (plwpd != NULL) { + bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls)); + lwpd->br_ppid = plwpd->br_pid; + lwpd->br_ptid = curthread->t_tid; + /* The child inherits the fs/gsbase values from the parent */ + lwpd->br_lx_fsbase = plwpd->br_lx_fsbase; + lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase; + lwpd->br_lx_gsbase = plwpd->br_lx_gsbase; + lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase; + } else { + /* + * Oddball case: the parent thread isn't a Linux process. + */ + lwpd->br_ppid = 0; + lwpd->br_ptid = -1; + } + lwp->lwp_brand = lwpd; + + /* + * When during lx_lwpdata_alloc, we must decide whether or not to + * allocate a new pid to associate with the lwp. Since p_lock is not + * held at that point, the only time we can guarantee a new pid isn't + * needed is when p_lwpcnt == 0. This is because other lwps won't be + * present to race with us with regards to pid allocation. + * + * This means that in all other cases (where p_lwpcnt > 0), we expect + * that lx_lwpdata_alloc will allocate a pid for us to use here, even + * if it is uneeded. If this process is undergoing an exec, for + * example, the single existing lwp will not need a new pid when it is + * rebranded. In that case, lx_pid_assign will free the uneeded pid. + */ + VERIFY(lwpd->br_lpid->l_pidp != NULL || p->p_lwpcnt == 0); + + lx_pid_assign(tp, lwpd->br_lpid); + lwpd->br_tgid = lwpd->br_pid; + /* + * Having performed the lx pid assignement, the lpid reference is no + * longer needed. The underlying data will be freed during lx_freelwp. + */ + lwpd->br_lpid = NULL; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + + /* + * Install branded system call hooks for this LWP: + */ + lwp->lwp_brand_syscall = lx_syscall_enter; + lwp->lwp_brand_syscall_fast = lx_syscall_fast_enter; + + /* + * The new LWP inherits the parent LWP cgroup ID. + */ + if (plwpd != NULL) { + lwpd->br_cgroupid = plwpd->br_cgroupid; + } + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_initlwp != NULL); + (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } +} + +void +lx_initlwp_post(klwp_t *lwp) +{ + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + /* + * If the parent LWP has a ptrace(2) tracer, the new LWP may + * need to inherit that same tracer. + */ + if (plwpd != NULL) { + lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp)); + } +} + +/* + * There is no need to have any locking for either the source or + * destination struct lx_lwp_data structs. This is always run in the + * thread context of the source thread, and the destination thread is + * always newly created and not referred to from anywhere else. + */ +void +lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + struct lx_lwp_data *src = srclwp->lwp_brand; + struct lx_lwp_data *dst = dstlwp->lwp_brand; + + dst->br_ppid = src->br_pid; + dst->br_ptid = lwptot(srclwp)->t_tid; + bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + + switch (src->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * The parent LWP has an alternate stack installed. + * The child LWP should have the same stack base and extent. + */ + dst->br_stack_mode = src->br_stack_mode; + dst->br_ntv_stack = src->br_ntv_stack; + dst->br_ntv_stack_current = src->br_ntv_stack_current; + break; + + default: + /* + * Otherwise, clear the stack data for this LWP. + */ + dst->br_stack_mode = LX_STACK_MODE_PREINIT; + dst->br_ntv_stack = 0; + dst->br_ntv_stack_current = 0; + } + + /* + * copy only these flags + */ + dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; + dst->br_scall_args = NULL; +} + +/* + * When switching a Linux process off the CPU, clear its GDT entries. + */ +/* ARGSUSED */ +static void +lx_save(klwp_t *t) +{ + int i; + +#if defined(__amd64) + reset_sregs(); +#endif + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &null_udesc); +} + +/* + * When switching a Linux process on the CPU, set its GDT entries. + * + * For 64-bit code we don't have to worry about explicitly setting the + * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen + * automatically in update_sregs if we are executing in user-land. If this + * is the case then pcb_rupdate should be set. + */ +static void +lx_restore(klwp_t *t) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(t); + user_desc_t *tls; + int i; + + ASSERT(lwpd); + + tls = lwpd->br_tls; + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &tls[i]); +} + +void +lx_set_gdt(int entry, user_desc_t *descrp) +{ + + gdt_update_usegd(entry, descrp); +} + +void +lx_clear_gdt(int entry) +{ + gdt_update_usegd(entry, &null_udesc); +} + +longlong_t +lx_nosys() +{ + return (set_errno(ENOSYS)); +} + +/* + * Brand-specific routine to check if given non-Solaris standard segment + * register values should be modified to other values. + */ +/*ARGSUSED*/ +greg_t +lx_fixsegreg(greg_t sr, model_t datamodel) +{ + uint16_t idx = SELTOIDX(sr); + + ASSERT(sr == (sr & 0xffff)); + + /* + * If the segment selector is a valid TLS selector, just return it. + */ + if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX) + return (sr | SEL_UPL); + + /* + * Force the SR into the LDT in ring 3 for 32-bit processes. + * + * 64-bit processes get the null GDT selector since they are not + * allowed to have a private LDT. + */ +#if defined(__amd64) + return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0); +#elif defined(__i386) + datamodel = datamodel; /* datamodel currently unused for 32-bit */ + return (sr | SEL_TI_LDT | SEL_UPL); +#endif /* __amd64 */ +} + +/* + * Brand-specific function to convert the fsbase as pulled from the register + * into a native fsbase suitable for locating the ulwp_t from the kernel. + */ +uintptr_t +lx_fsbase(klwp_t *lwp, uintptr_t fsbase) +{ + lx_lwp_data_t *lwpd = lwp->lwp_brand; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND || + lwpd->br_ntv_fsbase == NULL) { + return (fsbase); + } + + return (lwpd->br_ntv_fsbase); +} + +/* + * These two functions simulate winfo and post_sigcld for the lx brand. The + * difference is delivering a designated signal as opposed to always SIGCLD. + */ +static void +lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat) +{ + ASSERT(MUTEX_HELD(&pidlock)); + bzero(ip, sizeof (k_siginfo_t)); + ip->si_signo = ltos_signo[dat->l_signal]; + ip->si_code = pp->p_wcode; + ip->si_pid = pp->p_pid; + ip->si_ctid = PRCTID(pp); + ip->si_zoneid = pp->p_zone->zone_id; + ip->si_status = pp->p_wdata; + ip->si_stime = pp->p_stime; + ip->si_utime = pp->p_utime; +} + +static void +lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat) +{ + proc_t *pp = cp->p_parent; + + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + /* + * Since Linux doesn't queue SIGCHLD, or any other non RT + * signals, we just blindly deliver whatever signal we can. + */ + ASSERT(sqp != NULL); + lx_winfo(cp, &sqp->sq_info, dat); + sigaddqa(pp, NULL, sqp); + sqp = NULL; + mutex_exit(&pp->p_lock); +} + + +/* + * Brand specific code for exiting and sending a signal to the parent, as + * opposed to sigcld(). + */ +void +lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp) +{ + proc_t *pp = cp->p_parent; + lx_proc_data_t *lx_brand_data = ptolxproc(cp); + ASSERT(MUTEX_HELD(&pidlock)); + + switch (cp->p_wcode) { + case CLD_EXITED: + case CLD_DUMPED: + case CLD_KILLED: + ASSERT(cp->p_stat == SZOMB); + /* + * The broadcast on p_srwchan_cv is a kludge to + * wakeup a possible thread in uadmin(A_SHUTDOWN). + */ + cv_broadcast(&cp->p_srwchan_cv); + + /* + * Add to newstate list of the parent + */ + add_ns(pp, cp); + + cv_broadcast(&pp->p_cv); + if ((pp->p_flag & SNOWAIT) || + PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) { + if (!(cp->p_pidflag & CLDWAITPID)) + freeproc(cp); + } else if (!(cp->p_pidflag & CLDNOSIGCHLD) && + lx_brand_data->l_signal != 0) { + lx_post_exit_sig(cp, sqp, lx_brand_data); + sqp = NULL; + } + break; + + case CLD_STOPPED: + case CLD_CONTINUED: + case CLD_TRAPPED: + panic("Should not be called in this case"); + } + + if (sqp) + siginfofree(sqp); +} + +/* + * Filters based on arguments that have been passed in by a separate syscall + * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is + * applied, otherwise we look at the difference between a clone and non-clone + * process. + * The definition of a clone process in Linux is a thread that does not deliver + * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone + * processes. Without that option, a process should only wait on normal + * children. The following table shows the cases. + * + * default __WCLONE + * no SIGCHLD - X + * SIGCHLD X - + * + * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on + * process exit. + * + * More information on wait in lx brands can be found at + * usr/src/lib/brand/lx/lx_brand/common/wait.c. + */ +boolean_t +lx_wait_filter(proc_t *pp, proc_t *cp) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int flags = lwpd->br_waitid_flags; + boolean_t ret; + + if (!lwpd->br_waitid_emulate) { + return (B_TRUE); + } + + mutex_enter(&cp->p_lock); + if (flags & LX_WALL) { + ret = B_TRUE; + } else { + lx_proc_data_t *pd = ptolxproc(cp); + boolean_t is_sigchld = B_TRUE; + boolean_t match_wclone = B_FALSE; + + /* + * When calling clone, an alternate signal can be chosen to + * deliver to the parent when the child exits. + */ + if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) { + is_sigchld = B_FALSE; + } + if ((flags & LX_WCLONE) != 0) { + match_wclone = B_TRUE; + } + + ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE; + } + mutex_exit(&cp->p_lock); + + return (ret); +} + +void +lx_ifname_convert(char *ifname, lx_if_action_t act) +{ + if (act == LX_IF_TONATIVE) { + if (strncmp(ifname, "lo", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo0", IFNAMSIZ); + } else { + if (strncmp(ifname, "lo0", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo", IFNAMSIZ); + } +} + +void +lx_ifflags_convert(uint64_t *flags, lx_if_action_t act) +{ + uint64_t buf; + + buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG | + IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS | + IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI); + + /* Linux has different shift for multicast flag */ + if (act == LX_IF_TONATIVE) { + if (*flags & 0x1000) + buf |= IFF_MULTICAST; + } else { + if (*flags & IFF_MULTICAST) + buf |= 0x1000; + } + *flags = buf; +} + +/* + * Convert an IPv6 address into the numbers used by /proc/net/if_inet6 + */ +unsigned int +lx_ipv6_scope_convert(const in6_addr_t *addr) +{ + if (IN6_IS_ADDR_V4COMPAT(addr)) { + return (LX_IPV6_ADDR_COMPATv4); + } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_IPV6_ADDR_LOOPBACK); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_IPV6_ADDR_LINKLOCAL); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_IPV6_ADDR_SITELOCAL); + } else { + return (0x0000U); + } +} + + +void +lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size) +{ + int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data)); + + switch (src->sdl_type) { + case DL_ETHER: + dst->sa_family = LX_ARPHRD_ETHER; + break; + case DL_LOOP: + dst->sa_family = LX_ARPHRD_LOOPBACK; + break; + default: + dst->sa_family = LX_ARPHRD_VOID; + } + + bcopy(LLADDR(src), dst->sa_data, copy_size); + *size = copy_size; +} + +/* + * Brand hook to convert native kernel siginfo signal number, errno, code, pid + * and si_status to Linux values. Similar to the stol_ksiginfo function but + * this one converts in-place, converts the pid, and does not copyout. + */ +void +lx_sigfd_translate(k_siginfo_t *infop) +{ + infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL); + + infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL); + + infop->si_code = lx_stol_sigcode(infop->si_code); + + infop->si_errno = lx_errno(infop->si_errno, EINVAL); + + if (infop->si_pid == curproc->p_zone->zone_proc_initpid) { + infop->si_pid = 1; + } else if (infop->si_pid == curproc->p_zone->zone_zsched->p_pid) { + infop->si_pid = 0; + } +} + +int +stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +int +stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo32_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif + +/* + * Linux uses the original bounds of the argv array when determining the + * contents of /proc/<pid/cmdline. We mimic those bounds using argv[0] and + * envp[0] as the beginning and end, respectively. + */ +void +lx_read_argv_bounds(proc_t *p) +{ + user_t *up = PTOU(p); + lx_proc_data_t *pd = ptolxproc(p); + uintptr_t addr_arg = up->u_argv; + uintptr_t addr_env = up->u_envp; + uintptr_t arg_start = 0, env_start = 0, env_end = 0; + int i = 0; + + VERIFY(pd != NULL); + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Use AT_SUN_PLATFORM in the aux vector to find the end of the envp + * strings. + */ + for (i = 0; i < __KERN_NAUXV_IMPL; i++) { + if (up->u_auxv[i].a_type == AT_SUN_PLATFORM) { + env_end = (uintptr_t)up->u_auxv[i].a_un.a_val; + } + } + + mutex_exit(&p->p_lock); +#if defined(_LP64) + if (p->p_model != DATAMODEL_NATIVE) { + uint32_t buf32; + if (copyin((void *)addr_arg, &buf32, sizeof (buf32)) == 0) { + arg_start = (uintptr_t)buf32; + } + if (copyin((void *)addr_env, &buf32, sizeof (buf32)) == 0) { + env_start = (uintptr_t)buf32; + } + } else +#endif /* defined(_LP64) */ + { + uintptr_t buf; + if (copyin((void *)addr_arg, &buf, sizeof (buf)) == 0) { + arg_start = buf; + } + if (copyin((void *)addr_env, &buf, sizeof (buf)) == 0) { + env_start = buf; + } + } + mutex_enter(&p->p_lock); + pd->l_args_start = arg_start; + pd->l_envs_start = env_start; + pd->l_envs_end = env_end; +} + +/* Given an LX LWP, determine where user register state is stored. */ +lx_regs_location_t +lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write) +{ + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + /* + * The LWP was stopped with the brand stack and register state + * loaded, e.g. during a syscall emulated within the kernel. + */ + return (LX_REG_LOC_LWP); + + case LX_STACK_MODE_PREINIT: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED || + lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) { + /* The LWP was stopped by tracing on exec. */ + return (LX_REG_LOC_LWP); + } + break; + + case LX_STACK_MODE_NATIVE: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whystop == PR_BRAND && + lwpd->br_ptrace_whatstop == LX_PR_EVENT) { + /* Called while ptrace-event-stopped by lx_exec. */ + return (LX_REG_LOC_LWP); + } + break; + default: + break; + } + + if (lwpd->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Return the register state + * from that ucontext_t. + */ + VERIFY(ucp != NULL); + *ucp = (void *)lwpd->br_ptrace_stopucp; + return (LX_REG_LOC_UCP); + } + + return (LX_REG_LOC_UNAVAIL); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c new file mode 100644 index 0000000000..40179bbdaf --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> + +#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */ +static int hash_len = 4; /* desired average hash chain length */ +static int hash_size; /* no of buckets in the hash table */ + +static struct lx_pid **stol_pid_hash; +static struct lx_pid **ltos_pid_hash; + +#define LTOS_HASH(pid) ((pid) & (hash_size - 1)) +#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1)) + +static kmutex_t hash_lock; + +static void +lx_pid_insert_hash(struct lx_pid *lpidp) +{ + int shash = STOL_HASH(lpidp->s_pid, lpidp->s_tid); + int lhash = LTOS_HASH(lpidp->l_pid); + + ASSERT(MUTEX_HELD(&hash_lock)); + + lpidp->stol_next = stol_pid_hash[shash]; + stol_pid_hash[shash] = lpidp; + + lpidp->ltos_next = ltos_pid_hash[lhash]; + ltos_pid_hash[lhash] = lpidp; +} + +static struct lx_pid * +lx_pid_remove_hash(pid_t pid, id_t tid) +{ + struct lx_pid **hpp; + struct lx_pid *lpidp = NULL; + + ASSERT(MUTEX_HELD(&hash_lock)); + + hpp = &stol_pid_hash[STOL_HASH(pid, tid)]; + while (*hpp) { + if ((*hpp)->s_pid == pid && (*hpp)->s_tid == tid) { + lpidp = *hpp; + *hpp = (*hpp)->stol_next; + break; + } + hpp = &(*hpp)->stol_next; + } + + /* + * when called during error recovery the pid may already + * be released + */ + if (lpidp == NULL) + return (NULL); + + hpp = <os_pid_hash[LTOS_HASH(lpidp->l_pid)]; + while (*hpp) { + if (*hpp == lpidp) { + *hpp = lpidp->ltos_next; + break; + } + hpp = &(*hpp)->ltos_next; + } + + return (lpidp); +} + +/* + * given a solaris pid/tid pair, create a linux pid + */ +void +lx_pid_assign(kthread_t *t, struct lx_pid *lpidp) +{ + proc_t *p = ttoproc(t); + lx_lwp_data_t *lwpd = ttolxlwp(t); + pid_t s_pid = p->p_pid; + id_t s_tid = t->t_tid; + + /* + * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be + * equal to 1. Since lx_initlwp is being called against an lwp that + * already exists, an additional pid allocation is not necessary. + * + * We check for this by testing br_ppid == 0. + */ + if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) { + /* + * Assign allocated pid to any thread other than the first. + * The l_pid and l_pidp fields should be populated. + */ + VERIFY(lpidp->l_pidp != NULL); + VERIFY(lpidp->l_pid != 0); + } else { + /* + * There are cases where a pid is speculatively allocated but + * is not needed. We are obligated to free it here. + */ + if (lpidp->l_pidp != NULL) { + (void) pid_rele(lpidp->l_pidp); + } + lpidp->l_pidp = NULL; + lpidp->l_pid = s_pid; + } + + lpidp->s_pid = s_pid; + lpidp->s_tid = s_tid; + lpidp->l_start = t->t_start; + + /* + * now put the pid into the linux-solaris and solaris-linux + * conversion hash tables + */ + mutex_enter(&hash_lock); + lx_pid_insert_hash(lpidp); + mutex_exit(&hash_lock); + + lwpd->br_pid = lpidp->l_pid; +} + +/* + * If we are exec()ing the process, this thread's tid is about to be reset + * to 1. Make sure the Linux PID bookkeeping reflects that change. + */ +void +lx_pid_reassign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + struct pid *old_pidp; + struct lx_pid *lpidp; + + ASSERT(p->p_lwpcnt == 1); + + mutex_enter(&hash_lock); + + /* + * Clean up all the traces of this thread's 'fake' Linux PID. + */ + lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid); + ASSERT(lpidp != NULL); + old_pidp = lpidp->l_pidp; + lpidp->l_pidp = NULL; + + /* + * Now register this thread as (pid, 1). + */ + lpidp->l_pid = p->p_pid; + lpidp->s_pid = p->p_pid; + lpidp->s_tid = 1; + lx_pid_insert_hash(lpidp); + + mutex_exit(&hash_lock); + + if (old_pidp) + (void) pid_rele(old_pidp); +} + +/* + * release a solaris pid/tid pair + */ +void +lx_pid_rele(pid_t pid, id_t tid) +{ + struct lx_pid *lpidp; + + mutex_enter(&hash_lock); + lpidp = lx_pid_remove_hash(pid, tid); + mutex_exit(&hash_lock); + + if (lpidp) { + if (lpidp->l_pidp) + (void) pid_rele(lpidp->l_pidp); + + kmem_free(lpidp, sizeof (*lpidp)); + } +} + +/* + * given a linux pid, return the solaris pid/tid pair + */ +int +lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid) +{ + struct lx_pid *hp; + + if (l_pid == 1) { + pid_t initpid; + + /* + * We are trying to look up the Linux init process for the + * current zone, which we pretend has pid 1. + */ + if ((initpid = curzone->zone_proc_initpid) == -1) { + /* + * We could not find the init process for this zone. + */ + return (-1); + } + + if (s_pid != NULL) + *s_pid = initpid; + if (s_tid != NULL) + *s_tid = 1; + + return (0); + } + + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(l_pid)]; hp; hp = hp->ltos_next) { + if (l_pid == hp->l_pid) { + if (s_pid) + *s_pid = hp->s_pid; + if (s_tid) + *s_tid = hp->s_tid; + break; + } + } + mutex_exit(&hash_lock); + if (hp != NULL) + return (0); + + /* + * We didn't find this pid in our translation table. + * But this still could be the pid of a native process + * running in the current zone so check for that here. + * + * Note that prfind() only searches for processes in the current zone. + */ + mutex_enter(&pidlock); + if (prfind(l_pid) != NULL) { + mutex_exit(&pidlock); + if (s_pid) + *s_pid = l_pid; + if (s_tid) + *s_tid = 0; + return (0); + } + mutex_exit(&pidlock); + + return (-1); +} + +/* + * Given an lwp, return the Linux pid of its parent. If the caller + * wants them, we return the Solaris (pid, tid) as well. + */ +pid_t +lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + struct lx_pid *hp; + pid_t zoneinit = curproc->p_zone->zone_proc_initpid; + pid_t lppid, ppid; + + /* + * Be sure not to return a parent pid that should be invisible + * within this zone. + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * If the parent process's pid is the zone's init process, force it + * to the Linux init pid value of 1. + */ + if (ppid == zoneinit) + ppid = 1; + + /* + * There are two cases in which the Linux definition of a 'parent' + * matches that of Solaris: + * + * - if our tgid is the same as our PID, then we are either the + * first thread in the process or a CLONE_THREAD thread. + * + * - if the brand lwp value for ppid is 0, then we are either the + * child of a differently-branded process or a CLONE_PARENT thread. + */ + if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) { + if (ppidp != NULL) + *ppidp = ppid; + if (ptidp != NULL) + *ptidp = -1; + return (ppid); + } + + /* + * Set the default Linux parent pid to be the pid of the zone's init + * process; this will get converted back to the Linux default of 1 + * later. + */ + lppid = zoneinit; + + /* + * If the process's parent isn't init, try and look up the Linux "pid" + * corresponding to the process's parent. + */ + if (ppid != 1) { + /* + * In all other cases, we are looking for the parent of this + * specific thread, which in Linux refers to the thread that + * clone()d it. We stashed that thread's PID away when this + * thread was created. + */ + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; hp; + hp = hp->ltos_next) { + if (lwpd->br_ppid == hp->l_pid) { + /* + * We found the PID we were looking for, but + * since we cached its value in this LWP's brand + * structure, it has exited and been reused by + * another process. + */ + if (hp->l_start > lwptot(lwp)->t_start) + break; + + lppid = lwpd->br_ppid; + if (ppidp != NULL) + *ppidp = hp->s_pid; + if (ptidp != NULL) + *ptidp = hp->s_tid; + + break; + } + } + mutex_exit(&hash_lock); + } + + if (lppid == zoneinit) { + lppid = 1; + + if (ppidp != NULL) + *ppidp = lppid; + if (ptidp != NULL) + *ptidp = -1; + } + + return (lppid); +} + +void +lx_pid_init(void) +{ + hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR)); + + stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + + mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_pid_fini(void) +{ + kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size); + kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c new file mode 100644 index 0000000000..0f521df61b --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c @@ -0,0 +1,2564 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Emulation of the Linux ptrace(2) interface. + * + * OVERVIEW + * + * The Linux process model is somewhat different from the illumos native + * model. One critical difference is that each Linux thread has a unique + * identifier in the pid namespace. The lx brand assigns a pid to each LWP + * within the emulated process, giving the pid of the process itself to the + * first LWP. + * + * The Linux ptrace(2) interface allows for any LWP in a branded process to + * exert control over any other LWP within the same zone. Control is exerted + * by the use of the ptrace(2) system call itself, which accepts a number of + * request codes. Feedback on traced events is primarily received by the + * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system + * calls. Many of the possible ptrace(2) requests will only succeed if the + * target LWP is in a "ptrace-stop" condition. + * + * HISTORY + * + * The brand support for ptrace(2) was originally built on top of the rich + * support for debugging and tracing provided through the illumos /proc + * interfaces, mounted at /native/proc within the zone. The native legacy + * ptrace(3C) functionality was used as a starting point, but was generally + * insufficient for complete and precise emulation. The extant legacy + * interface, and indeed our native SIGCLD and waitid(2) facilities, are + * focused on _process_ level concerns -- the Linux interface has been + * extended to be aware of LWPs as well. + * + * In order to allow us to focus on providing more complete and accurate + * emulation without extensive and undesirable changes to the native + * facilities, this second generation ptrace(2) emulation is mostly separate + * from any other tracing or debugging framework in the system. + * + * ATTACHING TRACERS TO TRACEES + * + * There are several ways that a child LWP may becomed traced by a tracer. + * To determine which attach method caused a tracee to become attached, one + * may inspect the "br_ptrace_attach" member of the LWP-specific brand data + * with the debugger. + * + * The first attach methods to consider are the attaching ptrace(2) requests: + * + * PTRACE_TRACEME + * + * If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee + * to its parent LWP (br_ppid). Using PTRACE_TRACEME does _not_ cause the + * tracee to be held in a stop condition. It is common practice for + * consumers to raise(SIGSTOP) immediately afterward. + * + * PTRACE_ATTACH + * + * An LWP may attempt to trace any other LWP in this, or another, process. + * We currently allow any attach where the process containing the tracer + * LWP has permission to write to /proc for the process containing the + * intended tracer. This action also sends a SIGSTOP to the newly attached + * tracee. + * + * The second class of attach methods are the clone(2)/fork(2) inheritance + * options that may be set on a tracee with PTRACE_SETOPTIONS: + * + * PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE + * + * If these options have been set on a tracee, then a fork(2), vfork(2) or + * clone(2) respectively will cause the newly created LWP to be traced by + * the same tracer. The same set of ptrace(2) options will also be set on + * the new child. + * + * The third class of attach method is the PTRACE_CLONE flag to clone(2). + * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is + * passed by the tracee as an argument to clone(2). + * + * DETACHING TRACEES + * + * Tracees can be detached by the tracer with the PTRACE_DETACH request. + * This request is only valid when the tracee is in a ptrace(2) stop + * condition, and is itself a restarting action. + * + * If the tracer exits without detaching all of its tracees, then all of the + * tracees are automatically detached and restarted. If a tracee was in + * "signal-delivery-stop" at the time the tracer exited, the signal will be + * released to the child unless it is a SIGSTOP. We drop this instance of + * SIGSTOP in order to prevent the child from becoming stopped by job + * control. + * + * ACCORD ALLOCATION AND MANAGEMENT + * + * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP + * and zero or more tracee LWPs. It is explicitly illegal for a tracee to + * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME. + * + * An LWP starts out without an accord. If a child of that LWP calls + * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses + * PTRACE_ATTACH, an accord will be allocated and stored on that LWP. The + * accord structure is not released from that LWP until it arrives in + * lx_exitlwp(), as called by lwp_exit(). A new accord will not be + * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp() + * and sets the LX_PTF_EXITING flag. An LWP will have at most one accord + * structure throughout its entire lifecycle; once it has one, it has the + * same one until death. + * + * The accord is reference counted (lxpa_refcnt), starting at a count of one + * at creation to represent the link from the tracer LWP to its accord. The + * accord is not freed until the reference count falls to zero. + * + * To make mutual exclusion between a detaching tracer and various notifying + * tracees simpler, the tracer will hold "pidlock" while it clears the + * accord members that point back to the tracer LWP and CV. + * + * SIGNALS AND JOB CONTROL + * + * Various actions, either directly ptrace(2) related or commonly associated + * with tracing, cause process- or thread-directed SIGSTOP signals to be sent + * to tracees. These signals, and indeed any signal other than SIGKILL, can + * be suppressed by the tracer when using a restarting request (including + * PTRACE_DETACH) on a child. The signal may also be substituted for a + * different signal. + * + * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer, + * it will induce the regular illumos native job control stop of the entire + * traced process. This is at least passingly similar to the Linux "group + * stop" ptrace(2) condition. + * + * SYSTEM CALL TRACING + * + * The ptrace(2) interface enables the tracer to hold the tracee on entry and + * exit from system calls. When a stopped tracee is restarted through the + * PTRACE_SYSCALL request, the LX_PTF_SYSCALL flag is set until the next + * system call boundary. Whether this is a "syscall-entry-stop" or + * "syscall-exit-stop", the tracee is held and the tracer is notified via + * SIGCLD/waitpid(2) in the usual way. The flag LX_PTF_SYSCALL flag is + * cleared after each stop; for ongoing system call tracing the tracee must + * be continuously restarted with PTRACE_SYSCALL. + * + * EVENT STOPS + * + * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are + * enabled by the tracer through PTRACE_SETOPTIONS. Once enabled, the tracee + * will be stopped at the nominated points of interest and the tracer + * notified. The tracer may request additional information about the event, + * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG. + * + * LOCK ORDERING RULES + * + * It is not safe, in general, to hold p_lock for two different processes at + * the same time. This constraint is the primary reason for the existence + * (and complexity) of the ptrace(2) accord mechanism. + * + * In order to facilitate looking up accords by the "pid" of a tracer LWP, + * p_lock for the tracer process may be held while entering the accord mutex + * (lxpa_lock). This mutex protects the accord flags and reference count. + * The reference count is manipulated through lx_ptrace_accord_hold() and + * lx_ptrace_accord_rele(). + * + * DO NOT interact with the accord mutex (lxpa_lock) directly. The + * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various + * book-keeping and lock ordering enforcement and MUST be used. + * + * It is NOT legal to take ANY p_lock while holding the accord mutex + * (lxpa_lock). If the lxpa_tracees_lock is to be held concurrently with + * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock + * of any processes from the tracee list. + * + * It is NOT legal to take a tracee p_lock and then attempt to enter the + * accord mutex (or tracee list mutex) of its tracer. When running as the + * tracee LWP, the tracee's hold will prevent the accord from being freed. + * Use of the LX_PTF_STOPPING or LX_PTF_CLONING flag in the LWP-specific brand + * data prevents an exiting tracer from altering the tracee until the tracee + * has come to an orderly stop, without requiring the tracee to hold its own + * p_lock the entire time it is stopping. + * + * It is not safe, in general, to enter "pidlock" while holding the p_lock of + * any process. It is similarly illegal to hold any accord locks (lxpa_lock + * or lxpa_sublock) while attempting to enter "pidlock". As "pidlock" is a + * global mutex, it should be held for the shortest possible time. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/procfs.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/wait.h> +#include <sys/prsystm.h> +#include <sys/note.h> + +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> +#include <lx_signum.h> + + +typedef enum lx_ptrace_cont_flags_t { + LX_PTC_NONE = 0x00, + LX_PTC_SYSCALL = 0x01, + LX_PTC_SINGLESTEP = 0x02 +} lx_ptrace_cont_flags_t; + + +extern int lx_user_regs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_regs_copyout(lx_lwp_data_t *, void *); +extern int lx_ptrace_peekuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_ptrace_pokeuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_user_fpregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpregs_copyout(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyout(lx_lwp_data_t *, void *); + +/* + * Macros for checking the state of an LWP via "br_ptrace_flags": + */ +#define LX_PTRACE_BUSY \ + (LX_PTF_EXITING | LX_PTF_STOPPING | LX_PTF_CLONING) + +#define VISIBLE(a) (((a)->br_ptrace_flags & LX_PTF_EXITING) == 0) +#define TRACEE_BUSY(a) (((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0) + +#define ACCORD_HELD(a) MUTEX_HELD(&(a)->lxpa_lock) + +static kcondvar_t lx_ptrace_busy_cv; +static kmem_cache_t *lx_ptrace_accord_cache; + +/* + * Enter the accord mutex. + */ +static void +lx_ptrace_accord_enter(lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + + mutex_enter(&accord->lxpa_lock); +} + +/* + * Exit the accord mutex. If the reference count has dropped to zero, + * free the accord. + */ +static void +lx_ptrace_accord_exit(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + if (accord->lxpa_refcnt > 0) { + mutex_exit(&accord->lxpa_lock); + return; + } + + /* + * When the reference count drops to zero we must free the accord. + */ + VERIFY(accord->lxpa_tracer == NULL); + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + VERIFY(list_is_empty(&accord->lxpa_tracees)); + VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE); + + mutex_destroy(&accord->lxpa_lock); + mutex_destroy(&accord->lxpa_tracees_lock); + + kmem_cache_free(lx_ptrace_accord_cache, accord); +} + +/* + * Drop our reference to this accord. If this drops the reference count + * to zero, the next lx_ptrace_accord_exit() will free the accord. + */ +static void +lx_ptrace_accord_rele(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + VERIFY(accord->lxpa_refcnt > 0); + accord->lxpa_refcnt--; +} + +/* + * Place an additional hold on an accord. + */ +static void +lx_ptrace_accord_hold(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + accord->lxpa_refcnt++; +} + +/* + * Fetch the accord for this LWP. If one has not yet been created, and the + * process is not exiting, allocate it now. Must be called with p_lock held + * for the process containing the target LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp, + boolean_t allocate_one) +{ + lx_ptrace_accord_t *lxpa; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * If this LWP does not have an accord, we wish to allocate + * and install one. + */ + if ((lxpa = lwpd->br_ptrace_accord) == NULL) { + if (!allocate_one || !VISIBLE(lwpd)) { + /* + * Either we do not wish to allocate an accord, or this + * LWP has already begun exiting from a ptrace + * perspective. + */ + *accordp = NULL; + return (ESRCH); + } + + lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP); + bzero(lxpa, sizeof (*lxpa)); + + /* + * The initial reference count is 1 because we are referencing + * it in from the soon-to-be tracer LWP. + */ + lxpa->lxpa_refcnt = 1; + mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t), + offsetof(lx_lwp_data_t, br_ptrace_linkage)); + lxpa->lxpa_cvp = &p->p_cv; + + lxpa->lxpa_tracer = lwpd; + lwpd->br_ptrace_accord = lxpa; + } + + /* + * Lock the accord before returning it to the caller. + */ + lx_ptrace_accord_enter(lxpa); + + /* + * There should be at least one active reference to this accord, + * otherwise it should have been freed. + */ + VERIFY(lxpa->lxpa_refcnt > 0); + + *accordp = lxpa; + return (0); +} + +/* + * Accords belong to the tracer LWP. Get the accord for this tracer or return + * an error if it was not possible. To prevent deadlocks, the caller MUST NOT + * hold p_lock on its own or any other process. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp) +{ + int ret = ESRCH; + pid_t apid; + id_t atid; + proc_t *aproc; + kthread_t *athr; + klwp_t *alwp; + lx_lwp_data_t *alwpd; + + VERIFY(MUTEX_NOT_HELD(&curproc->p_lock)); + + /* + * Locate the process containing the tracer LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_to_spair(lxpid, &apid, &atid) != 0 || + (aproc = sprlock(apid)) == NULL) { + return (ESRCH); + } + + /* + * Locate the tracer LWP itself and ensure that it is visible to + * ptrace(2). + */ + if ((athr = idtot(aproc, atid)) == NULL || + (alwp = ttolwp(athr)) == NULL || + (alwpd = lwptolxlwp(alwp)) == NULL || + !VISIBLE(alwpd)) { + sprunlock(aproc); + return (ESRCH); + } + + /* + * We should not fetch our own accord this way. + */ + if (athr == curthread) { + sprunlock(aproc); + return (EPERM); + } + + /* + * Fetch (or allocate) the accord owned by this tracer LWP: + */ + ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE); + + /* + * Unlock the process and return. + */ + sprunlock(aproc); + return (ret); +} + +/* + * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a + * tracer. The caller MUST NOT currently hold p_lock on the process containing + * this LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + int ret; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Lock the tracer (this LWP). + */ + mutex_enter(&p->p_lock); + + /* + * Fetch (or allocate) the accord for this LWP: + */ + ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one); + + mutex_exit(&p->p_lock); + + return (ret); +} + +/* + * Restart an LWP if it is in "ptrace-stop". This function may induce sleep, + * so the caller MUST NOT hold any mutexes other than p_lock for the process + * containing the LWP. + */ +static void +lx_ptrace_restart_lwp(klwp_t *lwp) +{ + kthread_t *rt = lwptot(lwp); + proc_t *rproc = lwptoproc(lwp); + lx_lwp_data_t *rlwpd = lwptolxlwp(lwp); + + VERIFY(rt != curthread); + VERIFY(MUTEX_HELD(&rproc->p_lock)); + + /* + * Exclude potential meddling from procfs. + */ + prbarrier(rproc); + + /* + * Check that the LWP is still in "ptrace-stop" and, if so, restart it. + */ + thread_lock(rt); + if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) { + rt->t_schedflag |= TS_BSTART; + setrun_locked(rt); + + /* + * Clear stop reason. + */ + rlwpd->br_ptrace_whystop = 0; + rlwpd->br_ptrace_whatstop = 0; + rlwpd->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); + } + thread_unlock(rt); +} + +static void +lx_ptrace_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag, + pid_t *event_ppid, pid_t *event_pid) +{ + int signo; + + /* + * Populate our k_siginfo_t with data about this "ptrace-stop" + * condition: + */ + bzero(ip, sizeof (*ip)); + ip->si_signo = SIGCLD; + ip->si_pid = remote->br_pid; + ip->si_code = CLD_TRAPPED; + + switch (remote->br_ptrace_whatstop) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + ip->si_status = SIGTRAP; + if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) { + ip->si_status |= 0x80; + } + break; + + case LX_PR_SIGNALLED: + signo = remote->br_ptrace_stopsig; + if (signo < 1 || signo >= LX_NSIG) { + /* + * If this signal number is not valid, pretend it + * was a SIGTRAP. + */ + ip->si_status = SIGTRAP; + } else { + ip->si_status = ltos_signo[signo]; + } + break; + + case LX_PR_EVENT: + ip->si_status = SIGTRAP | remote->br_ptrace_event; + /* + * Record the Linux pid of both this LWP and the create + * event we are dispatching. We will use this information + * to unblock any subsequent ptrace(2) events that depend + * on this one. + */ + if (event_ppid != NULL) + *event_ppid = remote->br_pid; + if (event_pid != NULL) + *event_pid = (pid_t)remote->br_ptrace_eventmsg; + break; + + default: + cmn_err(CE_PANIC, "unxpected stop subreason: %d", + remote->br_ptrace_whatstop); + } + + /* + * If WNOWAIT was specified, do not mark the event as posted + * so that it may be re-fetched on another call to waitid(). + */ + if (waitflag) + remote->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); +} + +/* + * Receive notification from stop() of a PR_BRAND stop. + */ +void +lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + klwp_t *plwp = NULL; + proc_t *pp = NULL; + lx_lwp_data_t *parent; + boolean_t cldpend = B_TRUE; + boolean_t cldpost = B_FALSE; + sigqueue_t *sqp = NULL; + + /* + * We currently only care about LX-specific stop reasons. + */ + if (why != PR_BRAND) + return; + + switch (what) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + case LX_PR_SIGNALLED: + case LX_PR_EVENT: + break; + default: + cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND" + " stop: %d", (int)what); + } + + /* + * We should be holding the lock on our containing process. The + * STOPPING flag should have been set by lx_ptrace_stop() for all + * PR_BRAND stops. + */ + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + VERIFY((accord = lwpd->br_ptrace_tracer) != NULL); + + /* + * We must drop our process lock to take "pidlock". The + * LX_PTF_STOPPING flag protects us from an exiting tracer. + */ + mutex_exit(&p->p_lock); + + /* + * Allocate before we enter any mutexes. + */ + sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP); + + /* + * We take pidlock now, which excludes all callers of waitid() and + * prevents a detaching tracer from clearing critical accord members. + */ + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + + /* + * Get the ptrace(2) "parent" process, to which we may send + * a SIGCLD signal later. + */ + if ((parent = accord->lxpa_tracer) != NULL && + (plwp = parent->br_lwp) != NULL) { + pp = lwptoproc(plwp); + } + + /* + * Our tracer should not have been modified in our absence; the + * LX_PTF_STOPPING flag prevents it. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + + /* + * Stash data for this stop condition in the LWP data while we hold + * both pidlock and our p_lock. + */ + lwpd->br_ptrace_whystop = why; + lwpd->br_ptrace_whatstop = what; + lwpd->br_ptrace_flags |= LX_PTF_WAITPEND; + + /* + * If this event does not depend on an event from the parent LWP, + * populate the siginfo_t for the event pending on this tracee LWP. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_PARENT_WAIT) && pp != NULL) { + cldpost = B_TRUE; + lx_ptrace_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL); + } + + /* + * Drop our p_lock so that we may lock the tracer. + */ + mutex_exit(&p->p_lock); + if (cldpost && pp != NULL) { + /* + * Post the SIGCLD to the tracer. + */ + mutex_enter(&pp->p_lock); + if (!sigismember(&pp->p_sig, SIGCLD)) { + sigaddqa(pp, plwp->lwp_thread, sqp); + cldpend = B_FALSE; + sqp = NULL; + } + mutex_exit(&pp->p_lock); + } + + /* + * We re-take our process lock now. The lock will be held until + * the thread is actually marked stopped, so we will not race with + * lx_ptrace_lock_if_stopped() or lx_waitid_helper(). + */ + mutex_enter(&p->p_lock); + + /* + * We clear the STOPPING flag; stop() continues to hold our p_lock + * until our thread stop state is visible. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_STOPPING; + lwpd->br_ptrace_flags |= LX_PTF_STOPPED; + if (cldpend) { + /* + * We sent the SIGCLD for this new wait condition already. + */ + lwpd->br_ptrace_flags |= LX_PTF_CLDPEND; + } + + /* + * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will + * be sleeping on this CV until LX_PTF_STOPPING is clear. Wake it + * now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * While still holding pidlock, we attempt to wake our tracer from a + * potential waitid() slumber. + */ + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + + /* + * We release pidlock and return as we were called: with our p_lock + * held. + */ + mutex_exit(&pidlock); + + if (sqp != NULL) { + kmem_free(sqp, sizeof (*sqp)); + } +} + +/* + * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or + * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop". This + * check must ONLY be run on tracees of the current LWP. If the check is + * successful, we return with the tracee p_lock held. + */ +static int +lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote) +{ + klwp_t *rlwp = remote->br_lwp; + proc_t *rproc = lwptoproc(rlwp); + kthread_t *rt = lwptot(rlwp); + + /* + * We must never check that we, ourselves, are stopped. We must also + * have the accord tracee list locked while we lock our tracees. + */ + VERIFY(curthread != rt); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + VERIFY(accord->lxpa_tracer == ttolxlwp(curthread)); + + /* + * Lock the process containing the tracee LWP. + */ + mutex_enter(&rproc->p_lock); + if (!VISIBLE(remote)) { + /* + * The tracee LWP is currently detaching itself as it exits. + * It is no longer visible to ptrace(2). + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * We must only check whether tracees of the current LWP are stopped. + * We check this condition after confirming visibility as an exiting + * tracee may no longer be completely consistent. + */ + VERIFY(remote->br_ptrace_tracer == accord); + + if (!(remote->br_ptrace_flags & LX_PTF_STOPPED)) { + /* + * The tracee is not in "ptrace-stop", so we release the + * process. + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * The tracee is stopped. We return holding its process lock so that + * the caller may manipulate it. + */ + return (0); +} + +static int +lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options) +{ + /* + * Check for valid options. + */ + if ((options & ~LX_PTRACE_O_ALL) != 0) { + return (EINVAL); + } + + /* + * Set ptrace options on the target LWP. + */ + remote->br_ptrace_options = (lx_ptrace_options_t)options; + + return (0); +} + +static int +lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp) +{ + int error; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + uint32_t tmp = remote->br_ptrace_eventmsg; + + error = copyout(&tmp, umsgp, sizeof (uint32_t)); + } else +#endif + { + error = copyout(&remote->br_ptrace_eventmsg, umsgp, + sizeof (ulong_t)); + } + + return (error); +} + +static int +lx_ptrace_getsiginfo(lx_lwp_data_t *remote, void *usiginfo) +{ + klwp_t *lwp = remote->br_lwp; + int lx_sig; + + lx_sig = lx_stol_signo(lwp->lwp_cursig, 0); + if (lx_sig < 1 || lwp->lwp_curinfo == NULL) { + return (EINVAL); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + if (stol_ksiginfo32_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } else +#endif + { + if (stol_ksiginfo_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } + + return (0); +} + + +/* + * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface. + */ +static int +lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo) +{ + klwp_t *lwp = remote->br_lwp; + + if (flags & LX_PTC_SINGLESTEP) { + /* + * We do not currently support single-stepping. + */ + lx_unsupported("PTRACE_SINGLESTEP not currently implemented"); + return (EINVAL); + } + + /* + * The tracer may choose to suppress the delivery of a signal, or + * select an alternative signal for delivery. If this is an + * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig + * will be used as the new signal number. + * + * As with so many other aspects of the Linux ptrace(2) interface, this + * may fail silently if the state machine is not aligned correctly. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + /* + * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart: + */ + if (flags & LX_PTC_SYSCALL) { + remote->br_ptrace_flags |= LX_PTF_SYSCALL; + } else { + remote->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + lx_ptrace_restart_lwp(lwp); + + return (0); +} + +/* + * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface. + * + * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the + * current LWP, be detached and set runnable. If the specified LWP is not + * currently in the "ptrace-stop" state, the routine will return ESRCH as if + * the LWP did not exist at all. + * + * The caller must not hold p_lock on any process. + */ +static int +lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo, + boolean_t *release_hold) +{ + klwp_t *rlwp = remote->br_lwp; + + /* + * The tracee LWP was in "ptrace-stop" and we now hold its p_lock. + * Detach the LWP from the accord and set it running. + */ + VERIFY(!TRACEE_BUSY(remote)); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + remote->br_ptrace_flags &= ~(LX_PTF_SYSCALL | LX_PTF_INHERIT); + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + *release_hold = B_TRUE; + + /* + * Decrement traced-lwp count for the process. + */ + ASSERT(MUTEX_HELD(&rlwp->lwp_procp->p_lock)); + VERIFY(ptolxproc(rlwp->lwp_procp)->l_ptrace-- >= 1); + + /* + * The tracer may, as described in lx_ptrace_cont(), choose to suppress + * or modify the delivered signal. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + lx_ptrace_restart_lwp(rlwp); + + return (0); +} + +/* + * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2) + * interface. + * + * This LWP is requesting to be attached as a tracer to another LWP -- the + * tracee. If a ptrace accord to track the list of tracees has not yet been + * allocated, one will be allocated and attached to this LWP now. + * + * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the + * tracee LWP is then added to the "lxpa_tracees" list in the accord. We drop + * locks between these two phases; the only consumer of trace events from this + * accord is this LWP, which obviously cannot be running waitpid(2) at the same + * time as this call to ptrace(2). + */ +static int +lx_ptrace_attach(pid_t lx_pid) +{ + int error = ESRCH; + /* + * Our (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + /* + * Remote (Tracee) LWP: + */ + pid_t rpid; + id_t rtid; + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + + if (lwpd->br_pid == lx_pid) { + /* + * We cannot trace ourselves. + */ + return (EPERM); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This + * routine should not fail because the LWP cannot make ptrace(2) system + * calls after it has begun exiting. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * Place speculative hold in case the attach is successful. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Locate the process containing the tracee LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_to_spair(lx_pid, &rpid, &rtid) != 0 || + (rproc = sprlock(rpid)) == NULL) { + /* + * We could not find the target process. + */ + goto errout; + } + + /* + * Locate the tracee LWP. + */ + if ((rthr = idtot(rproc, rtid)) == NULL || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL || + !VISIBLE(rlwpd)) { + /* + * The LWP could not be found, was not branded, or is not + * visible to ptrace(2) at this time. + */ + goto unlock_errout; + } + + /* + * We now hold the lock on the tracee. Attempt to install ourselves + * as the tracer. + */ + if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc, + NULL, VWRITE) != 0) { + /* + * This process does not have permission to trace the remote + * process. + */ + error = EPERM; + } else if (rlwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(list_link_active(&rlwpd->br_ptrace_linkage)); + VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + lx_proc_data_t *rprocd = ptolxproc(rproc); + + /* + * Bond the tracee to the accord. + */ + VERIFY0(rlwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE); + rlwpd->br_ptrace_attach = LX_PTA_ATTACH; + rlwpd->br_ptrace_tracer = accord; + + /* + * We had no tracer, and are thus not in the tracees list. + * It is safe to take the tracee list lock while we insert + * ourselves. + */ + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, rlwpd); + /* + * Bump traced-lwp count for the remote process. + */ + rprocd->l_ptrace++; + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Send a thread-directed SIGSTOP. + */ + sigtoproc(rproc, rthr, SIGSTOP); + + + error = 0; + } + +unlock_errout: + /* + * Unlock the process containing the tracee LWP and the accord. + */ + sprunlock(rproc); + +errout: + if (error != 0) { + /* + * The attach was not successful. Remove our speculative + * hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); +} + +int +lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + + switch (option) { + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_clone_option = option; + break; + + default: + return (EINVAL); + } + + if (inherit_flag) { + lwpd->br_ptrace_flags |= LX_PTF_INHERIT; + } else { + lwpd->br_ptrace_flags &= ~LX_PTF_INHERIT; + } + + mutex_exit(&p->p_lock); + return (0); +} + +/* + * If the parent LWP is being traced, we want to attach ourselves to the + * same accord. + */ +void +lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst) +{ + proc_t *srcp = lwptoproc(src->br_lwp); + proc_t *dstp = lwptoproc(dst->br_lwp); + lx_ptrace_accord_t *accord; + boolean_t is_fork = B_FALSE; + + VERIFY(MUTEX_HELD(&dstp->p_lock)); + if (srcp != dstp) { + /* + * In the case of being called via forklwp, some lock shuffling + * is required. The destination p_lock must be dropped to + * avoid deadlocks when locking the source and manipulating + * ptrace accord resources. + */ + is_fork = B_TRUE; + sprlock_proc(dstp); + mutex_exit(&dstp->p_lock); + mutex_enter(&srcp->p_lock); + } + + if ((accord = src->br_ptrace_tracer) == NULL) { + /* + * The source LWP does not have a tracer to inherit. + */ + goto out; + } + + /* + * There are two conditions to check when determining if the new + * child should inherit the same tracer (and tracing options) as its + * parent. Either condition is sufficient to trigger inheritance. + */ + dst->br_ptrace_attach = LX_PTA_NONE; + if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) { + /* + * Condition 1: + * The clone(2), fork(2) and vfork(2) emulated system calls + * populate "br_ptrace_clone_option" with the specific + * ptrace(2) SETOPTIONS option that applies to this + * operation. If the relevant option has been enabled by the + * tracer then we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS; + + } else if ((src->br_ptrace_flags & LX_PTF_INHERIT) != 0) { + /* + * Condition 2: + * If the caller opted in to inheritance with the + * PTRACE_CLONE flag to clone(2), the LX_PTF_INHERIT flag + * will be set and we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE; + } + + /* + * These values only apply for the duration of a single clone(2), et + * al, system call. + */ + src->br_ptrace_flags &= ~LX_PTF_INHERIT; + src->br_ptrace_clone_option = 0; + + if (dst->br_ptrace_attach == LX_PTA_NONE) { + /* + * No condition triggered inheritance. + */ + goto out; + } + + /* + * Set the LX_PTF_CLONING flag to prevent us from being detached + * while our p_lock is dropped. + */ + src->br_ptrace_flags |= LX_PTF_CLONING; + mutex_exit(&srcp->p_lock); + + /* + * Hold the accord for the new LWP. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Install the tracer and copy the current PTRACE_SETOPTIONS options. + */ + dst->br_ptrace_tracer = accord; + dst->br_ptrace_options = src->br_ptrace_options; + + /* + * This flag prevents waitid() from seeing events for the new child + * until the parent is able to post the relevant ptrace event to + * the tracer. + */ + dst->br_ptrace_flags |= LX_PTF_PARENT_WAIT; + + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(list_link_active(&src->br_ptrace_linkage)); + VERIFY(!list_link_active(&dst->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, dst); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Relock our process and clear our busy flag. + */ + mutex_enter(&srcp->p_lock); + src->br_ptrace_flags &= ~LX_PTF_CLONING; + + /* + * Bump traced-lwp count for the process. + */ + ptolxproc(dstp)->l_ptrace++; + + /* + * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will + * be sleeping on this CV until LX_PTF_CLONING is clear. Wake it + * now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + +out: + if (is_fork) { + mutex_exit(&srcp->p_lock); + mutex_enter(&dstp->p_lock); + sprunprlock(dstp); + } +} + +static int +lx_ptrace_traceme(void) +{ + int error; + boolean_t did_attach = B_FALSE; + /* + * Our (Tracee) LWP: + */ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + /* + * Remote (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + + /* + * We are intending to be the tracee. Fetch (or allocate) the accord + * for our parent LWP. + */ + if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL, + NULL), &accord)) != 0) { + /* + * Could not determine the Linux pid of the parent LWP, or + * could not get the accord for that LWP. + */ + return (error); + } + + /* + * We now hold the accord lock. + */ + if (accord->lxpa_flags & LX_ACC_TOMBSTONE) { + /* + * The accord is marked for death; give up now. + */ + lx_ptrace_accord_exit(accord); + return (ESRCH); + } + + /* + * Bump the reference count so that the accord is not freed. We need + * to drop the accord lock before we take our own p_lock. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * We now lock _our_ process and determine if we can install our parent + * as our tracer. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + /* + * Bond ourselves to the accord. We already bumped the accord + * reference count. + */ + VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE); + lwpd->br_ptrace_attach = LX_PTA_TRACEME; + lwpd->br_ptrace_tracer = accord; + did_attach = B_TRUE; + error = 0; + + /* + * Speculatively bump l_ptrace now before dropping p_lock. + * It will be reverted if the tracee attachment fails. + */ + ptolxproc(p)->l_ptrace++; + } + mutex_exit(&p->p_lock); + + /* + * Lock the accord tracee list and add this LWP. Once we are in the + * tracee list, it is the responsibility of the tracer to detach us. + */ + if (error == 0) { + lx_ptrace_accord_enter(accord); + mutex_enter(&accord->lxpa_tracees_lock); + + if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) { + /* + * Put ourselves in the tracee list for this accord. + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, lwpd); + mutex_exit(&accord->lxpa_tracees_lock); + lx_ptrace_accord_exit(accord); + + return (0); + } + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * The accord has been marked for death. We must + * untrace ourselves. + */ + error = ESRCH; + lx_ptrace_accord_exit(accord); + + /* + * Undo speculative increment of ptracer count. + */ + mutex_enter(&p->p_lock); + ptolxproc(p)->l_ptrace--; + mutex_exit(&p->p_lock); + } + + /* + * Our optimism was unjustified: We were unable to attach. We need to + * lock the process containing this LWP again in order to remove the + * tracer. + */ + VERIFY(error != 0); + mutex_enter(&p->p_lock); + if (did_attach) { + /* + * Verify that things were as we left them: + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + VERIFY(lwpd->br_ptrace_tracer == accord); + + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + } + mutex_exit(&p->p_lock); + + /* + * Remove our speculative hold on the accord, possibly causing it to be + * freed in the process. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (error); +} + +static boolean_t +lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) +{ + boolean_t reset_nostop = B_FALSE; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark this LWP as stopping and call stop() to enter "ptrace-stop". + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + lwpd->br_ptrace_flags |= LX_PTF_STOPPING; + + if (lwpd->br_lwp->lwp_nostop == 1 && + lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) { + /* We need to clear this to get the signal delivered. */ + lwpd->br_lwp->lwp_nostop = 0; + reset_nostop = B_TRUE; + } + + stop(PR_BRAND, what); + + if (reset_nostop) { + VERIFY(lwpd->br_lwp->lwp_nostop == 0); + lwpd->br_lwp->lwp_nostop = 1; + } + + /* + * We are back from "ptrace-stop" with our process lock held. + */ + lwpd->br_ptrace_flags &= ~(LX_PTF_STOPPING | LX_PTF_STOPPED | + LX_PTF_CLDPEND); + lwpd->br_ptrace_stopucp = NULL; + cv_broadcast(&lx_ptrace_busy_cv); + mutex_exit(&p->p_lock); + + return (B_TRUE); +} + +int +lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg, + uintptr_t ucp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (!child) { + /* + * Only the first event posted by a new process is to be held + * until the matching parent event is dispatched, and only if + * it is a "child" event. This is not a child event, so we + * clear the wait flag. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + } else if (option == LX_PTRACE_O_TRACEVFORK) { + /* + * For a child, we have to handle vfork as a special case. In + * lx_ptrace_inherit_tracer() we set LX_PTF_PARENT_WAIT to + * force events to be delayed until the parent posts its event. + * This flag is cleared in lx_waitid_helper() to enforce a + * "happens after" relationship. However, this obviously cannot + * work for the vfork case. Thus, we clear our flag now so that + * we can deliver the signal in lx_stop_notify(), if necessary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + } + + if (!(lwpd->br_ptrace_options & option)) { + if (option == LX_PTRACE_O_TRACEEXEC) { + /* + * Without PTRACE_O_TRACEEXEC, the Linux kernel will + * send SIGTRAP to the process. + */ + sigtoproc(p, t, SIGTRAP); + mutex_exit(&p->p_lock); + return (0); + } + + /* + * The flag for this trace event is not enabled, so we will not + * stop. + */ + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (child) { + switch (option) { + case LX_PTRACE_O_TRACECLONE: + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + /* + * Send the child LWP a directed SIGSTOP. + */ + sigtoproc(p, t, SIGSTOP); + mutex_exit(&p->p_lock); + return (0); + default: + goto nostop; + } + } + + lwpd->br_ptrace_eventmsg = msg; + + switch (option) { + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE; + break; + case LX_PTRACE_O_TRACEEXEC: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC; + lwpd->br_ptrace_eventmsg = 0; + break; + case LX_PTRACE_O_TRACEEXIT: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT; + break; + case LX_PTRACE_O_TRACEFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK; + break; + case LX_PTRACE_O_TRACEVFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK; + break; + case LX_PTRACE_O_TRACEVFORKDONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE; + lwpd->br_ptrace_eventmsg = 0; + break; + default: + goto nostop; + } + + /* + * Userland may have passed in a ucontext_t pointer for + * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped. + */ + lwpd->br_ptrace_stopucp = ucp; + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH); + +nostop: + lwpd->br_ptrace_event = 0; + lwpd->br_ptrace_eventmsg = 0; + mutex_exit(&p->p_lock); + return (ESRCH); +} + +boolean_t +lx_ptrace_stop(ushort_t what) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT || + what == LX_PR_SIGNALLED); + + /* + * If we do not have an accord, bail out early. + */ + if (lwpd->br_ptrace_tracer == NULL) + return (B_FALSE); + + /* + * Lock this process and re-check the condition. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_SYSCALL); + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) { + /* + * This is a syscall-entry-stop or syscall-exit-stop point. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_SYSCALL)) { + /* + * A system call stop has not been requested. + */ + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + /* + * The PTRACE_SYSCALL restart command applies only to the next + * system call entry or exit. The tracer must restart us with + * PTRACE_SYSCALL while we are in ptrace-stop for us to fire + * again at the next system call boundary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, what)); +} + +int +lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int lx_sig; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * In very rare circumstances, a process which is almost completely + * through proc_exit() may incur issig checks in the current thread via + * clean-up actions. The process will still be branded, but the thread + * will have already been stripped of any LX-specific data on its way + * to the grave. Bail early if the brand data is missing. + */ + if (lwpd == NULL) { + return (0); + } + + /* + * If we do not have an accord, bail out now. Additionally, if there + * is no valid signal then we have no reason to stop. + */ + if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL || + (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) || + (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) { + if (lwp->lwp_cursig == 0) { + /* + * If this lwp has no current signal, it means that any + * signal ignorance enabled by br_ptrace_donesig has + * already taken place (the signal was consumed). + * By clearing donesig, we declare desire to ignore no + * signals for accurate ptracing. + */ + lwpd->br_ptrace_donesig = 0; + } + return (0); + } + + /* + * We stash the signal on the LWP where our waitid_helper will find it + * and enter the ptrace "signal-delivery-stop" condition. + */ + lwpd->br_ptrace_stopsig = lx_sig; + lwpd->br_ptrace_donesig = 0; + (void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED); + mutex_enter(&p->p_lock); + + /* + * When we return, the signal may have been altered or suppressed. + */ + if (lwpd->br_ptrace_stopsig != lx_sig) { + int native_sig; + lx_sig = lwpd->br_ptrace_stopsig; + + if (lx_sig >= LX_NSIG) { + lx_sig = 0; + } + + /* + * Translate signal from Linux signal number back to + * an illumos native signal. + */ + if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig = + ltos_signo[lx_sig]) < 1) { + /* + * The signal is not deliverable. + */ + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + } else { + /* + * Alter the currently dispatching signal. + */ + if (native_sig == SIGKILL) { + /* + * We mark ourselves the victim and request + * a restart of signal processing. + */ + p->p_flag |= SKILLED; + p->p_flag &= ~SEXTKILLED; + return (-1); + } + lwp->lwp_cursig = native_sig; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo != NULL) { + lwp->lwp_curinfo->sq_info.si_signo = native_sig; + } + } + } + + lwpd->br_ptrace_donesig = lwp->lwp_cursig; + lwpd->br_ptrace_stopsig = 0; + return (0); +} + +boolean_t +lx_ptrace_sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + lx_proc_data_t *lxpd = ptolxproc(p); + + /* + * Ignored signals and ptrace: + * + * When a process is being ptraced by another, special care is needed + * while handling signals. Since the tracer is interested in all + * signals sent to the tracee, an effort must be made to initially + * bypass signal ignorance logic. This allows the signal to be placed + * in the tracee's sigqueue to be inspected and potentially altered by + * the tracer. + * + * A critical detail in this procedure is how a signal is handled after + * tracer has completed processing for the event. If the signal would + * have been ignored, were it not for the initial ptrace override, then + * lx_ptrace_sig_ignorable must report B_TRUE when the tracee is + * restarted and resumes signal processing. This is done by recording + * the most recent tracee signal consumed by ptrace. + */ + + if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) { + /* + * This process is being ptraced. Bypass signal ignorance for + * anything that maps to a valid Linux signal... + */ + if (lwp != NULL && lwptolxlwp(lwp)->br_ptrace_donesig == sig) { + /* + * ...Unless it is a signal which has already been + * processed by the tracer. + */ + return (B_TRUE); + } + return (B_FALSE); + } + return (B_TRUE); +} + +static void +lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + lx_ptrace_accord_enter(accord); + /* + * Mark this accord for death. This means no new tracees can be + * attached to this accord. + */ + VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE); + accord->lxpa_flags |= LX_ACC_TOMBSTONE; + lx_ptrace_accord_exit(accord); + + /* + * Walk the list of tracees, detaching them and setting them runnable + * if they are stopped. + */ + for (;;) { + klwp_t *rlwp; + proc_t *rproc; + lx_lwp_data_t *remote; + kmutex_t *rmp; + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + mutex_exit(&accord->lxpa_tracees_lock); + break; + } + + /* + * Fetch the first tracee LWP in the list and lock the process + * which contains it. + */ + remote = list_head(&accord->lxpa_tracees); + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + /* + * The p_lock mutex persists beyond the life of the process + * itself. We save the address, here, to prevent the need to + * dereference the proc_t after awaking from sleep. + */ + rmp = &rproc->p_lock; + mutex_enter(rmp); + + if (TRACEE_BUSY(remote)) { + /* + * This LWP is currently detaching itself on exit, or + * mid-way through stop(). We must wait for this + * action to be completed. While we wait on the CV, we + * must drop the accord tracee list lock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + cv_wait(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, some state may have changed. + * Restart the walk to be sure we don't miss anything. + */ + mutex_exit(rmp); + continue; + } + + /* + * We now hold p_lock on the process. Remove the tracee from + * the list. + */ + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + /* + * Unlink the accord and clear our trace flags. + */ + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Let go of the list lock before we restart the LWP. We must + * not hold any locks other than the process p_lock when + * we call lx_ptrace_restart_lwp() as it will thread_lock + * the tracee. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Decrement traced-lwp count for the remote process. + */ + VERIFY(ptolxproc(rproc)->l_ptrace-- >= 1); + + /* + * Ensure that the LWP is not stopped on our account. + */ + lx_ptrace_restart_lwp(rlwp); + + /* + * Unlock the former tracee. + */ + mutex_exit(rmp); + + /* + * Drop the hold this tracee had on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + mutex_enter(&p->p_lock); + lwpd->br_ptrace_accord = NULL; + mutex_exit(&p->p_lock); + + /* + * Clean up and release our hold on the accord If we completely + * detached all tracee LWPs, this will free the accord. Otherwise, it + * will be freed when they complete their cleanup. + * + * We hold "pidlock" while clearing these members for easy exclusion of + * waitid(), etc. + */ + mutex_enter(&pidlock); + lx_ptrace_accord_enter(accord); + accord->lxpa_cvp = NULL; + accord->lxpa_tracer = NULL; + mutex_exit(&pidlock); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +static void +lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * We are the tracee LWP. Lock the accord tracee list and then our + * containing process. + */ + mutex_enter(&accord->lxpa_tracees_lock); + mutex_enter(&p->p_lock); + + /* + * Remove our reference to the accord. We will release our hold + * later. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + + /* + * Remove this LWP from the accord tracee list: + */ + VERIFY(list_link_active(&lwpd->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, lwpd); + + /* + * Wake up any tracers waiting for us to detach from the accord. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * Decrement traced-lwp count for the process. + */ + VERIFY(ptolxproc(p)->l_ptrace-- >= 1); + + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Grab "pidlock" and wake the tracer if it is blocked in waitid(). + */ + mutex_enter(&pidlock); + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + mutex_exit(&pidlock); + + /* + * Release our hold on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +/* + * This routine is called from lx_exitlwp() when an LWP is ready to exit. If + * this LWP is being traced, it will be detached from the tracer's accord. The + * routine will also detach any LWPs being traced by this LWP. + */ +void +lx_ptrace_exit(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark our LWP as exiting from a ptrace perspective. This will + * prevent a new accord from being allocated if one does not exist + * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + lwpd->br_ptrace_flags |= LX_PTF_EXITING; + + if ((accord = lwpd->br_ptrace_tracer) != NULL) { + /* + * We are traced by another LWP and must detach ourselves. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracee(p, lwpd, accord); + mutex_enter(&p->p_lock); + } + + if ((accord = lwpd->br_ptrace_accord) != NULL) { + /* + * We have been tracing other LWPs, and must detach from + * them and clean up our accord. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracer(p, lwpd, accord); + mutex_enter(&p->p_lock); + } +} + +/* + * Called when a SIGCLD signal is dispatched so that we may enqueue another. + * Return 0 if we enqueued a signal, or -1 if not. + */ +int +lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + boolean_t found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&pp->p_lock)); + VERIFY(lwptoproc(lwp) == pp); + + mutex_enter(&pp->p_lock); + if ((accord = lwpd->br_ptrace_accord) == NULL) { + /* + * This LWP is not a tracer LWP, so there will be no + * SIGCLD. + */ + mutex_exit(&pp->p_lock); + return (-1); + } + mutex_exit(&pp->p_lock); + + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_CLDPEND)) { + /* + * No SIGCLD is required for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + return (-1); + } + + /* + * Generate siginfo for this tracee LWP. + */ + lx_ptrace_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL); + remote->br_ptrace_flags &= ~LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + mutex_enter(&pp->p_lock); + if (sigismember(&pp->p_sig, SIGCLD)) { + mutex_exit(&pp->p_lock); + + mutex_enter(&rproc->p_lock); + remote->br_ptrace_flags |= LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + return (-1); + } + sigaddqa(pp, curthread, sqp); + mutex_exit(&pp->p_lock); + + return (0); +} + +/* + * Consume the next available ptrace(2) event queued against the accord for + * this LWP. The event will be emitted as if through waitid(), and converted + * by lx_waitpid() and friends before the return to usermode. + */ +int +lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options, + boolean_t *brand_wants_wait, int *rval) +{ + lx_ptrace_accord_t *accord; + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *local = lwptolxlwp(lwp); + lx_lwp_data_t *remote; + boolean_t found = B_FALSE; + klwp_t *rlwp = NULL; + proc_t *rproc = NULL; + pid_t event_pid = 0, event_ppid = 0; + boolean_t waitflag = !(options & WNOWAIT); + boolean_t target_found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * By default, we do not expect waitid() to block on our account. + */ + *brand_wants_wait = B_FALSE; + + if (!local->br_waitid_emulate) { + /* + * This waitid() call is not expecting emulated results. + */ + return (-1); + } + + switch (idtype) { + case P_ALL: + case P_PID: + case P_PGID: + break; + default: + /* + * This idtype has no power here. + */ + return (-1); + } + + if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) { + /* + * This LWP does not have an accord; it cannot be tracing. + */ + return (-1); + } + + /* + * We do not need an additional hold on the accord as it belongs to + * the running, tracer, LWP. + */ + lx_ptrace_accord_exit(accord); + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + /* + * Though it has an accord, there are currently no tracees in + * the list for this LWP. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (-1); + } + + /* + * Walk the list of tracees and determine if any of them have events to + * report. + */ + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * We check to see if this LWP matches an id we are waiting for. + */ + switch (idtype) { + case P_ALL: + break; + case P_PID: + if (remote->br_pid != id) + continue; + break; + case P_PGID: + if (rproc->p_pgrp != id) + continue; + break; + default: + cmn_err(CE_PANIC, "unexpected idtype: %d", idtype); + } + + /* This tracee matches provided idtype and id */ + target_found = B_TRUE; + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + /* + * There were no events of interest, but we have tracees. + * If any of the tracees matched the spcified criteria, signal + * to waitid() that it should block if the provided flags allow + * for it. + */ + if (target_found) { + *brand_wants_wait = B_TRUE; + } + + return (-1); + } + + /* + * Populate the signal information. + */ + lx_ptrace_winfo(remote, ip, waitflag, &event_ppid, &event_pid); + + /* + * Unlock the tracee. + */ + mutex_exit(&rproc->p_lock); + + if (event_pid != 0 && event_ppid != 0) { + /* + * We need to do another pass around the tracee list and + * unblock any events that have a "happens after" relationship + * with this event. + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + mutex_enter(&rproc->p_lock); + + if (remote->br_pid != event_pid || + remote->br_ppid != event_ppid) { + mutex_exit(&rproc->p_lock); + continue; + } + + remote->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + mutex_exit(&rproc->p_lock); + } + mutex_exit(&accord->lxpa_tracees_lock); + } + + /* + * If we are consuming this wait state, we remove the SIGCLD from + * the queue and post another. + */ + if (waitflag) { + mutex_exit(&pidlock); + sigcld_delete(ip); + sigcld_repost(); + mutex_enter(&pidlock); + } + + *rval = 0; + return (0); +} + +static int +lx_ptrace_peek(lx_lwp_data_t *lwpd, uintptr_t addr, void *data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + long buf; + int error = 0, size = sizeof (buf); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uread(p, &buf, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + if (copyout(&buf, data, size) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +lx_ptrace_poke(lx_lwp_data_t *lwpd, uintptr_t addr, uintptr_t data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + int error = 0, size = sizeof (data); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uwrite(p, &data, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + return (0); +} + +static int +lx_ptrace_kill(lx_lwp_data_t *lwpd) +{ + sigtoproc(lwptoproc(lwpd->br_lwp), NULL, SIGKILL); + + return (0); +} + +static int +lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + lx_lwp_data_t *local = ttolxlwp(curthread); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + int error; + boolean_t found = B_FALSE; + + /* + * PTRACE_TRACEME and PTRACE_ATTACH operations induce the tracing of + * one LWP by another. The target LWP must not be traced already. + */ + switch (ptrace_op) { + case LX_PTRACE_TRACEME: + return (lx_ptrace_traceme()); + + case LX_PTRACE_ATTACH: + return (lx_ptrace_attach(lxpid)); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This routine + * should not fail because the LWP cannot make ptrace(2) system calls + * after it has begun exiting. + */ + VERIFY0(local->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * The accord belongs to this (the tracer) LWP, and we have a hold on + * it. We drop the lock so that we can take other locks. + */ + lx_ptrace_accord_exit(accord); + + /* + * Does the tracee list contain the pid in question? + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + if (remote->br_pid == lxpid) { + found = B_TRUE; + break; + } + } + if (!found) { + /* + * The requested pid does not appear in the tracee list. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (ESRCH); + } + + /* + * Attempt to lock the target LWP. + */ + if ((error = lx_ptrace_lock_if_stopped(accord, remote)) != 0) { + /* + * The LWP was not in "ptrace-stop". + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (error); + } + + /* + * The target LWP is in "ptrace-stop". We have the containing process + * locked. + */ + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + + if (ptrace_op == LX_PTRACE_DETACH) { + boolean_t release_hold = B_FALSE; + error = lx_ptrace_detach(accord, remote, (int)data, + &release_hold); + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + if (release_hold) { + /* + * Release a hold from the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); + } + + /* + * The tracees lock is not needed for any of the other operations. + * Drop it so further actions can avoid deadlock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Process the ptrace(2) request: + */ + switch (ptrace_op) { + case LX_PTRACE_CONT: + error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data); + break; + + case LX_PTRACE_SYSCALL: + error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data); + break; + + case LX_PTRACE_SINGLESTEP: + error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data); + break; + + case LX_PTRACE_SETOPTIONS: + error = lx_ptrace_setoptions(remote, data); + break; + + case LX_PTRACE_GETEVENTMSG: + error = lx_ptrace_geteventmsg(remote, (void *)data); + break; + + case LX_PTRACE_GETREGS: + error = lx_user_regs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETREGS: + error = lx_user_regs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETSIGINFO: + error = lx_ptrace_getsiginfo(remote, (void *)data); + break; + + case LX_PTRACE_PEEKTEXT: + case LX_PTRACE_PEEKDATA: + error = lx_ptrace_peek(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKETEXT: + case LX_PTRACE_POKEDATA: + error = lx_ptrace_poke(remote, addr, data); + break; + + case LX_PTRACE_PEEKUSER: + error = lx_ptrace_peekuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKEUSER: + error = lx_ptrace_pokeuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_GETFPREGS: + error = lx_user_fpregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPREGS: + error = lx_user_fpregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETFPXREGS: + error = lx_user_fpxregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPXREGS: + error = lx_user_fpxregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_KILL: + error = lx_ptrace_kill(remote); + break; + + default: + error = EINVAL; + } + + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + + return (error); +} + +int +lx_ptrace(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + int error; + + error = lx_ptrace_kernel(ptrace_op, lxpid, addr, data); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +void +lx_ptrace_init(void) +{ + cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL); + + lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord", + sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lx_ptrace_fini(void) +{ + cv_destroy(&lx_ptrace_busy_cv); + + kmem_cache_destroy(lx_ptrace_accord_cache); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_signal.c b/usr/src/uts/common/brand/lx/os/lx_signal.c new file mode 100644 index 0000000000..53e0cecc14 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_signal.c @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/signal.h> +#include <sys/sunddi.h> +#include <lx_signum.h> + +void +lx_ltos_sigset(lx_sigset_t *lsigp, k_sigset_t *ssigp) +{ + int lx_sig, sig; + + sigemptyset(ssigp); + for (lx_sig = 1; lx_sig <= LX_NSIG; lx_sig++) { + if (lx_sigismember(lsigp, lx_sig) && + ((sig = ltos_signo[lx_sig]) > 0)) + sigaddset(ssigp, sig); + } + + /* Emulate sigutok() restrictions */ + ssigp->__sigbits[0] &= (FILLSET0 & ~CANTMASK0); + ssigp->__sigbits[1] &= (FILLSET1 & ~CANTMASK1); + ssigp->__sigbits[2] &= (FILLSET2 & ~CANTMASK2); +} + +void +lx_stol_sigset(k_sigset_t *ssigp, lx_sigset_t *lsigp) +{ + int sig, lx_sig; + + bzero(lsigp, sizeof (lx_sigset_t)); + for (sig = 1; sig < NSIG; sig++) { + if (sigismember(ssigp, sig) && + ((lx_sig = stol_signo[sig]) > 0)) + lx_sigaddset(lsigp, lx_sig); + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c new file mode 100644 index 0000000000..f48b043aa3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -0,0 +1,1316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/privregs.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/sdt.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + + +/* + * Flags for sysent entries: + */ +#define LX_SYS_NOSYS_REASON 0x07 +#define LX_SYS_EBPARG6 0x08 + +/* + * Flags that denote the specific reason we do not have a particular system + * call. These reasons are only valid if the function is NULL. + */ +#define NOSYS_USERMODE 0 +#define NOSYS_NULL 1 +#define NOSYS_NONE 2 +#define NOSYS_NO_EQUIV 3 +#define NOSYS_KERNEL 4 +#define NOSYS_UNDOC 5 +#define NOSYS_OBSOLETE 6 +#define NOSYS_MAX NOSYS_OBSOLETE + +#if NOSYS_MAX > LX_SYS_NOSYS_REASON +#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON +#endif + +/* + * Strings describing the reason we do not emulate a particular system call + * in the kernel. + */ +static char *nosys_reasons[] = { + NULL, /* NOSYS_USERMODE means this call is emulated in usermode */ + "Not done yet", + "No such Linux system call", + "No equivalent illumos functionality", + "Reads/modifies Linux kernel state", + "Undocumented and/or rarely used system call", + "Unsupported, obsolete system call" +}; + + +#if defined(_LP64) +/* + * System call handler table and entry count for Linux x86_64 (amd64): + */ +lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +int lx_nsysent64; +#endif +/* + * System call handler table and entry count for Linux x86 (i386): + */ +lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +int lx_nsysent32; + +#if defined(_LP64) +struct lx_vsyscall +{ + uintptr_t lv_addr; + uintptr_t lv_scnum; +} lx_vsyscalls[] = { + { LX_VSYS_gettimeofday, LX_SYS_gettimeofday }, + { LX_VSYS_time, LX_SYS_time }, + { LX_VSYS_getcpu, LX_SYS_getcpu }, + { NULL, NULL } +}; +#endif + +#if defined(__amd64) +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Note: Syscall argument passing is different from function + * call argument passing on amd64. For function calls, the + * fourth arg is passed via %rcx, but for system calls the 4th + * arg is passed via %r10. This is because in amd64, the + * syscall instruction puts the lower 32 bits of %rflags in + * %r11 and puts the %rip value to %rcx. + * + * Appendix A of the amd64 ABI (Linux conventions) states that + * syscalls are limited to 6 args and no arg is passed on the + * stack. + */ + args[0] = rp->r_rdi; + args[1] = rp->r_rsi; + args[2] = rp->r_rdx; + args[3] = rp->r_r10; + args[4] = rp->r_r8; + args[5] = rp->r_r9; + } else { + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + uint32_t args32[6]; + + if (copyin((void *)rp->r_rbx, &args32, + sizeof (args32)) != 0) { + /* + * Clear the argument vector so that the + * trace probe does not expose kernel + * memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + + args[0] = args32[0]; + args[1] = args32[1]; + args[2] = args32[2]; + args[3] = args32[3]; + args[4] = args32[4]; + args[5] = args32[5]; + } else { + args[0] = rp->r_rbx; + args[1] = rp->r_rcx; + args[2] = rp->r_rdx; + args[3] = rp->r_rsi; + args[4] = rp->r_rdi; + args[5] = rp->r_rbp; + } + } + + return (0); +} + +#else /* !__amd64 */ + +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) != + 0) { + /* + * Clear the argument vector so that the trace probe + * does not expose kernel memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + } else { + args[0] = rp->r_ebx; + args[1] = rp->r_ecx; + args[2] = rp->r_edx; + args[3] = rp->r_esi; + args[4] = rp->r_edi; + args[5] = rp->r_ebp; + } + + return (0); +} +#endif + +int +lx_syscall_return(klwp_t *lwp, int syscall_num, long ret) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int error = lwp->lwp_errno; + + if (error != EINTR) { + /* + * If this system call was not interrupted, clear the system + * call restart flag before lx_setcontext() can pass it to + * usermode. + */ + lwpd->br_syscall_restart = B_FALSE; + } + + if (error != 0) { + /* + * Convert from illumos to Linux errno: + */ + ret = -lx_errno(error, EINVAL); + } + + /* + * 32-bit Linux system calls return via %eax; 64-bit calls return via + * %rax. + */ + rp->r_r0 = ret; + + /* + * Hold for the ptrace(2) "syscall-exit-stop" condition if required by + * PTRACE_SYSCALL. Note that the register state may be modified by + * tracer. + */ + lx_ptrace_stop(LX_PR_SYSEXIT); + + /* + * Fire the DTrace "lx-syscall:::return" probe: + */ + lx_trace_sysreturn(syscall_num, ret); + + /* + * Clear errno for next time. We do not clear "br_syscall_restart" or + * "br_syscall_num" as they are potentially used by "lx_savecontext()" + * in the signal delivery path. + */ + lwp->lwp_errno = 0; + + lx_check_strict_failure(lwpd); + + /* + * We want complete control of the registers on return from this + * emulated Linux system call: + */ + lwp->lwp_eosys = JUSTRETURN; + curthread->t_post_sys = 1; + aston(curthread); + + return (0); +} + +static void +lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason) +{ + char buf[100]; + + if (s == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds", + syscall_num); + } else { + VERIFY(unsup_reason < (sizeof (nosys_reasons) / + sizeof (*nosys_reasons))); + + if (s->sy_name == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s", + syscall_num, nosys_reasons[unsup_reason]); + } else { + (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s", + s->sy_name, nosys_reasons[unsup_reason]); + } + } + + lx_unsupported(buf); +} + +/* + * This function is used to override the processing of arguments and + * invocation of a handler for emulated system calls, installed on each + * branded LWP as "lwp_brand_syscall". If this system call should use the + * native path, we return 1. If we handled this system call (and have made + * arrangements with respect to post-return usermode register state) we + * return 0. + */ +int +lx_syscall_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num; + int error; + long ret = 0; + lx_sysent_t *s; + uintptr_t args[6]; + unsigned int unsup_reason; + + /* + * If we got here, we should have an LWP-specific brand data + * structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return + * to the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); + } + + /* + * Clear the restartable system call flag. This flag will be set + * on in the system call handler if the call is a candidate for + * a restart. It will be saved by lx_setcontext() in the event + * that we take a signal, and used in the signal handling path + * to restart the system call iff SA_RESTART was set for this + * signal. Save the system call number so that we can store it + * in the saved context if required. + */ + lwpd->br_syscall_restart = B_FALSE; + lwpd->br_syscall_num = (int)rp->r_r0; + + /* + * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by + * PTRACE_SYSCALL. The system call number and arguments may be + * modified by the tracer. + */ + lx_ptrace_stop(LX_PR_SYSENTRY); + + /* + * Check that the system call number is within the bounds we expect. + */ + syscall_num = lwpd->br_syscall_num; + if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) { + lx_syscall_unsup_msg(NULL, syscall_num, 0); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + s = &lx_sysent64[syscall_num]; + } else +#endif + { + s = &lx_sysent32[syscall_num]; + } + + /* + * Process the arguments for this system call and fire the DTrace + * "lx-syscall:::entry" probe: + */ + error = lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + if (error != 0) { + /* + * Could not read and process the arguments. Return the error + * to the process. + */ + set_errno(error); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + + if (s->sy_callc != NULL) { + /* + * Call the in-kernel handler for this Linux system call: + */ + lwpd->br_eosys = NORMALRETURN; + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + if (lwpd->br_eosys == NORMALRETURN) { + lx_syscall_return(lwp, syscall_num, ret); + } + return (0); + } + + /* + * There is no in-kernel handler. + */ + switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) { + case NOSYS_USERMODE: + /* + * Pass to the usermode emulation routine. + */ +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(lwp, syscall_num, args); + } else +#endif + { + lx_emulate_user(lwp, syscall_num, args); + } + return (0); + + default: + /* + * We are not emulating this system call at all. + */ + lx_syscall_unsup_msg(s, syscall_num, unsup_reason); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } +} + +#if defined(_LP64) +/* + * Emulate vsyscall support. + * + * Linux magically maps a single page into the address space of each process, + * allowing them to make 'vsyscalls'. Originally designed to counteract the + * perceived overhead of regular system calls, vsyscalls were implemented as + * code residing in userspace which could be called directly. The userspace + * implementations of these vsyscalls which have now been replaced by + * instructions which vector into the normal syscall path. + * + * Implementing vsyscalls on Illumos is complicated by the fact that the + * required static address region resides inside the kernel address space. + * Rather than mapping a user-accessible page into the KAS, a different + * approach is taken. The vsyscall gate is emulated by interposing on + * pagefaults in trap(). An attempt to execute a known vsyscall address will + * result in emulating the appropriate system call rather than inducing a + * SIGSEGV. + */ +void +lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum) +{ + struct regs *rp = lwptoregs(lwp); + uintptr_t raddr; + + /* + * Fetch the return address from the process stack. + */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) { +#if DEBUG + printf("lx_vsyscall_call: bad brand stack at vsyscall " + "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm, + p->p_pid, (void *)rp->r_rsp); +#endif + + /* + * The process jumped to the vsyscall address without a + * correctly configured stack. Terminate the process. + */ + exit(CLD_KILLED, SIGSEGV); + return; + } + + DTRACE_PROBE1(brand__lx__vsyscall, int, scnum); + + /* Simulate vectoring into the syscall */ + rp->r_rax = scnum; + rp->r_rip = raddr; + rp->r_rsp += sizeof (uintptr_t); + + lx_syscall_enter(); +} + +boolean_t +lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int i; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * We only handle vsyscalls when running Linux code. + */ + return (B_FALSE); + } + + if (addr < LX_VSYSCALL_ADDR || + addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) { + /* + * Ignore faults outside the vsyscall page. + */ + return (B_FALSE); + } + + for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) { + if (addr == lx_vsyscalls[i].lv_addr) { + /* + * This is a valid vsyscall address. + */ + *scnum = lx_vsyscalls[i].lv_scnum; + return (B_TRUE); + } + } + + lx_unsupported("bad vsyscall access"); + return (B_FALSE); +} +#endif + +/* + * This function is used to provide a fasttrap-like interface for emulated + * syscalls. By skipping housekeeping such as mstate transitions, it should + * cut down on overhead for syscalls which would normally be fasttraps in a + * native process. + */ +int +lx_syscall_fast_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num, error; + lx_sysent_t *s; + uintptr_t args[6]; + long ret = 0; + + /* + * If we got here, we should have an LWP-specific brand data structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return to + * the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); + } + if (lwpd->br_ptrace_tracer != NULL) { + /* + * Given that ptrace is the antithesis of "fast", return to the + * regular system call path if we are being traced. + */ + return (1); + } + + syscall_num = (int)rp->r_r0; +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + switch (syscall_num) { + case LX_SYS32_gettimeofday: + case LX_SYS32_time: + case LX_SYS32_clock_gettime: + case LX_SYS32_getcpu: + s = &lx_sysent32[syscall_num]; + break; + default: + return (1); + } + } else +#endif + { + switch (syscall_num) { + case LX_SYS_gettimeofday: + case LX_SYS_time: + case LX_SYS_clock_gettime: + case LX_SYS_getcpu: +#if defined(_LP64) + s = &lx_sysent64[syscall_num]; +#else + s = &lx_sysent32[syscall_num]; +#endif + break; + default: + return (1); + } + } + + /* + * The above syscall restrictions should ensure that we do not arrive + * at this point without a suitable syscall planned. Since the + * lx_emulate_args routine can only fail for 6-arg syscalls, none of + * which would be performed as a fasttrap, it is assumed to succeed. + */ + VERIFY(s->sy_callc != NULL); + VERIFY(s->sy_narg < 6); + (void) lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + + if ((error = lwp->lwp_errno) != 0) { + ret = -lx_errno(error, EINVAL); + lwp->lwp_errno = 0; + } + rp->r_r0 = ret; + lx_trace_sysreturn(syscall_num, ret); + lwp->lwp_eosys = JUSTRETURN; + return (0); +} + +/* + * Linux defines system call numbers for 32-bit x86 in the file: + * arch/x86/syscalls/syscall_32.tbl + */ +lx_sysent_t lx_sysent32[] = { + {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ + {"exit", NULL, 0, 1}, /* 1 */ + {"fork", NULL, 0, 0}, /* 2 */ + {"read", lx_read, 0, 3}, /* 3 */ + {"write", lx_write, 0, 3}, /* 4 */ + {"open", lx_open, 0, 3}, /* 5 */ + {"close", lx_close, 0, 1}, /* 6 */ + {"waitpid", lx_waitpid, 0, 3}, /* 7 */ + {"creat", NULL, 0, 2}, /* 8 */ + {"link", lx_link, 0, 2}, /* 9 */ + {"unlink", NULL, 0, 1}, /* 10 */ + {"execve", NULL, 0, 3}, /* 11 */ + {"chdir", NULL, 0, 1}, /* 12 */ + {"time", lx_time, 0, 1}, /* 13 */ + {"mknod", NULL, 0, 3}, /* 14 */ + {"chmod", lx_chmod, 0, 2}, /* 15 */ + {"lchown16", lx_lchown16, 0, 3}, /* 16 */ + {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ + {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ + {"lseek", NULL, 0, 3}, /* 19 */ + {"getpid", lx_getpid, 0, 0}, /* 20 */ + {"mount", NULL, 0, 5}, /* 21 */ + {"umount", NULL, 0, 1}, /* 22 */ + {"setuid16", NULL, 0, 1}, /* 23 */ + {"getuid16", NULL, 0, 0}, /* 24 */ + {"stime", NULL, 0, 1}, /* 25 */ + {"ptrace", lx_ptrace, 0, 4}, /* 26 */ + {"alarm", NULL, 0, 1}, /* 27 */ + {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ + {"pause", NULL, 0, 0}, /* 29 */ + {"utime", NULL, 0, 2}, /* 30 */ + {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ + {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ + {"access", lx_access, 0, 2}, /* 33 */ + {"nice", NULL, 0, 1}, /* 34 */ + {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ + {"sync", NULL, 0, 0}, /* 36 */ + {"kill", lx_kill, 0, 2}, /* 37 */ + {"rename", NULL, 0, 2}, /* 38 */ + {"mkdir", lx_mkdir, 0, 2}, /* 39 */ + {"rmdir", NULL, 0, 1}, /* 40 */ + {"dup", NULL, 0, 1}, /* 41 */ + {"pipe", lx_pipe, 0, 1}, /* 42 */ + {"times", NULL, 0, 1}, /* 43 */ + {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ + {"brk", lx_brk, 0, 1}, /* 45 */ + {"setgid16", NULL, 0, 1}, /* 46 */ + {"getgid16", NULL, 0, 0}, /* 47 */ + {"signal", NULL, 0, 2}, /* 48 */ + {"geteuid16", NULL, 0, 0}, /* 49 */ + {"getegid16", NULL, 0, 0}, /* 50 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */ + {"umount2", NULL, 0, 2}, /* 52 */ + {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ + {"ioctl", lx_ioctl, 0, 3}, /* 54 */ + {"fcntl", lx_fcntl, 0, 3}, /* 55 */ + {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ + {"setpgid", NULL, 0, 2}, /* 57 */ + {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ + {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ + {"umask", NULL, 0, 1}, /* 60 */ + {"chroot", NULL, 0, 1}, /* 61 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ + {"dup2", NULL, 0, 2}, /* 63 */ + {"getppid", lx_getppid, 0, 0}, /* 64 */ + {"getpgrp", NULL, 0, 0}, /* 65 */ + {"setsid", NULL, 0, 0}, /* 66 */ + {"sigaction", NULL, 0, 3}, /* 67 */ + {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ + {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ + {"setreuid16", NULL, 0, 2}, /* 70 */ + {"setregid16", NULL, 0, 2}, /* 71 */ + {"sigsuspend", NULL, 0, 1}, /* 72 */ + {"sigpending", NULL, 0, 1}, /* 73 */ + {"sethostname", NULL, 0, 2}, /* 74 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */ + {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */ + {"getrusage", NULL, 0, 2}, /* 77 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */ + {"settimeofday", NULL, 0, 2}, /* 79 */ + {"getgroups16", NULL, 0, 2}, /* 80 */ + {"setgroups16", NULL, 0, 2}, /* 81 */ + {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ + {"symlink", NULL, 0, 2}, /* 83 */ + {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ + {"readlink", NULL, 0, 3}, /* 85 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */ + {"reboot", NULL, 0, 4}, /* 88 */ + {"readdir", NULL, 0, 3}, /* 89 */ + {"mmap", NULL, 0, 6}, /* 90 */ + {"munmap", NULL, 0, 2}, /* 91 */ + {"truncate", NULL, 0, 2}, /* 92 */ + {"ftruncate", NULL, 0, 2}, /* 93 */ + {"fchmod", lx_fchmod, 0, 2}, /* 94 */ + {"fchown16", lx_fchown16, 0, 3}, /* 95 */ + {"getpriority", NULL, 0, 2}, /* 96 */ + {"setpriority", NULL, 0, 3}, /* 97 */ + {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ + {"statfs", NULL, 0, 2}, /* 99 */ + {"fstatfs", NULL, 0, 2}, /* 100 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ + {"socketcall", lx_socketcall, 0, 2}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"setitimer", NULL, 0, 3}, /* 104 */ + {"getitimer", NULL, 0, 2}, /* 105 */ + {"stat", lx_stat32, 0, 2}, /* 106 */ + {"lstat", lx_lstat32, 0, 2}, /* 107 */ + {"fstat", lx_fstat32, 0, 2}, /* 108 */ + {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ + {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ + {"vhangup", NULL, 0, 0}, /* 111 */ + {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ + {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ + {"wait4", lx_wait4, 0, 4}, /* 114 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */ + {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ + {"ipc", NULL, 0, 5}, /* 117 */ + {"fsync", NULL, 0, 1}, /* 118 */ + {"sigreturn", NULL, 0, 1}, /* 119 */ + {"clone", NULL, 0, 5}, /* 120 */ + {"setdomainname", NULL, 0, 2}, /* 121 */ + {"uname", lx_uname, 0, 1}, /* 122 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */ + {"adjtimex", NULL, 0, 1}, /* 124 */ + {"mprotect", NULL, 0, 3}, /* 125 */ + {"sigprocmask", NULL, 0, 3}, /* 126 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ + {"getpgid", NULL, 0, 1}, /* 132 */ + {"fchdir", NULL, 0, 1}, /* 133 */ + {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"sysfs", NULL, 0, 3}, /* 135 */ + {"personality", lx_personality, 0, 1}, /* 136 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ + {"setfsuid16", NULL, 0, 1}, /* 138 */ + {"setfsgid16", NULL, 0, 1}, /* 139 */ + {"llseek", NULL, 0, 5}, /* 140 */ + {"getdents", lx_getdents_32, 0, 3}, /* 141 */ + {"select", lx_select, 0, 5}, /* 142 */ + {"flock", NULL, 0, 2}, /* 143 */ + {"msync", NULL, 0, 3}, /* 144 */ + {"readv", lx_readv, 0, 3}, /* 145 */ + {"writev", lx_writev, 0, 3}, /* 146 */ + {"getsid", NULL, 0, 1}, /* 147 */ + {"fdatasync", NULL, 0, 1}, /* 148 */ + {"sysctl", NULL, 0, 1}, /* 149 */ + {"mlock", NULL, 0, 2}, /* 150 */ + {"munlock", NULL, 0, 2}, /* 151 */ + {"mlockall", NULL, 0, 1}, /* 152 */ + {"munlockall", NULL, 0, 0}, /* 153 */ + {"sched_setparam", NULL, 0, 2}, /* 154 */ + {"sched_getparam", NULL, 0, 2}, /* 155 */ + {"sched_setscheduler", NULL, 0, 3}, /* 156 */ + {"sched_getscheduler", NULL, 0, 1}, /* 157 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 159 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 160 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 161 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */ + {"mremap", NULL, 0, 5}, /* 163 */ + {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */ + {"getresuid16", NULL, 0, 3}, /* 165 */ + {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ + {"query_module", NULL, 0, 5}, /* 167 */ + {"poll", lx_poll, 0, 3}, /* 168 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ + {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */ + {"getresgid16", NULL, 0, 3}, /* 171 */ + {"prctl", lx_prctl, 0, 5}, /* 172 */ + {"rt_sigreturn", NULL, 0, 0}, /* 173 */ + {"rt_sigaction", NULL, 0, 4}, /* 174 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 175 */ + {"rt_sigpending", NULL, 0, 2}, /* 176 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 179 */ + {"pread64", lx_pread32, 0, 5}, /* 180 */ + {"pwrite64", lx_pwrite32, 0, 5}, /* 181 */ + {"chown16", lx_chown16, 0, 3}, /* 182 */ + {"getcwd", lx_getcwd, 0, 2}, /* 183 */ + {"capget", NULL, 0, 2}, /* 184 */ + {"capset", NULL, 0, 2}, /* 185 */ + {"sigaltstack", NULL, 0, 2}, /* 186 */ + {"sendfile", NULL, 0, 4}, /* 187 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ + {"vfork", NULL, 0, 0}, /* 190 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */ + {"mmap2", NULL, LX_SYS_EBPARG6, 6}, /* 192 */ + {"truncate64", NULL, 0, 3}, /* 193 */ + {"ftruncate64", NULL, 0, 3}, /* 194 */ + {"stat64", lx_stat64, 0, 2}, /* 195 */ + {"lstat64", lx_lstat64, 0, 2}, /* 196 */ + {"fstat64", lx_fstat64, 0, 2}, /* 197 */ + {"lchown", lx_lchown, 0, 3}, /* 198 */ + {"getuid", NULL, 0, 0}, /* 199 */ + {"getgid", NULL, 0, 0}, /* 200 */ + {"geteuid", NULL, 0, 0}, /* 201 */ + {"getegid", NULL, 0, 0}, /* 202 */ + {"setreuid", NULL, 0, 0}, /* 203 */ + {"setregid", NULL, 0, 0}, /* 204 */ + {"getgroups", NULL, 0, 2}, /* 205 */ + {"setgroups", NULL, 0, 2}, /* 206 */ + {"fchown", lx_fchown, 0, 3}, /* 207 */ + {"setresuid", lx_setresuid, 0, 3}, /* 208 */ + {"getresuid", NULL, 0, 3}, /* 209 */ + {"setresgid", lx_setresgid, 0, 3}, /* 210 */ + {"getresgid", NULL, 0, 3}, /* 211 */ + {"chown", lx_chown, 0, 3}, /* 212 */ + {"setuid", NULL, 0, 1}, /* 213 */ + {"setgid", NULL, 0, 1}, /* 214 */ + {"setfsuid", NULL, 0, 1}, /* 215 */ + {"setfsgid", NULL, 0, 1}, /* 216 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ + {"mincore", NULL, 0, 3}, /* 218 */ + {"madvise", NULL, 0, 3}, /* 219 */ + {"getdents64", lx_getdents64, 0, 3}, /* 220 */ + {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ + {"gettid", lx_gettid, 0, 0}, /* 224 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ + {"setxattr", lx_setxattr, 0, 5}, /* 226 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 227 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 228 */ + {"getxattr", lx_getxattr, 0, 4}, /* 229 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 230 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 231 */ + {"listxattr", lx_listxattr, 0, 3}, /* 232 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 233 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 234 */ + {"removexattr", lx_removexattr, 0, 2}, /* 235 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 236 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 237 */ + {"tkill", lx_tkill, 0, 2}, /* 238 */ + {"sendfile64", NULL, 0, 4}, /* 239 */ + {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */ + {"sched_setaffinity", NULL, 0, 3}, /* 241 */ + {"sched_getaffinity", NULL, 0, 3}, /* 242 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */ + {"io_setup", lx_io_setup, 0, 2}, /* 245 */ + {"io_destroy", NULL, 0, 1}, /* 246 */ + {"io_getevents", NULL, 0, 5}, /* 247 */ + {"io_submit", NULL, 0, 3}, /* 248 */ + {"io_cancel", NULL, 0, 3}, /* 249 */ + {"fadvise64", NULL, 0, 4}, /* 250 */ + {"nosys", NULL, 0, 0}, /* 251 */ + {"group_exit", NULL, 0, 1}, /* 252 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */ + {"timer_create", NULL, 0, 3}, /* 259 */ + {"timer_settime", NULL, 0, 4}, /* 260 */ + {"timer_gettime", NULL, 0, 2}, /* 261 */ + {"timer_getoverrun", NULL, 0, 1}, /* 262 */ + {"timer_delete", NULL, 0, 1}, /* 263 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */ + {"clock_nanosleep", NULL, 0, 4}, /* 267 */ + {"statfs64", NULL, 0, 2}, /* 268 */ + {"fstatfs64", NULL, 0, 2}, /* 269 */ + {"tgkill", lx_tgkill, 0, 3}, /* 270 */ + +/* + * The following system calls only exist in kernel 2.6 and greater: + */ + {"utimes", NULL, 0, 2}, /* 271 */ + {"fadvise64_64", NULL, 0, 4}, /* 272 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ + {"waitid", lx_waitid, 0, 4}, /* 284 */ + {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 289 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 290 */ + {"inotify_init", NULL, 0, 0}, /* 291 */ + {"inotify_add_watch", NULL, 0, 3}, /* 292 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 293 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ + {"openat", lx_openat, 0, 4}, /* 295 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */ + {"mknodat", NULL, 0, 4}, /* 297 */ + {"fchownat", lx_fchownat, 0, 5}, /* 298 */ + {"futimesat", NULL, 0, 3}, /* 299 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */ + {"unlinkat", NULL, 0, 3}, /* 301 */ + {"renameat", NULL, 0, 4}, /* 302 */ + {"linkat", lx_linkat, 0, 5}, /* 303 */ + {"symlinkat", NULL, 0, 3}, /* 304 */ + {"readlinkat", NULL, 0, 4}, /* 305 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 306 */ + {"faccessat", lx_faccessat, 0, 4}, /* 307 */ + {"pselect6", lx_pselect, LX_SYS_EBPARG6, 6}, /* 308 */ + {"ppoll", lx_ppoll, 0, 5}, /* 309 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 311 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 312 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 314 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getcpu", lx_getcpu, 0, 3}, /* 318 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */ + {"utimensat", NULL, 0, 4}, /* 320 */ + {"signalfd", NULL, 0, 3}, /* 321 */ + {"timerfd_create", NULL, 0, 2}, /* 322 */ + {"eventfd", NULL, 0, 1}, /* 323 */ + {"fallocate", lx_fallocate32, LX_SYS_EBPARG6, 6}, /* 324 */ + {"timerfd_settime", NULL, 0, 4}, /* 325 */ + {"timerfd_gettime", NULL, 0, 2}, /* 326 */ + {"signalfd4", NULL, 0, 4}, /* 327 */ + {"eventfd2", NULL, 0, 2}, /* 328 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */ + {"dup3", NULL, 0, 3}, /* 330 */ + {"pipe2", lx_pipe2, 0, 2}, /* 331 */ + {"inotify_init1", NULL, 0, 1}, /* 332 */ + {"preadv", lx_preadv32, 0, 5}, /* 333 */ + {"pwritev", lx_pwritev32, 0, 5}, /* 334 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ + {"syncfs", lx_syncfs, 0, 1}, /* 344 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */ + {"getrandom", lx_getrandom, 0, 3}, /* 355 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */ +}; + +#if defined(_LP64) +/* + * Linux defines system call numbers for 64-bit x86 in the file: + * arch/x86/syscalls/syscall_64.tbl + */ +lx_sysent_t lx_sysent64[] = { + {"read", lx_read, 0, 3}, /* 0 */ + {"write", lx_write, 0, 3}, /* 1 */ + {"open", lx_open, 0, 3}, /* 2 */ + {"close", lx_close, 0, 1}, /* 3 */ + {"stat", lx_stat64, 0, 2}, /* 4 */ + {"fstat", lx_fstat64, 0, 2}, /* 5 */ + {"lstat", lx_lstat64, 0, 2}, /* 6 */ + {"poll", lx_poll, 0, 3}, /* 7 */ + {"lseek", NULL, 0, 3}, /* 8 */ + {"mmap", NULL, 0, 6}, /* 9 */ + {"mprotect", NULL, 0, 3}, /* 10 */ + {"munmap", NULL, 0, 2}, /* 11 */ + {"brk", lx_brk, 0, 1}, /* 12 */ + {"rt_sigaction", NULL, 0, 4}, /* 13 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 14 */ + {"rt_sigreturn", NULL, 0, 0}, /* 15 */ + {"ioctl", lx_ioctl, 0, 3}, /* 16 */ + {"pread64", lx_pread, 0, 4}, /* 17 */ + {"pwrite64", lx_pwrite, 0, 4}, /* 18 */ + {"readv", lx_readv, 0, 3}, /* 19 */ + {"writev", lx_writev, 0, 3}, /* 20 */ + {"access", lx_access, 0, 2}, /* 21 */ + {"pipe", lx_pipe, 0, 1}, /* 22 */ + {"select", lx_select, 0, 5}, /* 23 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */ + {"mremap", NULL, 0, 5}, /* 25 */ + {"msync", NULL, 0, 3}, /* 26 */ + {"mincore", NULL, 0, 3}, /* 27 */ + {"madvise", NULL, 0, 3}, /* 28 */ + {"shmget", NULL, 0, 3}, /* 29 */ + {"shmat", NULL, 0, 4}, /* 30 */ + {"shmctl", NULL, 0, 3}, /* 31 */ + {"dup", NULL, 0, 1}, /* 32 */ + {"dup2", NULL, 0, 2}, /* 33 */ + {"pause", NULL, 0, 0}, /* 34 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */ + {"getitimer", NULL, 0, 2}, /* 36 */ + {"alarm", NULL, 0, 1}, /* 37 */ + {"setitimer", NULL, 0, 3}, /* 38 */ + {"getpid", lx_getpid, 0, 0}, /* 39 */ + {"sendfile", NULL, 0, 4}, /* 40 */ + {"socket", lx_socket, 0, 3}, /* 41 */ + {"connect", lx_connect, 0, 3}, /* 42 */ + {"accept", lx_accept, 0, 3}, /* 43 */ + {"sendto", lx_sendto, 0, 6}, /* 44 */ + {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */ + {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */ + {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */ + {"shutdown", NULL, 0, 2}, /* 48 */ + {"bind", lx_bind, 0, 3}, /* 49 */ + {"listen", NULL, 0, 2}, /* 50 */ + {"getsockname", lx_getsockname, 0, 3}, /* 51 */ + {"getpeername", lx_getpeername, 0, 3}, /* 52 */ + {"socketpair", NULL, 0, 4}, /* 53 */ + {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */ + {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */ + {"clone", NULL, 0, 5}, /* 56 */ + {"fork", NULL, 0, 0}, /* 57 */ + {"vfork", NULL, 0, 0}, /* 58 */ + {"execve", NULL, 0, 3}, /* 59 */ + {"exit", NULL, 0, 1}, /* 60 */ + {"wait4", lx_wait4, 0, 4}, /* 61 */ + {"kill", lx_kill, 0, 2}, /* 62 */ + {"uname", lx_uname, 0, 1}, /* 63 */ + {"semget", NULL, 0, 3}, /* 64 */ + {"semop", NULL, 0, 3}, /* 65 */ + {"semctl", NULL, 0, 4}, /* 66 */ + {"shmdt", NULL, 0, 1}, /* 67 */ + {"msgget", NULL, 0, 2}, /* 68 */ + {"msgsnd", NULL, 0, 4}, /* 69 */ + {"msgrcv", NULL, 0, 5}, /* 70 */ + {"msgctl", NULL, 0, 3}, /* 71 */ + {"fcntl", lx_fcntl64, 0, 3}, /* 72 */ + {"flock", NULL, 0, 2}, /* 73 */ + {"fsync", NULL, 0, 1}, /* 74 */ + {"fdatasync", NULL, 0, 1}, /* 75 */ + {"truncate", NULL, 0, 2}, /* 76 */ + {"ftruncate", NULL, 0, 2}, /* 77 */ + {"getdents", lx_getdents_64, 0, 3}, /* 78 */ + {"getcwd", lx_getcwd, 0, 2}, /* 79 */ + {"chdir", NULL, 0, 1}, /* 80 */ + {"fchdir", NULL, 0, 1}, /* 81 */ + {"rename", NULL, 0, 2}, /* 82 */ + {"mkdir", lx_mkdir, 0, 2}, /* 83 */ + {"rmdir", NULL, 0, 1}, /* 84 */ + {"creat", NULL, 0, 2}, /* 85 */ + {"link", lx_link, 0, 2}, /* 86 */ + {"unlink", NULL, 0, 1}, /* 87 */ + {"symlink", NULL, 0, 2}, /* 88 */ + {"readlink", NULL, 0, 3}, /* 89 */ + {"chmod", lx_chmod, 0, 2}, /* 90 */ + {"fchmod", lx_fchmod, 0, 2}, /* 91 */ + {"chown", lx_chown, 0, 3}, /* 92 */ + {"fchown", lx_fchown, 0, 3}, /* 93 */ + {"lchown", lx_lchown, 0, 3}, /* 94 */ + {"umask", NULL, 0, 1}, /* 95 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */ + {"getrusage", NULL, 0, 2}, /* 98 */ + {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */ + {"times", NULL, 0, 1}, /* 100 */ + {"ptrace", lx_ptrace, 0, 4}, /* 101 */ + {"getuid", NULL, 0, 0}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"getgid", NULL, 0, 0}, /* 104 */ + {"setuid", NULL, 0, 1}, /* 105 */ + {"setgid", NULL, 0, 1}, /* 106 */ + {"geteuid", NULL, 0, 0}, /* 107 */ + {"getegid", NULL, 0, 0}, /* 108 */ + {"setpgid", NULL, 0, 2}, /* 109 */ + {"getppid", lx_getppid, 0, 0}, /* 110 */ + {"getpgrp", NULL, 0, 0}, /* 111 */ + {"setsid", NULL, 0, 0}, /* 112 */ + {"setreuid", NULL, 0, 0}, /* 113 */ + {"setregid", NULL, 0, 0}, /* 114 */ + {"getgroups", NULL, 0, 2}, /* 115 */ + {"setgroups", NULL, 0, 2}, /* 116 */ + {"setresuid", lx_setresuid, 0, 3}, /* 117 */ + {"getresuid", NULL, 0, 3}, /* 118 */ + {"setresgid", lx_setresgid, 0, 3}, /* 119 */ + {"getresgid", NULL, 0, 3}, /* 120 */ + {"getpgid", NULL, 0, 1}, /* 121 */ + {"setfsuid", NULL, 0, 1}, /* 122 */ + {"setfsgid", NULL, 0, 1}, /* 123 */ + {"getsid", NULL, 0, 1}, /* 124 */ + {"capget", NULL, 0, 2}, /* 125 */ + {"capset", NULL, 0, 2}, /* 126 */ + {"rt_sigpending", NULL, 0, 2}, /* 127 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 130 */ + {"sigaltstack", NULL, 0, 2}, /* 131 */ + {"utime", NULL, 0, 2}, /* 132 */ + {"mknod", NULL, 0, 3}, /* 133 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"personality", lx_personality, 0, 1}, /* 135 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ + {"statfs", NULL, 0, 2}, /* 137 */ + {"fstatfs", NULL, 0, 2}, /* 138 */ + {"sysfs", NULL, 0, 3}, /* 139 */ + {"getpriority", NULL, 0, 2}, /* 140 */ + {"setpriority", NULL, 0, 3}, /* 141 */ + {"sched_setparam", NULL, 0, 2}, /* 142 */ + {"sched_getparam", NULL, 0, 2}, /* 143 */ + {"sched_setscheduler", NULL, 0, 3}, /* 144 */ + {"sched_getscheduler", NULL, 0, 1}, /* 145 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 146 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 147 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 148 */ + {"mlock", NULL, 0, 2}, /* 149 */ + {"munlock", NULL, 0, 2}, /* 150 */ + {"mlockall", NULL, 0, 1}, /* 151 */ + {"munlockall", NULL, 0, 0}, /* 152 */ + {"vhangup", NULL, 0, 0}, /* 153 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ + {"sysctl", NULL, 0, 1}, /* 156 */ + {"prctl", lx_prctl, 0, 5}, /* 157 */ + {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ + {"adjtimex", NULL, 0, 1}, /* 159 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */ + {"chroot", NULL, 0, 1}, /* 161 */ + {"sync", NULL, 0, 0}, /* 162 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */ + {"settimeofday", NULL, 0, 2}, /* 164 */ + {"mount", NULL, 0, 5}, /* 165 */ + {"umount2", NULL, 0, 2}, /* 166 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */ + {"reboot", NULL, 0, 4}, /* 169 */ + {"sethostname", NULL, 0, 2}, /* 170 */ + {"setdomainname", NULL, 0, 2}, /* 171 */ + {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ + {"query_module", NULL, 0, 5}, /* 178 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ + {"gettid", lx_gettid, 0, 0}, /* 186 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ + {"setxattr", lx_setxattr, 0, 5}, /* 188 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 189 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 190 */ + {"getxattr", lx_getxattr, 0, 4}, /* 191 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 192 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 193 */ + {"listxattr", lx_listxattr, 0, 3}, /* 194 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 195 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 196 */ + {"removexattr", lx_removexattr, 0, 2}, /* 197 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 198 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 199 */ + {"tkill", lx_tkill, 0, 2}, /* 200 */ + {"time", lx_time, 0, 1}, /* 201 */ + {"futex", lx_futex, 0, 6}, /* 202 */ + {"sched_setaffinity", NULL, 0, 3}, /* 203 */ + {"sched_getaffinity", NULL, 0, 3}, /* 204 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */ + {"io_setup", lx_io_setup, 0, 2}, /* 206 */ + {"io_destroy", NULL, 0, 1}, /* 207 */ + {"io_getevents", NULL, 0, 5}, /* 208 */ + {"io_submit", NULL, 0, 3}, /* 209 */ + {"io_cancel", NULL, 0, 3}, /* 210 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */ + {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ + {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ + {"getdents64", lx_getdents64, 0, 3}, /* 217 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */ + {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ + {"semtimedop", NULL, 0, 4}, /* 220 */ + {"fadvise64", NULL, 0, 4}, /* 221 */ + {"timer_create", NULL, 0, 3}, /* 222 */ + {"timer_settime", NULL, 0, 4}, /* 223 */ + {"timer_gettime", NULL, 0, 2}, /* 224 */ + {"timer_getoverrun", NULL, 0, 1}, /* 225 */ + {"timer_delete", NULL, 0, 1}, /* 226 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */ + {"clock_nanosleep", NULL, 0, 4}, /* 230 */ + {"exit_group", NULL, 0, 1}, /* 231 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */ + {"tgkill", lx_tgkill, 0, 3}, /* 234 */ + {"utimes", NULL, 0, 2}, /* 235 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ + {"waitid", lx_waitid, 0, 4}, /* 247 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 251 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 252 */ + {"inotify_init", NULL, 0, 0}, /* 253 */ + {"inotify_add_watch", NULL, 0, 3}, /* 254 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 255 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ + {"openat", lx_openat, 0, 4}, /* 257 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */ + {"mknodat", NULL, 0, 4}, /* 259 */ + {"fchownat", lx_fchownat, 0, 5}, /* 260 */ + {"futimesat", NULL, 0, 3}, /* 261 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */ + {"unlinkat", NULL, 0, 3}, /* 263 */ + {"renameat", NULL, 0, 4}, /* 264 */ + {"linkat", lx_linkat, 0, 5}, /* 265 */ + {"symlinkat", NULL, 0, 3}, /* 266 */ + {"readlinkat", NULL, 0, 4}, /* 267 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 268 */ + {"faccessat", lx_faccessat, 0, 4}, /* 269 */ + {"pselect6", lx_pselect, 0, 6}, /* 270 */ + {"ppoll", lx_ppoll, 0, 5}, /* 271 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 273 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 274 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 275 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 277 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ + {"utimensat", NULL, 0, 4}, /* 280 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */ + {"signalfd", NULL, 0, 3}, /* 282 */ + {"timerfd_create", NULL, 0, 2}, /* 283 */ + {"eventfd", NULL, 0, 1}, /* 284 */ + {"fallocate", lx_fallocate, 0, 4}, /* 285 */ + {"timerfd_settime", NULL, 0, 4}, /* 286 */ + {"timerfd_gettime", NULL, 0, 2}, /* 287 */ + {"accept4", lx_accept4, 0, 4}, /* 288 */ + {"signalfd4", NULL, 0, 4}, /* 289 */ + {"eventfd2", NULL, 0, 2}, /* 290 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */ + {"dup3", NULL, 0, 3}, /* 292 */ + {"pipe2", lx_pipe2, 0, 2}, /* 293 */ + {"inotify_init1", NULL, 0, 1}, /* 294 */ + {"preadv", lx_preadv, 0, 4}, /* 295 */ + {"pwritev", lx_pwritev, 0, 4}, /* 296 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ + {"syncfs", lx_syncfs, 0, 1}, /* 306 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ + {"getcpu", lx_getcpu, 0, 3}, /* 309 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getrandom", lx_getrandom, 0, 3}, /* 318 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */ + {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */ + + /* XXX TBD gap then x32 syscalls from 512 - 544 */ +}; +#endif diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h new file mode 100644 index 0000000000..131a061062 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h @@ -0,0 +1,350 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_PROC_H +#define _LX_PROC_H + +#ifdef _LXPROC_NATIVE_H +#error Attempted to include branded lx_proc.h after native lxproc.h +#endif + +#define _LXPROC_BRANDED_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/zfs_ioctl.h> +#include <sys/nvpair.h> +#include <vm/as.h> +#include <vm/anon.h> + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_INVALID, /* nodes start at 1 */ + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_AUXV, /* /proc/<pid>/auxv */ + LXPR_PID_CGROUP, /* /proc/<pid>/cgroup */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_COMM, /* /proc/<pid>/comm */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_LIMITS, /* /proc/<pid>/limits */ + LXPR_PID_LOGINUID, /* /proc/<pid>/loginuid */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_MOUNTINFO, /* /proc/<pid>/mountinfo */ + LXPR_PID_OOM_SCR_ADJ, /* /proc/<pid>/oom_score_adj */ + LXPR_PID_PERSONALITY, /* /proc/<pid>/personality */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_TASKDIR, /* /proc/<pid>/task */ + LXPR_PID_TASK_IDDIR, /* /proc/<pid>/task/<tid> */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_PID_TID_AUXV, /* /proc/<pid>/task/<tid>/auxv */ + LXPR_PID_TID_CGROUP, /* /proc/<pid>/task/<tid>/cgroup */ + LXPR_PID_TID_CMDLINE, /* /proc/<pid>/task/<tid>/cmdline */ + LXPR_PID_TID_COMM, /* /proc/<pid>/task/<tid>/comm */ + LXPR_PID_TID_CPU, /* /proc/<pid>/task/<tid>/cpu */ + LXPR_PID_TID_CURDIR, /* /proc/<pid>/task/<tid>/cwd */ + LXPR_PID_TID_ENV, /* /proc/<pid>/task/<tid>/environ */ + LXPR_PID_TID_EXE, /* /proc/<pid>/task/<tid>/exe */ + LXPR_PID_TID_LIMITS, /* /proc/<pid>/task/<tid>/limits */ + LXPR_PID_TID_LOGINUID, /* /proc/<pid>/task/<tid>/loginuid */ + LXPR_PID_TID_MAPS, /* /proc/<pid>/task/<tid>/maps */ + LXPR_PID_TID_MEM, /* /proc/<pid>/task/<tid>/mem */ + LXPR_PID_TID_MOUNTINFO, /* /proc/<pid>/task/<tid>/mountinfo */ + LXPR_PID_TID_OOM_SCR_ADJ, /* /proc/<pid>/task/<tid>/oom_score_adj */ + LXPR_PID_TID_PERSONALITY, /* /proc/<pid>/task/<tid>/personality */ + LXPR_PID_TID_ROOTDIR, /* /proc/<pid>/task/<tid>/root */ + LXPR_PID_TID_STAT, /* /proc/<pid>/task/<tid>/stat */ + LXPR_PID_TID_STATM, /* /proc/<pid>/task/<tid>/statm */ + LXPR_PID_TID_STATUS, /* /proc/<pid>/task/<tid>/status */ + LXPR_PID_TID_FDDIR, /* /proc/<pid>/task/<tid>/fd */ + LXPR_PID_TID_FD_FD, /* /proc/<pid>/task/<tid>/fd/nn */ + LXPR_CGROUPS, /* /proc/cgroups */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DISKSTATS, /* /proc/diskstats */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MODULES, /* /proc/modules */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IF_INET6, /* /proc/net/if_inet6 */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_IPV6_ROUTE, /* /proc/net/ipv6_route */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_TCP6, /* /proc/net/tcp6 */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UDP6, /* /proc/net/udp6 */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_SWAPS, /* /proc/swaps */ + LXPR_SYSDIR, /* /proc/sys/ */ + LXPR_SYS_FSDIR, /* /proc/sys/fs/ */ + LXPR_SYS_FS_INOTIFYDIR, /* /proc/sys/fs/inotify */ + LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, /* inotify/max_queued_events */ + LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, /* inotify/max_user_instances */ + LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, /* inotify/max_user_watches */ + LXPR_SYS_KERNELDIR, /* /proc/sys/kernel/ */ + LXPR_SYS_KERNEL_CAPLCAP, /* /proc/sys/kernel/cap_last_cap */ + LXPR_SYS_KERNEL_COREPATT, /* /proc/sys/kernel/core_pattern */ + LXPR_SYS_KERNEL_HOSTNAME, /* /proc/sys/kernel/hostname */ + LXPR_SYS_KERNEL_MSGMNI, /* /proc/sys/kernel/msgmni */ + LXPR_SYS_KERNEL_NGROUPS_MAX, /* /proc/sys/kernel/ngroups_max */ + LXPR_SYS_KERNEL_OSREL, /* /proc/sys/kernel/osrelease */ + LXPR_SYS_KERNEL_PID_MAX, /* /proc/sys/kernel/pid_max */ + LXPR_SYS_KERNEL_RANDDIR, /* /proc/sys/kernel/random */ + LXPR_SYS_KERNEL_RAND_BOOTID, /* /proc/sys/kernel/random/boot_id */ + LXPR_SYS_KERNEL_SEM, /* /proc/sys/kernel/sem */ + LXPR_SYS_KERNEL_SHMALL, /* /proc/sys/kernel/shmall */ + LXPR_SYS_KERNEL_SHMMAX, /* /proc/sys/kernel/shmmax */ + LXPR_SYS_KERNEL_SHMMNI, /* /proc/sys/kernel/shmmni */ + LXPR_SYS_KERNEL_THREADS_MAX, /* /proc/sys/kernel/threads-max */ + LXPR_SYS_NETDIR, /* /proc/sys/net */ + LXPR_SYS_NET_COREDIR, /* /proc/sys/net/core */ + LXPR_SYS_NET_CORE_SOMAXCON, /* /proc/sys/net/core/somaxconn */ + LXPR_SYS_NET_IPV4DIR, /* /proc/sys/net/ipv4 */ + LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, /* .../net/ipv4/ip_local_port_range */ + LXPR_SYS_NET_IPV4_TCP_FIN_TO, /* /proc/sys/net/ipv4/tcp_fin_timeout */ + LXPR_SYS_NET_IPV4_TCP_KA_INT, /* .../net/ipv4/tcp_keepalive_intvl */ + LXPR_SYS_NET_IPV4_TCP_KA_TIM, /* .../net/ipv4/tcp_keepalive_time */ + LXPR_SYS_NET_IPV4_TCP_SACK, /* /proc/sys/net/ipv4/tcp_sack */ + LXPR_SYS_NET_IPV4_TCP_WINSCALE, /* .../net/ipv4/tcp_window_scaling */ + LXPR_SYS_VMDIR, /* /proc/sys/vm */ + LXPR_SYS_VM_MAX_MAP_CNT, /* /proc/sys/vm/max_map_count */ + LXPR_SYS_VM_MINFR_KB, /* /proc/sys/vm/min_free_kbytes */ + LXPR_SYS_VM_NHUGEP, /* /proc/sys/vm/nr_hugepages */ + LXPR_SYS_VM_OVERCOMMIT_MEM, /* /proc/sys/vm/overcommit_memory */ + LXPR_SYS_VM_SWAPPINESS, /* /proc/sys/vm/swappiness */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_VMSTAT, /* /proc/vmstat */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * Linux sector size for /proc/diskstats + */ +#define LXPR_SECTOR_SIZE 512 + +/* + * external dirent characteristics + */ +typedef struct { + lxpr_nodetype_t d_type; + char *d_name; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + uint_t lxpr_desc; /* addl. descriptor (fd or tid) */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern boolean_t lxpr_is_writable(lxpr_nodetype_t); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); +extern vnode_t *lxpr_lookup_fdnode(vnode_t *, const char *); +extern int lxpr_readlink_fdnode(lxpr_node_t *, char *, size_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +} lxpr_uiobuf_t; + +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +extern int lxpr_core_path_l2s(const char *, char *, size_t); +extern int lxpr_core_path_s2l(const char *, char *, size_t); + +typedef enum lxpr_zombok { + NO_ZOMB = 0, + ZOMB_OK +} zombok_t; + +proc_t *lxpr_lock(pid_t, zombok_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LX_PROC_H */ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c new file mode 100644 index 0000000000..c12118a3ea --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -0,0 +1,851 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +/* + * lxprsubr.c: Various functions for the /lxproc vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> +#include <sys/zfs_ioctl.h> + +#include "lx_proc.h" + +#define LXPRCACHE_NAME "lxbpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +int lx_pr_bufsize = 4000; + +struct lxpr_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +}; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lx_pr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +boolean_t +lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf) +{ + if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0) + return (B_TRUE); + return (B_FALSE); +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid, zombok_t zombie_ok) +{ + proc_t *p; + kmutex_t *mp; + pid_t find_pid; + uint_t flags; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process; + * if 0 we want zsched. + */ + if (pid == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (pid == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = pid; + } + p = prfind(find_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + /* + * Filter out exiting or zombie processes, if requested. + */ + if (zombie_ok == NO_ZOMB && + ((p->p_flag & SEXITING) || p->p_stat == SZOMB)) { + mutex_exit(mp); + return (NULL); + } + + flags = p->p_proc_flag & (P_PR_LOCK | P_PR_EXEC); + if (flags == 0) { + break; + } else if (flags == P_PR_EXEC && p == curproc) { + /* + * Forward progress with (only) the PR_EXEC flag is + * safe if a process is accessing resources in its own + * piddir. Executing its own /proc/<pid>/exe symlink + * is one potential example. + * + * For all other processes, it is necessary to wait + * until the exec is completed. + */ + break; + } + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc) +{ + if (pid == 1) { + pid = curproc->p_zone->zone_proc_initpid; + } else if (pid == 0) { + pid = curproc->p_zone->zone_zsched->p_pid; + } + + switch (type) { + case LXPR_PIDDIR: + return (maxpid + pid + 1); + case LXPR_PID_TASK_IDDIR: + return (maxpid + (desc * 10)); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + desc); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + lxpnp->lxpr_desc = desc; + VN_HOLD(dp); + if (p != NULL) { + if (p->p_pid == curproc->p_zone->zone_proc_initpid) { + lxpnp->lxpr_pid = 1; + } else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) { + lxpnp->lxpr_pid = 0; + } else { + lxpnp->lxpr_pid = p->p_pid; + } + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + if (lxpr_is_writable(type)) { + /* These two have different modes; handled later. */ + if (type != LXPR_PID_FD_FD && type != LXPR_PID_TID_FD_FD) { + vp->v_type = VREG; + lxpnp->lxpr_mode = 0644; + return (lxpnp); + } + } + + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_TASKDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_TASK_IDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + vp->v_type = VLNK; + break; + + case LXPR_PID_FDDIR: + case LXPR_PID_TID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + case LXPR_SYSDIR: + case LXPR_SYS_FSDIR: + case LXPR_SYS_FS_INOTIFYDIR: + case LXPR_SYS_KERNELDIR: + case LXPR_SYS_KERNEL_RANDDIR: + case LXPR_SYS_NETDIR: + case LXPR_SYS_NET_COREDIR: + case LXPR_SYS_NET_IPV4DIR: + case LXPR_SYS_VMDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} + +/* + * Attempt to locate vnode for /proc/<pid>/fd/<#>. + */ +vnode_t * +lxpr_lookup_fdnode(vnode_t *dvp, const char *name) +{ + lxpr_node_t *lxdp = VTOLXP(dvp); + lxpr_node_t *lxfp; + char *endptr = NULL; + long num; + int fd; + proc_t *p; + vnode_t *vp = NULL; + file_t *fp; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(lxdp->lxpr_type == LXPR_PID_FDDIR || + lxdp->lxpr_type == LXPR_PID_TID_FDDIR); + + if (ddi_strtol(name, &endptr, 10, &num) != 0) { + return (NULL); + } else if (name[0] < '0' || name[0] > '9' || *endptr != '\0') { + /* + * ddi_strtol allows leading spaces and trailing garbage + * We do not tolerate such foolishness. + */ + return (NULL); + } else if ((fd = (int)num) < 0) { + return (NULL); + } + + /* Lock the owner process */ + p = lxpr_lock(lxdp->lxpr_pid, NO_ZOMB); + if ((p == NULL)) + return (NULL); + + /* Not applicable to processes which are system-owned. */ + if ((p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + lxfp = lxpr_getnode(dvp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + fip = P_FINFO(p); + mutex_exit(&p->p_lock); + mutex_enter(&fip->fi_lock); + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + if ((fp = ufp->uf_file) != NULL) { + vp = fp->f_vnode; + VN_HOLD(vp); + } + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + + if (vp == NULL) { + lxpr_unlock(p); + lxpr_freenode(lxfp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxfp->lxpr_realvp = vp; + + /* + * For certain entries (sockets, pipes, etc), Linux expects a + * bogus-named symlink. If that's the case, report the type as + * VNON to bypass link-following elsewhere in the vfs system. + * + * See lxpr_readlink for more details. + */ + if (lxpr_readlink_fdnode(lxfp, NULL, 0) == 0) + LXPTOV(lxfp)->v_type = VNON; + } + + lxpr_unlock(p); + ASSERT(LXPTOV(lxfp) != NULL); + return (LXPTOV(lxfp)); +} + +/* + * Attempt to create Linux-proc-style fake symlinks contents for supported + * /proc/<pid>/fd/<#> entries. + */ +int +lxpr_readlink_fdnode(lxpr_node_t *lxpnp, char *bp, size_t len) +{ + const char *format; + vnode_t *rvp = lxpnp->lxpr_realvp; + vattr_t attr; + + switch (rvp->v_type) { + case VSOCK: + format = "socket:[%lu]"; + break; + case VFIFO: + format = "pipe:[%lu]"; + break; + default: + return (-1); + } + + /* Fetch the inode of the underlying vnode */ + if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0) + return (-1); + + if (bp != NULL) + (void) snprintf(bp, len, format, (ino_t)attr.va_nodeid); + return (0); +} + +/* + * Translate a Linux core_pattern path to a native Illumos one, by replacing + * the appropriate % escape sequences. + * + * Any % escape sequences that are not recognised are double-escaped so that + * they will be inserted literally into the path (to mimic Linux). + */ +int +lxpr_core_path_l2s(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'E': + if (j + 4 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = 'd'; + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'e': + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'p': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = x; + break; + case 'h': + outp[j++] = '%'; + outp[j++] = 'n'; + break; + default: + /* No translation, make it literal. */ + if (j + 3 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = '%'; + outp[j++] = x; + break; + } + } + + outp[j] = '\0'; + return (0); +} + +/* + * Translate an Illumos core pattern path back to Linux format. + */ +int +lxpr_core_path_s2l(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'd': + /* No Linux equivalent unless it's %d%f. */ + if (inp[i] == '%' && inp[i + 1] == 'f') { + i += 2; + outp[j++] = '%'; + outp[j++] = 'E'; + } + break; + case 'f': + outp[j++] = '%'; + outp[j++] = 'e'; + break; + case 'p': + case 'P': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = (x == 'P' ? 'p' : x); + break; + case 'n': + outp[j++] = '%'; + outp[j++] = 'h'; + break; + default: + /* No translation. */ + break; + } + } + + outp[j] = '\0'; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c new file mode 100644 index 0000000000..b4dc5091c2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * lxprvfsops.c: vfs operations for /lxprocfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#include "lx_proc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_proc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialise cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Mounting lx_proc is not allowed outside an LX zone. + */ + if (zone->zone_brand != &lx_brand) { + return (ENOTSUP); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Hold a zone reference for access to the lxzd structure. + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* + * Allocate the first vnode and arbitrarily set the parent vnode to the + * mounted over directory + */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + + ldi_ident_release(lxpr_mnt->lxprm_li); + + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/proc"); + (void) strcpy(&sp->f_fstr[6], "/proc"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c new file mode 100644 index 0000000000..262339c31c --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -0,0 +1,7085 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * lx_proc -- a Linux-compatible /proc for the LX brand + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support native (but Linux-borne) programs that wish to view the native + * system through the Linux /proc model; the other -- this one -- is to + * support Linux binaries via the LX brand. These two implementations differ + * greatly in their aspirations (and their willingness to bend the truth + * of the system to accommodate those aspirations); they should not be unified. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <lx_auxv.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/fcntl.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/rctl.h> +#include <sys/kstat.h> +#include <sys/lx_misc.h> +#include <sys/lx_types.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/corectl.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> +#include <inet/ipclassifier.h> +#include <sys/socketvar.h> +#include <fs/sockfs/socktpi.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); +extern int prreadbuf(proc_t *, uintptr_t, uint8_t *, size_t, size_t *); + +#include "lx_proc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_create(struct vnode *, char *, struct vattr *, enum vcexcl, + int, struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_write(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_space(vnode_t *, int, flock64_t *, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_setattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kdir_randdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_coredir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_ipv4dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_vmdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kdir_randdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_coredir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_vmdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cgroups(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_devices(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_filesystems(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_vmstat(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_auxv(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cgroup(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_comm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_loginuid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_oom_scr_adj(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_personality(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_caplcap(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_corepatt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_osrel(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_sem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmall(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_core_somaxc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_minfr_kb(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_nhpages(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_swappiness(lxpr_node_t *, lxpr_uiobuf_t *); + +static int lxpr_write_pid_loginuid(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_core_somaxc(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_kernel_corepatt(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) + +extern rctl_hndl_t rc_process_semmsl; +extern rctl_hndl_t rc_process_semopm; +extern rctl_hndl_t rc_zone_semmni; + +extern rctl_hndl_t rc_zone_msgmni; +extern rctl_hndl_t rc_zone_shmmax; +extern rctl_hndl_t rc_zone_shmmni; +#define FOURGB 4294967295 + +/* + * The maximum length of the concatenation of argument vector strings we + * will return to the user via the branded procfs. Likewise for the env vector. + */ +int lxpr_maxargvlen = 4096; +int lxpr_maxenvvlen = 4096; + +/* + * The lx /proc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_WRITE, { .vop_read = lxpr_write }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_CREATE, { .vop_create = lxpr_create }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_SPACE, { .vop_space = lxpr_space }, + VOPNAME_SETATTR, { .vop_setattr = lxpr_setattr }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + + +/* + * file contents of an lx /proc directory. + */ +static lxpr_dirent_t lx_procdir[] = { + { LXPR_CGROUPS, "cgroups" }, + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DISKSTATS, "diskstats" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MODULES, "modules" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_SWAPS, "swaps" }, + { LXPR_SYSDIR, "sys" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" }, + { LXPR_VMSTAT, "vmstat" } +}; + +#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0])) + +/* + * Contents of an lx /proc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_TASKDIR, "task" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * Contents of an lx /proc/<pid>/task/<tid> directory. + */ +static lxpr_dirent_t tiddir[] = { + { LXPR_PID_TID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_TID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_TID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_TID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_TID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define TIDDIRFILES (sizeof (tiddir) / sizeof (tiddir[0])) + +#define LX_RLIM_INFINITY 0xFFFFFFFFFFFFFFFF + +#define RCTL_INFINITE(x) \ + ((x.rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x.rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct lxpr_rlimtab { + char *rlim_name; /* limit name */ + char *rlim_unit; /* limit unit */ + char *rlim_rctl; /* rctl source */ +} lxpr_rlimtab_t; + +static lxpr_rlimtab_t lxpr_rlimtab[] = { + { "Max cpu time", "seconds", "process.max-cpu-time" }, + { "Max file size", "bytes", "process.max-file-size" }, + { "Max data size", "bytes", "process.max-data-size" }, + { "Max stack size", "bytes", "process.max-stack-size" }, + { "Max core file size", "bytes", "process.max-core-size" }, + { "Max resident set", "bytes", "zone.max-physical-memory" }, + { "Max processes", "processes", "zone.max-lwps" }, + { "Max open files", "files", "process.max-file-descriptor" }, + { "Max locked memory", "bytes", "zone.max-locked-memory" }, + { "Max address space", "bytes", "process.max-address-space" }, + { "Max file locks", "locks", NULL }, + { "Max pending signals", "signals", + "process.max-sigqueue-size" }, + { "Max msgqueue size", "bytes", "process.max-msg-messages" } +}; + +#define LX_RLIM_TAB_LEN (sizeof (lxpr_rlimtab) / sizeof (lxpr_rlimtab[0])) + + +/* + * contents of lx /proc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IF_INET6, "if_inet6" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_IPV6_ROUTE, "ipv6_route" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_TCP6, "tcp6" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UDP6, "udp6" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * contents of /proc/sys directory + */ +static lxpr_dirent_t sysdir[] = { + { LXPR_SYS_FSDIR, "fs" }, + { LXPR_SYS_KERNELDIR, "kernel" }, + { LXPR_SYS_NETDIR, "net" }, + { LXPR_SYS_VMDIR, "vm" }, +}; + +#define SYSDIRFILES (sizeof (sysdir) / sizeof (sysdir[0])) + +/* + * contents of /proc/sys/fs directory + */ +static lxpr_dirent_t sys_fsdir[] = { + { LXPR_SYS_FS_INOTIFYDIR, "inotify" }, +}; + +#define SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0])) + +/* + * contents of /proc/sys/fs/inotify directory + */ +static lxpr_dirent_t sys_fs_inotifydir[] = { + { LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, +}; + +#define SYS_FS_INOTIFYDIRFILES \ + (sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0])) + +/* + * contents of /proc/sys/kernel directory + */ +static lxpr_dirent_t sys_kerneldir[] = { + { LXPR_SYS_KERNEL_CAPLCAP, "cap_last_cap" }, + { LXPR_SYS_KERNEL_COREPATT, "core_pattern" }, + { LXPR_SYS_KERNEL_HOSTNAME, "hostname" }, + { LXPR_SYS_KERNEL_MSGMNI, "msgmni" }, + { LXPR_SYS_KERNEL_NGROUPS_MAX, "ngroups_max" }, + { LXPR_SYS_KERNEL_OSREL, "osrelease" }, + { LXPR_SYS_KERNEL_PID_MAX, "pid_max" }, + { LXPR_SYS_KERNEL_RANDDIR, "random" }, + { LXPR_SYS_KERNEL_SEM, "sem" }, + { LXPR_SYS_KERNEL_SHMALL, "shmall" }, + { LXPR_SYS_KERNEL_SHMMAX, "shmmax" }, + { LXPR_SYS_KERNEL_SHMMNI, "shmmni" }, + { LXPR_SYS_KERNEL_THREADS_MAX, "threads-max" }, +}; + +#define SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0])) + +/* + * contents of /proc/sys/kernel/random directory + */ +static lxpr_dirent_t sys_randdir[] = { + { LXPR_SYS_KERNEL_RAND_BOOTID, "boot_id" }, +}; + +#define SYS_RANDDIRFILES (sizeof (sys_randdir) / sizeof (sys_randdir[0])) + +/* + * contents of /proc/sys/net directory + */ +static lxpr_dirent_t sys_netdir[] = { + { LXPR_SYS_NET_COREDIR, "core" }, + { LXPR_SYS_NET_IPV4DIR, "ipv4" }, +}; + +#define SYS_NETDIRFILES (sizeof (sys_netdir) / sizeof (sys_netdir[0])) + +/* + * contents of /proc/sys/net/core directory + */ +static lxpr_dirent_t sys_net_coredir[] = { + { LXPR_SYS_NET_CORE_SOMAXCON, "somaxconn" }, +}; + +#define SYS_NET_COREDIRFILES \ + (sizeof (sys_net_coredir) / sizeof (sys_net_coredir[0])) + +/* + * contents of /proc/sys/net/ipv4 directory + * See the Linux ip(7) & tcp(7) man pages for descriptions and the illumos + * ip(7p) & tcp(7p) man pages for the native descriptions. + */ +static lxpr_dirent_t sys_net_ipv4dir[] = { + { LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, "ip_local_port_range" }, + { LXPR_SYS_NET_IPV4_TCP_FIN_TO, "tcp_fin_timeout" }, + { LXPR_SYS_NET_IPV4_TCP_KA_INT, "tcp_keepalive_intvl" }, + { LXPR_SYS_NET_IPV4_TCP_KA_TIM, "tcp_keepalive_time" }, + { LXPR_SYS_NET_IPV4_TCP_SACK, "tcp_sack" }, + { LXPR_SYS_NET_IPV4_TCP_WINSCALE, "tcp_window_scaling" }, +}; + +#define SYS_NET_IPV4DIRFILES \ + (sizeof (sys_net_ipv4dir) / sizeof (sys_net_ipv4dir[0])) + +/* + * contents of /proc/sys/vm directory + */ +static lxpr_dirent_t sys_vmdir[] = { + { LXPR_SYS_VM_MAX_MAP_CNT, "max_map_count" }, + { LXPR_SYS_VM_MINFR_KB, "min_free_kbytes" }, + { LXPR_SYS_VM_NHUGEP, "nr_hugepages" }, + { LXPR_SYS_VM_OVERCOMMIT_MEM, "overcommit_memory" }, + { LXPR_SYS_VM_SWAPPINESS, "swappiness" }, +}; + +#define SYS_VMDIRFILES (sizeof (sys_vmdir) / sizeof (sys_vmdir[0])) + +/* + * Table for standard writable files. Non-standard writable files not in this + * table can be handled explicitly as special cases. + * This table drives lxpr_is_writable, lxpr_write, and lxpr_create. + * Note that the entries LXPR_PID_FD_FD and LXPR_PID_TID_FD_FD exist in the + * table both to verify writability and to satisfy opening with O_CREATE. + */ +typedef struct wftab { + lxpr_nodetype_t wft_type; /* file entry type */ + int (*wft_wrf)(lxpr_node_t *, struct uio *, cred_t *, + caller_context_t *); /* write function */ +} wftab_t; + +static wftab_t wr_tab[] = { + {LXPR_PID_FD_FD, NULL}, + {LXPR_PID_LOGINUID, lxpr_write_pid_loginuid}, + {LXPR_PID_OOM_SCR_ADJ, NULL}, + {LXPR_PID_TID_FD_FD, NULL}, + {LXPR_PID_TID_OOM_SCR_ADJ, NULL}, + {LXPR_SYS_KERNEL_COREPATT, lxpr_write_sys_kernel_corepatt}, + {LXPR_SYS_KERNEL_SHMALL, NULL}, + {LXPR_SYS_KERNEL_SHMMAX, NULL}, + {LXPR_SYS_NET_CORE_SOMAXCON, lxpr_write_sys_net_core_somaxc}, + {LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, + lxpr_write_sys_net_ipv4_ip_lport_range}, + {LXPR_SYS_NET_IPV4_TCP_FIN_TO, lxpr_write_sys_net_ipv4_tcp_fin_to}, + {LXPR_SYS_NET_IPV4_TCP_KA_INT, lxpr_write_sys_net_ipv4_tcp_ka_int}, + {LXPR_SYS_NET_IPV4_TCP_KA_TIM, lxpr_write_sys_net_ipv4_tcp_ka_tim}, + {LXPR_SYS_NET_IPV4_TCP_SACK, lxpr_write_sys_net_ipv4_tcp_sack}, + {LXPR_SYS_NET_IPV4_TCP_WINSCALE, lxpr_write_sys_net_ipv4_tcp_winscale}, + {LXPR_SYS_VM_OVERCOMMIT_MEM, NULL}, + {LXPR_SYS_VM_SWAPPINESS, NULL}, + {LXPR_INVALID, NULL} +}; + +/* + * Centralized test for the standard writable proc files. Other non-standard + * writable files might be handled separately. + */ +boolean_t +lxpr_is_writable(lxpr_nodetype_t type) +{ + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* Restrict writes to certain files */ + if ((flag & FWRITE) && !lxpr_is_writable(type)) { + return (EPERM); + } + + /* + * If we are opening an underlying file only allow regular files, + * fifos or sockets; reject the open for anything else. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG && + rvp->v_type != VFIFO && rvp->v_type != VSOCK) { + error = EACCES; + } else { + if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) { + /* + * This flag lets the fifo open know that + * we're using proc/fd to open a fd which we + * already have open. Otherwise, the fifo might + * reject an open if the other end has closed. + */ + flag |= FKLYR; + } + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_auxv, /* /proc/<pid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_pid_comm, /* /proc/<pid>/comm */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_limits, /* /proc/<pid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/mountinfo */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/oom_score_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task */ + lxpr_read_isdir, /* /proc/<pid>/task/nn */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_pid_auxv, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_read_pid_comm, /* /proc/<pid>/task/<tid>/comm */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/task/<tid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/exe */ + lxpr_read_pid_limits, /* /proc/<pid>/task/<tid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/task/<tid>/maps */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/task/<tid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/task/<tid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/task/<tid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/task/<tid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_read_fd, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_read_cgroups, /* /proc/cgroups */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_devices, /* /proc/devices */ + lxpr_read_diskstats, /* /proc/diskstats */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_filesystems, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_empty, /* /proc/modules */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_if_inet6, /* /proc/net/if_inet6 */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_ipv6_route, /* /proc/net/ipv6_route */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_tcp6, /* /proc/net/tcp6 */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_udp6, /* /proc/net/udp6 */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_swaps, /* /proc/swaps */ + lxpr_read_invalid, /* /proc/sys */ + lxpr_read_invalid, /* /proc/sys/fs */ + lxpr_read_invalid, /* /proc/sys/fs/inotify */ + lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */ + lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */ + lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */ + lxpr_read_invalid, /* /proc/sys/kernel */ + lxpr_read_sys_kernel_caplcap, /* /proc/sys/kernel/cap_last_cap */ + lxpr_read_sys_kernel_corepatt, /* /proc/sys/kernel/core_pattern */ + lxpr_read_sys_kernel_hostname, /* /proc/sys/kernel/hostname */ + lxpr_read_sys_kernel_msgmni, /* /proc/sys/kernel/msgmni */ + lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */ + lxpr_read_sys_kernel_osrel, /* /proc/sys/kernel/osrelease */ + lxpr_read_sys_kernel_pid_max, /* /proc/sys/kernel/pid_max */ + lxpr_read_invalid, /* /proc/sys/kernel/random */ + lxpr_read_sys_kernel_rand_bootid, /* /proc/sys/kernel/random/boot_id */ + lxpr_read_sys_kernel_sem, /* /proc/sys/kernel/sem */ + lxpr_read_sys_kernel_shmall, /* /proc/sys/kernel/shmall */ + lxpr_read_sys_kernel_shmmax, /* /proc/sys/kernel/shmmax */ + lxpr_read_sys_kernel_shmmni, /* /proc/sys/kernel/shmmni */ + lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */ + lxpr_read_invalid, /* /proc/sys/net */ + lxpr_read_invalid, /* /proc/sys/net/core */ + lxpr_read_sys_net_core_somaxc, /* /proc/sys/net/core/somaxconn */ + lxpr_read_invalid, /* /proc/sys/net/ipv4 */ + lxpr_read_sys_net_ipv4_ip_lport_range, /* ../ipv4/ip_local_port_range */ + lxpr_read_sys_net_ipv4_tcp_fin_to, /* .../ipv4/tcp_fin_timeout */ + lxpr_read_sys_net_ipv4_tcp_ka_int, /* .../ipv4/tcp_keepalive_intvl */ + lxpr_read_sys_net_ipv4_tcp_ka_tim, /* .../ipv4/tcp_keepalive_time */ + lxpr_read_sys_net_ipv4_tcp_sack, /* .../ipv4/tcp_sack */ + lxpr_read_sys_net_ipv4_tcp_winscale, /* .../ipv4/tcp_window_scaling */ + lxpr_read_invalid, /* /proc/sys/vm */ + lxpr_read_sys_vm_max_map_cnt, /* /proc/sys/vm/max_map_count */ + lxpr_read_sys_vm_minfr_kb, /* /proc/sys/vm/min_free_kbytes */ + lxpr_read_sys_vm_nhpages, /* /proc/sys/vm/nr_hugepages */ + lxpr_read_sys_vm_overcommit_mem, /* /proc/sys/vm/overcommit_memory */ + lxpr_read_sys_vm_swappiness, /* /proc/sys/vm/swappiness */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ + lxpr_read_vmstat, /* /proc/vmstat */ +}; + +/* + * Array of lookup functions, indexed by lx /proc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_taskdir, /* /proc/<pid>/task */ + lxpr_lookup_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cgroups */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/diskstats */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/modules */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/swaps */ + lxpr_lookup_sysdir, /* /proc/sys */ + lxpr_lookup_sys_fsdir, /* /proc/sys/fs */ + lxpr_lookup_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_lookup_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_lookup_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_lookup_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_lookup_sys_netdir, /* /proc/sys/net */ + lxpr_lookup_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_lookup_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_lookup_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_lookup_sys_vmdir, /* /proc/sys/vm */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ + lxpr_lookup_not_a_dir, /* /proc/vmstat */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_taskdir, /* /proc/<pid>/task */ + lxpr_readdir_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/oom_scr_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cgroups */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/diskstats */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/modules */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/swaps */ + lxpr_readdir_sysdir, /* /proc/sys */ + lxpr_readdir_sys_fsdir, /* /proc/sys/fs */ + lxpr_readdir_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_readdir_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_readdir_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_readdir_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_readdir_sys_netdir, /* /proc/sys/net */ + lxpr_readdir_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_readdir_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_readdir_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_readdir_sys_vmdir, /* /proc/sys/vm */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ + lxpr_readdir_not_a_dir, /* /proc/vmstat */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in the lx procfs is human + * readable and not binary structures there do not have to be different + * read variants depending on whether the reading process model is 32 or 64 bits + * (at least in general, and certainly the difference is unlikely to be enough + * to justify have different routines for 32 and 64 bit reads + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_auxv(): read process aux vector + */ +static void +lxpr_read_pid_auxv(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + lx_elf_data_t *edp = NULL; + int i, cnt; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_AUXV || + lxpnp->lxpr_type == LXPR_PID_TID_AUXV); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + + if (p == NULL) { + return; + } + if ((pd = ptolxproc(p)) == NULL) { + /* Emit a single AT_NULL record for non-branded processes */ + auxv_t buf; + + bzero(&buf, sizeof (buf)); + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)&buf, sizeof (buf)); + return; + } else { + edp = &pd->l_elf_data; + } + + if (p->p_model == DATAMODEL_NATIVE) { + auxv_t buf[__KERN_NAUXV_IMPL]; + + /* + * Because a_type is only of size int (not long), the buffer + * contents must be zeroed first to ensure cleanliness. + */ + bzero(buf, sizeof (buf)); + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &buf[cnt], edp) == 0) { + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#if defined(_SYSCALL32_IMPL) + else { + auxv32_t buf[__KERN_NAUXV_IMPL]; + + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + auxv_t temp; + + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &temp, edp) == 0) { + buf[cnt].a_type = (int)temp.a_type; + buf[cnt].a_un.a_val = (int)temp.a_un.a_val; + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#endif /* defined(_SYSCALL32_IMPL) */ +} + +/* + * lxpr_read_pid_cgroup(): read cgroups for process + */ +static void +lxpr_read_pid_cgroup(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CGROUP || + lxpnp->lxpr_type == LXPR_PID_TID_CGROUP); + + p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* basic stub, 3rd field will need to be populated */ + lxpr_uiobuf_printf(uiobuf, "1:name=systemd:/\n"); +} + +static void +lxpr_copy_cmdline(proc_t *p, lx_proc_data_t *pd, lxpr_uiobuf_t *uiobuf) +{ + uio_t *uiop = uiobuf->uiop; + char *buf = uiobuf->buffer; + int bsz = uiobuf->buffsize; + boolean_t env_overflow = B_FALSE; + uintptr_t pos = pd->l_args_start + uiop->uio_offset; + uintptr_t estart = pd->l_envs_start; + uintptr_t eend = pd->l_envs_end; + size_t chunk, copied; + int err = 0; + + /* Do not bother with data beyond the end of the envp strings area. */ + if (pos > eend) { + return; + } + mutex_exit(&p->p_lock); + + /* + * If the starting or ending bounds are outside the argv strings area, + * check to see if the process has overwritten the terminating NULL. + * If not, no data needs to be copied from oustide the argv area. + */ + if (pos >= estart || (pos + uiop->uio_resid) >= estart) { + uint8_t term; + if (uread(p, &term, sizeof (term), estart - 1) != 0) { + err = EFAULT; + } else if (term != 0) { + env_overflow = B_TRUE; + } + } + + /* Data between astart and estart-1 can be copied freely. */ + while (pos < estart && uiop->uio_resid > 0 && err == 0) { + chunk = MIN(estart - pos, uiop->uio_resid); + chunk = MIN(chunk, bsz); + + if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) != 0 || + copied != chunk) { + err = EFAULT; + break; + } + err = uiomove(buf, copied, UIO_READ, uiop); + pos += copied; + } + + /* + * Onward from estart, data is copied as a contiguous string. To + * protect env data from potential snooping, only one buffer-sized copy + * is allowed to avoid complex seek logic. + */ + if (err == 0 && env_overflow && pos == estart && uiop->uio_resid > 0) { + chunk = MIN(eend - pos, uiop->uio_resid); + chunk = MIN(chunk, bsz); + if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) == 0) { + int len = strnlen(buf, copied); + if (len > 0) { + err = uiomove(buf, len, UIO_READ, uiop); + } + } + } + + uiobuf->error = err; + /* reset any uiobuf state */ + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + + mutex_enter(&p->p_lock); +} + +/* + * lxpr_read_pid_cmdline(): read argument vector from process + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxargvlen, sz; + lx_proc_data_t *pd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE || + lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + if ((pd = ptolxproc(p)) != NULL && pd->l_args_start != 0 && + pd->l_envs_start != 0 && pd->l_envs_end != 0) { + /* Use Linux-style argv bounds if possible. */ + lxpr_copy_cmdline(p, pd, uiobuf); + lxpr_unlock(p); + } else { + int r; + + r = prreadargv(p, buf, asz, &sz); + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + } + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_comm(): read command from process + */ +static void +lxpr_read_pid_comm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char buf[MAXCOMLEN + 1]; + + VERIFY(lxpnp->lxpr_type == LXPR_PID_COMM || + lxpnp->lxpr_type == LXPR_PID_TID_COMM); + + /* + * Because prctl(PR_SET_NAME) does not set custom names for threads + * (vs processes), there is no need for special handling here. + */ + if ((p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK)) == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + strlcpy(buf, p->p_user.u_comm, sizeof (buf)); + lxpr_unlock(p); + lxpr_uiobuf_printf(uiobuf, "%s\n", buf); +} + +/* + * lxpr_read_pid_env(): read env vector from process + */ +static void +lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxenvvlen, sz; + int r; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + r = prreadenvv(p, buf, asz, &sz); + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_limits(): ulimit file + */ +static void +lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + rctl_qty_t cur[LX_RLIM_TAB_LEN], max[LX_RLIM_TAB_LEN]; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS || + lxpnp->lxpr_type == LXPR_PID_TID_LIMITS); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + if (p == NULL) { + return; + } + + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + char *kname = lxpr_rlimtab[i].rlim_rctl; + rctl_val_t nval, *oval = NULL; + rctl_hndl_t hndl; + + /* default to unlimited for resources without an analog */ + cur[i] = RLIM_INFINITY; + max[i] = RLIM_INFINITY; + if (kname == NULL || (hndl = rctl_hndl_lookup(kname)) == -1) { + continue; + } + while (rctl_local_get(hndl, oval, &nval, p) == 0) { + oval = &nval; + switch (nval.rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + cur[i] = nval.rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + max[i] = nval.rcv_value; + break; + } + } + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name); + if (cur[i] == RLIM_INFINITY || cur[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", cur[i]); + } + if (max[i] == RLIM_INFINITY || max[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", max[i]); + } + lxpr_uiobuf_printf(uiobuf, " %-10s\n", + lxpr_rlimtab[i].rlim_unit); + } +} + +/* + * lxpr_read_pid_loginuid(): loginuid file + */ +static void +lxpr_read_pid_loginuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + uid_t lu = 0; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID || + lxpnp->lxpr_type == LXPR_PID_TID_LOGINUID); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + if ((pd = ptolxproc(p)) != NULL) { + lu = pd->l_loginuid; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%d", lu); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + uintptr_t saddr; + uintptr_t eaddr; + int type; + char prot[5]; + uintptr_t offset; + vnode_t *vp; + char *name_override; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS || + lxpnp->lxpr_type == LXPR_PID_TID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB); + if (p == NULL) { + return; + } + + as = p->p_as; + lxpd = ptolxproc(p); + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = (uintptr_t)seg->s_base; + pbuf->eaddr = pbuf->saddr + seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr); + + pbuf->name_override = NULL; + if (lxpd != NULL) { + if (pbuf->saddr == lxpd->l_vdso) { + pbuf->name_override = "[vdso]"; + } else if (pbuf->saddr == p->p_user.u_commpagep) { + pbuf->name_override = "[vvar]"; + } + } + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + ino_t inode = 0; + + *buf = '\0'; + if (pbuf->name_override != NULL) { + (void) strncpy(buf, pbuf->name_override, buflen); + } else if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (p->p_model == DATAMODEL_LP64) { + lxpr_uiobuf_printf(uiobuf, + "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, *buf != '\0' ? " " : "", buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02x:%02x %llu%s%s\n", + (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr, + pbuf->prot, (uint32_t)pbuf->offset, maj, min, + inode, *buf != '\0' ? " " : "", buf); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * Make mount entry look more like Linux. Non-zero return to skip it. + */ +static int +lxpr_clean_mntent(char **mntpt, char **fstype, char **resource) +{ + if (strcmp(*mntpt, "/var/ld") == 0 || + strcmp(*fstype, "objfs") == 0 || + strcmp(*fstype, "mntfs") == 0 || + strcmp(*fstype, "ctfs") == 0 || + strncmp(*mntpt, "/native/", 8) == 0) { + return (1); + } + + if (strcmp(*fstype, "tmpfs") == 0) { + *resource = "tmpfs"; + } else if (strcmp(*fstype, "lx_proc") == 0) { + *resource = *fstype = "proc"; + } else if (strcmp(*fstype, "lx_sysfs") == 0) { + *resource = *fstype = "sysfs"; + } else if (strcmp(*fstype, "lx_devfs") == 0) { + *resource = *fstype = "devtmpfs"; + } else if (strcmp(*fstype, "lx_cgroup") == 0) { + *resource = *fstype = "cgroup"; + } else if (strcmp(*fstype, "lxautofs") == 0) { + *fstype = "autofs"; + } + + return (0); +} + + +typedef struct lxpr_mount_entry { + list_node_t lme_link; + uint_t lme_id; + uint_t lme_parent_id; + refstr_t *lme_mntpt; + refstr_t *lme_resource; + uint_t lme_flag; + int lme_fstype; + dev_t lme_dev; + boolean_t lme_force; +} lxpr_mount_entry_t; + +static int lxpr_zfs_fstype = -1; + +#define LXPR_ROOT_MOUNT_ID 15 + +static list_t * +lxpr_enumerate_mounts(zone_t *zone) +{ + vfs_t *vfsp, *rvfsp, *vfslist; + lx_zone_data_t *lxzd = ztolxzd(zone); + list_t *result; + lxpr_mount_entry_t *lme; + lx_virt_disk_t *vd; + uint_t root_id, mount_id; + char tmppath[MAXPATHLEN]; + + result = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(result, sizeof (lxpr_mount_entry_t), + offsetof(lxpr_mount_entry_t, lme_link)); + /* use an arbitrary start value for the root mount_id */ + root_id = 15; + mount_id = root_id + 1; + + ASSERT(zone != global_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + vfs_list_read_lock(); + vfsp = vfslist = zone->zone_vfslist; + + /* + * If the zone has a root entry, it will be the first in the list. + * Conjure one up if needed. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + rvfsp = zone->zone_rootvp->v_vfsp; + } else { + rvfsp = vfslist; + vfsp = vfslist->vfs_zone_next; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = root_id; + lme->lme_parent_id = 0; + lme->lme_mntpt = refstr_alloc(zone->zone_rootpath); + lme->lme_flag = rvfsp->vfs_flag; + lme->lme_fstype = rvfsp->vfs_fstype; + lme->lme_force = B_TRUE; + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == rvfsp->vfs_dev) { + (void) snprintf(tmppath, sizeof (tmppath), + "%sdev/%s", zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(tmppath); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = refstr_alloc(zone->zone_rootpath); + lme->lme_dev = rvfsp->vfs_dev; + } + list_insert_head(result, lme); + + do { + if (vfsp == NULL) { + break; + } + /* Skip mounts we shouldn't show */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_mntpt); + lme->lme_flag = vfsp->vfs_flag; + lme->lme_fstype = vfsp->vfs_fstype; + lme->lme_force = B_FALSE; + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == vfsp->vfs_dev) { + char vdev[MAXPATHLEN]; + + (void) snprintf(vdev, sizeof (vdev), + "%sdev/%s", + zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(vdev); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = vfsp->vfs_resource; + refstr_hold(vfsp->vfs_resource); + lme->lme_dev = vfsp->vfs_dev; + } + list_insert_tail(result, lme); + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* Add a single dummy entry for /native/usr */ + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_flag = VFS_RDONLY; + lme->lme_dev = makedevice(0, 1); + (void) snprintf(tmppath, sizeof (tmppath), + "%snative/usr", zone->zone_rootpath); + lme->lme_mntpt = refstr_alloc(tmppath); + lme->lme_resource = lme->lme_mntpt; + refstr_hold(lme->lme_mntpt); + if (lxpr_zfs_fstype == -1) { + vfssw_t *zfssw = vfs_getvfssw("zfs"); + VERIFY(zfssw != NULL); + lxpr_zfs_fstype = ((uintptr_t)zfssw - (uintptr_t)vfssw) / + sizeof (vfssw[0]); + VERIFY(&vfssw[lxpr_zfs_fstype] == zfssw); + } + lme->lme_fstype = lxpr_zfs_fstype; + lme->lme_force = B_TRUE; + list_insert_tail(result, lme); + + return (result); +} + +/* + * lxpr_read_pid_mountinfo(): information about process mount points. + */ +static void +lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO || + lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO); + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype, *rwflag; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro"; + + /* + * XXX parent ID is not tracked correctly here. Currently we + * always assume the parent ID is the root ID. + */ + lxpr_uiobuf_printf(uiobuf, + "%d %d %d:%d / %s %s - %s %s %s\n", + lme->lme_id, lme->lme_parent_id, + getmajor(lme->lme_dev), getminor(lme->lme_dev), + mntpt, rwflag, fstype, resource, rwflag); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_pid_oom_scr_adj(): read oom_score_adj for process + */ +static void +lxpr_read_pid_oom_scr_adj(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_OOM_SCR_ADJ || + lxpnp->lxpr_type == LXPR_PID_TID_OOM_SCR_ADJ); + + p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* always 0 */ + lxpr_uiobuf_printf(uiobuf, "0\n"); +} + +/* + * lxpr_read_pid_personality(): read personality for process + */ +static void +lxpr_read_pid_personality(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + unsigned int personality; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_PERSONALITY); + + p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + if ((lxpd = ptolxproc(p)) != NULL) { + personality = lxpd->l_personality; + } else { + /* Report native processes as having the SunOS personality */ + personality = LX_PER_SUNOS; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%08x\n", personality); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize, rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM || + lxpnp->lxpr_type == LXPR_PID_TID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * Look for either the main thread (lookup_id is 0) or the specified thread. + * If we're looking for the main thread but the proc does not have one, we + * fallback to using prchoose to get any thread available. + */ +static kthread_t * +lxpr_get_thread(proc_t *p, uint_t lookup_id) +{ + kthread_t *t; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + pid_t pid = p->p_pid; + pid_t init_pid = curproc->p_zone->zone_proc_initpid; + boolean_t branded = (p->p_brand == &lx_brand); + + /* get specified thread */ + if ((t = p->p_tlist) == NULL) + return (NULL); + + do { + if (lookup_id == 0 && t->t_tid == 1) { + thread_lock(t); + return (t); + } + + lwpd = ttolxlwp(t); + if (branded && lwpd != NULL) { + if (pid == init_pid && lookup_id == 1) { + emul_tid = t->t_tid; + } else { + emul_tid = lwpd->br_pid; + } + } else { + /* + * Make only the first (assumed to be main) thread + * visible for non-branded processes. + */ + emul_tid = p->p_pid; + } + if (emul_tid == lookup_id) { + thread_lock(t); + return (t); + } + } while ((t = t->t_forw) != p->p_tlist); + + if (lookup_id == 0) + return (prchoose(p)); + return (NULL); +} + +/* + * Lookup the real pid for procs 0 or 1. + */ +static pid_t +get_real_pid(pid_t p) +{ + pid_t find_pid; + + if (p == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (p == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = p; + } + + return (find_pid); +} + +/* + * pid/tid common code to read status file + */ +static void +lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + struct as *as; + char *status; + pid_t pid, ppid; + k_sigset_t current, ignore, handle; + int i, lx_sig, lwpcnt, ngroups; + pid_t real_pid; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t fdlim; + size_t vsize = 0, nlocked = 0, rss = 0, stksize = 0; + boolean_t printsz = B_FALSE; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process or if we're the zone's zsched the pid is 0. + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else if (pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; /* zsched is pid 0 */ + ppid = 0; /* parent pid for zsched is itself */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = lxpr_get_thread(p, lookup_id); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + strlcpy(buf_comm, up->u_comm, sizeof (buf_comm)); + fdlim = p->p_fno_ctl; + lwpcnt = p->p_lwpcnt; + + /* + * Gather memory information + */ + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) && + (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + nlocked = p->p_locked_mem; + stksize = p->p_stksize; + printsz = B_TRUE; + } + + /* + * Gather signal information + */ + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + for (i = 1; i < NSIG; i++) { + lx_sig = stol_signo[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + buf_comm, + status, + pid, /* thread group id - same as pid */ + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + fdlim); + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + if (printsz) { + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + btok(nlocked), + ptok(rss), + 0l, + btok(stksize), + ptok(rss), + 0l); + } + lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u\n", lwpcnt); + lxpr_uiobuf_printf(uiobuf, + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0]); + /* Report only the full bounding set for now */ + lxpr_uiobuf_printf(uiobuf, + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n" + "CapBnd:\t%016llx\n", + 0, 0, 0, 0x1fffffffffLL); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + lxpr_read_status_common(lxpnp, uiobuf, 0); +} + +/* + * lxpr_read_pid_tid_status(): status file + */ +static void +lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STATUS); + lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* + * Same logic as the lx devfs lxd_pts_devt_translator. + */ +static dev_t +lxpr_xlate_pts_dev(dev_t dev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + + return (LX_MAKEDEVICE(lx_maj, lx_min)); +} + +/* + * pid/tid common code to read stat file + */ +static void +lxpr_read_stat_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri, lwpcnt; + caddr_t wchan, stackbase; + processorid_t cpu; + pid_t real_pid; + clock_t utime, stime, cutime, cstime, ticks, boottime; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t vmem_ctl; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + /* + * Set Linux defaults if we're the zone's init process + */ + pid = p->p_pid; + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else if (pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; /* PID for zsched */ + ppid = 0; /* parent PID for zsched is 0 */ + pgpid = 0; /* process group for zsched is 0 */ + psgid = (gid_t)-1; /* credential GID for zsched is -1 */ + spid = 0; /* session id for zsched is 0 */ + psdev = 0; /* session device for zsched is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = lxpr_xlate_pts_dev(p->p_sessp->s_dev); + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + stackbase = 0; + } else { + /* from prgetstackbase() */ + stackbase = p->p_usrstack - p->p_stksize; + } + + utime = stime = 0; + t = lxpr_get_thread(p, lookup_id); + if (t != NULL) { + klwp_t *lwp = ttolwp(t); + struct mstate *ms = &lwp->lwp_mstate; + hrtime_t utm, stm; + + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; + break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; + break; + case TS_ZOMB: + stat = 'Z'; + break; + case TS_STOPPED: + stat = 'T'; + break; + default: + stat = '!'; + break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + + utm = ms->ms_acct[LMS_USER]; + stm = ms->ms_acct[LMS_SYSTEM]; + + thread_unlock(t); + + /* convert unscaled high-res time to nanoseconds */ + scalehrtime(&utm); + scalehrtime(&stm); + + /* Linux /proc expects these values in ticks */ + utime = (clock_t)NSEC_TO_TICK(utm); + stime = (clock_t)NSEC_TO_TICK(stm); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + + if (lookup_id == 0) { + /* process */ + utime = p->p_utime; + stime = p->p_stime; + } else { + /* tid: utime & stime for the thread set in block above */ + } + cutime = p->p_cutime; + cstime = p->p_cstime; + lwpcnt = p->p_lwpcnt; + vmem_ctl = p->p_vmem_ctl; + strlcpy(buf_comm, p->p_user.u_comm, sizeof (buf_comm)); + ticks = p->p_user.u_ticks; /* lbolt at process start */ + /* adjust ticks to account for zone boot time */ + boottime = LXPTOZ(lxpnp)->zone_zsched->p_user.u_ticks; + ticks -= boottime; + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%d " /* 1 */ + "(%s) %c %d %d %d %d %d " /* 2-8 */ + "%lu %lu %lu %lu %lu " /* 9-13 */ + "%lu %lu %ld %ld " /* 14-17 */ + "%d %d %d " /* 18-20 */ + "%lu " /* 21 */ + "%lu " /* 22 */ + "%lu %ld %llu " /* 23-25 */ + "%lu %lu %llu " /* 26-28 */ + "%lu %lu " /* 29-30 */ + "%lu %lu %lu %lu " /* 31-34 */ + "%lu " /* 35 */ + "%lu %lu " /* 36-37 */ + "%d " /* 38 */ + "%d" /* 39 */ + "\n", + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, /* 1 */ + buf_comm, stat, ppid, pgpid, spid, psdev, psgid, /* 2-8 */ + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + utime, stime, cutime, cstime, /* 14-17 */ + pri, nice, lwpcnt, /* 18-20 */ + 0l, /* itrealvalue (time before next SIGALRM) 21 */ + ticks, /* 22 */ + vsize, rss, vmem_ctl, /* 23-25 */ + 0l, 0l, stackbase, /* startcode, endcode, startstack 26-28 */ + 0l, 0l, /* kstkesp, kstkeip 29-30 */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch 31-34 */ + wchan, /* 35 */ + 0l, 0l, /* nswap,cnswap 36-37 */ + 0, /* exit_signal 38 */ + cpu /* 39 */); +} + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + lxpr_read_stat_common(lxpnp, uiobuf, 0); +} + +/* + * lxpr_read_pid_tid_stat(): pid stat file + */ +static void +lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STAT); + lxpr_read_stat_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +struct lxpr_ifstat { + uint64_t rx_bytes; + uint64_t rx_packets; + uint64_t rx_errors; + uint64_t rx_drop; + uint64_t tx_bytes; + uint64_t tx_packets; + uint64_t tx_errors; + uint64_t tx_drop; + uint64_t collisions; + uint64_t rx_multicast; +}; + +static void * +lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num) +{ + kstat_t *kp; + int i, nrec = 0; + size_t bufsize; + void *buf = NULL; + + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, getzoneid()); + } else { + kp = kstat_hold_bykid(kn->ks_kid, getzoneid()); + } + if (kp == NULL) { + return (NULL); + } + if (kp->ks_flags & KSTAT_FLAG_INVALID) { + kstat_rele(kp); + return (NULL); + } + + bufsize = kp->ks_data_size + 1; + kstat_rele(kp); + + /* + * The kstat in question is released so that kmem_alloc(KM_SLEEP) is + * performed without it held. After the alloc, the kstat is reacquired + * and its size is checked again. If the buffer is no longer large + * enough, the alloc and check are repeated up to three times. + */ + for (i = 0; i < 2; i++) { + buf = kmem_alloc(bufsize, KM_SLEEP); + + /* Check if bufsize still appropriate */ + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, getzoneid()); + } else { + kp = kstat_hold_bykid(kn->ks_kid, getzoneid()); + } + if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) { + if (kp != NULL) { + kstat_rele(kp); + } + kmem_free(buf, bufsize); + return (NULL); + } + KSTAT_ENTER(kp); + (void) KSTAT_UPDATE(kp, KSTAT_READ); + if (bufsize < kp->ks_data_size) { + kmem_free(buf, bufsize); + buf = NULL; + bufsize = kp->ks_data_size + 1; + KSTAT_EXIT(kp); + kstat_rele(kp); + continue; + } else { + if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) { + kmem_free(buf, bufsize); + buf = NULL; + } + nrec = kp->ks_ndata; + KSTAT_EXIT(kp); + kstat_rele(kp); + break; + } + } + + if (buf != NULL) { + *size = bufsize; + *num = nrec; + } + return (buf); +} + +static int +lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs) +{ + kstat_named_t *kp; + int i, num; + size_t size; + + /* + * Search by name instead of by kid since there's a small window to + * race against kstats being added/removed. + */ + bzero(ifs, sizeof (*ifs)); + kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num); + if (kp == NULL) + return (-1); + for (i = 0; i < num; i++) { + if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0) + ifs->rx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0) + ifs->rx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0) + ifs->rx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0) + ifs->rx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0) + ifs->rx_multicast = kp[i].value.ui32; + else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0) + ifs->tx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0) + ifs->tx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0) + ifs->tx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0) + ifs->tx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0) + ifs->collisions = kp[i].value.ui32; + } + kmem_free(kp, size); + return (0); +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + int i, nidx; + size_t sidx; + struct lxpr_ifstat ifs; + + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + if (ksr == NULL) + return; + + for (i = 1; i < nidx; i++) { + if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 || + strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) { + if (lxpr_kstat_ifstat(&ksr[i], &ifs) != 0) + continue; + + /* Overwriting the name is ok in the local snapshot */ + lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE); + lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu " + "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u " + "%5lu %7u %10u\n", + ksr[i].ks_name, + ifs.rx_bytes, ifs.rx_packets, + ifs.rx_errors, ifs.rx_drop, + 0, 0, 0, ifs.rx_multicast, + ifs.tx_bytes, ifs.tx_packets, + ifs.tx_errors, ifs.tx_drop, + 0, ifs.collisions, 0, 0); + } + } + + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_inet6_out(const in6_addr_t *addr, char buf[33]) +{ + const uint8_t *ip = addr->s6_addr; + char digits[] = "0123456789abcdef"; + int i; + for (i = 0; i < 16; i++) { + buf[2 * i] = digits[ip[i] >> 4]; + buf[2 * i + 1] = digits[ip[i] & 0xf]; + } + buf[32] = '\0'; +} + +/* ARGSUSED */ +static void +lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + ipif_t *ipif; + ill_walk_context_t ctx; + char ifname[LIFNAMSIZ], ip6out[33]; + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = ILL_START_WALK_V6(&ctx, ipst); + + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + uint_t index = ill->ill_phyint->phyint_ifindex; + int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); + unsigned int scope = lx_ipv6_scope_convert( + &ipif->ipif_v6lcl_addr); + /* Always report PERMANENT flag */ + int flag = 0x80; + + (void) snprintf(ifname, LIFNAMSIZ, "%s", ill->ill_name); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out); + + lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x" + " %8s\n", ip6out, index, plen, scope, flag, ifname); + } + } + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + char ipv6addr[33]; + + lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr, + ip_mask_to_plen_v6(&ire->ire_mask_v6)); + + /* punt on this for now */ + lxpr_uiobuf_printf(uiobuf, "%s %02x ", + "00000000000000000000000000000000", 0); + + lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr); + + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + /* Linux's RTF_LOCAL equivalent */ + if (ire->ire_metrics.iulp_local) + flags |= 0x80000000; + + if (ire->ire_ill != NULL) { + ill_get_name(ire->ire_ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n", + 0, /* metric */ + ire->ire_refcnt, + 0, + flags, + name); +} + +/* ARGSUSED */ +static void +lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +#define LXPR_SKIP_ROUTE(type) \ + (((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \ + IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0) + +static void +lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + ill_t *ill; + ire_t *nire; + ipif_t *ipif; + ipaddr_t gateway; + + if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0) + return; + + /* These route flags have direct Linux equivalents */ + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + + /* + * Search for a suitable IRE for naming purposes. + * On Linux, the default route is typically associated with the + * interface used to access gateway. The default IRE on Illumos + * typically lacks an ill reference but its parent might have one. + */ + nire = ire; + do { + ill = nire->ire_ill; + nire = nire->ire_dep_parent; + } while (ill == NULL && nire != NULL); + if (ill != NULL) { + ill_get_name(ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '*'; + name[1] = '\0'; + } + + /* + * Linux suppresses the gateway address for directly connected + * interface networks. To emulate this behavior, we walk all addresses + * of a given route interface. If one matches the gateway, it is + * displayed as NULL. + */ + gateway = ire->ire_gateway_addr; + if ((ill = ire->ire_ill) != NULL) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (ipif->ipif_lcl_addr == gateway) { + gateway = 0; + break; + } + } + } + + lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u\n", + name, + ire->ire_addr, + gateway, + flags, 0, 0, + 0, /* priority */ + ire->ire_mask, + 0, 0, /* mss, window */ + ire->ire_metrics.iulp_rtt); +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t" + "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n"); + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +typedef struct lxpr_snmp_table { + const char *lst_proto; + const char *lst_fields[]; +} lxpr_snmp_table_t; + +static lxpr_snmp_table_t lxpr_snmp_ip = { "ip", + { + "forwarding", "defaultTTL", "inReceives", "inHdrErrors", + "inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards", + "inDelivers", "outRequests", "outDiscards", "outNoRoutes", + "reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs", + "fragFails", "fragCreates", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp", + { + "inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds", + "inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps", + "inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps", + "outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds", + "outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos", + "outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks", + "outAddrMaskReps", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp", + { + "rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens", + "passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs", + "outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_udp = { "udp", + { + "inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors", + "sndbufErrors", "inCsumErrors", + NULL + } +}; + +static lxpr_snmp_table_t *lxpr_net_snmptab[] = { + &lxpr_snmp_ip, + &lxpr_snmp_icmp, + &lxpr_snmp_tcp, + &lxpr_snmp_udp, + NULL +}; + +static void +lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table, + kstat_t *kn) +{ + kstat_named_t *klist; + char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN]; + int i, j, num; + size_t size; + + klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num); + if (klist == NULL) + return; + + /* Print the header line, fields capitalized */ + (void) strncpy(upname, table->lst_proto, KSTAT_STRLEN); + upname[0] = toupper(upname[0]); + lxpr_uiobuf_printf(uiobuf, "%s:", upname); + for (i = 0; table->lst_fields[i] != NULL; i++) { + (void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN); + upfield[0] = toupper(upfield[0]); + lxpr_uiobuf_printf(uiobuf, " %s", upfield); + } + lxpr_uiobuf_printf(uiobuf, "\n%s:", upname); + + /* Then loop back through to print the value line. */ + for (i = 0; table->lst_fields[i] != NULL; i++) { + kstat_named_t *kpoint = NULL; + for (j = 0; j < num; j++) { + if (strncmp(klist[j].name, table->lst_fields[i], + KSTAT_STRLEN) == 0) { + kpoint = &klist[j]; + break; + } + } + if (kpoint == NULL) { + /* Output 0 for unknown fields */ + lxpr_uiobuf_printf(uiobuf, " 0"); + } else { + switch (kpoint->data_type) { + case KSTAT_DATA_INT32: + lxpr_uiobuf_printf(uiobuf, " %d", + kpoint->value.i32); + break; + case KSTAT_DATA_UINT32: + lxpr_uiobuf_printf(uiobuf, " %u", + kpoint->value.ui32); + break; + case KSTAT_DATA_INT64: + lxpr_uiobuf_printf(uiobuf, " %ld", + kpoint->value.l); + break; + case KSTAT_DATA_UINT64: + lxpr_uiobuf_printf(uiobuf, " %lu", + kpoint->value.ul); + break; + } + } + } + lxpr_uiobuf_printf(uiobuf, "\n"); + kmem_free(klist, size); +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + lxpr_snmp_table_t **table = lxpr_net_snmptab; + int i, t, nidx; + size_t sidx; + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + if (ksr == NULL) + return; + + for (t = 0; table[t] != NULL; t++) { + for (i = 0; i < nidx; i++) { + if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0) + continue; + if (strncmp(ksr[i].ks_name, table[t]->lst_proto, + KSTAT_STRLEN) == 0) { + lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i]); + break; + } + } + } + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static int +lxpr_convert_tcp_state(int st) +{ + /* + * Derived from the enum located in the Linux kernel sources: + * include/net/tcp_states.h + */ + switch (st) { + case TCPS_ESTABLISHED: + return (1); + case TCPS_SYN_SENT: + return (2); + case TCPS_SYN_RCVD: + return (3); + case TCPS_FIN_WAIT_1: + return (4); + case TCPS_FIN_WAIT_2: + return (5); + case TCPS_TIME_WAIT: + return (6); + case TCPS_CLOSED: + return (7); + case TCPS_CLOSE_WAIT: + return (8); + case TCPS_LAST_ACK: + return (9); + case TCPS_LISTEN: + return (10); + case TCPS_CLOSING: + return (11); + default: + /* No translation for TCPS_IDLE, TCPS_BOUND or anything else */ + return (0); + } +} + +static void +lxpr_format_tcp(lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address " + "st tx_queue rx_queue tr tm->when retrnsmt uid timeout " + "inode\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode\n"); + } + /* + * Due to differences between the Linux and illumos TCP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st + * - tx_queue + * - rx_queue + * - uid + * - inode + * + * Omitted/invalid fields + * - tr + * - tm->when + * - retrnsmt + * - timeout + */ + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp_t *tcp; + vattr_t attr; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + vnode_t *vp = (so != NULL) ? so->so_vnode : NULL; + if (connp->conn_ipversion != ipver) + continue; + tcp = connp->conn_tcp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + /* fetch the simulated inode for the socket */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %u %u %u %u %d\n", + lxpr_convert_tcp_state(tcp->tcp_state), + tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* per-connection rexmits aren't tracked today */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode + more */ + (ino_t)attr.va_nodeid, 0, NULL, 0, 0, 0, 0, 0); + } + } + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(uiobuf, IPV4_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(uiobuf, IPV6_VERSION); +} + +static void +lxpr_format_udp(lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address" + " st tx_queue rx_queue tr tm->when retrnsmt uid" + " timeout inode ref pointer drops\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode ref pointer drops\n"); + } + /* + * Due to differences between the Linux and illumos UDP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st: limited + * - uid + * + * Omitted/invalid fields + * - tx_queue + * - rx_queue + * - tr + * - tm->when + * - retrnsmt + * - timeout + * - inode + */ + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) { + udp_t *udp; + int state = 0; + vattr_t attr; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + vnode_t *vp = (so != NULL) ? so->so_vnode : NULL; + if (connp->conn_ipversion != ipver) + continue; + udp = connp->conn_udp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + switch (udp->udp_state) { + case TS_UNBND: + case TS_IDLE: + state = 7; + break; + case TS_DATA_XFER: + state = 1; + break; + } + + /* fetch the simulated inode for the socket */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %d\n", + state, + 0, 0, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* retrans */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode, ref, pointer, drops */ + (ino_t)attr.va_nodeid, 0, NULL, 0); + } + } + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(uiobuf, IPV4_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(uiobuf, IPV6_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + sonode_t *so; + zoneid_t zoneid = getzoneid(); + + lxpr_uiobuf_printf(uiobuf, "Num RefCount Protocol Flags Type " + "St Inode Path\n"); + + mutex_enter(&socklist.sl_lock); + for (so = socklist.sl_list; so != NULL; + so = _SOTOTPI(so)->sti_next_so) { + vnode_t *vp = so->so_vnode; + vattr_t attr; + sotpi_info_t *sti; + const char *name = NULL; + int status = 0; + int type = 0; + int flags = 0; + + /* Only process active sonodes in this zone */ + if (so->so_count == 0 || so->so_zoneid != zoneid) + continue; + + /* + * Grab the inode, if possible. + * This must be done before entering so_lock. + */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + mutex_enter(&so->so_lock); + sti = _SOTOTPI(so); + + if (sti->sti_laddr_sa != NULL && + sti->sti_laddr_len > 0) { + name = sti->sti_laddr_sa->sa_data; + } else if (sti->sti_faddr_sa != NULL && + sti->sti_faddr_len > 0) { + name = sti->sti_faddr_sa->sa_data; + } + + /* + * Derived from enum values in Linux kernel source: + * include/uapi/linux/net.h + */ + if ((so->so_state & SS_ISDISCONNECTING) != 0) { + status = 4; + } else if ((so->so_state & SS_ISCONNECTING) != 0) { + status = 2; + } else if ((so->so_state & SS_ISCONNECTED) != 0) { + status = 3; + } else { + status = 1; + /* Add ACC flag for stream-type server sockets */ + if (so->so_type != SOCK_DGRAM && + sti->sti_laddr_sa != NULL) + flags |= 0x10000; + } + + /* Convert to Linux type */ + switch (so->so_type) { + case SOCK_DGRAM: + type = 2; + break; + case SOCK_SEQPACKET: + type = 5; + break; + default: + type = 1; + } + + lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu", + so, + so->so_count, + 0, /* proto, always 0 */ + flags, + type, + status, + (ino_t)attr.va_nodeid); + + /* + * Due to shortcomings in the abstract socket emulation, they + * cannot be properly represented here (as @<path>). + * + * This will be the case until they are better implemented. + */ + if (name != NULL) + lxpr_uiobuf_printf(uiobuf, " %s\n", name); + else + lxpr_uiobuf_printf(uiobuf, "\n"); + mutex_exit(&so->so_lock); + } + mutex_exit(&socklist.sl_lock); +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced, unless we're open non-blocking, in which case we return after + * 1ms. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + timestruc_t to; + timestruc_t *tp = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (lxpr_uiobuf_nonblock(uiobuf)) { + to.tv_sec = 0; + to.tv_nsec = 1000000; /* 1msec */ + tp = &to; + } + + if (ldi_getmsg(lh, &mp, tp) == 0) { + /* + * lx procfs doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + mutex_exit(&zone->zone_mem_lock); + } + + /* + * SwapFree + * On illumos we reserve swap up front, whereas on Linux they just + * wing it and kill a random process if they run out of backing store + * for virtual memory. Our swap reservation doesn't translate to that + * model, so just inform the caller that no swap is being used. + */ + lxpr_uiobuf_printf(uiobuf, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype, *rwflag; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro"; + + lxpr_uiobuf_printf(uiobuf, "%s %s %s %s 0 0\n", + resource, mntpt, fstype, rwflag); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_partitions(): + * + * Over the years, /proc/partitions has been made considerably smaller -- to + * the point that it really is only major number, minor number, number of + * blocks (which we report as 0), and partition name. + * + * We support this because some things want to see it to make sense of + * /proc/diskstats, and also because "fdisk -l" and a few other things look + * here to find all disks on the system. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS); + + lxpr_uiobuf_printf(uiobuf, "major minor #blocks name\n\n"); + + lxzd = ztolxzd(curproc->p_zone); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + lxpr_uiobuf_printf(uiobuf, "%4d %7d %10d %s\n", + getmajor(vd->lxvd_emul_dev), getminor(vd->lxvd_emul_dev), + 0, vd->lxvd_name); + vd = list_next(lxzd->lxzd_vdisks, vd); + } +} + +/* + * There aren't many actual devices inside a zone but we want to provide the + * major numbers for the pseudo devices that do exist, including our pts/ptm + * device, as well as the zvol virtual disk device. We simply hardcode the + * emulated major numbers that are used elsewhere in the code and that match + * the expected Linux major numbers. See lx devfs where some of the major + * numbers have no defined constants. + */ +/* ARGSUSED */ +static void +lxpr_read_devices(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_DEVICES); + + lxpr_uiobuf_printf(uiobuf, "Character devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/tty\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/console\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/ptmx\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d ptm\n", LX_PTM_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d pts\n", LX_PTS_MAJOR_MIN); + + lxpr_uiobuf_printf(uiobuf, "\nBlock devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d zvol\n", LX_MAJOR_DISK); +} + +/* + * lxpr_read_diskstats(): + * + * See the block comment above the per-device output-generating line for the + * details of the format. + */ +/* ARGSUSED */ +static void +lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd; + kstat_t kn; + int num; + zone_vfs_kstat_t *kip; + size_t size; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS); + + lxzd = ztolxzd(zone); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + /* + * Use the zone_vfs kstat, which is a superset of a kstat_io_t, since + * it tracks IO at the zone level. + */ + strlcpy(kn.ks_module, "zone_vfs", sizeof (kn.ks_module)); + strlcpy(kn.ks_name, zone->zone_name, sizeof (kn.ks_name)); + kn.ks_instance = getzoneid(); + + kip = (zone_vfs_kstat_t *)lxpr_kstat_read(&kn, B_TRUE, &size, &num); + if (kip == NULL) + return; + + if (size < sizeof (kstat_io_t)) { + kmem_free(kip, size); + return; + } + + /* + * Because the zone vfs stats are tracked at the zone level we use + * the same kstat for the zone's virtual disk (the zpool) and any + * zvols that might also visible within the zone. + */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + /* + * /proc/diskstats is defined to have one line of output for + * each block device, with each line containing the following + * 14 fields: + * + * 1 - major number + * 2 - minor mumber + * 3 - device name + * 4 - reads completed successfully + * 5 - reads merged + * 6 - sectors read + * 7 - time spent reading (ms) + * 8 - writes completed + * 9 - writes merged + * 10 - sectors written + * 11 - time spent writing (ms) + * 12 - I/Os currently in progress + * 13 - time spent doing I/Os (ms) + * 14 - weighted time spent doing I/Os (ms) + * + * One small hiccup: we don't actually keep track of time + * spent reading vs. time spent writing -- we keep track of + * time waiting vs. time actually performing I/O. While we + * could divide the total time by the I/O mix (making the + * obviously wrong assumption that I/O operations all take the + * same amount of time), this has the undesirable side-effect + * of moving backwards. Instead, we report the total time + * (read + write) for all three stats (read, write, total). + * This is also a lie of sorts, but it should be more + * immediately clear to the user that reads and writes are + * each being double-counted as the other. + * + * Since certain consumers interpret the major/minor numbers to + * infer device names, some translation is required to avoid + * output which results in totally unexpected results. + */ + + lxpr_uiobuf_printf(uiobuf, "%4d %7d %s ", + getmajor(vd->lxvd_emul_dev), + getminor(vd->lxvd_emul_dev), + vd->lxvd_name); + + if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Use the zone-wide vfs stats for any zfs datasets + * represented via virtual devices. + */ +#define KV(N) kip->zv_ ## N.value.ui64 +#define NS_PER_MS (uint64_t)(NANOSEC / MILLISEC) + lxpr_uiobuf_printf(uiobuf, + "%llu %llu %llu %llu " + "%llu %llu %llu %llu " + "%llu %llu %llu\n", + (uint64_t)KV(reads), 0LL, + KV(nread) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)KV(writes), 0LL, + KV(nwritten) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)(KV(rcnt) + KV(wcnt)), + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (KV(rlentime) + KV(wlentime)) / NS_PER_MS); +#undef KV +#undef NS_PER_MS + } else { + /* + * Report nearly-zeroed statistics for other devices. + * + * Since iostat will ignore devices which report no + * succesful reads or writes, a single read of one + * sector, taking 1ms, is reported. + */ + lxpr_uiobuf_printf(uiobuf, + "1 0 1 1 0 0 0 0 0 0 0\n"); + } + + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + kmem_free(kip, size); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + lx_proc_data_t *lxpd = ptolxproc(curproc); + char release[LX_KERN_RELEASE_MAX]; + char version[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(release, lxzd->lxzd_kernel_release, sizeof (release)); + (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version)); + mutex_exit(&lxzd->lxzd_lock); + + /* Use per-process overrides, if specified */ + if (lxpd != NULL && lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(release, lxpd->l_uname_release, + sizeof (release)); + } + if (lxpd != NULL && lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(version, lxpd->l_uname_version, + sizeof (version)); + } + + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) %s\n", + LX_UNAME_SYSNAME, release, +#if defined(__GNUC__) + "gcc", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, +#else + "cc", 1, 0, 0, +#endif + version); +} + +/* ARGSUSED */ +static void +lxpr_read_vmstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* Only count CPUs which are present and active. */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* + * Needless to say, the metrics presented by vmstat are very specific + * to the internals of the Linux kernel. There is little per-zone + * information which can be translated in a meaningful way to fit the + * expected fields. For the time being, the output is kept sparse. + */ + lxpr_uiobuf_printf(uiobuf, + "pgpgin %lu\n" + "pgpgout %lu\n" + "pswpin %lu\n" + "pswpout %lu\n", + pgpgin_cum, + pgpgout_cum, + pgswapin_cum, + pgswapout_cum); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + /* is the emulated release > 2.4 */ + boolean_t newer_than24 = lx_kern_release_cmp(LXPTOZ(lxpnp), "2.4") > 0; + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + + if (newer_than24) { + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + if (newer_than24) { + lxpr_uiobuf_printf(uiobuf, + "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + } else { + lxpr_uiobuf_printf(uiobuf, + "cpu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum); + } + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + if (newer_than24) { + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + } else { + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu\n", + cp->cpu_id, + user_ticks, 0L, sys_ticks, idle_ticks); + } + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + if (newer_than24) { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); + } else { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum); + } +} + +/* + * lxpr_read_swaps(): + * + * We don't support swap files or partitions, but some programs like to look + * here just to check we have some swap on the system, so we lie and show + * our entire swap cap as one swap partition. See lxpr_read_meminfo for an + * explanation on why we report 0 used swap. + * + * It is important to use formatting identical to the Linux implementation + * so that consumers do not break. See swap_show() in mm/swapfile.c. + */ +/* ARGSUSED */ +static void +lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = curzone; + uint64_t totswap, usedswap; + + if (zone == global_zone || zone->zone_max_swap_ctl == UINT64_MAX) { + totswap = (k_anoninfo.ani_max * PAGESIZE) >> 10; + } else { + mutex_enter(&zone->zone_mem_lock); + /* Uses units of 1 kb (2^10). */ + totswap = zone->zone_max_swap_ctl >> 10; + mutex_exit(&zone->zone_mem_lock); + } + usedswap = 0; + + lxpr_uiobuf_printf(uiobuf, + "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + lxpr_uiobuf_printf(uiobuf, "%-40s%s\t%llu\t%llu\t%d\n", + "/dev/swap", "partition", totswap, usedswap, -1); +} + +/* + * inotify tunables exported via /proc. + */ +extern int inotify_maxevents; +extern int inotify_maxinstances; +extern int inotify_maxwatches; + +static void +lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents); +} + +static void +lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances); +} + +static void +lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches); +} + +static void +lxpr_read_sys_kernel_caplcap(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_CAPLCAP); + lxpr_uiobuf_printf(uiobuf, "%d\n", LX_CAP_MAX_VALID); +} + +static void +lxpr_read_sys_kernel_corepatt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = curproc->p_zone; + struct core_globals *cg; + refstr_t *rp; + corectl_path_t *ccp; + char tr[MAXPATHLEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + /* If core dumps are disabled, return an empty string. */ + if ((cg->core_options & CC_PROCESS_PATH) == 0) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + if ((rp = ccp->ccp_path) != NULL) + refstr_hold(rp); + mutex_exit(&ccp->ccp_mtx); + + if (rp == NULL) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + bzero(tr, sizeof (tr)); + if (lxpr_core_path_s2l(refstr_value(rp), tr, sizeof (tr)) != 0) { + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "%s\n", tr); +} + +static void +lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME); + lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename()); +} + +static void +lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_msgmni, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max); +} + +static void +lxpr_read_sys_kernel_osrel(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *br_data; + char version[LX_KERN_VERSION_MAX]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_OSREL); + br_data = ztolxzd(curproc->p_zone); + if (curproc->p_zone->zone_brand == &lx_brand) { + mutex_enter(&br_data->lxzd_lock); + (void) strlcpy(version, br_data->lxzd_kernel_version, + sizeof (version)); + mutex_exit(&br_data->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%s\n", version); + } else { + lxpr_uiobuf_printf(uiobuf, "\n"); + } +} + +static void +lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid); +} + +static void +lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * This file isn't documented on the Linux proc(5) man page but + * according to the blog of the author of systemd/journald (the + * consumer), he says: + * boot_id: A random ID that is regenerated on each boot. As such it + * can be used to identify the local machine's current boot. It's + * universally available on any recent Linux kernel. It's a good and + * safe choice if you need to identify a specific boot on a specific + * booted kernel. + * + * We'll just generate a random ID if necessary. On Linux the format + * appears to resemble a uuid but since it is not documented to be a + * uuid, we don't worry about that. + */ + lx_zone_data_t *br_data; + char bootid[LX_BOOTID_LEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_BOOTID); + + if (curproc->p_zone->zone_brand != &lx_brand) { + lxpr_uiobuf_printf(uiobuf, "0\n"); + return; + } + + br_data = ztolxzd(curproc->p_zone); + mutex_enter(&br_data->lxzd_lock); + if (br_data->lxzd_bootid[0] == '\0') { + extern int getrandom(void *, size_t, int); + int i; + + for (i = 0; i < 5; i++) { + u_longlong_t n; + char s[32]; + + (void) random_get_bytes((uint8_t *)&n, sizeof (n)); + switch (i) { + case 0: (void) snprintf(s, sizeof (s), "%08llx", n); + s[8] = '\0'; + break; + case 4: (void) snprintf(s, sizeof (s), "%012llx", n); + s[12] = '\0'; + break; + default: (void) snprintf(s, sizeof (s), "%04llx", n); + s[4] = '\0'; + break; + } + if (i > 0) + strlcat(br_data->lxzd_bootid, "-", + sizeof (br_data->lxzd_bootid)); + strlcat(br_data->lxzd_bootid, s, + sizeof (br_data->lxzd_bootid)); + } + } + (void) strlcpy(bootid, br_data->lxzd_bootid, sizeof (bootid)); + mutex_exit(&br_data->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%s\n", bootid); + +} + +static void +lxpr_read_sys_kernel_sem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *pp = curproc; + rctl_qty_t vmsl, vopm, vmni, vmns; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SEM); + + mutex_enter(&pp->p_lock); + vmsl = rctl_enforced_value(rc_process_semmsl, pp->p_rctls, pp); + vopm = rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp); + vmni = rctl_enforced_value(rc_zone_semmni, pp->p_zone->zone_rctls, pp); + mutex_exit(&pp->p_lock); + vmns = vmsl * vmni; + if (vmns < vmsl || vmns < vmni) { + vmns = ULLONG_MAX; + } + /* + * Format: semmsl semmns semopm semmni + * - semmsl: Limit semaphores in a sempahore set. + * - semmns: Limit semaphores in all semaphore sets + * - semopm: Limit operations in a single semop call + * - semmni: Limit number of semaphore sets + */ + lxpr_uiobuf_printf(uiobuf, "%llu\t%llu\t%llu\t%llu\n", + vmsl, vmns, vopm, vmni); +} + +static void +lxpr_read_sys_kernel_shmall(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMALL); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + /* value is in pages */ + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)btop(val)); +} + +static void +lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_shmmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmni, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", curproc->p_zone->zone_nlwps_ctl); +} + +static void +lxpr_read_sys_net_core_somaxc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_printf(uiobuf, "%d\n", SOMAXCONN); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q); + netstack_rele(ns); +} + +/* + * ip_local_port_range + * + * The low & high port number range. + * integers; default: 32768 61000 + * + * illumos: tcp_smallest_anon_port & tcp_largest_anon_port + * Not in tcp(7p) man page. + */ +static void +lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\t%d\n", + tcps->tcps_smallest_anon_port, tcps->tcps_largest_anon_port); + netstack_rele(ns); +} + +/* + * tcp_fin_timeout + * + * This specifies how many seconds to wait for a final FIN packet before the + * socket is forcibly closed. This is strictly a violation of the TCP + * specification, but required to prevent denial-of-service attacks. + * integer; default: 60; + * + * illumos: tcp_fin_wait_2_flush_interval + * Not in tcp(7p) man page but see comment in uts/common/inet/tcp/tcp_input.c + * in the tcp_input_data() function on the use of tcp_fin_wait_2_flush_interval. + * The value is in milliseconds. + */ +static void +lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + tcps->tcps_fin_wait_2_flush_interval / 1000); + netstack_rele(ns); +} + +/* + * tcp_keepalive_intvl + * + * The number of seconds between TCP keep-alive probes. default: 75 + * Linux retries tcp_keepalive_probes (9) times before timing out. + * + * illumos: + * We have tcp_ka_rinterval but there is no corresponding tcps_* tunable for + * this. The closest is tcps_keepalive_abort_interval which specifies the + * time threshold for aborting a TCP connection in milliseconds. Linux retries + * 9 times (giving a total of 11.25 minutes) so we emulate this by dividing out + * tcps_keepalive_abort_interval by 9. + */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_abort_interval / 1000) / 9); + netstack_rele(ns); +} + +/* + * tcp_keepalive_time + * + * The number of seconds a connection needs to be idle before TCP begins + * sending out keep-alive probes. The default value is 7200 seconds (2 hours). + * + * illumos: tcp_keepalive_interval + * The interval for sending out the first probe in milliseconds. The default is + * two hours. + */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_interval / 1000)); + netstack_rele(ns); +} + +/* + * tcp_sack + * + * Enable RFC 2018 TCP Selective Acknowledgements. Boolean, default: enabled + * + * illumos: tcp_sack_permitted + * tcp_sack_permitted 0 == disabled, 1 == no initiate but accept, + * 2 == initiate and accept. default is 2. + */ +static void +lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_sack_permitted == 0 ? 0 : 1)); + netstack_rele(ns); +} + +/* + * tcp_window_scaling + * + * RFC 1323 TCP window scaling. This feature allows the use of a large window + * (> 64K) on a TCP connection. Boolean; default: enabled + * + * illumos: tcp_wscale_always + * tcp_wscale_always is set to 1, the window scale option will always be + * set when connecting to a remote system. If tcp_wscale_always is 0, the + * window scale option will be set only if the user has requested a send or + * receive window larger than 64K. The default value of is 1. + */ +static void +lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + + ns = netstack_get_current(); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_wscale_always); + netstack_rele(ns); +} + +static void +lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MAX_MAP_CNT); + /* We don't limit mappings, just say we have a large limit. */ + lxpr_uiobuf_printf(uiobuf, "%d\n", 16777215); +} + +static void +lxpr_read_sys_vm_minfr_kb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MINFR_KB); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_nhpages(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_NHUGEP); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_OVERCOMMIT_MEM); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_swappiness(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_SWAPPINESS); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +/* + * Report a list of each cgroup subsystem supported by our emulated cgroup fs. + * This needs to exist for systemd to run but for now we don't report any + * cgroup subsystems as being installed. The commented example below shows + * how to print a subsystem entry. + */ +static void +lxpr_read_cgroups(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + "#subsys_name", "hierarchy", "num_cgroups", "enabled"); + + /* + * lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + * "cpu,cpuacct", "2", "1", "1"); + */ +} + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * Report a list of file systems loaded in the kernel. We only report the ones + * which we support and which may be checked by various components to see if + * they are loaded. + */ +static void +lxpr_read_filesystems(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "autofs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "cgroup"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "nfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "proc"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "sysfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "tmpfs"); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink, fifo or socket + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = lxpnp->lxpr_realvp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_PID_TASK_IDDIR: + vap->va_nlink = TIDDIRFILES; + vap->va_size = TIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + /* + * Restore VLNK type for lstat-type activity. + * See lxpr_readlink for more details. + */ + if ((flags & FOLLOW) == 0) + vap->va_type = VLNK; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int shift = 0; + proc_t *tp; + + /* lx /proc is primarily a read only file system */ + if ((mode & VWRITE) && !lxpr_is_writable(type)) { + return (EROFS); + } + + /* + * If this is a restricted file, check access permissions. + */ + switch (type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_LIMITS: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FDDIR: + case LXPR_PID_TID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid, ZOMB_OK); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_taskdir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + pid_t real_pid; + uint_t tid; + int c; + kthread_t *t; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR); + + /* + * convert the string rendition of the filename to a thread ID + */ + tid = 0; + while ((c = *comp++) != '\0') { + int otid; + if (c < '0' || c > '9') + return (NULL); + + otid = tid; + tid = 10 * tid + c - '0'; + /* integer overflow */ + if (tid / 10 != otid) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + real_pid = get_real_pid(dlxpnp->lxpr_pid); + p = lxpr_lock(real_pid, NO_ZOMB); + if ((p == NULL)) + return (NULL); + + /* + * Bail if this is a system process. + */ + if ((p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + if (p->p_brand == &lx_brand) { + t = lxpr_get_thread(p, tid); + } else { + /* + * Only the main thread is visible for non-branded processes. + */ + t = p->p_tlist; + if (tid != p->p_pid || t == NULL) { + t = NULL; + } else { + thread_lock(t); + } + } + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + thread_unlock(t); + + /* + * Allocate and fill in a new lx /proc taskid node. + * Instead of the last arg being a fd, it is a tid. + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + pid_t real_pid; + kthread_t *t; + int i; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* + * get the proc to work with and lock it + */ + real_pid = get_real_pid(dlxpnp->lxpr_pid); + p = lxpr_lock(real_pid, NO_ZOMB); + if ((p == NULL)) + return (NULL); + + /* + * Bail if this is a system process. + */ + if ((p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* need to confirm tid is still there */ + t = lxpr_get_thread(p, dlxpnp->lxpr_desc); + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + thread_unlock(t); + + /* + * allocate and fill in the new lx /proc taskid dir node + */ + for (i = 0; i < TIDDIRFILES; i++) { + if (strcmp(tiddir[i].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p, + dlxpnp->lxpr_desc); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); + } + } + + lxpr_unlock(p); + return (NULL); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR || + dlxpnp->lxpr_type == LXPR_PID_TID_FDDIR); + + return (lxpr_lookup_fdnode(dp, comp)); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid, ZOMB_OK); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lx /proc node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sysdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kdir_randdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_randdir, + SYS_RANDDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_netdir, + SYS_NETDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_coredir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_ipv4dir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_vmdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_vmdir, + SYS_VMDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir, + SYS_FSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, or 0 if zsched, otherwise use the value from + * the proc structure + */ + if (p->p_pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + } else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; + } else { + pid = p->p_pid; + } + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + pid_t find_pid; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + if (lxpnp->lxpr_pid == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (lxpnp->lxpr_pid == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = lxpnp->lxpr_pid; + } + p = prfind(find_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, tiddirsize, tasknum; + proc_t *p; + pid_t real_pid; + kthread_t *t; + boolean_t branded; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + oresid = uiop->uio_resid; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid, ZOMB_OK); + if (p == NULL) { + return (ENOENT); + } + if (p->p_stat == SIDL) { + lxpr_unlock(p); + return (ENOENT); + } + + /* + * Just emit static entries for system processes and zombies. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its threads. + */ + tiddirsize = p->p_lwpcnt; + branded = (p->p_brand == &lx_brand); + mutex_exit(&p->p_lock); + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + if ((t = p->p_tlist) == NULL) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until all thread's have + * been returned. + */ + for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) { + int i, reclen, len; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the thread list + */ + i = (uoffset / LXPR_SDSIZE) - 2; + if (i < 0 || i >= tiddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (i != tasknum) + goto next; + + if (!branded) { + /* + * Emulating the goofy linux task model is impossible + * to do for native processes. We can compromise by + * presenting only the main thread to the consumer. + */ + emul_tid = p->p_pid; + } else { + if ((lwpd = ttolxlwp(t)) == NULL) { + goto next; + } + emul_tid = lwpd->br_pid; + /* + * Convert pid to Linux default of 1 if we're the + * zone's init. + */ + if (emul_tid == curproc->p_zone->zone_proc_initpid) + emul_tid = 1; + } + + dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, lxpnp->lxpr_pid, + emul_tid); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + + if ((t = t->t_forw) == p->p_tlist || !branded) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + } + + if (eofp != NULL) + *eofp = 0; + +out: + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + pid_t real_pid; + kthread_t *t; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + mutex_enter(&pidlock); + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = prfind(real_pid); + + /* can't read its contents if it died */ + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + + mutex_exit(&pidlock); + + /* need to confirm tid is still there */ + t = lxpr_get_thread(p, lxpnp->lxpr_desc); + if (t == NULL) { + /* we can't find this specific thread */ + return (NULL); + } + thread_unlock(t); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, fddirsize; + proc_t *p; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR || + lxpnp->lxpr_type == LXPR_PID_TID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid, ZOMB_OK); + if (p == NULL) + return (ENOENT); + + /* + * For exiting/exited processes or those belonging to the system, only + * emit the fixed entries. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES)); +} + +static int +lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir, + SYS_FSDIRFILES)); +} + +static int +lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +static int +lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static int +lxpr_readdir_sys_kdir_randdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_randdir, + SYS_RANDDIRFILES)); +} + +static int +lxpr_readdir_sys_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_netdir, + SYS_NETDIRFILES)); +} + +static int +lxpr_readdir_sys_net_coredir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static int +lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static int +lxpr_readdir_sys_vmdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_vmdir, + SYS_VMDIRFILES)); +} + +#define isdigit(c) ((c) >= '0' && (c) <= '9') +#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') + +/* + * Obtain a numeric value from the null-terminated input string. + * We don't have strtok in the kernel, so tokenize this ourselves and + * validate the input. + */ +static int +lxpr_tokenize_num(char *str, long *pv, char **ep) +{ + char *pstart, *pc, c, *endptr; + long v; + + for (pc = str; isspace(*pc); pc++) + ; + + for (pstart = pc; isdigit(*pc); pc++) + ; + if (pc == pstart || (!isspace(*pc) && *pc != '\0')) + return (EINVAL); + c = *pc; + *pc = '\0'; + + if (ddi_strtol(pstart, &endptr, 10, &v) != 0) { + *pc = c; + return (EINVAL); + } + if (*endptr != '\0') { + *pc = c; + return (EINVAL); + } + + if (pv != NULL) + *pv = v; + if (ep != NULL) + *ep = ++pc; + + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write_tcp_property(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct, char *prop, + int (*xlate)(char *, int)) +{ + int error; + int res = 0; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + netstack_t *ns; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '\0') /* no input */ + return (EINVAL); + + ns = netstack_get_current(); + if (ns == NULL) + return (EINVAL); + + if (xlate != NULL && xlate(val, sizeof (val)) != 0) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + pinfo = mod_prop_lookup(ptbl, prop, MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, val, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_core_somaxc(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_conn_req_max_q", NULL)); +} + +static int +lxpr_xlate_sec2ms(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_ka_intvl(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000 * 9) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_sack(char *val, int size) +{ + long flag; + char *ep; + + if (lxpr_tokenize_num(val, &flag, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (flag != 0 && flag != 1) + return (EINVAL); + /* see comment on lxpr_read_sys_net_ipv4_tcp_sack */ + if (snprintf(val, size, "%d", (flag == 0 ? 0 : 2)) >= size) + return (EINVAL); + return (0); +} + +/* + * We expect two port numbers on a line as input for the range, and we have to + * set two properties on the netstack_tcp, so we can't reuse + * lxpr_write_tcp_property. + */ +static int +lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int res; + size_t olen; + char vals[32]; /* big enough for a line w/ 2 16-bit numeric strings */ + char *ep; + long low, high; + netstack_t *ns; + tcp_stack_t *tcps; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (vals) - 1) + return (EINVAL); + + bzero(vals, sizeof (vals)); + res = uiomove(vals, olen, UIO_WRITE, uio); + if (res != 0) + return (res); + + if (lxpr_tokenize_num(vals, &low, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &high, &ep) != 0) + return (EINVAL); + + if (*ep != '\0') { + /* make sure no other tokens on the line */ + *ep++ = '\0'; + for (; isspace(*ep); ep++) + ; + if (*ep != '\0') + return (EINVAL); + } + + if (low > high || high > 65535) + return (EINVAL); + + ns = netstack_get_current(); + if (ns == NULL) + return (EINVAL); + + tcps = ns->netstack_tcp; + if (low < tcps->tcps_smallest_nonpriv_port) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + + (void) snprintf(vals, sizeof (vals), "%ld", low); + pinfo = mod_prop_lookup(ptbl, "smallest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + (void) snprintf(vals, sizeof (vals), "%ld", high); + pinfo = mod_prop_lookup(ptbl, "largest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_fin_wait_2_flush_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_abort_interval", lxpr_xlate_ka_intvl)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "sack", + lxpr_xlate_sack)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "_wscale_always", + NULL)); +} + +/* ARGSUSED */ +static int +lxpr_write_sys_kernel_corepatt(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + zone_t *zone = curproc->p_zone; + struct core_globals *cg; + refstr_t *rp, *nrp; + corectl_path_t *ccp; + char val[MAXPATHLEN]; + char valtr[MAXPATHLEN]; + size_t olen; + int error; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + if (secpolicy_coreadm(cr) != 0) + return (EPERM); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '|') + return (EINVAL); + + if ((error = lxpr_core_path_l2s(val, valtr, sizeof (valtr))) != 0) + return (error); + + nrp = refstr_alloc(valtr); + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + rp = ccp->ccp_path; + refstr_hold((ccp->ccp_path = nrp)); + cg->core_options |= CC_PROCESS_PATH; + mutex_exit(&ccp->ccp_mtx); + + if (rp != NULL) + refstr_rele(rp); + + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write_pid_loginuid(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + char *ep; + long u; + proc_t *p; + lx_proc_data_t *pd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (lxpr_tokenize_num(val, &u, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + if ((p = lxpr_lock(lxpnp->lxpr_pid, NO_ZOMB)) == NULL) + return (ENXIO); + + if ((pd = ptolxproc(p)) != NULL) { + pd->l_loginuid = (uid_t)u; + } + lxpr_unlock(p); + + return (0); +} + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* + * Linux does something very "clever" for /proc/<pid>/fd/<num> entries. + * Open FDs are represented as symlinks, the link contents + * corresponding to the open resource. For plain files or devices, + * this isn't absurd since one can dereference the symlink to query + * the underlying resource. For sockets or pipes, it becomes ugly in a + * hurry. To maintain this human-readable output, those FD symlinks + * point to bogus targets such as "socket:[<inodenum>]". This requires + * circumventing vfs since the stat/lstat behavior on those FD entries + * will be unusual. (A stat must retrieve information about the open + * socket or pipe. It cannot fail because the link contents point to + * an absent file.) + * + * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD + * entries. This bypasses code paths which would normally + * short-circuit on symlinks and allows us to emulate the vfs behavior + * expected by /proc consumers. + */ + if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) { + /* + * Special handling possible for /proc/<pid>/fd/<num> + * Generate <type>:[<inode>] links, if allowed. + */ + if (lxpnp->lxpr_type != LXPR_PID_FD_FD || + lxpr_readlink_fdnode(lxpnp, bp, buflen) != 0) { + return (error); + } + } + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process or 0 if zsched. + */ + if (curproc->p_pid == + curproc->p_zone->zone_proc_initpid) { + pid = 1; + } else if (curproc->p_pid == + curproc->p_zone->zone_zsched->p_pid) { + pid = 0; + } else { + pid = curproc->p_pid; + } + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} + +static int +lxpr_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) { + if (wr_tab[i].wft_wrf != NULL) { + return (wr_tab[i].wft_wrf(lxpnp, uiop, cr, ct)); + } + break; + } + } + + /* pretend we wrote the whole thing */ + uiop->uio_offset += uiop->uio_resid; + uiop->uio_resid = 0; + return (0); +} + +/* Needed for writable files which are first "truncated" */ +static int +lxpr_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + cred_t *cred, caller_context_t *ct) +{ + int error; + + if (cmd != F_FREESP) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cred, ct)) != 0) + return (error); + + return (0); +} + +/* + * Needed for writable files which are first "truncated". We only support + * truncation. + */ +static int +lxpr_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (vap->va_mask != AT_SIZE) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cr, ct)) != 0) + return (error); + + return (0); +} + +/* + * We need to allow open with O_CREAT for the writable files. + */ +/*ARGSUSED7*/ +static int +lxpr_create(vnode_t *dvp, char *nm, vattr_t *vap, enum vcexcl exclusive, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + lxpr_node_t *lxpnp = VTOLXP(dvp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *vp = NULL; + int error; + + ASSERT(type < LXPR_NFILES); + + /* + * restrict create permission to owner or root + */ + if ((error = lxpr_access(dvp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + if (*nm == '\0') + return (EPERM); + + if (dvp->v_type != VDIR) + return (EPERM); + + if (exclusive == EXCL) + return (EEXIST); + + /* + * No writable files in top-level proc dir. We check this to avoid + * getting a non-proc node via "..". + */ + if (type != LXPR_PROCDIR && + lxpr_lookup(dvp, nm, &vp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + lxpr_nodetype_t ftype = VTOLXP(vp)->lxpr_type; + if (!lxpr_is_writable(ftype)) { + VN_RELE(vp); + vp = NULL; + } + } + + if (vp != NULL) { + ASSERT(vp->v_type != VDIR); + + /* confirm permissions against existing file */ + if ((error = lxpr_access(vp, mode, 0, cr, ct)) != 0) { + VN_RELE(vp); + return (error); + } + + *vpp = vp; + return (0); + } + + /* + * Linux proc does not allow creation of addition, non-subsystem + * specific files inside the hierarchy. ENOENT is tossed when such + * actions are attempted. + */ + return (ENOENT); +} diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h new file mode 100644 index 0000000000..17b19895f4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h @@ -0,0 +1,511 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_H +#define _LX_AUTOFS_H + +/* + * The lxautofs filesystem and driver exist to emulate the Linux autofs + * filesystem and /dev/autofs device (this code emulates both). The + * purpose is to provide support for the Linux "automount" automounter. + * + * The device ioctls map fairly closely to the filesystem ioctls. The device + * ioctls have superseded the filesystem ioctls and the automounter will + * use the device ioctls if the device exists. + * + * The device ioctls are used by the automounter to perform recovery + * in cases where the automounter is restarted while mounts are present. It + * also allows for better management operations when a filesystem is mounted + * on top of an autofs mountpoint, as in the case of an NFS direct mount on + * top of an autofs mount. + * + * + * +++ Linux automounter background. + * + * Linux has two automounters: "amd" (not used in any popular, modern distro) + * and "automount". + * + * "automount" is the normal Linux automounter. It utilizes a kernel + * filesystem (autofs) and device (/dev/autofs) to provide its functionality. + * Basically, it mounts the autofs filesystem at any automounter controlled + * mountpoint. This filesystem then intercepts and redirects lookup operations + * to the userland automounter process via a pipe. The pipe to the automounter + * is established via a mount option when the autofs filesystem is mounted or + * via the setpipefd ioctl if the automounter restarts. When the automounter + * receives a request via this pipe, it does lookups (or unmounts) to whatever + * backing store it's configured to use, does mkdir operations on the autofs + * filesystem, mounts remote NFS filesystems on any directories it manages or + * just created, and signals the autofs device via an ioctl to let it know + * that the lookup (or expire) can continue. Other management operations (such + * as querying expiration for unmounting) are performed using the autofs device. + * + * + * +++ Linux autofs documentation. + * + * Within the Linux src tree, see the file: + * Documentation/filesystems/autofs4-mount-control.txt + * This documents some of the autofs behavior and the device driver ioctls. + * + * The following URL (https://lwn.net/Articles/606960/) documents autofs in + * general. This patch was targeted for Documentation/filesystems/autofs4.txt, + * but seems to have never integrated into the Linux src tree. + * + * + * +++ Linux autofs (and automount daemon) notes + * + * Since we're mimicking the behavior of the Linux autofs filesystem and + * device, we document some of the observed behavior here. + * + * There are multiple versions of the autofs filesystem kernel API protocol + * and modern implementations of the user-land automount daemon would depend + * on v5, although the filesystem API has been superseded by the driver ioctl + * API, which is roughly similar. + * + * We'll describe the filesystem ioctls first, since support for those was + * implemented first. The device ioctls roughly correspond to the filesystem + * ioctls and were implemented last, but the automounter will use those + * ioctls, instead of the filesystem ioctls, when the device is present. + * + * Our original autofs implementation was developed in the mid-2000s around the + * v2 protocol, but that is currently obsolete. Our current implementation is + * based around the v5 protocol API. There was no autofs device support at that + * time. + * + * The autoumounter supports 3 different, mutually exclusive, mount options for + * each mountpoint: + * - indirect (this was all you got with the v2 support) + * - direct + * - offset + * + * An 'indirect' mountpoint is managed with dynamic mounts below that + * mountpoint. For example, if '/home' were an indirect autofs mount, then + * accessing a username under /home would traverse the 'lookup' code described + * below, cause a local subdirectory to be created, and a mount, usually NFS, + * onto that username subdirectory. + * + * A 'direct' mountpoint is an autofs mountpoint which will trigger the + * mounting of another filesystem overtop that mountpoint when accessed. + * + * An 'offset' mountpoint behaves like a 'direct' mountpoint but it is + * created dynamically by the automounter underneath an 'indirect' mountpoint. + * For example, if '/net' were an indirect autosfs mountpoint and the host + * 'jurassic' exported two NFS filesystems; '/var/crash' and '/var/core', then + * accessing '/net/jurassic' would trigger the automounter to create two + * subdirectories; '/net/jurassic/var/crash' and '/net/jurassic/var/core'. The + * automounter would then mount an autofs offset mount onto each one of these + * directories. Accessing either of those directories would then trigger + * automounter to perform another mount on top, as is done with a 'direct' + * mount. + * + * General behavior + * + * A) Autofs allows root owned, non-automounter processes to create + * directories in the autofs filesystem. The autofs filesystem treats the + * automounter's process group as special, but it doesn't prevent root + * processes outside of the automounter's process group from creating new + * directories in the autofs filesystem. + * + * B) Autofs doesn't allow creation of any non-directory entries in the + * autofs filesystem. No entity can create files (e.g. /bin/touch or + * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within + * the autofs filesystem are directories. + * + * C) Autofs only intercepts vop lookup operations. Notably, it does _not_ + * intercept and re-direct vop readdir operations. This means that the + * observed behavior of the Linux automounter can be considerably different + * from that of the illumos automounter. Specifically, on illumos if an autofs + * mountpoint is mounted _without_ the -nobrowse option then if a user does + * an ls operation (which translates into a vop readdir operation) then the + * automounter will intercept that operation and list all the possible + * directories and mountpoints without actually mounting any filesystems. + * Essentially, all automounter managed mountpoints on Linux will behave + * like "-nobrowse" mountpoints on illumos. Here's an example to illustrate + * this. If /ws was mounted on illumos without the -nobrowse option and an + * auto_ws yp map was setup as the backing store for this mountpoint, then an + * "ls /ws" would list all the keys in the map as valid directories, but an + * "ls /ws" on Linux would list an emptry directory. + * + * D) NFS mounts are performed by the automount process. When the automount + * process gets a redirected lookup request, it determines _all_ the + * possible remote mountpoints for that request, creates directory paths + * via mkdir, and mounts the remote filesystems on the newly created paths. + * This is described in the offset mount example above. Once the automounter + * completed the mounts it would signal the autofs filesystem (via an ioctl) + * that the lookup could continue. + * + * E.1) Autofs only redirects vop lookup operations for path entries that + * don't already exist in the autofs filesystem. So for the example above, + * an initial (after the start of the automounter) "ls /net/jurassic" would + * result in a request to the automounter. A subsequest "ls /net/jurassic" + * would not result in a request to the automounter. Even if + * /net/jurassic/var/crash and /net/jurassic/var/core were manually unmounted + * after the initial "ls /net/jurassic", a subsequest "ls /net/jurassic" + * would not result in a new request to the automounter. + * + * E.2) Autofs lookup requests that are sent to the automounter only include + * the root directory path component. So for example, after starting up + * the automounter if a user were to do a "ls /net/jurassic/var/crash", the + * initial lookup request actually sent to the automounter would just be for + * "jurassic" (the same request as if the user had done "ls /net/jurassic"). + * After the initial mounting of the two offset mounts onto crash and core the + * lookup would continue and a final lookup request would be sent to the + * automounter for "crash" (but this would be on a different vfs from the + * /net vfs). + * + * E.3) The two statements above aren't entirely entirely true. The Linux + * autofs filesystem will also redirect lookup operations for leaf + * directories that don't have a filesystem mounted on them. Using the + * example above, if a user did a "ls /net/jurassic", then manually + * unmounted /net/jurassic/var/crash, and then did an "ls + * /net/jurassic/var/crash", this would result in a request for + * "jurassic/var/crash" being sent to the automounter. The strange thing + * (a Linux bug perhaps) is that the automounter won't do anything with this + * request and the lookup will fail. + * + * F) The autofs filesystem communication protocol (what ioctls it supports + * and what data it passes to the automount process) is versioned. The + * userland automount daemon (as of version v5.0.7) expects v5 of the protocol + * (by running the AUTOFS_IOC_PROTOSUBVER ioctl), and exits if that is not + * supported. For v2-v5 the structure passed through the pipe always begins + * with a common header followed by different fields depending on the packet + * type. In addition the different versions support additional ioctls. + * + * v2 - basic lookup request + * v3 - adds expiring (umounting) + * v4 - adds expire multi + * v5 - adds missing indirect, expire indirect, missing direct & expire direct. + * Defines a new protocol structure layout. + * The v5 'missing indirect' and 'missing direct' ioctls are analogous to + * the v2 'missing' ioctl. These ioctls are used to initiate a mount via + * a lookup. The 'expire' ioctls are used by the automounter to query if + * it is possible to unmount the filesystem. 'direct' and 'indirect' + * refer to the mount option type that the automounter performed and + * correlate to an automounter direct or indirect map mointpoint. + * + * G) The automounter periodically issues an 'expire' ioctl to autofs to + * obtain the name of a mountpoint which the automounter can unmount. + * Unmounting is dicussed in more detail below. + * + * H) The device ioctls roughly correspond to the filesystem ioctls, but + * instead of being tied to an auotfs mountpoint vnode, they can be called any + * time. The argument structure uses either a path or an autofs pipe file + * descriptor to indicate what is being operated on. + * + * +++ lxautofs notes + * + * 1) In general, the lxautofs filesystem tries to mimic the behavior of the + * Linux autofs filesystem with the following exceptions: + * + * 1.1) We don't bother to implement the E.3 functionality listed above + * since it doesn't appear to be of any use. + * + * 1.2) We only fully implement v2 and v5 of the autofs protocol. + * + * 2) In general, the approach taken for lxautofs is to keep it as simple + * as possible and to minimize it's memory usage. To do this all information + * about the contents of the lxautofs filesystem are mirrored in the + * underlying filesystem that lxautofs is mounted on and most vop operations + * are simply passed onto this underlying filesystem. This means we don't + * have to implement most of the complex operations that a full filesystem + * normally has to implement. It also means that most of our filesystem state + * (wrt the contents of the filesystem) doesn't actually have to be stored + * in memory, we can simply go to the underlying filesystem to get it when + * it's requested. For the purposes of discussion, we'll call the underlying + * filesystem the "backing store." + * + * The backing store is actually a directory called ".lxautofs" which is created + * in the directory where the lxautofs filesystem is mounted. When the + * lxautofs filesystem is unmounted this backing store directory is deleted. + * If this directory exists at mount time (perhaps the system crashed while a + * previous lxautofs instance was mounted at the same location) it will be + * deleted. There are a few implications of using a backing store worth + * mentioning. + * + * 2.1) lxautofs can't be mounted on a read only filesystem. If this + * proves to be a problem we can probably move the location of the + * backing store. + * + * 2.2) If the backing store filesystem runs out of space then the + * automounter process won't be able to create more directories and mount + * new filesystems. Of course, strange failures usually happen when + * filesystems run out of space. + * + * 3) Why aren't we using gfs? gfs has two different usage models. + * + * 3.1) I'm my own filesystem but i'm using gfs to help with managing + * readdir operations. + * + * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes + * + * We're not using the 3.1 interfaces because we don't implement readdir + * ourselves. We pass all readdir operations onto the backing store + * filesystem and utilize its readdir implementation. + * + * We're not using the 3.2 interfaces because they are really designed for + * in memory filesystems where all of the filesystem state is stored in + * memory. They don't lend themselves to filesystems where part of the + * state is in memory and part of the state is on disk. + * + * For more information on gfs take a look at the block comments in the + * top of gfs.c + * + * 4) Unmounting + * + * The automounter has a timeout associated with each mount. It informs autofs + * of this timeout using the LX_AUTOFS_DEV_IOC_TIMEOUT_CMD ioctl after autofs + * has been mounted on the mountpoint. + * + * After the automounter has mounted something associated with the mountpoint + * then periodically (<timeout>/4 seconds) the automounter will issue the + * LX_AUTOFS_DEV_IOC_EXPIRE_CMD ioctl on the autofs mount. autofs is expected + * to respond with an underlying mountpoint entry which is a candidate for + * unmounting. The automounter will attempt to unmount the filesystem + * (which may fail if it is busy, since this is obviously racy) and then + * acknowledge the expire ioctl. The successful acknowledgement is independent + * of the success of unmounting the underlying filesystem. + * + * Unmount handling varies based on which type of mount the autofs was mounted + * with (indirect, direct or offset). + * + * To support 'indirect' mount expiration, the autofs vfs keeps track of the + * filesystems mounted immediately under the autofs mountpoint (in + * lav_mnt_list) after a lookup has completed successfully. Upon receipt of the + * LX_AUTOFS_IOC_DEV_EXPIRE_CMD ioctl, autofs removes the first element from + * the list, attempts to check if it is busy and if not, returns that mountpoint + * over the fifo (if busy the entry is added to the end of the list). When the + * ioctl is acknowledged, if the mountpoint still exists, that means the unmount + * failed and the entry is added at the back of the list. If there are no + * elements or the first one is busy, EAGAIN is returned for the 'expire' ioctl + * and the autoumounter will check again in <timeout>/4 seconds. + * + * For example, if /home is an autofs indirect mount, then there are typically + * many different {username}-specific NFS mounts under that /home autofs mount. + * autofs uses the lav_mnt_list to respond to 'expire' ioctls in a round-robin + * fashion so that the automounter can unmount user file systems that aren't in + * use. + * + * Expiring 'direct' mounts is similar, but since there is only a single mount, + * the lav_mnt_list only will have at most one entry if there is a filesystem + * mounted overtop of the autofs mount. + * + * Expiring 'offset' mounts is more complicated because there are at least + * two different autofs VFSs involved (the top-level and one for each offset + * mount underneath). The actual offset mount is handled exactly like a 'direct' + * mount. The top-level is an indirect mount and is handled in a similar way + * as described above for indirect mounts, but special handling is needed for + * each offset mount below. + * + * This can be explained using the same 'jurassic' example described earlier + * (/net is an autofs 'indirect' mount and the host 'jurassic' has two exported + * file systems; /var/crash and /var/core). If the user accesses + * /net/jurassic/var/crash then the automounter would setup the system so that + * the following mounts exist: + * - /net (the original autofs indirect mount which triggers everything) + * - /net/jurassic/var/crash (autofs offset mount) + * - /net/jurassic/var/crash (NFS mount on top of the autofs offset mount) + * - /net/jurassic/var/core (autofs offset mount) + * + * For expiration the automounter will issue the LX_AUTOFS_IOC_EXPIRE_MULTI + * ioctl on each autofs vfs for which something is mounted, so we would receive + * an expire ioctl on /net and another on /net/jusrassic/var/crash. The vfs for + * /net will be tracking "jurassic", but we detect it is busy and won't do + * anything at first. The vfs for "crash" will work like a direct mount and + * acknowledge the expire ioctl to the automounter once that filesystem times + * out and is no longer busy. The automounter will then unmount the "crash" + * NFS mount. + * + * Once the "crash" NFS mount has been unmounted by the automounter, we're left + * with the two autofs offset mounts under jurassic. The automounter will not + * try to unmount either of those, so we have to do that. Once we get another + * expire ioctl on /net and check "jurassic", we'll see there are only autofs + * mounts under /net/jurassic. We umount those using the lx_autofs_umount_offset + * function and respond to the automounter expire ioctl with "jurassic", in the + * same way as we would for any other indirect mount. + * + * 5) Recovery + * + * If the automounter is restarted for any reason, it needs to cope with + * pre-existing autofs mounts, as well as other automount-initiated mounts (e.g. + * a direct mount on top of an autofs mountpoint). The automounter uses the + * /proc/mounts file to correlate mounts to the managed mountpoints. It then + * uses the /dev/autofs device to openmount each of the autofs devices and + * reinitialize them using the various dev ioctls (timeout, requester, etc.). + * + * In general, the autoumounter will closemount the mountpoint once it's done, + * but it doesn't in the case of an offset mountpoint with nothing mounted + * on top. In this case the automounter expects autofs to expire that mountpoint + * before it will closemount (so things can subsequently cleanup). We handle + * this special case in the expire code path. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that the name of the actual file system is lxautofs, not lx_autofs, but + * the code uses lx_autofs to prefix the various names. This is because file + * system names are limited to 8 characters. + */ +#define LX_AUTOFS_NAME "lxautofs" + +#define LX_AUTOFS_MINORNAME "autofs" + +/* + * Mount options supported. + */ +#define LX_MNTOPT_FD "fd" +#define LX_MNTOPT_PGRP "pgrp" +#define LX_MNTOPT_MINPROTO "minproto" +#define LX_MNTOPT_MAXPROTO "maxproto" +#define LX_MNTOPT_INDIRECT "indirect" +#define LX_MNTOPT_DIRECT "direct" +#define LX_MNTOPT_OFFSET "offset" + +/* + * Version/subversion of the Linux kernel automount protocol we support. + * + * We fully support v2 and v5. We'll return ENOTSUP for all of the ioctls we + * don't yet handle. + */ +#define LX_AUTOFS_PROTO_VERS5 5 +#define LX_AUTOFS_PROTO_SUBVERSION 2 +#define LX_AUTOFS_PROTO_VERS2 2 + +/* packet types */ +typedef enum laph_ptype { + LX_AUTOFS_PTYPE_MISSING, /* 0 */ + LX_AUTOFS_PTYPE_EXPIRE, /* 1 */ + LX_AUTOFS_PTYPE_EXPIRE_MULTI, /* 2 */ + LX_AUTOFS_PTYPE_MISSING_INDIR, /* 3 */ + LX_AUTOFS_PTYPE_EXPIRE_INDIR, /* 4 */ + LX_AUTOFS_PTYPE_MISSING_DIRECT, /* 5 */ + LX_AUTOFS_PTYPE_EXPIRE_DIRECT /* 6 */ +} laph_ptype_t; + +/* + * Common header for all versions of the protocol. + */ +typedef struct lx_autofs_pkt_hdr { + int laph_protover; /* protocol version number */ + laph_ptype_t laph_type; + int laph_id; /* every pkt must have a unique id */ +} lx_autofs_pkt_hdr_t; + +/* + * Command structure sent to automount process from lxautofs via a pipe. + * This structure is the same for v2-v4 of the automount protocol + * (the communication pipe is established at mount time). + */ +typedef struct lx_autofs_v2_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component to lookup */ +} lx_autofs_v2_pkt_t; + +/* v4 multi-expire */ +typedef struct lx_autofs_v4_exp_pkt { + lx_autofs_pkt_hdr_t lape_hdr; + int lape_len; + char lape_name[MAXNAMELEN]; +} lx_autofs_v4_exp_pkt_t; + +/* v5 */ +typedef struct lx_autofs_v5_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + uint32_t lap_dev; + uint64_t lap_ino; + uint32_t lap_uid; + uint32_t lap_gid; + uint32_t lap_pid; + uint32_t lap_tgid; + uint32_t lap_name_len; + char lap_name[256]; +} lx_autofs_v5_pkt_t; + +union lx_autofs_pkt { + lx_autofs_v2_pkt_t lap_v2; + lx_autofs_v5_pkt_t lap_v5; +}; + +#define lap_protover lap_v2.lap_hdr.laph_protover +#define lap_type lap_v2.lap_hdr.laph_type +#define lap_id lap_v2.lap_hdr.laph_id + +/* + * Ioctls fully supported (v2 protocol). + */ +#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */ +#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */ +#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */ + +/* + * Ioctls supported (v3/v4 protocol). + */ +#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */ +#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0089364 /* arg: ulong_t */ + +/* + * Ioctls not supported (v3/v4 protocol). + */ + /* arg: lx_autofs_v3_exp_pkt_t * */ +#define LX_AUTOFS_IOC_EXPIRE 0x81109365 + +/* + * Ioctls supported (v5 protocol). + */ +#define LX_AUTOFS_IOC_PROTOSUBVER 0x80049367 /* arg: int */ +#define LX_AUTOFS_IOC_ASKUMOUNT 0x80049370 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_MULTI 0x40049366 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_INDIRECT LX_AUTOFS_IOC_EXPIRE_MULTI +#define LX_AUTOFS_IOC_EXPIRE_DIRECT LX_AUTOFS_IOC_EXPIRE_MULTI + +/* + * autofs device ioctls + */ +#define LX_AUTOFS_DEV_IOC_VERSION_CMD 0xc0189371 +#define LX_AUTOFS_DEV_IOC_PROTOVER_CMD 0xc0189372 +#define LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD 0xc0189373 +#define LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD 0xc0189374 +#define LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD 0xc0189375 +#define LX_AUTOFS_DEV_IOC_READY_CMD 0xc0189376 +#define LX_AUTOFS_DEV_IOC_FAIL_CMD 0xc0189377 +#define LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD 0xc0189378 +#define LX_AUTOFS_DEV_IOC_CATATONIC_CMD 0xc0189379 +#define LX_AUTOFS_DEV_IOC_TIMEOUT_CMD 0xc018937a +#define LX_AUTOFS_DEV_IOC_REQUESTER_CMD 0xc018937b +#define LX_AUTOFS_DEV_IOC_EXPIRE_CMD 0xc018937c +#define LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD 0xc018937d +#define LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD 0xc018937e + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h new file mode 100644 index 0000000000..39ea96d1fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_IMPL_H +#define _LX_AUTOFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/modhash.h> +#include <sys/vnode.h> + +#include <sys/lx_autofs.h> + +/* + * Space key. + * Used to persist data across lx_autofs filesystem module unloads. + */ +#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev" + +/* + * Name of the backing store directory. + */ +#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME + +#define LX_AUTOFS_VFS_ID_HASH_SIZE 15 +#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15 +#define LX_AUTOFS_VFS_VN_HASH_SIZE 15 + +enum lx_autofs_mnttype { LXAMT_NONE, LXAMT_INDIR, LXAMT_DIRECT, LXAMT_OFFSET }; + +typedef struct lx_autofs_mntent { + list_node_t lxafme_lst; + uint64_t lxafme_ts; /* time stamp */ + uint_t lxafme_len; + char *lxafme_path; +} lx_autofs_mntent_t; + +/* + * VFS data object. + */ +typedef struct lx_autofs_vfs { + /* Info about the underlying filesystem and backing store. */ + vnode_t *lav_mvp; + char *lav_bs_name; + vnode_t *lav_bs_vp; + + /* Info about the automounter process managing this filesystem. */ + int lav_fd; + pid_t lav_pgrp; + file_t *lav_fifo_wr; + file_t *lav_fifo_rd; + + /* The mount's dev and ino values for v5 protocol msg */ + uint64_t lav_dev; + u_longlong_t lav_ino; + + /* options from the mount */ + enum lx_autofs_mnttype lav_mnttype; + int lav_min_proto; + + /* + * ioctl-set timeout value. The automounter will perform an expire + * ioctl every timeout/4 seconds. We use this to expire a mount once + * it is inactive for the full timeout. + */ + ulong_t lav_timeout; + + /* ioctl-set catatonic value (prevents future mounts). */ + boolean_t lav_catatonic; + + /* Mount initiator's uid/gid for recovery handling. */ + uid_t lav_uid; + gid_t lav_gid; + + /* Each automount requests needs a unique id. */ + id_space_t *lav_ids; + + /* All remaining structure members are protected by lav_lock. */ + kmutex_t lav_lock; + /* openmount counter */ + int lav_openmnt_cnt; + + + /* Hashes to keep track of outstanding automounter requests. */ + mod_hash_t *lav_path_hash; + mod_hash_t *lav_id_hash; + + /* We need to keep track of all our vnodes. */ + vnode_t *lav_root; + mod_hash_t *lav_vn_hash; + + /* list of current mounts */ + list_t lav_mnt_list; +} lx_autofs_vfs_t; + +enum lx_autofs_callres { LXACR_NONE, LXACR_READY, LXACR_FAIL }; + +/* + * Structure to keep track of automounter requests sent to user-land. + */ +typedef struct lx_autofs_automnt_req { + /* Packet that gets sent to the automounter. */ + union lx_autofs_pkt laar_pkt; + int laar_pkt_size; + + /* Reference count. Always updated atomically. */ + uint_t laar_ref; + + /* + * Fields to keep track and sync threads waiting on a lookup. + * Fields are protected by lalr_lock. + */ + kmutex_t laar_lock; + kcondvar_t laar_cv; + int laar_complete; + + enum lx_autofs_callres laar_result; +} lx_autofs_automnt_req_t; + +/* + * Generic stack structure. + */ +typedef struct stack_elem { + list_node_t se_list; + caddr_t se_ptr1; + caddr_t se_ptr2; + caddr_t se_ptr3; +} stack_elem_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h new file mode 100644 index 0000000000..4906e444f1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -0,0 +1,680 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_BRAND_H +#define _LX_BRAND_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/zone.h> +#include <sys/ksocket.h> +#include <sys/vfs.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_BRANDNAME "lx" + +/* + * Brand uname info + */ +#define LX_UNAME_SYSNAME "Linux" +#define LX_UNAME_RELEASE_2_6 "2.6.18" +#define LX_UNAME_RELEASE_2_4 "2.4.21" +#define LX_UNAME_VERSION "BrandZ virtual linux" +#define LX_UNAME_MACHINE32 "i686" +#define LX_UNAME_MACHINE64 "x86_64" + +#define LX_LIB_PATH32 "/native/usr/lib/lx_brand.so.1" +#define LX_LIB_PATH64 "/native/usr/lib/amd64/lx_brand.so.1" + +#define LX_VDSO_PATH32 "/native/usr/lib/brand/lx/lx_vdso.so.1" +#define LX_VDSO_PATH64 "/native/usr/lib/brand/lx/amd64/lx_vdso.so.1" + +#if defined(_LP64) +#define LX_LIB_PATH LX_LIB_PATH64 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE64 +#define LX_VDSO_PATH LX_VDSO_PATH64 +#else +#define LX_LIB_PATH LX_LIB_PATH32 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE32 +#define LX_VDSO_PATH LX_VDSO_PATH32 +#endif + +/* + * This must be large enough for both the 32-bit table and 64-bit table. + */ +#define LX_NSYSCALLS 358 + +/* Highest capability we know about */ +#define LX_CAP_MAX_VALID 36 + +/* + * brand(2) subcommands + * + * Everything >= 128 is a brand-specific subcommand. + * > 192 is reserved for in-kernel emulated system calls. + */ +#define B_LPID_TO_SPAIR 128 +#define B_GET_CURRENT_CONTEXT 129 +#define B_EMULATION_DONE 130 +/* formerly B_PTRACE_KERNEL 131 */ +#define B_SET_AFFINITY_MASK 132 +#define B_GET_AFFINITY_MASK 133 +#define B_PTRACE_CLONE_BEGIN 134 +#define B_PTRACE_STOP_FOR_OPT 135 +#define B_UNSUPPORTED 136 +#define B_STORE_ARGS 137 +#define B_GETPID 138 +#define B_JUMP_TO_LINUX 139 +#define B_SET_THUNK_PID 140 +#define B_EXIT_AS_SIG 141 +#define B_HELPER_WAITID 142 +#define B_HELPER_CLONE 143 +#define B_HELPER_SETGROUPS 144 +#define B_HELPER_SIGQUEUE 145 +#define B_HELPER_TGSIGQUEUE 146 +#define B_SET_NATIVE_STACK 147 +#define B_SIGEV_THREAD_ID 148 +#define B_OVERRIDE_KERN_VER 149 +/* formerly B_NOTIFY_VDSO_LOC 150 */ +#define B_GET_PERSONALITY 151 + +#ifndef _ASM +/* + * Support for Linux PTRACE_SETOPTIONS handling. + */ +typedef enum lx_ptrace_options { + LX_PTRACE_O_TRACESYSGOOD = 0x0001, + LX_PTRACE_O_TRACEFORK = 0x0002, + LX_PTRACE_O_TRACEVFORK = 0x0004, + LX_PTRACE_O_TRACECLONE = 0x0008, + LX_PTRACE_O_TRACEEXEC = 0x0010, + LX_PTRACE_O_TRACEVFORKDONE = 0x0020, + LX_PTRACE_O_TRACEEXIT = 0x0040, + LX_PTRACE_O_TRACESECCOMP = 0x0080 +} lx_ptrace_options_t; + +#define LX_PTRACE_O_ALL \ + (LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | \ + LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | \ + LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE | \ + LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP) +#endif /* !_ASM */ + +/* siginfo si_status for traced events */ +#define LX_PTRACE_EVENT_FORK 0x100 +#define LX_PTRACE_EVENT_VFORK 0x200 +#define LX_PTRACE_EVENT_CLONE 0x300 +#define LX_PTRACE_EVENT_EXEC 0x400 +#define LX_PTRACE_EVENT_VFORK_DONE 0x500 +#define LX_PTRACE_EVENT_EXIT 0x600 +#define LX_PTRACE_EVENT_SECCOMP 0x700 + +/* + * Brand-private values for the "pr_what" member of lwpstatus, for use with the + * PR_BRAND stop reason. These reasons are validated in lx_stop_notify(); + * update it if you add new reasons here. + */ +#define LX_PR_SYSENTRY 1 +#define LX_PR_SYSEXIT 2 +#define LX_PR_SIGNALLED 3 +#define LX_PR_EVENT 4 + + +#define LX_VERSION_1 1 +#define LX_VERSION LX_VERSION_1 + +#define LX_ATTR_KERN_RELEASE ZONE_ATTR_BRAND_ATTRS +#define LX_ATTR_KERN_VERSION (ZONE_ATTR_BRAND_ATTRS + 1) + +/* + * Aux vector containing phdr of Linux executable and ehdr of interpreter + * (if any), both of which are used by lx_librtld_db to ascertain r_debug. + * We repurpose the 3rd brand-specific aux vector slot for the Linux + * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library). + */ +#define AT_SUN_BRAND_LX_PHDR AT_SUN_BRAND_AUX1 +#define AT_SUN_BRAND_LX_INTERP AT_SUN_BRAND_AUX2 +#define AT_SUN_BRAND_LX_CLKTCK AT_SUN_BRAND_AUX3 +#define AT_SUN_BRAND_LX_SYSINFO_EHDR AT_SUN_BRAND_AUX4 + +/* Aux vectors containing real/effective user/group IDs */ +#define AT_LX_UID 11 +#define AT_LX_EUID 12 +#define AT_LX_GID 13 +#define AT_LX_EGID 14 +/* Aux vector containing hz value */ +#define AT_CLKTCK 17 +/* Aux vector containing secure boolean */ +#define AT_SECURE 23 +/* Aux vector containing vDSO addr */ +#define AT_SYSINFO_EHDR 33 + +/* + * Usermode emulation routines are run on an alternate stack allocated by + * the brand library. Every LWP in a process will incur this overhead beyond + * the regular thread stack: + */ +#define LX_NATIVE_STACK_PAGE_COUNT 64 + +/* + * When returning in a new child process created with vfork(2) (or CLONE_VFORK) + * we discard some of the native stack to prevent corruption of the parent + * emulation state. + */ +#define LX_NATIVE_STACK_VFORK_GAP 0x3000 + +#ifndef _ASM + +extern struct brand lx_brand; + +typedef struct lx_brand_registration { + uint_t lxbr_version; /* version number */ + void *lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration_t; + +typedef struct lx_brand_registration32 { + uint_t lxbr_version; /* version number */ + uint32_t lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration32_t; + +#endif /* _ASM */ + +/* + * GDT usage + */ +#define GDT_TLSMIN (GDT_BRANDMIN) +#define GDT_TLSMAX (GDT_TLSMIN + 2) +#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN) + +#ifndef _ASM + +/* + * Stores information needed by the lx linker to launch the main + * lx executable. + */ +typedef struct lx_elf_data64 { + uintptr_t ed_phdr; + uintptr_t ed_phent; + uintptr_t ed_phnum; + uintptr_t ed_entry; + uintptr_t ed_base; + uintptr_t ed_ldentry; +} lx_elf_data64_t; + +typedef struct lx_elf_data32 { + uint32_t ed_phdr; + uint32_t ed_phent; + uint32_t ed_phnum; + uint32_t ed_entry; + uint32_t ed_base; + uint32_t ed_ldentry; +} lx_elf_data32_t; + +#if defined(_LP64) +typedef lx_elf_data64_t lx_elf_data_t; +#else +typedef lx_elf_data32_t lx_elf_data_t; +#endif + +typedef enum lx_proc_flags { + /* flags configurable via brandsys() and members of LX_PROC_ALL */ + LX_PROC_INSTALL_MODE = 0x01, + LX_PROC_STRICT_MODE = 0x02, + /* internal flags */ + LX_PROC_CHILD_DEATHSIG = 0x04, + LX_PROC_AIO_USED = 0x08 +} lx_proc_flags_t; + +#define LX_PROC_ALL (LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE) + +/* Maximum length for fields of LX uname */ +#define LX_SYS_UTS_LN 65 + +/* Max. length of kernel release string */ +#define LX_KERN_RELEASE_MAX LX_SYS_UTS_LN +#define LX_KERN_VERSION_MAX LX_SYS_UTS_LN + +#ifdef _KERNEL + +/* + * Entry points for cgroup integration. + */ +extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +#define LX_RLFAKE_LOCKS 0 +#define LX_RLFAKE_NICE 1 +#define LX_RLFAKE_RTPRIO 2 +#define LX_RLFAKE_RTTIME 3 + +#define LX_RLFAKE_NLIMITS 4 + +#define LX_RLIM64_INFINITY (~0ULL) + +typedef struct { + uint64_t rlim_cur; + uint64_t rlim_max; +} lx_rlimit64_t; + +typedef struct lx_proc_data { + uintptr_t l_handler; /* address of user-space handler */ + pid_t l_ppid; /* pid of originating parent proc */ + uid_t l_loginuid; /* /proc/{pid}/loginuid */ + int64_t l_ptrace; /* count of process lwps observed by ptrace */ + lx_elf_data_t l_elf_data; /* ELF data for linux executable */ + /* signal to deliver to parent when this thread group dies */ + int l_signal; + /* native signal to deliver to process when parent dies */ + int l_parent_deathsig; + lx_proc_flags_t l_flags; + + lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS]; + + /* original start/end bounds of arg/env string data */ + uintptr_t l_args_start; + uintptr_t l_envs_start; + uintptr_t l_envs_end; + + /* Override zone-wide settings for uname release and version */ + char l_uname_release[LX_KERN_RELEASE_MAX]; + char l_uname_version[LX_KERN_VERSION_MAX]; + + /* Linux process personality */ + unsigned int l_personality; + + /* VDSO location */ + uintptr_t l_vdso; +} lx_proc_data_t; + +#endif /* _KERNEL */ + +/* + * Linux process personality(2) flags stored in l_personality + */ +#define LX_PER_UNAME26 0x0020000 +#define LX_PER_ADDR_NO_RANDOMIZE 0x0040000 +#define LX_PER_FDPIC_FUNCPTRS 0x0080000 +#define LX_PER_MMAP_PAGE_ZERO 0x0100000 +#define LX_PER_ADDR_COMPAT_LAYOUT 0x0200000 +#define LX_PER_READ_IMPLIES_EXEC 0x0400000 +#define LX_PER_ADDR_LIMIT_32BIT 0x0800000 +#define LX_PER_SHORT_INODE 0x1000000 +#define LX_PER_WHOLE_SECONDS 0x2000000 +#define LX_PER_STICKY_TIMEOUTS 0x4000000 +#define LX_PER_ADDR_LIMIT_3GB 0x8000000 + +#define LX_PER_LINUX 0x00 +#define LX_PER_SUNOS (0x06 | LX_PER_STICKY_TIMEOUTS) +#define LX_PER_MASK 0xff + +/* + * A data type big enough to bitmap all Linux possible cpus. + * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages + * for sched_getaffinity() and sched_getaffinity(). + */ +#define LX_NCPU (1024) +#define LX_AFF_ULONGS (LX_NCPU / (8 * sizeof (ulong_t))) +typedef ulong_t lx_affmask_t[LX_AFF_ULONGS]; + +/* Length of proc boot_id string */ +#define LX_BOOTID_LEN 37 + +/* + * Flag values for uc_brand_data[0] in the ucontext_t: + */ +#define LX_UC_STACK_NATIVE 0x00001 +#define LX_UC_STACK_BRAND 0x00002 +#define LX_UC_RESTORE_NATIVE_SP 0x00010 +#define LX_UC_FRAME_IS_SYSCALL 0x00100 +#define LX_UC_RESTART_SYSCALL 0x01000 +#define LX_UC_IGNORE_LINK 0x10000 + +#ifdef _KERNEL + +typedef struct lx_lwp_data lx_lwp_data_t; + +/* + * Flag values for "lxpa_flags" on a ptrace(2) accord. + */ +typedef enum lx_accord_flags { + LX_ACC_TOMBSTONE = 0x01 +} lx_accord_flags_t; + +/* + * Flags values for "br_ptrace_flags" in the LWP-specific data. + */ +typedef enum lx_ptrace_flags { + LX_PTF_SYSCALL = 0x01, + LX_PTF_EXITING = 0x02, + LX_PTF_STOPPING = 0x04, + LX_PTF_INHERIT = 0x08, + LX_PTF_STOPPED = 0x10, + LX_PTF_PARENT_WAIT = 0x20, + LX_PTF_CLDPEND = 0x40, + LX_PTF_CLONING = 0x80, + LX_PTF_WAITPEND = 0x100 +} lx_ptrace_flags_t; + +/* + * A ptrace(2) accord represents the relationship between a tracer LWP and the + * set of LWPs that it is tracing: the tracees. This data structure belongs + * primarily to the tracer, but is reference counted so that it may be freed by + * whoever references it last. + */ +typedef struct lx_ptrace_accord { + kmutex_t lxpa_lock; + uint_t lxpa_refcnt; + lx_accord_flags_t lxpa_flags; + + /* + * The tracer must hold "pidlock" while clearing these fields for + * exclusion of waitid(), etc. + */ + lx_lwp_data_t *lxpa_tracer; + kcondvar_t *lxpa_cvp; + + /* + * The "lxpa_tracees_lock" mutex protects the tracee list. + */ + kmutex_t lxpa_tracees_lock; + list_t lxpa_tracees; +} lx_ptrace_accord_t; + +/* + * These values are stored in the per-LWP data for a tracee when it is attached + * to a tracer. They record the method that was used to attach. + */ +typedef enum lx_ptrace_attach { + LX_PTA_NONE = 0x00, /* not attached */ + LX_PTA_ATTACH = 0x01, /* due to tracer using PTRACE_ATTACH */ + LX_PTA_TRACEME = 0x02, /* due to child using PTRACE_TRACEME */ + LX_PTA_INHERIT_CLONE = 0x04, /* due to PTRACE_CLONE clone(2) flag */ + LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */ +} lx_ptrace_attach_t; + +typedef enum lx_stack_mode { + LX_STACK_MODE_PREINIT = 0, + LX_STACK_MODE_INIT, + LX_STACK_MODE_NATIVE, + LX_STACK_MODE_BRAND +} lx_stack_mode_t; + +struct lx_pid { + pid_t s_pid; /* the SunOS pid and ... */ + id_t s_tid; /* ... tid pair */ + pid_t l_pid; /* the corresponding linux pid */ + time_t l_start; /* birthday of this pid */ + struct pid *l_pidp; + struct lx_pid *stol_next; /* link in stol hash table */ + struct lx_pid *ltos_next; /* link in ltos hash table */ +}; + +/* + * lx-specific data in the klwp_t + */ +struct lx_lwp_data { + uint_t br_lwp_flags; /* misc. flags */ + klwp_t *br_lwp; /* back pointer to container lwp */ + int br_signal; /* signal to send to parent when */ + /* clone()'ed child terminates */ + int br_exitwhy; /* reason for thread (process) exit */ + int br_exitwhat; /* exit code / killing signal */ + lx_affmask_t br_affinitymask; /* bitmask of CPU sched affinities */ + struct user_desc br_tls[LX_TLSNUM]; + /* descriptors used by libc for TLS */ + ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */ + ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */ + ulong_t br_lx_gsbase; /* lx user-land gsbase */ + ulong_t br_ntv_gsbase; /* native user-land gsbase */ + pid_t br_pid; /* converted pid for this thread */ + pid_t br_tgid; /* thread group ID for this thread */ + pid_t br_ppid; /* parent pid for this thread */ + id_t br_ptid; /* parent tid for this thread */ + void *br_clear_ctidp; /* clone thread id ptr */ + void *br_set_ctidp; /* clone thread id ptr */ + void *br_robust_list; /* robust lock list, if any */ + + /* + * The following struct is used by some system calls to pass extra + * flags into the kernel without impinging on the namespace for + * illumos. + */ + void *br_scall_args; + int br_args_size; /* size in bytes of br_scall_args */ + + boolean_t br_waitid_emulate; + int br_waitid_flags; + + lx_ptrace_flags_t br_ptrace_flags; /* ptrace flags for this LWP */ + lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */ + lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */ + + lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */ + lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */ + lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */ + list_node_t br_ptrace_linkage; /* linkage for lxpa_tracees list */ + + ushort_t br_ptrace_whystop; /* stop reason, 0 for no stop */ + ushort_t br_ptrace_whatstop; /* stop sub-reason */ + + int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */ + /* + * Track the last (native) signal number processed by a ptrace. + * This allows the tracee to properly handle ignored signals after + * the tracer has been notified and the tracee restarted. + */ + int32_t br_ptrace_donesig; + uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */ + + uint_t br_ptrace_event; + ulong_t br_ptrace_eventmsg; + + int br_syscall_num; /* current system call number */ + boolean_t br_syscall_restart; /* should restart on EINTR */ + + /* + * Store the LX_STACK_MODE for this LWP, and the current extent of the + * native (emulation) stack. This is similar, in principle, to the + * sigaltstack mechanism for signal handling. We also use this mode + * flag to determine how to process system calls from this LWP. + */ + lx_stack_mode_t br_stack_mode; + uintptr_t br_ntv_stack; + uintptr_t br_ntv_stack_current; + + /* + * If this pid is set, we return it with getpid(). This allows the + * thunking server to interpose on the pid returned to the Linux + * syslog software. + */ + pid_t br_lx_thunk_pid; + + /* + * If strict mode is enabled (via LX_STRICT in the environment), any + * call to lx_unsupported() will set this boolean to B_TRUE. This will + * cause us to drop SIGSYS on the LWP as it attempts to return to + * usermode. + */ + boolean_t br_strict_failure; + + /* + * Some syscalls emulated in-kernel still call back out to the + * userspace emulation for certain functions. When that is the case, + * the syscall_return logic must be bypassed at the end of the + * in-kernel syscall code. The NORMALRETURN and JUSTRETURN constants + * are used to choose the behavior. + */ + char br_eosys; + + /* + * Hold a pre-allocated lx_pid structure to be used during lx_initlwp. + */ + struct lx_pid *br_lpid; + + /* + * ID of the cgroup this thread belongs to. + */ + uint_t br_cgroupid; +}; + +/* + * Upper limit on br_args_size, low because this value can persist until + * overridden with another value, and the size is given from userland. + */ +#define LX_BR_ARGS_SIZE_MAX (1024) + +/* + * brand specific data + * + * We currently only support a single cgroup mount in an lx zone so we only have + * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever + * enhanced to support different mounts with different subsystem controllers. + */ +typedef struct lx_zone_data { + kmutex_t lxzd_lock; /* protects all members */ + char lxzd_kernel_release[LX_KERN_RELEASE_MAX]; + char lxzd_kernel_version[LX_KERN_VERSION_MAX]; + ksocket_t lxzd_ioctl_sock; + char lxzd_bootid[LX_BOOTID_LEN]; /* procfs boot_id */ + vfs_t *lxzd_cgroup; /* cgroup for this zone */ + list_t *lxzd_vdisks; /* virtual disks (zvols) */ + dev_t lxzd_zfs_dev; /* major num for zfs */ +} lx_zone_data_t; + +#define BR_CPU_BOUND 0x0001 + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) +#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l)) +#define ttolxproc(t) \ + (((t)->t_procp->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL) +#define ptolxproc(p) \ + (((p)->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(p)->p_brand_data : NULL) +#define ztolxzd(z) \ + (((z)->zone_brand == &lx_brand) ? \ + (lx_zone_data_t *)(z)->zone_brand_data : NULL) + +/* Macro for converting to system call arguments. */ +#define LX_ARGS(scall) ((struct lx_##scall##_args *)\ + (ttolxlwp(curthread)->br_scall_args)) + +typedef enum lx_virt_disk_type { + LXVD_NONE, + LXVD_ZFS_DS, + LXVD_ZVOL +} lx_virt_disk_type_t; + +typedef struct lx_virt_disk { + list_node_t lxvd_link; + char lxvd_name[MAXNAMELEN]; + lx_virt_disk_type_t lxvd_type; + dev_t lxvd_emul_dev; + dev_t lxvd_real_dev; + uint64_t lxvd_volsize; + uint64_t lxvd_blksize; + char lxvd_real_name[MAXPATHLEN]; +} lx_virt_disk_t; + +/* + * Determine the upper bound on the system call number: + */ +#if defined(_LP64) +#define LX_MAX_SYSCALL(lwp) \ + ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \ + lx_nsysent64 : lx_nsysent32) +#else +#define LX_MAX_SYSCALL(lwp) lx_nsysent32 +#endif + +extern int lx_kern_release_cmp(zone_t *, const char *); + +extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t); +extern void lx_divert(klwp_t *, uintptr_t); +extern int lx_runexe(klwp_t *, void *); +extern void lx_switch_to_native(klwp_t *); + +extern int lx_syscall_enter(void); +extern int lx_syscall_return(klwp_t *, int, long); + +extern int lx_syscall_fast_enter(void); + +extern void lx_trace_sysenter(int, uintptr_t *); +extern void lx_trace_sysreturn(int, long); + +extern void lx_emulate_user(klwp_t *, int, uintptr_t *); +#if defined(_SYSCALL32_IMPL) +extern void lx_emulate_user32(klwp_t *, int, uintptr_t *); +#endif + +extern int lx_debug; +#define lx_print if (lx_debug) printf + +extern void lx_pid_assign(kthread_t *, struct lx_pid *); +extern void lx_pid_reassign(kthread_t *); +extern void lx_pid_rele(pid_t, id_t); +extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *); +extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *); +extern void lx_pid_init(void); +extern void lx_pid_fini(void); + +/* + * In-Kernel Linux System Call Description. + */ +typedef struct lx_sysent { + char *sy_name; + long (*sy_callc)(); + char sy_flags; + char sy_narg; +} lx_sysent_t; + +#if defined(_LP64) +extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +extern int lx_nsysent64; +#endif +extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +extern int lx_nsysent32; + +#endif /* _KERNEL */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_BRAND_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h new file mode 100644 index 0000000000..f82c6b867d --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h @@ -0,0 +1,161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_LX_FCNTL_H +#define _SYS_LX_FCNTL_H + +#include <sys/vnode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Lx open/fcntl flags + */ +#define LX_O_RDONLY 00 +#define LX_O_WRONLY 01 +#define LX_O_RDWR 02 +#define LX_O_ACCMODE (LX_O_RDONLY | LX_O_WRONLY | LX_O_RDWR) +#define LX_O_CREAT 0100 +#define LX_O_EXCL 0200 +#define LX_O_NOCTTY 0400 +#define LX_O_TRUNC 01000 +#define LX_O_APPEND 02000 +#define LX_O_NONBLOCK 04000 +#define LX_O_NDELAY LX_O_NONBLOCK +#define LX_O_SYNC 010000 +#define LX_O_FSYNC LX_O_SYNC +#define LX_O_ASYNC 020000 +#define LX_O_DIRECT 040000 +#define LX_O_LARGEFILE 0100000 +#define LX_O_DIRECTORY 0200000 +#define LX_O_NOFOLLOW 0400000 +#define LX_O_CLOEXEC 02000000 +#define LX_O_PATH 010000000 + +#define LX_F_DUPFD 0 +#define LX_F_GETFD 1 +#define LX_F_SETFD 2 +#define LX_F_GETFL 3 +#define LX_F_SETFL 4 +#define LX_F_GETLK 5 +#define LX_F_SETLK 6 +#define LX_F_SETLKW 7 +#define LX_F_SETOWN 8 +#define LX_F_GETOWN 9 +#define LX_F_SETSIG 10 +#define LX_F_GETSIG 11 + +#define LX_F_GETLK64 12 +#define LX_F_SETLK64 13 +#define LX_F_SETLKW64 14 + +#define LX_F_SETLEASE 1024 +#define LX_F_GETLEASE 1025 +#define LX_F_NOTIFY 1026 +#define LX_F_CANCELLK 1029 +#define LX_F_DUPFD_CLOEXEC 1030 +#define LX_F_SETPIPE_SZ 1031 +#define LX_F_GETPIPE_SZ 1032 + +#define LX_F_RDLCK 0 +#define LX_F_WRLCK 1 +#define LX_F_UNLCK 2 + +/* Test for emulated O_PATH setting in file_t flags */ +#define LX_IS_O_PATH(f) (((f)->f_flag & (FREAD|FWRITE)) == 0) + +extern int lx_vp_at(int, char *, vnode_t **, int); + +/* + * Lx flock codes. + */ +#define LX_NAME_MAX 255 +#define LX_LOCK_SH 1 /* shared */ +#define LX_LOCK_EX 2 /* exclusive */ +#define LX_LOCK_NB 4 /* non-blocking */ +#define LX_LOCK_UN 8 /* unlock */ + +/* + * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value. + * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by + * faccessat. + */ +#define LX_AT_FDCWD (-100) +#define LX_AT_SYMLINK_NOFOLLOW 0x100 +#define LX_AT_REMOVEDIR 0x200 +#define LX_AT_EACCESS 0x200 +#define LX_AT_SYMLINK_FOLLOW 0x400 +#define LX_AT_NO_AUTOMOUNT 0x800 +#define LX_AT_EMPTY_PATH 0x1000 + +typedef struct lx_flock { + short l_type; + short l_whence; + long l_start; + long l_len; + int l_pid; +} lx_flock_t; + +typedef struct lx_flock64 { + short l_type; + short l_whence; + long long l_start; + long long l_len; + int l_pid; +} lx_flock64_t; + +#if defined(_KERNEL) + +/* + * 64-bit kernel view of 32-bit usermode structs. + */ +#pragma pack(4) +typedef struct lx_flock32 { + int16_t l_type; + int16_t l_whence; + int32_t l_start; + int32_t l_len; + int32_t l_pid; +} lx_flock32_t; + +typedef struct lx_flock64_32 { + int16_t l_type; + int16_t l_whence; + int64_t l_start; + int64_t l_len; + int32_t l_pid; +} lx_flock64_32_t; +#pragma pack() + +#endif /* _KERNEL && _SYSCALL32_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FCNTL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h new file mode 100644 index 0000000000..a400b3bd83 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_FUTEX_H +#define _SYS_LX_FUTEX_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI + +/* + * Flags that can be OR'd into a futex operation. + */ +#define FUTEX_CMD_MASK 0x007f +#define FUTEX_PRIVATE_FLAG 0x0080 +#define FUTEX_CLOCK_REALTIME 0x0100 + +#define FUTEX_BITSET_MATCH_ANY 0xffffffff +/* + * FUTEX_WAKE_OP operations + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +/* + * FUTEX_WAKE_OP comparison operations + */ +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ + +/* + * The encoding of the FUTEX_WAKE_OP operation in 32 bits: + * + * +--+-- - --+-- - --+-- - --+-- - --+ + * |S |OP |CMP |OPARG |CMPARG | + * +--+-- - --+-- - --+-- - --+-- - --+ + * |31|30 - 28|27 - 24|23 - 12|11 - 0| + * + * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG. + * (Yes, this whole thing is entirely absurd -- see the block comment in + * lx_futex.c for an explanation of this nonsense.) Macros to extract the + * various components from the operation, given the above encoding: + */ +#define FUTEX_OP_OP(x) (((x) >> 28) & 7) +#define FUTEX_OP_CMP(x) (((x) >> 24) & 15) +#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \ + ((((x) << 8) >> 20))) +#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20) + +#ifdef _KERNEL + +#define FUTEX_WAITERS 0x80000000 +#define FUTEX_OWNER_DIED 0x40000000 +#define FUTEX_TID_MASK 0x3fffffff + +#define FUTEX_ROBUST_LOCK_PI 1 +#define FUTEX_ROBUST_LIST_LIMIT 2048 + +extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2); +extern void lx_futex_init(void); +extern int lx_futex_fini(void); +extern long lx_set_robust_list(void *listp, size_t len); +extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp); +extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FUTEX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h new file mode 100644 index 0000000000..03b9d43038 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_IMPL_H +#define _LX_IMPL_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t, + ulong_t, ulong_t); + + +extern lx_systrace_f *lx_systrace_entry_ptr; +extern lx_systrace_f *lx_systrace_return_ptr; + +extern void lx_brand_systrace_enable(void); +extern void lx_brand_systrace_disable(void); + +extern void lx_unsupported(char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h new file mode 100644 index 0000000000..825933e86c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LINUX_LDT_H +#define _SYS_LINUX_LDT_H + +#include <sys/segments.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ldt_info { + uint_t entry_number; + uint_t base_addr; + uint_t limit; + uint_t seg_32bit:1, + contents:2, + read_exec_only:1, + limit_in_pages:1, + seg_not_present:1, + useable:1; +}; + +#define LDT_INFO_EMPTY(info) \ + ((info)->base_addr == 0 && (info)->limit == 0 && \ + (info)->contents == 0 && (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && (info)->useable == 0) + +#if defined(__amd64) +#define SETMODE(desc) (desc)->usd_long = SDP_SHORT; +#else +#define SETMODE(desc) +#endif + +#define LDT_INFO_TO_DESC(info, desc) { \ + USEGD_SETBASE(desc, (info)->base_addr); \ + USEGD_SETLIMIT(desc, (info)->limit); \ + (desc)->usd_type = ((info)->contents << 2) | \ + ((info)->read_exec_only ^ 1) << 1 | 0x10; \ + (desc)->usd_dpl = SEL_UPL; \ + (desc)->usd_p = (info)->seg_not_present ^ 1; \ + (desc)->usd_def32 = (info)->seg_32bit; \ + (desc)->usd_gran = (info)->limit_in_pages; \ + (desc)->usd_avl = (info)->useable; \ + SETMODE(desc); \ +} + +#define DESC_TO_LDT_INFO(desc, info) { \ + bzero((info), sizeof (*(info))); \ + (info)->base_addr = USEGD_GETBASE(desc); \ + (info)->limit = USEGD_GETLIMIT(desc); \ + (info)->seg_not_present = (desc)->usd_p ^ 1; \ + (info)->contents = ((desc)->usd_type >> 2) & 3; \ + (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \ + (info)->seg_32bit = (desc)->usd_def32; \ + (info)->limit_in_pages = (desc)->usd_gran; \ + (info)->useable = (desc)->usd_avl; \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_LDT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h new file mode 100644 index 0000000000..7c1e50362c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -0,0 +1,117 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS__LX_MISC_H +#define _SYS__LX_MISC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <inet/ip.h> +#include <inet/ip6.h> +#include <sys/siginfo.h> +#include <sys/lx_brand.h> + +#ifdef _KERNEL + +extern void lx_setrval(klwp_t *, int, int); +extern void lx_exec(); +extern void lx_exitlwp(klwp_t *); +extern void lx_freelwp(klwp_t *); +extern void *lx_lwpdata_alloc(proc_t *); +extern void lx_lwpdata_free(void *); +extern void lx_initlwp(klwp_t *, void *); +extern void lx_initlwp_post(klwp_t *); +extern void lx_forklwp(klwp_t *, klwp_t *); + +extern void lx_set_gdt(int, user_desc_t *); +extern void lx_clear_gdt(int); + +extern longlong_t lx_nosys(); + +extern greg_t lx_fixsegreg(greg_t, model_t); +extern uintptr_t lx_fsbase(klwp_t *, uintptr_t); +extern void lx_exit_with_sig(proc_t *, sigqueue_t *); +extern boolean_t lx_wait_filter(proc_t *, proc_t *); +extern void lx_sigfd_translate(k_siginfo_t *); +extern int stol_ksiginfo_copyout(k_siginfo_t *, void *); +#if defined(_SYSCALL32_IMPL) +extern int stol_ksiginfo32_copyout(k_siginfo_t *, void *); +#endif +extern void lx_read_argv_bounds(proc_t *p); + +typedef enum lx_regs_location { + LX_REG_LOC_UNAVAIL, + LX_REG_LOC_LWP, + LX_REG_LOC_UCP +} lx_regs_location_t; + +extern lx_regs_location_t lx_regs_location(lx_lwp_data_t *, void **, boolean_t); + + +typedef enum lx_if_action { + LX_IF_FROMNATIVE, + LX_IF_TONATIVE +} lx_if_action_t; + +/* Linux ARP protocol hardware identifiers */ +#define LX_ARPHRD_ETHER 1 /* Ethernet */ +#define LX_ARPHRD_LOOPBACK 772 /* Loopback */ +#define LX_ARPHRD_VOID 0xffff /* Unknown */ + +/* IPv6 address scope values used in /proc/net/if_inet6 */ +#define LX_IPV6_ADDR_LOOPBACK 0x0010U +#define LX_IPV6_ADDR_LINKLOCAL 0x0020U +#define LX_IPV6_ADDR_SITELOCAL 0x0040U +#define LX_IPV6_ADDR_COMPATv4 0x0080U + +extern void lx_ifname_convert(char *, lx_if_action_t); +extern void lx_ifflags_convert(uint64_t *, lx_if_action_t); +extern unsigned int lx_ipv6_scope_convert(const in6_addr_t *); +extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *, + int *); + +extern boolean_t lx_ptrace_stop(ushort_t); +extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t); +extern void lx_ptrace_init(void); +extern void lx_ptrace_fini(void); +extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *, + int *); +extern void lx_ptrace_exit(proc_t *, klwp_t *); +extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *); +extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t); +extern int lx_ptrace_set_clone_inherit(int, boolean_t); +extern int lx_sigcld_repost(proc_t *, sigqueue_t *); +extern int lx_ptrace_issig_stop(proc_t *, klwp_t *); +extern boolean_t lx_ptrace_sig_ignorable(proc_t *, klwp_t *, int); + +extern int lx_helper_clone(int64_t *, int, void *, void *, void *); +extern int lx_helper_setgroups(int, gid_t *); +extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *); +extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *); + +extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *); +extern void lx_vsyscall_enter(proc_t *, klwp_t *, int); + +extern void lx_check_strict_failure(lx_lwp_data_t *); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS__LX_MISC_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h new file mode 100644 index 0000000000..74bbc939a3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PTM_LINUX_H +#define _SYS_PTM_LINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_PTM_DRV "lx_ptm" +#define LX_PTM_MINOR_NODE "lx_ptmajor" + +#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PTM_LINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_sched.h b/usr/src/uts/common/brand/lx/sys/lx_sched.h new file mode 100644 index 0000000000..b0ae748f3c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_sched.h @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LINUX_SCHED_H +#define _SYS_LINUX_SCHED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/procset.h> +#include <sys/priocntl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux scheduler policies. + */ +#define LX_SCHED_OTHER 0 +#define LX_SCHED_FIFO 1 +#define LX_SCHED_RR 2 + +#define LX_PRI_MAX 99 + +typedef int l_pid_t; + +struct lx_sched_param { + int lx_sched_prio; +}; + +extern int sched_setprocset(procset_t *, l_pid_t); +extern long do_priocntlsys(int, procset_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SCHED_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h new file mode 100644 index 0000000000..9f606b614f --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGINFO_H +#define _LX_SIGINFO_H + +#include <sys/lx_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_siginfo_t lsi_code values + * + * LX_SI_ASYNCNL: Sent by asynch name lookup completion + * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads + * LX_SI_SIGIO: Sent by queued SIGIO + * LX_SI_ASYNCIO: Sent by asynchronous I/O completion + * LX_SI_MESGQ: Sent by real time message queue state change + * LX_SI_TIMER: Sent by timer expiration + * LX_SI_QUEUE: Sent by sigqueue + * LX_SI_USER: Sent by kill, sigsend, raise, etc. + * LX_SI_KERNEL: Sent by kernel + * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to + * illumos errors, if there is no translation available, this value + * should be used. This value should have no meaning as an si_code in + * illumos or Linux. + * + * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by + * BrandZ. + */ +#define LX_SI_CODE_NOT_EXIST (-61) +#define LX_SI_ASYNCNL (-60) +#define LX_SI_DETHREAD (-7) +#define LX_SI_TKILL (-6) +#define LX_SI_SIGIO (-5) +#define LX_SI_ASYNCIO (-4) +#define LX_SI_MESGQ (-3) +#define LX_SI_TIMER (-2) +#define LX_SI_QUEUE (-1) +#define LX_SI_USER (0) +#define LX_SI_KERNEL (0x80) + +#define LX_SI_MAX_SIZE 128 +#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3) +#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4) + +#if defined(_LP64) +/* + * Because of the odd number (3) of ints before the union, we need to account + * for the smaller padding needed on x64 due to the union being offset to an 8 + * byte boundary. + */ +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64 +#else +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32 +#endif + +typedef struct lx_siginfo { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE]; + + struct { + pid_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid_t _pid; + lx_uid16_t _uid; + union sigval _sigval; + } _rt; + + struct { + pid_t _pid; + lx_uid16_t _uid; + int _status; + clock_t _utime; + clock_t _stime; + } _sigchld; + + struct { + void *_addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "lx_siginfo_t" object. + */ +#pragma pack(4) +typedef struct lx_siginfo32 { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE_32]; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + union sigval32 _sigval; + } _rt; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + int _status; + clock32_t _utime; + clock32_t _stime; + } _sigchld; + + struct { + caddr32_t _addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo32_t; +#pragma pack() +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#define lsi_pid _sifields._kill._pid +#define lsi_uid _sifields._kill._uid +#define lsi_status _sifields._sigchld._status +#define lsi_utime _sifields._sigchld._utime +#define lsi_stime _sifields._sigchld._stime +#define lsi_value _sifields._rt._sigval +#define lsi_int _sifields._rt._sigval.sivalx_int +#define lsi_ptr _sifields._rt._sigval.sivalx_ptr +#define lsi_addr _sifields._sigfault._addr +#define lsi_band _sifields._sigpoll._band +#define lsi_fd _sifields._sigpoll._fd + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGINFO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_signal.h b/usr/src/uts/common/brand/lx/sys/lx_signal.h new file mode 100644 index 0000000000..552c36238b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_signal.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGNAL_H +#define _LX_SIGNAL_H + +#include <lx_signum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void lx_ltos_sigset(lx_sigset_t *, k_sigset_t *); +extern void lx_stol_sigset(k_sigset_t *, lx_sigset_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGNAL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_socket.h b/usr/src/uts/common/brand/lx/sys/lx_socket.h new file mode 100644 index 0000000000..eb9826eebe --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_socket.h @@ -0,0 +1,434 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_LX_SOCKET_H +#define _SYS_LX_SOCKET_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux address family definitions + * Some of these are not supported + */ +#define LX_AF_UNSPEC 0 /* Unspecified */ +#define LX_AF_UNIX 1 /* local file/pipe name */ +#define LX_AF_INET 2 /* IP protocol family */ +#define LX_AF_AX25 3 /* Amateur Radio AX.25 */ +#define LX_AF_IPX 4 /* Novell Internet Protocol */ +#define LX_AF_APPLETALK 5 /* Appletalk */ +#define LX_AF_NETROM 6 /* Amateur radio */ +#define LX_AF_BRIDGE 7 /* Multiprotocol bridge */ +#define LX_AF_ATMPVC 8 /* ATM PVCs */ +#define LX_AF_X25 9 /* X.25 */ +#define LX_AF_INET6 10 /* IPV 6 */ +#define LX_AF_ROSE 11 /* Amateur Radio X.25 */ +#define LX_AF_DECNET 12 /* DECnet */ +#define LX_AF_NETBEUI 13 /* 802.2LLC */ +#define LX_AF_SECURITY 14 /* Security callback */ +#define LX_AF_KEY 15 /* key management */ +#define LX_AF_ROUTE 16 /* Alias to emulate 4.4BSD */ +#define LX_AF_NETLINK LX_AF_ROUTE +#define LX_AF_PACKET 17 /* Packet family */ +#define LX_AF_ASH 18 /* Ash ? */ +#define LX_AF_ECONET 19 /* Acorn Econet */ +#define LX_AF_ATMSVC 20 /* ATM SVCs */ +#define LX_AF_SNA 22 /* Linux SNA */ +#define LX_AF_IRDA 23 /* IRDA sockets */ +#define LX_AF_PPPOX 24 /* PPPoX sockets */ +#define LX_AF_WANPIPE 25 /* Wanpipe API sockets */ +#define LX_AF_LLC 26 +/* gap in Linux defines for 27 and 28 */ +#define LX_AF_CAN 29 +#define LX_AF_TIPC 30 +#define LX_AF_BLUETOOTH 31 /* Bluetooth sockets */ +#define LX_AF_IUCV 32 +#define LX_AF_RXRPC 33 + +/* limit of AF mappings */ +#define LX_AF_MAX LX_AF_RXRPC + +#define AF_NOTSUPPORTED -1 +#define AF_INVAL -2 + +/* + * Options for use with [gs]etsockopt at the SOL_SOCKET level. + */ +#define LX_SOL_SOCKET 1 + +#define LX_SCM_RIGHTS 1 +#define LX_SCM_CRED 2 + +#define LX_SO_DEBUG 1 +#define LX_SO_REUSEADDR 2 +#define LX_SO_TYPE 3 +#define LX_SO_ERROR 4 +#define LX_SO_DONTROUTE 5 +#define LX_SO_BROADCAST 6 +#define LX_SO_SNDBUF 7 +#define LX_SO_RCVBUF 8 +#define LX_SO_KEEPALIVE 9 +#define LX_SO_OOBINLINE 10 +#define LX_SO_NO_CHECK 11 +#define LX_SO_PRIORITY 12 +#define LX_SO_LINGER 13 +#define LX_SO_BSDCOMPAT 14 +#define LX_SO_REUSEPORT 15 +/* + * For Linux see unix(7) man page SO_PASSCRED description. For Illumos see + * socket.h(3HEAD) man page SO_RECVUCRED description. + */ +#define LX_SO_PASSCRED 16 +#define LX_SO_PEERCRED 17 +#define LX_SO_RCVLOWAT 18 +#define LX_SO_SNDLOWAT 19 +#define LX_SO_RCVTIMEO 20 +#define LX_SO_SNDTIMEO 21 +/* Security levels - as per NRL IPv6 - don't actually do anything */ +#define LX_SO_SECURITY_AUTHENTICATION 22 +#define LX_SO_SECURITY_ENCRYPTION_TRANSPORT 23 +#define LX_SO_SECURITY_ENCRYPTION_NETWORK 24 +#define LX_SO_BINDTODEVICE 25 +/* Socket filtering */ +#define LX_SO_ATTACH_FILTER 26 +#define LX_SO_DETACH_FILTER 27 +#define LX_SO_PEERNAME 28 +#define LX_SO_TIMESTAMP 29 +#define LX_SCM_TIMESTAMP LX_SO_TIMESTAMP +#define LX_SO_ACCEPTCONN 30 + +#define LX_SO_PEERSEC 31 +#define LX_SO_SNDBUFFORCE 32 +#define LX_SO_RCVBUFFORCE 33 +#define LX_SO_PASSSEC 34 +#define LX_SO_TIMESTAMPNS 35 +#define LX_SCM_TIMESTAMPNS LX_SO_TIMESTAMPNS +#define LX_SO_MARK 36 +#define LX_SO_TIMESTAMPING 37 +#define LX_SCM_TIMESTAMPING LX_SO_TIMESTAMPING +#define LX_SO_PROTOCOL 38 +#define LX_SO_DOMAIN 39 +#define LX_SO_RXQ_OVFL 40 +#define LX_SO_WIFI_STATUS 41 +#define LX_SCM_WIFI_STATUS LX_SO_WIFI_STATUS +#define LX_SO_PEEK_OFF 42 +#define LX_SO_NOFCS 43 +#define LX_SO_LOCK_FILTER 44 +#define LX_SO_SELECT_ERR_QUEUE 45 +#define LX_SO_BUSY_POLL 46 +#define LX_SO_MAX_PACING_RATE 47 +#define LX_SO_BPF_EXTENSIONS 48 + +/* + * Options for use with [gs]etsockopt at the RAW level. + * IPPROTO_RAW + */ +#define LX_ICMP_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the PACKET level. + * SOL_PACKET + */ +#define LX_SOL_PACKET 263 + +#define LX_PACKET_ADD_MEMBERSHIP 1 +#define LX_PACKET_DROP_MEMBERSHIP 2 +#define LX_PACKET_RECV_OUTPUT 3 +#define LX_PACKET_RX_RING 5 +#define LX_PACKET_STATISTICS 6 + +/* + * Options for use with [gs]etsockopt at the NETLINK level. + * SOL_NETLINK + */ +#define LX_SOL_NETLINK 270 + +/* + * Linux socket type definitions + */ +#define LX_SOCK_STREAM 1 /* Connection-based byte streams */ +#define LX_SOCK_DGRAM 2 /* Connectionless, datagram */ +#define LX_SOCK_RAW 3 /* Raw protocol interface */ +#define LX_SOCK_RDM 4 /* Reliably-delivered message */ +#define LX_SOCK_SEQPACKET 5 /* Sequenced packet stream */ +#define LX_SOCK_PACKET 10 /* Linux specific */ +#define LX_SOCK_MAX 11 + +/* + * The Linux socket type can be or-ed with other flags (e.g. SOCK_CLOEXEC). + */ +#define LX_SOCK_TYPE_MASK 0xf + +/* + * Linux flags for socket, socketpair and accept4. These are or-ed into the + * socket type value. In the Linux net.h header these come from fcntl.h (note + * that they are in octal in the Linux header). + */ +#define LX_SOCK_CLOEXEC 0x80000 +#define LX_SOCK_NONBLOCK 0x800 + +#define SOCK_NOTSUPPORTED -1 +#define SOCK_INVAL -2 + +/* + * PF_PACKET protocol definitions. + */ +#define LX_ETH_P_802_3 0x0001 +#define LX_ETH_P_ALL 0x0003 +#define LX_ETH_P_802_2 0x0004 +#define LX_ETH_P_IP 0x0800 +#define LX_ETH_P_ARP 0x0806 +#define LX_ETH_P_IPV6 0x86DD + +/* + * IP Protocol levels. Some of these match the Illumos IPPROTO_* values. + */ +#define LX_IPPROTO_IP 0 +#define LX_IPPROTO_ICMP 1 +#define LX_IPPROTO_IGMP 2 +#define LX_IPPROTO_TCP 6 +#define LX_IPPROTO_UDP 17 +#define LX_IPPROTO_IPV6 41 +#define LX_IPPROTO_ICMPV6 58 +#define LX_IPPROTO_RAW 255 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IP + */ +#define LX_IP_TOS 1 +#define LX_IP_TTL 2 +#define LX_IP_HDRINCL 3 +#define LX_IP_OPTIONS 4 +#define LX_IP_ROUTER_ALERT 5 +#define LX_IP_RECVOPTS 6 +#define LX_IP_RETOPTS 7 +#define LX_IP_PKTINFO 8 +#define LX_IP_PKTOPTIONS 9 +#define LX_IP_MTU_DISCOVER 10 +#define LX_IP_RECVERR 11 +#define LX_IP_RECVTTL 12 +#define LX_IP_RECVTOS 13 +#define LX_IP_MTU 14 +#define LX_IP_FREEBIND 15 +#define LX_IP_IPSEC_POLICY 16 +#define LX_IP_XFRM_POLICY 17 +#define LX_IP_PASSSEC 18 +#define LX_IP_TRANSPARENT 19 +#define LX_IP_ORIGDSTADDR 20 +#define LX_IP_MINTTL 21 +#define LX_IP_NODEFRAG 22 +/* Linux apparently leaves a gap here */ +#define LX_IP_MULTICAST_IF 32 +#define LX_IP_MULTICAST_TTL 33 +#define LX_IP_MULTICAST_LOOP 34 +#define LX_IP_ADD_MEMBERSHIP 35 +#define LX_IP_DROP_MEMBERSHIP 36 +#define LX_IP_UNBLOCK_SOURC 37 +#define LX_IP_BLOCK_SOURCE 38 +#define LX_IP_ADD_SOURCE_MEMBERSHIP 39 +#define LX_IP_DROP_SOURCE_MEMBERSHIP 40 +#define LX_IP_MSFILTER 41 +#define LX_MCAST_JOIN_GROUP 42 +#define LX_MCAST_BLOCK_SOURCE 43 +#define LX_MCAST_UNBLOCK_SOURCE 44 +#define LX_MCAST_LEAVE_GROUP 45 +#define LX_MCAST_JOIN_SOURCE_GROUP 46 +#define LX_MCAST_LEAVE_SOURCE_GROUP 47 +#define LX_MCAST_MSFILTER 48 +#define LX_IP_MULTICAST_ALL 49 +#define LX_IP_UNICAST_IF 50 + +/* + * LX_IP_MTU_DISCOVER values + */ +#define LX_IP_PMTUDISC_DONT 0 +#define LX_IP_PMTUDISC_WANT 1 +#define LX_IP_PMTUDISC_DO 2 +#define LX_IP_PMTUDISC_PROBE 3 +#define LX_IP_PMTUDISC_INTERFACE 4 +#define LX_IP_PMTUDISC_OMIT 5 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IPV6 + */ + +#define LX_IPV6_ADDRFORM 1 +#define LX_IPV6_2292PKTINFO 2 +#define LX_IPV6_2292HOPOPTS 3 +#define LX_IPV6_2292DSTOPTS 4 +#define LX_IPV6_2292RTHDR 5 +#define LX_IPV6_2292PKTOPTIONS 6 +#define LX_IPV6_CHECKSUM 7 +#define LX_IPV6_2292HOPLIMIT 8 +#define LX_IPV6_NEXTHOP 9 +#define LX_IPV6_AUTHHDR 10 +#define LX_IPV6_UNICAST_HOPS 16 +#define LX_IPV6_MULTICAST_IF 17 +#define LX_IPV6_MULTICAST_HOPS 18 +#define LX_IPV6_MULTICAST_LOOP 19 +#define LX_IPV6_JOIN_GROUP 20 +#define LX_IPV6_LEAVE_GROUP 21 +#define LX_IPV6_ROUTER_ALERT 22 +#define LX_IPV6_MTU_DISCOVER 23 +#define LX_IPV6_MTU 24 +#define LX_IPV6_RECVERR 25 +#define LX_IPV6_V6ONLY 26 +#define LX_IPV6_JOIN_ANYCAST 27 +#define LX_IPV6_LEAVE_ANYCAST 28 +#define LX_IPV6_IPSEC_POLICY 34 +#define LX_IPV6_XFRM_POLICY 35 + +#define LX_IPV6_RECVPKTINFO 49 +#define LX_IPV6_PKTINFO 50 +#define LX_IPV6_RECVHOPLIMIT 51 +#define LX_IPV6_HOPLIMIT 52 +#define LX_IPV6_RECVHOPOPTS 53 +#define LX_IPV6_HOPOPTS 54 +#define LX_IPV6_RTHDRDSTOPTS 55 +#define LX_IPV6_RECVRTHDR 56 +#define LX_IPV6_RTHDR 57 +#define LX_IPV6_RECVDSTOPTS 58 +#define LX_IPV6_DSTOPTS 59 +#define LX_IPV6_RECVTCLASS 66 +#define LX_IPV6_TCLASS 67 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_ICMPV6 + */ + +#define LX_ICMP6_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the TCP level. + * IPPROTO_TCP + */ +#define LX_TCP_NODELAY 1 /* Don't delay send to coalesce packets */ +#define LX_TCP_MAXSEG 2 /* Set maximum segment size */ +#define LX_TCP_CORK 3 /* Control sending of partial frames */ +#define LX_TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define LX_TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define LX_TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define LX_TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define LX_TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define LX_TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define LX_TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define LX_TCP_INFO 11 /* Information about this connection. */ +#define LX_TCP_QUICKACK 12 /* Bock/reenable quick ACKs. */ +#define LX_TCP_CONGESTION 13 /* Congestion control algorithm */ +#define LX_TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define LX_TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts on thin streams */ +#define LX_TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define LX_TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define LX_TCP_REPAIR 19 /* TCP socket under repair */ +#define LX_TCP_REPAIR_QUEUE 20 +#define LX_TCP_QUEUE_SEQ 21 +#define LX_TCP_REPAIR_OPTIONS 22 +#define LX_TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define LX_TCP_TIMESTAMP 24 +#define LX_TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes */ + +/* + * Options for use with [gs]etsockopt at the IGMP level. + * IPPROTO_IGMP + */ +#define LX_IGMP_MINLEN 8 +#define LX_IGMP_MAX_HOST_REPORT_DELAY 10 +#define LX_IGMP_HOST_MEMBERSHIP_QUERY 0x11 +#define LX_IGMP_HOST_MEMBERSHIP_REPORT 0x12 +#define LX_IGMP_DVMRP 0x13 +#define LX_IGMP_PIM 0x14 +#define LX_IGMP_TRACE 0x15 +#define LX_IGMP_HOST_NEW_MEMBERSHIP_REPORT 0x16 +#define LX_IGMP_HOST_LEAVE_MESSAGE 0x17 +#define LX_IGMP_MTRACE_RESP 0x1e +#define LX_IGMP_MTRACE 0x1f + +/* + * Linux socket flags for use with recv(2)/send(2)/recvmsg(2)/sendmsg(2) + */ +#define LX_MSG_OOB 0x1 +#define LX_MSG_PEEK 0x2 +#define LX_MSG_DONTROUTE 0x4 +#define LX_MSG_CTRUNC 0x8 +#define LX_MSG_PROXY 0x10 +#define LX_MSG_TRUNC 0x20 +#define LX_MSG_DONTWAIT 0x40 +#define LX_MSG_EOR 0x80 +#define LX_MSG_WAITALL 0x100 +#define LX_MSG_FIN 0x200 +#define LX_MSG_SYN 0x400 +#define LX_MSG_CONFIRM 0x800 +#define LX_MSG_RST 0x1000 +#define LX_MSG_ERRQUEUE 0x2000 +#define LX_MSG_NOSIGNAL 0x4000 +#define LX_MSG_MORE 0x8000 +#define LX_MSG_WAITFORONE 0x10000 +#define LX_MSG_FASTOPEN 0x20000000 +#define LX_MSG_CMSG_CLOEXEC 0x40000000 + +typedef struct lx_msghdr { + void *msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + size_t msg_iovlen; /* # elements in msg_iov */ + void *msg_control; /* ancillary data */ + size_t msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ +} lx_msghdr_t; + + +#if defined(_LP64) + +typedef struct lx_msghdr32 { + caddr32_t msg_name; /* optional address */ + uint32_t msg_namelen; /* size of address */ + caddr32_t msg_iov; /* scatter/gather array */ + int32_t msg_iovlen; /* # elements in msg_iov */ + caddr32_t msg_control; /* ancillary data */ + uint32_t msg_controllen; /* ancillary data buffer len */ + int32_t msg_flags; /* flags on received message */ +} lx_msghdr32_t; + +#endif + +typedef struct lx_sockaddr_in6 { + sa_family_t sin6_family; + in_port_t sin6_port; + uint32_t sin6_flowinfo; + struct in6_addr sin6_addr; + uint32_t sin6_scope_id; /* Depends on scope of sin6_addr */ + /* one 32-bit field shorter than illumos */ +} lx_sockaddr_in6_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_SOCKET_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h new file mode 100644 index 0000000000..64084b77f1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_LINUX_SYSCALLS_H +#define _SYS_LINUX_SYSCALLS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern long lx_accept(); +extern long lx_accept4(); +extern long lx_access(); +extern long lx_arch_prctl(); +extern long lx_bind(); +extern long lx_brk(); +extern long lx_chmod(); +extern long lx_chown(); +extern long lx_chown16(); +extern long lx_clock_getres(); +extern long lx_clock_gettime(); +extern long lx_clock_settime(); +extern long lx_close(); +extern long lx_connect(); +extern long lx_epoll_create(); +extern long lx_epoll_create1(); +extern long lx_epoll_ctl(); +extern long lx_epoll_pwait(); +extern long lx_epoll_wait(); +extern long lx_faccessat(); +extern long lx_fallocate(); +extern long lx_fallocate32(); +extern long lx_fchmod(); +extern long lx_fchmodat(); +extern long lx_fchown(); +extern long lx_fchown16(); +extern long lx_fchownat(); +extern long lx_fcntl(); +extern long lx_fcntl64(); +extern long lx_fgetxattr(); +extern long lx_flistxattr(); +extern long lx_fremovexattr(); +extern long lx_fsetxattr(); +extern long lx_fstat32(); +extern long lx_fstat64(); +extern long lx_fstatat64(); +extern long lx_futex(); +extern long lx_get_robust_list(); +extern long lx_get_thread_area(); +extern long lx_getcpu(); +extern long lx_getcwd(); +extern long lx_getdents_32(); +extern long lx_getdents_64(); +extern long lx_getdents64(); +extern long lx_getpeername(); +extern long lx_getsockname(); +extern long lx_getpid(); +extern long lx_getppid(); +extern long lx_getrandom(); +extern long lx_getrlimit(); +extern long lx_getsockopt(); +extern long lx_gettid(); +extern long lx_gettimeofday(); +extern long lx_getxattr(); +extern long lx_io_setup(); +extern long lx_ioctl(); +extern long lx_ioprio_get(); +extern long lx_ioprio_set(); +extern long lx_kill(); +extern long lx_lchown(); +extern long lx_lchown16(); +extern long lx_lgetxattr(); +extern long lx_link(); +extern long lx_linkat(); +extern long lx_llistxattr(); +extern long lx_lremovexattr(); +extern long lx_lsetxattr(); +extern long lx_lstat32(); +extern long lx_lstat64(); +extern long lx_listxattr(); +extern long lx_mkdir(); +extern long lx_mkdirat(); +extern long lx_modify_ldt(); +extern long lx_nanosleep(); +extern long lx_oldgetrlimit(); +extern long lx_open(); +extern long lx_openat(); +extern long lx_personality(); +extern long lx_pipe(); +extern long lx_pipe2(); +extern long lx_poll(); +extern long lx_ppoll(); +extern long lx_pread(); +extern long lx_pread32(); +extern long lx_preadv(); +extern long lx_preadv32(); +extern long lx_prctl(); +extern long lx_prlimit64(); +extern long lx_pselect(); +extern long lx_ptrace(); +extern long lx_pwrite(); +extern long lx_pwrite32(); +extern long lx_pwritev(); +extern long lx_pwritev32(); +extern long lx_read(); +extern long lx_readv(); +extern long lx_recv(); +extern long lx_recvmsg(); +extern long lx_recvfrom(); +extern long lx_sched_getparam(); +extern long lx_sched_getscheduler(); +extern long lx_sched_rr_get_interval(); +extern long lx_sched_setparam(); +extern long lx_sched_setscheduler(); +extern long lx_sched_yield(); +extern long lx_select(); +extern long lx_send(); +extern long lx_sendmsg(); +extern long lx_sendto(); +extern long lx_set_robust_list(); +extern long lx_set_thread_area(); +extern long lx_set_tid_address(); +extern long lx_setresgid(); +extern long lx_setresgid16(); +extern long lx_setresuid(); +extern long lx_setresuid16(); +extern long lx_setrlimit(); +extern long lx_setxattr(); +extern long lx_setsockopt(); +extern long lx_socket(); +extern long lx_socketcall(); +extern long lx_stat32(); +extern long lx_stat64(); +extern long lx_sync_file_range(); +extern long lx_syncfs(); +extern long lx_sysinfo32(); +extern long lx_sysinfo64(); +extern long lx_removexattr(); +extern long lx_tgkill(); +extern long lx_time(); +extern long lx_tkill(); +extern long lx_uname(); +extern long lx_wait4(); +extern long lx_waitid(); +extern long lx_waitpid(); +extern long lx_write(); +extern long lx_writev(); + +#if defined(_LP64) +/* + * Linux vsyscall addresses: + */ +#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000 +#define LX_VSYS_time (uintptr_t)0xffffffffff600400 +#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800 + +#define LX_VSYSCALL_ADDR (uintptr_t)0xffffffffff600000 +#define LX_VSYSCALL_SIZE (uintptr_t)0x1000 +#endif + +#endif /* _KERNEL */ + +/* + * System call numbers for revectoring: + */ + +#if defined(__amd64) +#define LX_SYS_close 3 +#define LX_SYS_gettimeofday 96 +#define LX_SYS_time 201 +#define LX_SYS_io_setup 206 +#define LX_SYS_clock_gettime 228 +#define LX_SYS_getcpu 309 + +#define LX_SYS32_close 6 +#define LX_SYS32_gettimeofday 78 +#define LX_SYS32_time 13 +#define LX_SYS32_clock_gettime 265 +#define LX_SYS32_io_setup 245 +#define LX_SYS32_getcpu 318 +#elif defined(__i386) +#define LX_SYS_close 6 +#define LX_SYS_gettimeofday 78 +#define LX_SYS_time 13 +#define LX_SYS_clock_gettime 265 +#define LX_SYS_io_setup 245 +#define LX_SYS_getcpu 318 +#else +#error "Architecture not supported" +#endif /* defined(__amd64) */ + +/* + * The current code in the VDSO operates under the expectation that it will be + * mapped at a fixed offset from the comm page. This simplifies the act of + * locating said page without any other reference. The VDSO must fit within + * this offset, matching the same value as COMM_PAGE_ALIGN. + * See: uts/i86pc/sys/comm_page.h + */ +#define LX_VDSO_SIZE 0x4000 +#define LX_VDSO_ADDR_MASK ~(LX_VDSO_SIZE - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SYSCALLS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h new file mode 100644 index 0000000000..90363c8939 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_types.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_TYPES_H +#define _SYS_LX_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#define SHRT_MIN (-32768) /* min value of a "short int" */ +#define SHRT_MAX 32767 /* max value of a "short int" */ +#define USHRT_MAX 65535 /* max of "unsigned short int" */ +#define INT_MIN (-2147483647-1) /* min value of an "int" */ +#define INT_MAX 2147483647 /* max value of an "int" */ +#define UINT_MAX 4294967295U /* max value of an "unsigned int" */ + +#ifndef LLONG_MAX +#define LLONG_MAX 9223372036854775807LL +#endif + +#if defined(_LP64) +#define LONG_MAX 9223372036854775807L +#define ULONG_MAX 18446744073709551615UL +#else +#define LONG_MAX 2147483647L /* max value of a 32-bit "long int" */ +#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */ +#endif + +#endif /* !_KERNEL */ + + +typedef uint64_t lx_dev_t; +typedef uint16_t lx_dev16_t; +typedef uint32_t lx_ino_t; +typedef uint64_t lx_ino64_t; +typedef uint32_t lx_uid_t; +typedef uint16_t lx_uid16_t; +typedef uint32_t lx_gid_t; +typedef uint16_t lx_gid16_t; +typedef uint32_t lx_off_t; +typedef uint64_t lx_off64_t; +typedef uint32_t lx_blksize_t; +typedef uint32_t lx_blkcnt_t; +typedef uint64_t lx_blkcnt64_t; +typedef uint32_t lx_mode_t; +typedef uint16_t lx_mode16_t; + +/* + * Linux mangles major/minor numbers into dev_t differently than SunOS. + */ +#ifdef _LP64 +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | \ + ((uint64_t)((min) & ~0xff) << 12) | ((uint64_t)((maj) & ~0xfff) << 32)) + +#define LX_GETMAJOR(lx_dev) ((((lx_dev) >> 8) & 0xfff) | \ + ((((uint64_t)(lx_dev)) >> 32) & ~0xfff)) + +#else +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | (((min) & ~0xff) << 12)) + +#define LX_GETMAJOR(lx_dev) (((lx_dev) >> 8) & 0xfff) +#endif + +#define LX_GETMINOR(lx_dev) (((lx_dev) & 0xff) | (((lx_dev) >> 12) & ~0xff)) +/* Linux supports 20 bits for the minor, and 12 bits for the major number */ +#define LX_MAXMIN 0xfffff +#define LX_MAXMAJ 0xfff + +/* + * Certain Linux tools care deeply about major/minor number mapping. + * Map virtual disks (zfs datasets, zvols, etc) into a safe reserved range. + */ +#define LX_MAJOR_DISK 203 + +/* LX ptm driver major/minor number */ +#define LX_PTM_MAJOR 5 +#define LX_PTM_MINOR 2 + +/* LX pts driver major number range */ +#define LX_PTS_MAJOR_MIN 136 +#define LX_PTS_MAJOR_MAX 143 + +/* LX tty/cons driver major number */ +#define LX_TTY_MAJOR 5 + +#define LX_UID16_TO_UID32(uid16) \ + (((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16)) + +#define LX_GID16_TO_GID32(gid16) \ + (((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16)) + +/* Overflow values default to NFS nobody. */ + +#define UID16_OVERFLOW ((lx_uid16_t)65534) +#define GID16_OVERFLOW ((lx_gid16_t)65534) + +/* + * All IDs with high word non-zero are converted to default overflow values to + * avoid inadvertent truncation to zero (root) (!). + */ +#define LX_UID32_TO_UID16(uid32) \ + ((((uid32) & 0xffff0000) == 0) ? ((lx_uid16_t)(uid32)) : \ + (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW)) + +#define LX_GID32_TO_GID16(gid32) \ + ((((gid32) & 0xffff0000) == 0) ? ((lx_gid16_t)(gid32)) : \ + (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW)) + +#define LX_32TO64(lo, hi) \ + ((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_TYPES_H */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_access.c b/usr/src/uts/common/brand/lx/syscall/lx_access.c new file mode 100644 index 0000000000..24805a5e96 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_access.c @@ -0,0 +1,224 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + * + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred_impl.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/file.h> +#include <fs/fs_subr.h> +#include <c2/audit.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +/* + * Determine accessibility of file. + */ + +#define E_OK 010 /* use effective ids */ +#define R_OK 004 +#define W_OK 002 +#define X_OK 001 + +/* + * Convert Linux LX_AT_* flags to SunOS AT_* flags but skip verifying allowed + * flags have been passed. This also allows EACCESS/REMOVEDIR to be translated + * correctly since on linux they have the same value. + * + * Some code can actually pass in other bits in the flag. We may have to simply + * ignore these, as indicated by the enforce parameter. See lx_fchmodat for + * another example of this type of behavior. + */ +static int +ltos_at_flag(int lflag, int allow, boolean_t enforce) +{ + int sflag = 0; + + if ((lflag & LX_AT_EACCESS) && (allow & AT_EACCESS)) { + lflag &= ~LX_AT_EACCESS; + sflag |= AT_EACCESS; + } + + if ((lflag & LX_AT_REMOVEDIR) && (allow & AT_REMOVEDIR)) { + lflag &= ~LX_AT_REMOVEDIR; + sflag |= AT_REMOVEDIR; + } + + if ((lflag & LX_AT_SYMLINK_NOFOLLOW) && (allow & AT_SYMLINK_NOFOLLOW)) { + lflag &= ~LX_AT_SYMLINK_NOFOLLOW; + sflag |= AT_SYMLINK_NOFOLLOW; + } + + /* right now solaris doesn't have a _FOLLOW flag, so use a fake one */ + if ((lflag & LX_AT_SYMLINK_FOLLOW) && (allow & LX_AT_SYMLINK_FOLLOW)) { + lflag &= ~LX_AT_SYMLINK_FOLLOW; + sflag |= LX_AT_SYMLINK_FOLLOW; + } + + /* If lflag is not zero than some flags did not hit the above code. */ + if (enforce && lflag) + return (-EINVAL); + + return (sflag); +} + +/* + * For illumos, access() does this: + * If the process has appropriate privileges, an implementation may indicate + * success for X_OK even if none of the execute file permission bits are set. + * + * But for Linux, access() does this: + * If the calling process is privileged (i.e., its real UID is zero), then + * an X_OK check is successful for a regular file if execute permission is + * enabled for any of the file owner, group, or other. + * + * Linux used to behave more like illumos on older kernels: + * In kernel 2.4 (and earlier) there is some strangeness in the handling + * of X_OK tests for superuser. If all categories of execute permission + * are disabled for a nondirectory file, then the only access() test that + * returns -1 is when mode is specified as just X_OK; if R_OK or W_OK is + * also specified in mode, then access() returns 0 for such files. + * + * So we need to handle the case where a privileged process is checking for + * X_OK but none of the execute bits are set on the file. We'll keep the old + * 2.4 behavior for 2.4 emulation but use the new behavior for any other + * kernel rev. + */ +static int +lx_common_access(char *fname, int fmode, vnode_t *startvp) +{ + vnode_t *vp; + cred_t *tmpcr; + int error; + int mode; + cred_t *cr; + int estale_retry = 0; + + if (fmode & ~(E_OK|R_OK|W_OK|X_OK)) + return (EINVAL); + + mode = ((fmode & (R_OK|W_OK|X_OK)) << 6); + + cr = CRED(); + + /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */ + if ((fmode & E_OK) != 0 || + (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid)) { + tmpcr = cr; + crhold(tmpcr); + } else { + tmpcr = crdup(cr); + tmpcr->cr_uid = cr->cr_ruid; + tmpcr->cr_gid = cr->cr_rgid; + tmpcr->cr_ruid = cr->cr_uid; + tmpcr->cr_rgid = cr->cr_gid; + } + +lookup: + if ((error = lookupnameatcred(fname, UIO_USERSPACE, FOLLOW, NULLVPP, + &vp, startvp, tmpcr)) != 0) { + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto lookup; + crfree(tmpcr); + return (error); + } + + if (mode != 0) { + error = VOP_ACCESS(vp, mode, 0, tmpcr, NULL); + if (error != 0) { + if ((error == ESTALE) && + fs_need_estale_retry(estale_retry++)) { + VN_RELE(vp); + goto lookup; + } + + } else if ((fmode & X_OK) != 0 && cr->cr_ruid == 0 && + lx_kern_release_cmp(curproc->p_zone, "2.4.0") > 0) { + /* check for incorrect execute success */ + vattr_t va; + + va.va_mask = AT_MODE; + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) == 0) { + mode_t m = VTTOIF(va.va_type) | va.va_mode; + + if ((m & S_IFMT) == S_IFREG && + !(m & (S_IXUSR | S_IXGRP | S_IXOTH))) { + /* no execute bits set in the mode */ + error = EACCES; + } + } + } + } + + crfree(tmpcr); + VN_RELE(vp); + return (error); +} + +int +lx_faccessat(int atfd, char *fname, int fmode, int flag) +{ + vnode_t *startvp; + int error; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + if ((flag = ltos_at_flag(flag, AT_EACCESS, B_FALSE)) < 0) + return (set_errno(EINVAL)); + + if (fname == NULL) + return (set_errno(EFAULT)); + if ((error = fgetstartvp(atfd, fname, &startvp)) != 0) + return (set_errno(error)); + if (AU_AUDITING() && startvp != NULL) + audit_setfsat_path(1); + + /* Do not allow E_OK unless AT_EACCESS flag is set */ + if ((flag & AT_EACCESS) == 0) + fmode &= ~E_OK; + + error = lx_common_access(fname, fmode, startvp); + if (startvp != NULL) + VN_RELE(startvp); + if (error) + return (set_errno(error)); + return (0); +} + +int +lx_access(char *fname, int fmode) +{ + return (lx_faccessat(LX_AT_FDCWD, fname, fmode, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_aio.c b/usr/src/uts/common/brand/lx/syscall/lx_aio.c new file mode 100644 index 0000000000..12f37ea4c7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_aio.c @@ -0,0 +1,45 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> + + +long +lx_io_setup(unsigned int nr_events, void **ctxp) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uintptr_t uargs[2] = {(uintptr_t)nr_events, (uintptr_t)ctxp}; + + mutex_enter(&curproc->p_lock); + lxpd->l_flags |= LX_PROC_AIO_USED; + mutex_exit(&curproc->p_lock); + + ttolxlwp(curthread)->br_eosys = JUSTRETURN; +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(ttolwp(curthread), LX_SYS32_io_setup, uargs); + } else +#endif + { + lx_emulate_user(ttolwp(curthread), LX_SYS_io_setup, uargs); + } + /* NOTREACHED */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c new file mode 100644 index 0000000000..19a7577ac0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> + +/* + * The brk() system call needs to be in-kernel because Linux expects a call to + * brk(0) to return the current breakpoint. In Solaris, the process breakpoint + * is setup and managed by libc. Due to the way we link our libraries and the + * need for Linux to manage its own breakpoint, this has to remain in the + * kernel. + */ +extern int brk(caddr_t); + +long +lx_brk(caddr_t nva) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + + if (nva != 0) { + (void) brk(nva); + + /* + * Despite claims to the contrary in the manpage, when Linux + * brk() fails, errno is left unchanged. + */ + lwp->lwp_errno = 0; + } + return ((long)(p->p_brkbase + p->p_brksize)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c new file mode 100644 index 0000000000..7783b97cb0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/thread.h> +#include <sys/klwp.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +long +lx_vn_chmod(vnode_t *vp, int mode) +{ + vattr_t vattr; + + vattr.va_mode = mode & MODEMASK; + vattr.va_mask = AT_MODE; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +static long +lx_fchmodat_wrapper(int fd, char *path, int mode) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, 0)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchmodat(int fd, char *path, int mode) +{ + return (lx_fchmodat_wrapper(fd, path, mode)); +} + +long +lx_fchmod(int fd, int mode) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchmod cannot leverage + * lx_fchmodat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_chmod(char *path, int mode) +{ + return (lx_fchmodat_wrapper(LX_AT_FDCWD, path, mode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c new file mode 100644 index 0000000000..830fba0a73 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c @@ -0,0 +1,180 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> + +long +lx_vn_chown(vnode_t *vp, uid_t uid, gid_t gid) +{ + vattr_t vattr; + zone_t *zone = crgetzone(CRED()); + + if ((uid != (uid_t)-1 && !VALID_UID(uid, zone)) || + (gid != (gid_t)-1 && !VALID_GID(gid, zone))) { + return (EINVAL); + } + vattr.va_uid = uid; + vattr.va_gid = gid; + vattr.va_mask = 0; + if (vattr.va_uid != -1) + vattr.va_mask |= AT_UID; + if (vattr.va_gid != -1) + vattr.va_mask |= AT_GID; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +long +lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, native_flag)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchown_wrapper(int fd, uid_t uid, gid_t gid) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchown cannot leverage + * lx_fchownat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag) +{ + int native_flag = 0; + + if (flag & LX_AT_EMPTY_PATH) { + char c; + + /* + * According to fchownat(2), when AT_EMPTY_PATH is set: "if + * path is an empty string, operate on the file referred to by + * fd". We pass NULL in place of the empty string, which + * causes fchownat() to operate on the fd we passed without an + * additional lookup. + */ + if (copyin(path, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + path = NULL; + } + + flag &= ~LX_AT_EMPTY_PATH; + } + if (flag & LX_AT_SYMLINK_NOFOLLOW) { + flag &= ~LX_AT_SYMLINK_NOFOLLOW; + native_flag |= AT_SYMLINK_NOFOLLOW; + } + if (flag != 0) { + return (set_errno(EINVAL)); + } + + return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag)); +} + +long +lx_fchown(int fd, uid_t uid, gid_t gid) +{ + return (lx_fchown_wrapper(fd, uid, gid)); +} + +long +lx_lchown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, + AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0)); +} + +long +lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchown_wrapper(fd, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid))); +} + +long +lx_lchown16(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c new file mode 100644 index 0000000000..50cdeaeab9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> + +/* + * Our lwp has already been created at this point, so this routine is + * responsible for setting up all the state needed to track this as a + * linux cloned thread. + */ +/* ARGSUSED */ +int +lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + struct lx_proc_data *lproc = ttolxproc(curthread); + struct ldt_info info; + struct user_desc descr; + int tls_index; + int entry = -1; + int signo; + + signo = flags & LX_CSIGNAL; + if (signo < 0 || signo > LX_NSIG) + return (set_errno(EINVAL)); + + if (!(flags & LX_CLONE_THREAD)) { + lproc->l_signal = signo; + } else { + if (flags & LX_CLONE_SETTLS) { + if (get_udatamodel() == DATAMODEL_ILP32) { + if (copyin((caddr_t)tls, &info, sizeof (info))) + return (set_errno(EFAULT)); + + if (LDT_INFO_EMPTY(&info)) + return (set_errno(EINVAL)); + + entry = info.entry_number; + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + tls_index = entry - GDT_TLSMIN; + + /* + * Convert the user-space structure into a real + * x86 descriptor and copy it into this LWP's + * TLS array. We also load it into the GDT. + */ + LDT_INFO_TO_DESC(&info, &descr); + bcopy(&descr, &lwpd->br_tls[tls_index], + sizeof (descr)); + lx_set_gdt(entry, &lwpd->br_tls[tls_index]); + } else { + /* + * Set the Linux %fsbase for this LWP. We will + * restore it the next time we return to Linux + * via setcontext()/lx_restorecontext(). + */ + lwpd->br_lx_fsbase = (uintptr_t)tls; + } + } + + lwpd->br_clear_ctidp = + (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL; + + if (signo && ! (flags & LX_CLONE_DETACH)) + lwpd->br_signal = signo; + else + lwpd->br_signal = 0; + + if (flags & LX_CLONE_THREAD) + lwpd->br_tgid = curthread->t_procp->p_pid; + + if (flags & LX_CLONE_PARENT) + lwpd->br_ppid = 0; + + if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) && + (suword32(ctidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) && + (suword32(ptidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + } + + *rval = lwpd->br_pid; + return (0); +} + +long +lx_set_tid_address(int *tidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + long rv; + + lwpd->br_clear_ctidp = tidp; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + rv = lwpd->br_pid; + } + + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_close.c b/usr/src/uts/common/brand/lx/syscall/lx_close.c new file mode 100644 index 0000000000..8df0cbbe2f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_close.c @@ -0,0 +1,57 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> + + +extern int close(int); + +long +lx_close(int fdes) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + boolean_t aio_used; + uintptr_t uargs[1] = {(uintptr_t)fdes}; + + mutex_enter(&curproc->p_lock); + aio_used = ((lxpd->l_flags & LX_PROC_AIO_USED) != 0); + mutex_exit(&curproc->p_lock); + + if (!aio_used) { + return (close(fdes)); + } + + /* + * If the process potentially has any AIO contexts open, the userspace + * emulation must be used so that libc can properly maintain its state. + */ + + ttolxlwp(curthread)->br_eosys = JUSTRETURN; +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(ttolwp(curthread), LX_SYS32_close, uargs); + } else +#endif + { + lx_emulate_user(ttolwp(curthread), LX_SYS_close, uargs); + } + /* NOTREACHED */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c new file mode 100644 index 0000000000..ec8b7576d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * We support neither the second argument (NUMA node), nor the third (obsolete + * pre-2.6.24 caching functionality which was ultimately broken). + */ +long +lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3) +{ + unsigned int curcpu = curthread->t_cpu->cpu_id; + + if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_epoll.c b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c new file mode 100644 index 0000000000..62a0eccf4b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c @@ -0,0 +1,272 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/epoll.h> +#include <sys/devpoll.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/vnode.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_signal.h> + +static major_t devpoll_major = 0; + +static boolean_t +lx_epoll_isvalid(file_t *fp) +{ + vnode_t *vp = fp->f_vnode; + + if (vp->v_type == VCHR && getmajor(vp->v_rdev) == devpoll_major) + return (B_TRUE); + return (B_FALSE); +} + +long +lx_epoll_create1(int flags) +{ + int err, fd, rv; + int fmode = FREAD | FWRITE; + boolean_t cloexec = B_FALSE; + vnode_t *vp = NULL; + file_t *fp = NULL; + + if (flags & EPOLL_CLOEXEC) { + cloexec = B_TRUE; + flags &= ~EPOLL_CLOEXEC; + } + if (flags != 0) { + /* No other flags accepted at this time */ + return (set_errno(EINVAL)); + } + + if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) { + err = EMFILE; + goto error; + } + if (ldi_vp_from_name("/devices/pseudo/poll@0:poll", &vp) != 0) { + err = ENOENT; + goto error; + } + if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) { + goto error; + } + err = VOP_IOCTL(vp, DP_EPOLLCOMPAT, 0, fmode, CRED(), &rv, NULL); + if (err != 0) { + (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL); + goto error; + } + + devpoll_major = getmajor(vp->v_rdev); + + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if (cloexec) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); + +error: + if (fp != NULL) { + setf(fd, NULL); + unfalloc(fp); + } + if (vp != NULL) { + VN_RELE(vp); + } + return (set_errno(err)); +} + +long +lx_epoll_create(int size) +{ + if (size <= 0) { + return (set_errno(EINVAL)); + } + + return (lx_epoll_create1(0)); +} + + +/* Match values from libc implementation */ +#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP) +#define EPOLLSWIZZLED \ + (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM) + +long +lx_epoll_ctl(int fd, int op, int pfd, void *event) +{ + epoll_event_t epevent; + dvpoll_epollfd_t dpevent[2]; + file_t *fp; + iovec_t aiov; + uio_t auio; + uint32_t events, ev = 0; + int error = 0, i = 0; + + dpevent[i].dpep_pollfd.fd = pfd; + switch (op) { + case EPOLL_CTL_DEL: + dpevent[i].dpep_pollfd.events = POLLREMOVE; + break; + + case EPOLL_CTL_MOD: + /* + * In the modify case, we pass down two events: one to + * remove the event and another to add it back. + */ + dpevent[i++].dpep_pollfd.events = POLLREMOVE; + dpevent[i].dpep_pollfd.fd = pfd; + /* FALLTHROUGH */ + + case EPOLL_CTL_ADD: + if (copyin(event, &epevent, sizeof (epevent)) != 0) + return (set_errno(EFAULT)); + + /* + * Mask off the events that we ignore, and then swizzle the + * events for which our values differ from their epoll(7) + * equivalents. + */ + events = epevent.events; + ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED); + + if (events & EPOLLRDHUP) + ev |= POLLRDHUP; + if (events & EPOLLET) + ev |= POLLET; + if (events & EPOLLONESHOT) + ev |= POLLONESHOT; + if (events & EPOLLWRNORM) + ev |= POLLWRNORM; + if (events & EPOLLWRBAND) + ev |= POLLWRBAND; + + dpevent[i].dpep_data = epevent.data.u64; + dpevent[i].dpep_pollfd.events = ev; + break; + + default: + return (set_errno(EINVAL)); + } + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + aiov.iov_base = (void *)dpevent; + aiov.iov_len = sizeof (dvpoll_epollfd_t) * (i + 1); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_loffset = 0; + auio.uio_fmode = fp->f_flag; + + error = VOP_WRITE(fp->f_vnode, &auio, 1, fp->f_cred, NULL); + + releasef(fd); + if (error) + return (set_errno(error)); + return (0); +} + +long +lx_epoll_wait(int fd, void *events, int maxevents, int timeout) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_POLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} + +long +lx_epoll_pwait(int fd, void *events, int maxevents, int timeout, void *sigmask) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + k_sigset_t ksig; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + if (sigmask != NULL) { + lx_sigset_t lsig; + + if (copyin(sigmask, &lsig, sizeof (lsig)) != 0) { + releasef(fd); + return (set_errno(EFAULT)); + } + lx_ltos_sigset(&lsig, &ksig); + arg.dp_setp = (sigset_t *)&ksig; + } else { + arg.dp_setp = NULL; + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_PPOLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c new file mode 100644 index 0000000000..338e4399fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c @@ -0,0 +1,251 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/nbmlock.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/sdt.h> + +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); + +#define LX_FALLOC_FL_KEEP_SIZE 0x01 +#define LX_FALLOC_FL_PUNCH_HOLE 0x02 +#define LX_FALLOC_FL_NO_HIDE_STALE 0x04 +#define LX_FALLOC_FL_COLLAPSE_RANGE 0x08 +#define LX_FALLOC_FL_ZERO_RANGE 0x10 + +#define LX_FALLOC_VALID (LX_FALLOC_FL_KEEP_SIZE | LX_FALLOC_FL_PUNCH_HOLE | \ + LX_FALLOC_FL_NO_HIDE_STALE | LX_FALLOC_FL_COLLAPSE_RANGE | \ + LX_FALLOC_FL_ZERO_RANGE) + +#define LX_FALLOC_UNSUPP (LX_FALLOC_FL_NO_HIDE_STALE | \ + LX_FALLOC_FL_COLLAPSE_RANGE) + +long +lx_fallocate(int fd, int mode, off_t offset, off_t len) +{ + int error = 0; + file_t *fp; + vnode_t *vp; + int64_t tot; + struct flock64 bf; + vattr_t vattr; + u_offset_t f_offset; + boolean_t in_crit = B_FALSE; + + /* + * Error checking is in a specific order to make LTP happy. + */ + + tot = offset + len; + if (tot > (LLONG_MAX / (int64_t)1024)) + return (set_errno(EFBIG)); + + if (mode & LX_FALLOC_UNSUPP) + return (set_errno(EOPNOTSUPP)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto done; + } + + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = EINVAL; + goto done; + } + + if (offset < 0 || len <= 0) { + error = EINVAL; + goto done; + } + + if (tot < 0LL) { + error = EFBIG; + goto done; + } + + if ((mode & ~LX_FALLOC_VALID) != 0) { + error = EINVAL; + goto done; + } + + /* + * If this is the only flag then we don't actually do any work. + */ + if (mode == LX_FALLOC_FL_KEEP_SIZE) + goto done; + + bzero(&bf, sizeof (bf)); + + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + if (mode == 0) { + /* Nothing to do if not extending the file */ + if (vattr.va_size >= tot) + goto done; + + /* Extend the file. */ + bf.l_start = (off64_t)tot; + bf.l_len = (off64_t)0; + + } else if (mode & LX_FALLOC_FL_PUNCH_HOLE) { + /* + * Deallocate space in the file. + */ + if ((mode & LX_FALLOC_FL_KEEP_SIZE) == 0) { + /* this flag is required with punch hole */ + error = EINVAL; + goto done; + } + + if (mode & + ~(LX_FALLOC_FL_PUNCH_HOLE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend since keep_size is set. */ + if (vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = (off_t)vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + + } else if (mode & LX_FALLOC_FL_ZERO_RANGE) { + /* + * Zero out the space in the file. + */ + if (mode & + ~(LX_FALLOC_FL_ZERO_RANGE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend when keep_size is set. */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + } else { + /* We should have already handled all flags */ + VERIFY(0); + } + + /* + * Check for locks in the range. + */ + f_offset = fp->f_offset; + error = flock_check(vp, &bf, f_offset, MAXOFF_T); + if (error != 0) + goto done; + + /* + * Check for conflicting non-blocking mandatory locks. + * We need to get the size again under nbl_start_crit. + */ + if (nbl_need_check(vp)) { + u_offset_t begin; + ssize_t length; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + /* + * Make sure we don't extend when keep_size is set. + */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + ASSERT(mode & (LX_FALLOC_FL_PUNCH_HOLE | + LX_FALLOC_FL_ZERO_RANGE)); + + /* + * If the size grew we can short-circuit the rest of + * the work, otherwise adjust bf for the vop_space + * call. + */ + if (offset >= vattr.va_size) + goto done; + len = vattr.va_size - offset; + bf.l_len = (off64_t)len; + } + + if (offset > vattr.va_size) { + begin = vattr.va_size; + length = offset - vattr.va_size; + } else { + begin = offset; + length = vattr.va_size - offset; + } + + if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, NULL)) { + error = EACCES; + goto done; + } + } + + error = VOP_SPACE(vp, F_FREESP, &bf, 0, f_offset, fp->f_cred, NULL); + +done: + if (in_crit) + nbl_end_crit(vp); + + releasef(fd); + if (error != 0) + return (set_errno(error)); + + return (0); +} + +long +lx_fallocate32(int fd, int mode, uint32_t offl, uint32_t offh, uint32_t lenl, + uint32_t lenh) +{ + int64_t offset = 0, len = 0; + + /* + * From 32-bit callers, Linux passes the 64-bit offset and len by + * concatenating consecutive arguments. We must perform the same + * conversion here. + */ + offset = offh; + offset = offset << 32; + offset |= offl; + len = lenh; + len = len << 32; + len |= lenl; + + return (lx_fallocate(fd, mode, (off_t)offset, (off_t)len)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c new file mode 100644 index 0000000000..2699b9bac7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c @@ -0,0 +1,644 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/cmn_err.h> +#include <sys/pathname.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/fs/fifonode.h> +#include <sys/strsubr.h> +#include <sys/stream.h> + +extern int fcntl(int, int, intptr_t); +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); + + +int +lx_vp_at(int fd, char *upath, vnode_t **vpp, int flag) +{ + vnode_t *startvp; + int error; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + if ((error = fgetstartvp(fd, upath, &startvp)) != 0) { + return (error); + } + + if (upath != NULL) { + uio_seg_t seg = UIO_USERSPACE; + + error = lookupnameat(upath, seg, + (flag == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, vpp, startvp); + if (startvp != NULL) { + VN_RELE(startvp); + } + return (error); + } else { + /* VN_HOLD was established in fgetstartvp */ + *vpp = startvp; + VERIFY(*vpp); + return (0); + } +} + +#define LTOS_FLOCK(l, s) \ +{ \ + s->l_type = ltos_type(l->l_type); \ + s->l_whence = l->l_whence; \ + s->l_start = l->l_start; \ + s->l_len = l->l_len; \ + s->l_sysid = 0; /* not defined in linux */ \ + s->l_pid = (pid_t)l->l_pid; \ +} + +#define STOL_FLOCK(s, l) \ +{ \ + l->l_type = stol_type(s->l_type); \ + l->l_whence = s->l_whence; \ + l->l_start = s->l_start; \ + l->l_len = s->l_len; \ + l->l_pid = (int)s->l_pid; \ +} + +static short +ltos_type(short l_type) +{ + switch (l_type) { + case LX_F_RDLCK: + return (F_RDLCK); + case LX_F_WRLCK: + return (F_WRLCK); + case LX_F_UNLCK: + return (F_UNLCK); + default: + return (-1); + } +} + +static short +stol_type(short l_type) +{ + switch (l_type) { + case F_RDLCK: + return (LX_F_RDLCK); + case F_WRLCK: + return (LX_F_WRLCK); + case F_UNLCK: + return (LX_F_UNLCK); + default: + /* can't ever happen */ + return (0); + } +} + +static void +ltos_flock(struct lx_flock *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock(struct flock64 *s, struct lx_flock *l) +{ + STOL_FLOCK(s, l) +} + +static void +ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock64(struct flock64 *s, struct lx_flock64_32 *l) +{ + STOL_FLOCK(s, l) +} + +static int +lx_fcntl_getfl(int fd) +{ + int retval; + int rc; + + retval = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + if ((retval & O_ACCMODE) == O_RDONLY) + rc = LX_O_RDONLY; + else if ((retval & O_ACCMODE) == O_WRONLY) + rc = LX_O_WRONLY; + else + rc = LX_O_RDWR; + /* O_NDELAY != O_NONBLOCK, so we need to check for both */ + if (retval & O_NDELAY) + rc |= LX_O_NDELAY; + if (retval & O_NONBLOCK) + rc |= LX_O_NONBLOCK; + if (retval & O_APPEND) + rc |= LX_O_APPEND; + if (retval & O_SYNC) + rc |= LX_O_SYNC; + if (retval & O_LARGEFILE) + rc |= LX_O_LARGEFILE; + if (retval & FASYNC) + rc |= LX_O_ASYNC; + + return (rc); +} + +static int +lx_fcntl_setfl(int fd, ulong_t arg) +{ + int new_arg; + + new_arg = 0; + /* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */ + if (arg & LX_O_NDELAY) + new_arg |= O_NONBLOCK; + if (arg & LX_O_APPEND) + new_arg |= O_APPEND; + if (arg & LX_O_SYNC) + new_arg |= O_SYNC; + if (arg & LX_O_LARGEFILE) + new_arg |= O_LARGEFILE; + if (arg & LX_O_ASYNC) + new_arg |= FASYNC; + + return (fcntl(fd, F_SETFL, new_arg)); +} + +/* The default unprivileged limit in Linux is 1MB */ +static int lx_pipe_max_size = 1048576; + +static int +lx_fcntl_pipesz(int fd, int cmd, ulong_t arg) +{ + file_t *fp; + vnode_t *vp; + stdata_t *str; + int err = 0, res = 0; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VFIFO || vp->v_op != fifo_vnodeops) { + err = EBADF; + goto out; + } + VERIFY((str = vp->v_stream) != NULL); + + if (cmd == LX_F_SETPIPE_SZ) { + stdata_t *mate; + intptr_t val = arg; + + if (val < PAGESIZE || val > lx_pipe_max_size) { + err = EINVAL; + goto out; + } + if (!STRMATED(str)) { + err = strqset(RD(str->sd_wrq), QHIWAT, 0, val); + goto out; + } + + /* + * Ensure consistent order so the set operation is always + * attempted on the "higher" stream first. + */ + if (str > str->sd_mate) { + VERIFY((mate = str->sd_mate) != NULL); + } else { + mate = str; + VERIFY((str = mate->sd_mate) != NULL); + } + + /* + * While it is unfortunate that an error could occur for the + * latter half of the stream pair, there is little to be done + * about it aside from reporting the failure. + */ + if ((err = strqset(RD(str->sd_wrq), QHIWAT, 0, val)) != 0) { + goto out; + } + err = strqset(RD(mate->sd_wrq), QHIWAT, 0, val); + } else if (cmd == LX_F_GETPIPE_SZ) { + size_t val; + + err = strqget(RD(str->sd_wrq), QHIWAT, 0, &val); + res = val; + } else { + /* NOTREACHED */ + ASSERT(0); + } + +out: + releasef(fd); + if (err != 0) { + return (set_errno(err)); + } + return (res); +} + +static int +lx_fcntl_common(int fd, int cmd, ulong_t arg) +{ + int rc = 0; + pid_t pid; + int error; + int rv; + int32_t flag; + file_t *fp; + + /* + * We depend on the call to fcntl to set the errno if necessary. + */ + ttolwp(curthread)->lwp_errno = 0; + + switch (cmd) { + case LX_F_SETSIG: + case LX_F_GETSIG: + case LX_F_SETLEASE: + case LX_F_GETLEASE: + case LX_F_NOTIFY: + case LX_F_CANCELLK: + { + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "unsupported fcntl command: %d", cmd); + lx_unsupported(buf); + } + return (set_errno(ENOTSUP)); + + case LX_F_DUPFD: + rc = fcntl(fd, F_DUPFD, arg); + break; + + case LX_F_DUPFD_CLOEXEC: + rc = fcntl(fd, F_DUPFD_CLOEXEC, arg); + break; + + case LX_F_GETFD: + rc = fcntl(fd, F_GETFD, 0); + break; + + case LX_F_SETFD: + rc = fcntl(fd, F_SETFD, arg); + break; + + case LX_F_GETFL: + rc = lx_fcntl_getfl(fd); + break; + + case LX_F_SETFL: + rc = lx_fcntl_setfl(fd, arg); + break; + + case LX_F_SETOWN: + pid = (pid_t)arg; + if (pid == 1) { + /* Setown for the init process uses the real pid. */ + pid = curzone->zone_proc_initpid; + } + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) { + /* + * On illumos F_SETOWN is only defined for sockets, but + * some apps hardcode to do this fcntl on other devices + * (e.g. /dev/tty) to setup signal handling. If the + * app is only setting itself to be the signal + * handler, we pretend to succeed. + */ + if (error != EINVAL || + curthread->t_procp->p_pid != pid) { + return (set_errno(error)); + } + } + + rc = 0; + break; + + case LX_F_GETOWN: + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) + return (set_errno(error)); + + if (pid == curzone->zone_proc_initpid) { + /* Getown for the init process returns 1. */ + pid = 1; + } + + rc = pid; + break; + + case LX_F_SETPIPE_SZ: + case LX_F_GETPIPE_SZ: + rc = lx_fcntl_pipesz(fd, cmd, arg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rc); +} + +static int +lx_fcntl_lock_cmd_to_s(int lx_cmd) +{ + switch (lx_cmd) { + case LX_F_GETLK: + return (F_GETLK); + case LX_F_SETLK: + return (F_SETLK); + case LX_F_SETLKW: + return (F_SETLKW); + case LX_F_GETLK64: + return (F_GETLK64); + case LX_F_SETLK64: + return (F_SETLK64); + case LX_F_SETLKW64: + return (F_SETLKW64); + default: + VERIFY(0); + /*NOTREACHED*/ + return (0); + } +} + +/* + * This is a pain but we can't re-use the fcntl code for locking since it does + * its own copyin/copyout for the flock struct. Since we have to convert the + * struct we have to do our own copyin/out. Thus we replicate the fcntl code for + * these 3 cmds. Luckily it's not much. + */ +static int +lx_fcntl_lock(int fd, int lx_cmd, void *arg) +{ + int cmd; + int error = 0; + file_t *fp; + vnode_t *vp; + int flag; + offset_t maxoffset; + u_offset_t offset; + model_t datamodel; + lx_flock_t lxflk; + lx_flock64_32_t lxflk64; + struct flock64 bf; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + maxoffset = MAXOFF_T; + datamodel = DATAMODEL_NATIVE; +#if defined(_SYSCALL32_IMPL) + if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32) + maxoffset = MAXOFF32_T; +#endif + vp = fp->f_vnode; + flag = fp->f_flag; + offset = fp->f_offset; + + cmd = lx_fcntl_lock_cmd_to_s(lx_cmd); + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + + lxflk.l_type = lxflk32.l_type; + lxflk.l_whence = lxflk32.l_whence; + lxflk.l_start = (off64_t)lxflk32.l_start; + lxflk.l_len = (off64_t)lxflk32.l_len; + lxflk.l_pid = lxflk32.l_pid; + } +#endif /* _SYSCALL32_IMPL */ + + ltos_flock(&lxflk, &bf); + + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock(&bf, &lxflk); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type, + sizeof (lxflk.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) { + error = EOVERFLOW; + break; + } + + lxflk32.l_type = lxflk.l_type; + lxflk32.l_whence = lxflk.l_whence; + lxflk32.l_start = lxflk.l_start; + lxflk32.l_len = lxflk.l_len; + lxflk32.l_pid = lxflk.l_pid; + + if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + } +#endif /* _SYSCALL32_IMPL */ + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + /* + * Large File support is only used for ILP32 apps. + */ + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (cmd == F_GETLK64) + cmd = F_GETLK; + else if (cmd == F_SETLK64) + cmd = F_SETLK; + else if (cmd == F_SETLKW64) + cmd = F_SETLKW; + + if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + + ltos_flock64(&lxflk64, &bf); + + if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock64(&bf, &lxflk64); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk64.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk64.l_type, + &((lx_flock64_t *)arg)->l_type, + sizeof (lxflk64.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + break; + } + + releasef(fd); + if (error) + return (set_errno(error)); + + return (0); +} + +long +lx_fcntl(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK64: + case LX_F_SETLK64: + case LX_F_SETLKW64: + /* The 64-bit fcntl commands must go through fcntl64(). */ + return (set_errno(EINVAL)); + + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, arg)); + } +} + +long +lx_fcntl64(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + case LX_F_GETLK64: + case LX_F_SETLKW64: + case LX_F_SETLK64: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, (ulong_t)arg)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c new file mode 100644 index 0000000000..e7648e1fc3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -0,0 +1,1104 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/priv.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/condvar.h> +#include <sys/inttypes.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_futex.h> +#include <sys/lx_impl.h> + +/* + * Futexes are a Linux-specific implementation of inter-process mutexes. + * They are designed to use shared memory for simple, uncontested + * operations, and rely on the kernel to resolve any contention issues. + * + * Most of the information in this section comes from the paper "Futexes + * Are Tricky", by Ulrich Drepper. This paper is currently available at: + * http://people.redhat.com/~drepper/futex.pdf. + * + * A futex itself a 4-byte integer, which must be 4-byte aligned. The + * value of this integer is expected to be modified using user-level atomic + * operations. The futex(4) design itself does not impose any semantic + * constraints on the value stored in the futex; it is up to the + * application to define its own protocol. + * + * When the application decides that kernel intervention is required, it + * will use the futex(2) system call. There are 5 different operations + * that can be performed on a futex, using this system call. Since this + * interface has evolved over time, there are several different prototypes + * available to the user. Fortunately, there is only a single kernel-level + * interface: + * + * long sys_futex(void *futex1, int cmd, int val1, + * struct timespec *timeout, void *futex2, int val2) + * + * The kernel-level operations that may be performed on a futex are: + * + * FUTEX_WAIT + * + * Atomically verify that futex1 contains the value val1. If it + * doesn't, return EWOULDBLOCK. If it does contain the expected + * value, the thread will sleep until somebody performs a FUTEX_WAKE + * on the futex. The caller may also specify a timeout, indicating + * the maximum time the thread should sleep. If the timer expires, + * the call returns ETIMEDOUT. If the thread is awoken with a signal, + * the call returns EINTR. Otherwise, the call returns 0. + * + * FUTEX_WAKE + * + * Wake up val1 processes that are waiting on futex1. The call + * returns the number of blocked threads that were woken up. + * + * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET + * + * Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument + * denoting a bit vector, with wakers will only waking waiters that match + * in one or more bits. These semantics are dubious enough, but the + * interface has an inconsistency that is glaring even by the + * embarrassingly low standards that Linux sets for itself: the timeout + * argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for + * FUTEX_WAIT. And as if that weren't enough unnecessary complexity, + * the caller may specify this absolute timeout to be against either + * CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET, + * of course! + * + * FUTEX_WAKE_OP + * + * The implementation of a conditional variable in terms of futexes + * actually uses two futexes: one to assure sequential access and one to + * represent the condition variable. This implementation gives rise to a + * particular performance problem whereby a thread is awoken on the futex + * that represents the condition variable only to have to (potentially) + * immediately wait on the futex that protects the condition variable. + * (Do not confuse the futex that serves to protect the condition variable + * with the pthread_mutex_t associated with pthread_cond_t -- which + * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP + * was invented, which performs an atomic compare-and-exchange on a + * second address in a specified fashion (that is, with a specified + * operation). Here are the possible operations (OPARG is defined + * to be 12 bit value embedded in the operation): + * + * - FUTEX_OP_SET: Sets the value at the second address to OPARG + * - FUTEX_OP_ADD: Adds the value to OPARG + * - FUTEX_OP_OR: OR's the value with OPARG + * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG + * - FUTEX_OP_XOR: XOR's the value with OPARG + * + * After this compare-and-exchange on the second address, a FUTEX_WAKE is + * performed on the first address and -- if the compare-and-exchange + * matches a specified result based on a specified comparison operation -- + * a FUTEX_WAKE is performed on the second address. Here are the possible + * comparison operations: + * + * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake + * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake + * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake + * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake + * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake + * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake + * + * As a practical matter, the only way that this is used (or, some might + * argue, is usable) is by the implementation of pthread_cond_signal(), + * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the + * futex that protects the condition variable and wake the futex that + * represents the condition variable. The second wake-up is conditional + * because the futex that protects the condition variable (rather than the + * one that represents it) may or may not have waiters. Given that this + * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for + * five different kinds of operations and six different kinds of + * comparision operations, in practice only one is used. (Namely, setting + * to 0 and waking if the old value is greater than 1 -- which denotes + * that waiters are present and the wakeup should be performed.) Moreover, + * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the + * case that the pthread_mutex_t associated with the pthread_cond_t is + * held at the time of a pthread_cond_signal(), this entire mechanism is + * essentially for naught in this case. As one can imagine (and can + * verify on just about any source base that uses pthread_cond_signal()), + * it is overwhelmingly the common case that the lock associated with the + * pthread_cond_t is held at the time of pthread_cond_signal(), assuring + * that the problem that all of this complexity was designed to solve + * isn't, in fact, solved because the signalled thread simply wakes up + * only to block again on the held mutex. Cue a slow clap! + * + * FUTEX_CMP_REQUEUE + * + * If the value stored in futex1 matches that passed in in val2, wake + * up val1 processes that are waiting on futex1. Otherwise, return + * EAGAIN. + * + * If there are more than val1 threads waiting on the futex, remove + * the remaining threads from this futex, and requeue them on futex2. + * The caller can limit the number of threads being requeued by + * encoding an integral numerical value in the position usually used + * for the timeout pointer. + * + * The call returns the number of blocked threads that were woken up + * or requeued. + * + * FUTEX_REQUEUE + * + * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. + * This command has been declared broken and obsolete, but we still + * need to support it. + * + * FUTEX_FD + * + * Return a file descriptor, which can be used to refer to the futex. + * This operation was broken by design, and was blessedly removed in + * Linux 2.6.26 ("because it was inherently racy"); it should go without + * saying that we don't support this operation. + */ + +/* + * This structure is used to track all the threads currently waiting on a + * futex. There is one fwaiter_t for each blocked thread. We store all + * fwaiter_t's in a hash structure, indexed by the memid_t of the integer + * containing the futex's value. + * + * At the moment, all fwaiter_t's for a single futex are simply dumped into + * the hash bucket. If futex contention ever becomes a hot path, we can + * chain a single futex's waiters together. + */ +typedef struct fwaiter { + memid_t fw_memid; /* memid of the user-space futex */ + kcondvar_t fw_cv; /* cond var */ + struct fwaiter *fw_next; /* hash queue */ + struct fwaiter *fw_prev; /* hash queue */ + uint32_t fw_bits; /* bits waiting on */ + volatile int fw_woken; +} fwaiter_t; + +/* + * The structure of the robust_list, as set with the set_robust_list() system + * call. See lx_futex_robust_exit(), below, for details. + */ +typedef struct futex_robust_list { + uintptr_t frl_head; /* list of robust locks held */ + uint64_t frl_offset; /* offset of lock word within a lock */ + uintptr_t frl_pending; /* pending operation */ +} futex_robust_list_t; + +#if defined(_SYSCALL32_IMPL) + +#pragma pack(4) +typedef struct futex_robust_list32 { + uint32_t frl_head; /* list of robust locks held */ + uint32_t frl_offset; /* offset of lock word within a lock */ + uint32_t frl_pending; /* pending operation */ +} futex_robust_list32_t; +#pragma pack() + +#endif + +#define MEMID_COPY(s, d) \ + { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } +#define MEMID_EQUAL(s, d) \ + ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) + +/* + * Because collisions on this hash table can be a source of negative + * scalability, we make it pretty large: 4,096 entries -- 64K. If this + * size is found to be insufficient, the size should be made dynamic. + * (Making it dynamic will be delicate because the per-chain locking will + * necessitate memory retiring or similar; see the 2008 ACM Queue article + * "Real-world concurrency" for details on this technique.) + */ +#define HASH_SHIFT_SZ 12 +#define HASH_SIZE (1 << HASH_SHIFT_SZ) +#define HASH_FUNC(id) \ + ((((uintptr_t)((id)->val[1]) >> 3) + \ + ((uintptr_t)((id)->val[1]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[1]) >> (3 + 2 * HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> 3) + \ + ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ + (HASH_SIZE - 1)) + +/* + * We place the per-chain lock next to the pointer to the chain itself. + * When compared to an array of orthogonal locks, this reduces false sharing + * (though adjacent entries can still be falsely shared -- just not as many), + * while having the additional bonus of increasing locality. + */ +typedef struct futex_hash { + kmutex_t fh_lock; + fwaiter_t *fh_waiters; +} futex_hash_t; + +static futex_hash_t futex_hash[HASH_SIZE]; + +static void +futex_hashin(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + fwp->fw_prev = NULL; + fwp->fw_next = futex_hash[index].fh_waiters; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp; + futex_hash[index].fh_waiters = fwp; +} + +static void +futex_hashout(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + if (fwp->fw_prev) + fwp->fw_prev->fw_next = fwp->fw_next; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp->fw_prev; + if (futex_hash[index].fh_waiters == fwp) + futex_hash[index].fh_waiters = fwp->fw_next; + + fwp->fw_prev = NULL; + fwp->fw_next = NULL; +} + +/* + * Go to sleep until somebody does a WAKE operation on this futex, we get a + * signal, or the timeout expires. + */ +static int +futex_wait(memid_t *memid, caddr_t addr, + int val, timespec_t *timeout, uint32_t bits) +{ + kthread_t *t = curthread; + int err, ret; + int32_t curval; + fwaiter_t fw; + int index; + + /* + * The LMS_USER_LOCK micro state becomes valid if we sleep; otherwise + * our time will accrue against LMS_SYSTEM. Use of this micro state + * is modelled on lwp_mutex_timedlock(), a native analogue of + * futex_wait(). + */ + (void) new_mstate(t, LMS_USER_LOCK); + + fw.fw_woken = 0; + fw.fw_bits = bits; + + MEMID_COPY(memid, &fw.fw_memid); + cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL); + + index = HASH_FUNC(&fw.fw_memid); + mutex_enter(&futex_hash[index].fh_lock); + + if (fuword32(addr, (uint32_t *)&curval)) { + err = set_errno(EFAULT); + goto out; + } + if (curval != val) { + err = set_errno(EWOULDBLOCK); + goto out; + } + + futex_hashin(&fw); + + err = 0; + while ((fw.fw_woken == 0) && (err == 0)) { + ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash[index].fh_lock, + timeout, timechanged); + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* + * According to signal(7), a futex(2) call with the + * FUTEX_WAIT operation is restartable. + */ + ttolxlwp(t)->br_syscall_restart = B_TRUE; + err = set_errno(EINTR); + } + } + + /* + * The futex is normally hashed out in wakeup. If we timed out or + * got a signal, we need to hash it out here instead. + */ + if (fw.fw_woken == 0) + futex_hashout(&fw); + +out: + mutex_exit(&futex_hash[index].fh_lock); + + return (err); +} + +/* + * Wake up to wake_threads threads that are blocked on the futex at memid. + */ +static int +futex_wake(memid_t *memid, int wake_threads, uint32_t mask) +{ + fwaiter_t *fwp, *next; + int index; + int ret = 0; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash[index].fh_lock); + + for (fwp = futex_hash[index].fh_waiters; + fwp != NULL && ret < wake_threads; fwp = next) { + next = fwp->fw_next; + if (MEMID_EQUAL(&fwp->fw_memid, memid) && + (fwp->fw_bits & mask)) { + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + ret++; + } + } + + mutex_exit(&futex_hash[index].fh_lock); + + return (ret); +} + +static int +futex_wake_op_execute(int32_t *addr, int32_t val3) +{ + int32_t op = FUTEX_OP_OP(val3); + int32_t cmp = FUTEX_OP_CMP(val3); + int32_t cmparg = FUTEX_OP_CMPARG(val3); + int32_t oparg, oldval, newval; + label_t ljb; + int rval; + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + if (on_fault(&ljb)) + return (set_errno(EFAULT)); + + oparg = FUTEX_OP_OPARG(val3); + + do { + oldval = *addr; + newval = oparg; + + switch (op) { + case FUTEX_OP_SET: + break; + + case FUTEX_OP_ADD: + newval += oparg; + break; + + case FUTEX_OP_OR: + newval |= oparg; + break; + + case FUTEX_OP_ANDN: + newval &= ~oparg; + break; + + case FUTEX_OP_XOR: + newval ^= oparg; + break; + + default: + no_fault(); + return (set_errno(EINVAL)); + } + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + no_fault(); + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + rval = (oldval == cmparg); + break; + + case FUTEX_OP_CMP_NE: + rval = (oldval != cmparg); + break; + + case FUTEX_OP_CMP_LT: + rval = (oldval < cmparg); + break; + + case FUTEX_OP_CMP_LE: + rval = (oldval <= cmparg); + break; + + case FUTEX_OP_CMP_GT: + rval = (oldval > cmparg); + break; + + case FUTEX_OP_CMP_GE: + rval = (oldval >= cmparg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rval); +} + +static int +futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2, + int wake_threads, int wake_threads2, int val3) +{ + kmutex_t *l1, *l2; + int ret = 0, ret2 = 0, wake; + fwaiter_t *fwp, *next; + int index1, index2; + + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(memid2); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + /* LINTED: alignment */ + if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) + goto out; + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret >= wake_threads) { + break; + } + } + + if (!wake) + goto out; + + for (fwp = futex_hash[index2].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid2)) + continue; + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret2 >= wake_threads2) { + break; + } + } + + ret += ret2; +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + return (ret); +} + +/* + * Wake up to wake_threads waiting on the futex at memid. If there are + * more than that many threads waiting, requeue the remaining threads on + * the futex at requeue_memid. + */ +static int +futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, + ulong_t requeue_threads, caddr_t addr, int *cmpval) +{ + fwaiter_t *fwp, *next; + int index1, index2; + int ret = 0; + int32_t curval; + kmutex_t *l1, *l2; + + /* + * To ensure that we don't miss a wakeup if the value of cmpval + * changes, we need to grab locks on both the original and new hash + * buckets. To avoid deadlock, we always grab the lower-indexed + * lock first. + */ + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(requeue_memid); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + if (cmpval != NULL) { + if (fuword32(addr, (uint32_t *)&curval)) { + ret = -EFAULT; + goto out; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out; + } + } + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + MEMID_COPY(requeue_memid, &fwp->fw_memid); + futex_hashin(fwp); + + if ((ret - wake_threads) >= requeue_threads) + break; + } + } + +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + if (ret < 0) + return (set_errno(-ret)); + return (ret); +} + +/* + * Copy in the relative timeout provided by the application and convert it + * to an absolute timeout. Sadly, this is complicated by the different + * timeout of semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET. (Yes, you read + * that correctly; FUTEX_WAIT and FUTEX_WAIT_BITSET have different timeout + * semantics; see the block comment at the top of the file for commentary + * on this inanity.) + */ +static int +get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd, int clock) +{ + timestruc_t now; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) + return (EFAULT); + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) + return (EFAULT); + timeout->tv_sec = (time_t)timeout32.tv_sec; + timeout->tv_nsec = timeout32.tv_nsec; + } +#endif + if (itimerspecfix(timeout)) + return (EINVAL); + + if (cmd == FUTEX_WAIT) { + /* + * We've been given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } else { + /* + * This is a FUTEX_WAIT_BITSET operation, which (1) specifies + * the timeout as an absolute rather than a relative timeout + * and (2) allows for different clock types to be specified. + * If the clock is CLOCK_REALTIME, we actually have nothing + * to do -- but if this is CLOCK_MONOTONIC, we need to convert + * our absolute time back into a relative time and then add + * it to our current hrestime to get an absolute CLOCK_REALTIME + * timeout. + */ + if (clock == CLOCK_MONOTONIC) { + /* + * Get our current time, and subtract it from our + * timeout to get the relative value. + */ + hrt2ts(gethrtime(), &now); + timespecsub(timeout, &now); + + /* + * If our timeout is in the past, set it to be 0. + */ + if (timeout->tv_sec < 0) { + timeout->tv_sec = 0; + timeout->tv_nsec = 0; + } + + /* + * Add the relative time back into the current time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } + } + + return (0); +} + +long +lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val3) +{ + struct as *as = curproc->p_as; + memid_t memid, memid2; + timestruc_t timeout; + timestruc_t *tptr = NULL; + int val2 = NULL; + int rval = 0; + int cmd = op & FUTEX_CMD_MASK; + int private = op & FUTEX_PRIVATE_FLAG; + char dmsg[32]; + + /* must be aligned on int boundary */ + if (addr & 0x3) + return (set_errno(EINVAL)); + + /* Sanity check the futex command */ + if (cmd < 0 || cmd > FUTEX_MAX_CMD) + return (set_errno(EINVAL)); + + if (cmd == FUTEX_FD) { + /* + * FUTEX_FD was sentenced to death for grievous crimes of + * semantics against humanity; it has been ripped out of Linux + * and will never be supported by us. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + /* + * These are operations that we don't currently support, but + * may well need to in the future. For now, callers need to + * deal with these being missing -- but if and as that changes, + * they may well need to be implemented. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) { + /* + * Linux only allows FUTEX_CLOCK_REALTIME to be set on the + * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands. + */ + return (set_errno(ENOSYS)); + } + + /* Copy in the timeout structure from userspace. */ + if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET) && + lx_timeout != NULL) { + rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd, + op & FUTEX_CLOCK_REALTIME ? CLOCK_REALTIME : + CLOCK_MONOTONIC); + + if (rval != 0) + return (set_errno(rval)); + tptr = &timeout; + } + + switch (cmd) { + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: + /* + * lx_timeout is nominally a pointer to a userspace address. + * For several commands, however, it actually contains + * an additional integer parameter. This is horrible, and + * the people who did this to us should be sorry. + */ + val2 = (int)lx_timeout; + } + + /* + * Translate the process-specific, user-space futex virtual + * address(es) to a universal memid. If the private bit is set, we + * can just use our as plus the virtual address, saving quite a bit + * of effort. + */ + if (private) { + memid.val[0] = (uintptr_t)as; + memid.val[1] = (uintptr_t)addr; + } else { + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) + return (set_errno(rval)); + } + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) { + if (addr2 & 0x3) + return (set_errno(EINVAL)); + + if (private) { + memid2.val[0] = (uintptr_t)as; + memid2.val[1] = (uintptr_t)addr2; + } else { + rval = as_getmemid(as, (void *)addr2, &memid2); + if (rval) + return (set_errno(rval)); + } + } + + switch (cmd) { + case FUTEX_WAIT: + rval = futex_wait(&memid, (void *)addr, val, + tptr, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAIT_BITSET: + rval = futex_wait(&memid, (void *)addr, val, tptr, val3); + break; + + case FUTEX_WAKE: + rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAKE_BITSET: + rval = futex_wake(&memid, val, val3); + break; + + case FUTEX_WAKE_OP: + rval = futex_wake_op(&memid, (void *)addr2, &memid2, + val, val2, val3); + break; + + case FUTEX_CMP_REQUEUE: + case FUTEX_REQUEUE: + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, &val3); + + break; + } + + return (rval); +} + +/* + * Does the dirty work of actually dropping a held robust lock in the event + * of the untimely death of the owner; see lx_futex_robust_exit(), below. + */ +static void +lx_futex_robust_drop(uintptr_t addr, uint32_t tid) +{ + memid_t memid; + uint32_t oldval, newval; + + VERIFY(addr + sizeof (uint32_t) < KERNELBASE); + + do { + fuword32_noerr((void *)addr, &oldval); + + if ((oldval & FUTEX_TID_MASK) != tid) + return; + + newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + /* + * We have now denoted that this lock's owner is dead; we need to + * wake any waiters. + */ + if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0) + return; + + (void) futex_wake(&memid, 1, FUTEX_BITSET_MATCH_ANY); +} + +/* + * Called when a thread is exiting. The role of the kernel is very clearly + * spelled out in the Linux design document entitled robust-futex-ABI.txt: + * we must (carefully!) iterate over the list of held locks pointed to by + * the robust list head; for each lock, we'll check to see if the calling + * (exiting) thread is the owner, and if so, denote that the lock is dead + * and wake any waiters. (The "pending" field of the head points to a lock + * that is in transition; it should be dropped if held.) If there are any + * errors through here at all (including memory operations), we abort the + * entire operation. + */ +void +lx_futex_robust_exit(uintptr_t addr, uint32_t tid) +{ + futex_robust_list_t list; + uintptr_t entry, next; + model_t model = get_udatamodel(); + int length = 0; + label_t ljb; + + if (on_fault(&ljb)) + return; + + if (addr + sizeof (futex_robust_list_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + copyin_noerr((void *)addr, &list, sizeof (list)); + } +#if defined(_SYSCALL32_IMPL) + else { + futex_robust_list32_t list32; + + copyin_noerr((void *)addr, &list32, sizeof (list32)); + list.frl_head = list32.frl_head; + list.frl_offset = list32.frl_offset; + list.frl_pending = list32.frl_pending; + } +#endif + + /* + * Strip off the PI bit, if any. + */ + entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI; + + while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) { + if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + fulword_noerr((void *)entry, &next); + } +#if defined(_SYSCALL32_IMPL) + else { + uint32_t next32; + fuword32_noerr((void *)entry, &next32); + next = next32; + } +#endif + + /* + * Drop the robust mutex -- but only if our pending lock didn't + * somehow sneak on there. + */ + if (entry != list.frl_pending) + lx_futex_robust_drop(entry + list.frl_offset, tid); + + entry = next & ~FUTEX_LOCK_PI; + } + + /* + * Finally, drop the pending lock if there is one. + */ + if (list.frl_pending != NULL && list.frl_pending + + list.frl_offset + sizeof (uint32_t) < KERNELBASE) + lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid); + +out: + no_fault(); +} + +long +lx_set_robust_list(void *listp, size_t len) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (len != sizeof (futex_robust_list_t)) + return (set_errno(EINVAL)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (len != sizeof (futex_robust_list32_t)) + return (set_errno(EINVAL)); + } +#endif + + /* + * To assure that we are serialized with respect to any racing call + * to lx_get_robust_list(), we lock ourselves to set the value. (Note + * that sprunlock() drops p_lock.) + */ + mutex_enter(&p->p_lock); + sprlock_proc(p); + lwpd->br_robust_list = listp; + sprunlock(p); + + return (0); +} + +long +lx_get_robust_list(pid_t pid, void **listp, size_t *lenp) +{ + model_t model = get_udatamodel(); + pid_t rpid; + id_t rtid; + proc_t *rproc; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + kthread_t *rthr; + void *list; + int err = 0; + + if (pid == 0) { + /* + * A pid of 0 denotes the current thread; we lock the current + * process even though it isn't strictly necessary (we can't + * race with set_robust_list() because a thread may only set + * its robust list on itself). + */ + rproc = curproc; + rlwpd = lwptolxlwp(ttolwp(curthread)); + mutex_enter(&curproc->p_lock); + sprlock_proc(rproc); + } else { + if (lx_lpid_to_spair(pid, &rpid, &rtid) != 0 || + (rproc = sprlock(rpid)) == NULL) { + /* + * We couldn't find the specified process. + */ + return (set_errno(ESRCH)); + } + + if (rproc->p_model != model || + (rthr = idtot(rproc, rtid)) == NULL || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL) { + /* + * The target process does not match our data model, or + * we couldn't find the LWP, or the target process is + * not branded. + */ + err = ESRCH; + goto out; + } + } + + if (curproc != rproc && + priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) { + /* + * We don't have the permission to examine the target. + */ + err = EPERM; + goto out; + } + + list = rlwpd->br_robust_list; + +out: + sprunlock(rproc); + + if (err != 0) + return (set_errno(err)); + + if (model == DATAMODEL_NATIVE) { + if (sulword(listp, (uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (sulword(lenp, sizeof (futex_robust_list_t)) != 0) + return (set_errno(EFAULT)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (suword32(listp, (uint32_t)(uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +void +lx_futex_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + mutex_init(&futex_hash[i].fh_lock, NULL, MUTEX_DEFAULT, NULL); +} + +int +lx_futex_fini(void) +{ + int i, err; + + err = 0; + for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { + mutex_enter(&futex_hash[i].fh_lock); + if (futex_hash[i].fh_waiters != NULL) + err = EBUSY; + mutex_exit(&futex_hash[i].fh_lock); + } + return (err); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c new file mode 100644 index 0000000000..7fcc594d81 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/pathname.h> + +/* + * getcwd() - Linux syscall semantics are slightly different; we need to return + * the length of the pathname copied (+ 1 for the terminating NULL byte.) + */ +long +lx_getcwd(char *buf, int size) +{ + int len; + int error; + vnode_t *vp; + char path[MAXPATHLEN + 1]; + + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + if ((error = vnodetopath(NULL, vp, path, sizeof (path), CRED())) != 0) { + VN_RELE(vp); + return (set_errno(error)); + } + VN_RELE(vp); + + len = strlen(path) + 1; + if (len > size) + return (set_errno(ERANGE)); + + if (copyout(path, buf, len) != 0) + return (set_errno(EFAULT)); + + return (len); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c new file mode 100644 index 0000000000..102d521e02 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c @@ -0,0 +1,350 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/vnode.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> + +#include <sys/lx_types.h> +#include <sys/lx_misc.h> + +#define LX_NAMEMAX 256 + +#define LX_GETDENTS_MAX_BUFSZ 65536 + +/* + * Because the Linux dirent has an extra field (d_type), it's possible that + * each entry will be 8 bytes larger (and aligned to 8 bytes) due to padding. + * To prevent overrun during translation, the illumos-native buffer is sized + * pessimistically. + */ +#define LTOS_GETDENTS_BUFSZ(bufsz, datasz) \ + (((bufsz) / (((datasz) + 15) & ~7)) * sizeof (struct dirent)) + +/* + * Record must be long enough to house d_name string, null terminator and + * d_type field. It's then padded to nearest 8-byte boundary + */ +#define LX_RECLEN(l, t) \ + ((offsetof(t, d_name) + 2 + (l) + 7) & ~7) + +/* + * Bytes after d_name string until d_reclen should be zeroed. + * Includes zero-terminating d_name + */ +#define LX_ZEROLEN(l, t) \ + (LX_RECLEN(l, t) - \ + ((offsetof(t, d_name) + (l)))) + +/* The output format of getdents differs if the caller is 32 or 64 bit. */ +struct lx_dirent_32 { + uint32_t d_ino; + int32_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +struct lx_dirent_64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +static long +lx_getdents_common(int fd, caddr_t uptr, size_t count, + unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int)) +{ + vnode_t *vp; + file_t *fp; + struct uio auio; + struct iovec aiov; + int error; + int sbufsz, lbufsz, bufsz; + void *lbuf, *sbuf; + size_t outb = 0; + + if (count < lx_size) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + if (!(fp->f_flag & FREAD)) { + releasef(fd); + return (set_errno(EBADF)); + } + + if (count > LX_GETDENTS_MAX_BUFSZ) { + /* + * If the target buffer passed to us is huge, keep the + * translation buffers moderate in size. Iteration will be + * used to fill the request. + */ + lbufsz = LX_GETDENTS_MAX_BUFSZ; + sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size); + } else if (count < (lx_size + MAXPATHLEN)) { + /* + * If the target buffer is tiny, allocate a Linux-format buffer + * big enough to hold at least one max-length row while keeping + * the illumos-format buffer pesimistic in size. + * + * Assuming the buffer is truely tiny, it's likely that the + * result will not fit and an EINVAL will be tossed. + */ + lbufsz = (lx_size + MAXPATHLEN); + sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)), + sizeof (struct dirent)); + } else { + lbufsz = count; + sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size); + } + bufsz = sbufsz; + lbuf = kmem_alloc(lbufsz, KM_SLEEP); + sbuf = kmem_alloc(sbufsz, KM_SLEEP); + + aiov.iov_base = sbuf; + aiov.iov_len = sbufsz; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = sbufsz; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Since we use a conservative buffer allocation for the differing + * struct sizing and Linux places fewer limits on getdents buffers in + * general, there's a chance we'll undershoot on the record count. + * When this happens, we can simply repeat the READDIR operation until + * the available records are exhausted or we've filled the user buffer. + */ + while (1) { + int at_eof, res; + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error != 0 || auio.uio_resid == sbufsz) { + break; + } + res = outcb(sbuf, lbuf, bufsz - auio.uio_resid); + VERIFY(res <= lbufsz); + if (res == 0) { + /* no records to copyout from this batch */ + break; + } else if (res > count) { + /* + * For very small buffer sizes, it's possible that a + * single record is too large due to a long filename. + */ + error = EINVAL; + break; + } + + VERIFY(outb + res <= count); + if (copyout(lbuf, (void *)(uptr + outb), res) != 0) { + error = EFAULT; + break; + } + outb += res; + + if (at_eof != 0 || (count - outb) < (lx_size + MAXPATHLEN)) { + /* + * If there are no records left or the remaining buffer + * space is not large enough to hold a max-length + * filename, do not continue iteration. + */ + break; + } + + /* + * We undershot the request buffer. + * Reset for another READDIR, taking care not to overshoot. + */ + bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size)); + auio.uio_resid = bufsz; + aiov.iov_len = bufsz; + aiov.iov_base = sbuf; + } + + kmem_free(lbuf, lbufsz); + kmem_free(sbuf, sbufsz); + + if (error) { + releasef(fd); + return (set_errno(error)); + } + + fp->f_offset = auio.uio_loffset; + releasef(fd); + return (outb); +} + + +static int +lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent_32 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent_32 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_32); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_32)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +static int +lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent_64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent_64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_64); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_64)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +long +lx_getdents_32(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_32), lx_getdents_format32)); +} + +long +lx_getdents_64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_64), lx_getdents_format64)); +} + +struct lx_dirent64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + uchar_t d_type; + char d_name[1]; +}; + +#define LX_RECLEN64(namelen) \ + ((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7) + +#define LX_ZEROLEN64(namelen) \ + (LX_RECLEN64(namelen) - \ + ((offsetof(struct lx_dirent64, d_name) + (namelen)))) + +static int +lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + ld->d_type = 0; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN64(namelen); + /* Zero out any alignment padding */ + bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + + +long +lx_getdents64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent64), lx_getdents64_format)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c new file mode 100644 index 0000000000..c2506f52c5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * return the pid + */ +long +lx_getpid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + long rv; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + VERIFY(lwpd != NULL); + + if (lwpd->br_lx_thunk_pid != 0) { + rv = lwpd->br_lx_thunk_pid; + } else { + rv = lwpd->br_tgid; + } + } + + return (rv); +} + +/* + * return the parent pid + */ +long +lx_getppid(void) +{ + return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL)); +} + +/* + * return the thread id + */ +long +lx_gettid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + return (lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c new file mode 100644 index 0000000000..acc4073483 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * From "uts/common/syscall/getrandom.c": + */ +extern int getrandom(void *, size_t, int); + +long +lx_getrandom(void *bufp, size_t buflen, int flags) +{ + /* + * According to signal(7), calls to getrandom(2) are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (getrandom(bufp, buflen, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c new file mode 100644 index 0000000000..baa41f52fa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -0,0 +1,296 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/zone.h> +#include <sys/cred_impl.h> +#include <sys/policy.h> + +typedef ushort_t l_uid16_t; +typedef ushort_t l_gid16_t; +typedef uint_t l_uid_t; +typedef uint_t l_gid_t; + +#define LINUX_UID16_TO_UID32(uid16) \ + (((uid16) == (l_uid16_t)-1) ? ((l_uid_t)-1) : (l_uid_t)(uid16)) + +#define LINUX_GID16_TO_GID32(gid16) \ + (((gid16) == (l_gid16_t)-1) ? ((l_gid_t)-1) : (l_gid_t)(gid16)) + +#define LX_NGROUPS_MAX 32 +extern int setgroups(int, gid_t *); + +/* + * This function is based on setreuid in common/syscall/uid.c and exists + * because illumos does not have a way to explicitly set the saved uid (suid) + * from any other system call. + */ +long +lx_setresuid(l_uid_t ruid, l_uid_t euid, l_uid_t suid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid > MAXUID)) || + (euid != -1 && (euid > MAXUID)) || + (suid != -1 && (suid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && + ruid != cr->cr_ruid && ruid != cr->cr_uid && + ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else if (suid != -1 && + suid != cr->cr_ruid && suid != cr->cr_uid && + suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (suid != -1) + newcr->cr_suid = suid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16) +{ + long rval; + + rval = lx_setresuid( + LINUX_UID16_TO_UID32(ruid16), + LINUX_UID16_TO_UID32(euid16), + LINUX_UID16_TO_UID32(suid16)); + + return (rval); +} + +/* + * This function is based on setregid in common/syscall/gid.c + */ +long +lx_setresgid(l_gid_t rgid, l_gid_t egid, l_gid_t sgid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid > MAXUID)) || + (egid != -1 && (egid > MAXUID)) || + (sgid != -1 && (sgid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (rgid != -1 && + rgid != cr->cr_rgid && rgid != cr->cr_gid && + rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (egid != -1 && + egid != cr->cr_rgid && egid != cr->cr_gid && + egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (sgid != -1 && + sgid != cr->cr_rgid && sgid != cr->cr_gid && + sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (sgid != -1) + newcr->cr_sgid = sgid; + if (rgid != -1) + newcr->cr_rgid = rgid; + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16) +{ + long rval; + + rval = lx_setresgid( + LINUX_GID16_TO_GID32(rgid16), + LINUX_GID16_TO_GID32(egid16), + LINUX_GID16_TO_GID32(sgid16)); + + return (rval); +} + +/* + * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ + * the terrible hack below so that tests may proceed, if only on DEBUG kernels. + */ +long +lx_helper_setgroups(int ngroups, gid_t *grouplist) +{ +#ifdef DEBUG + if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) + ngroups = ngroups_max; +#endif /* DEBUG */ + + return (setgroups(ngroups, grouplist)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c new file mode 100644 index 0000000000..2bd5da9961 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c @@ -0,0 +1,1741 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/termio.h> +#include <sys/termios.h> +#include <sys/ptyvar.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <sys/sockio.h> +#include <sys/stropts.h> +#include <sys/ptms.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/sysmacros.h> +#include <sys/lx_misc.h> +#include <sys/lx_ptm.h> +#include <sys/sunddi.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/kmem.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/if_arp.h> +#include <sys/ioccom.h> +#include <sys/dtrace.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/lx_autofs.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> +#include <sys/dkio.h> +#include <sys/sdt.h> + +/* + * Linux ioctl types + */ +#define LX_IOC_TYPE_HD 0x03 +#define LX_IOC_TYPE_BLK 0x12 +#define LX_IOC_TYPE_FD 0x54 +#define LX_IOC_TYPE_DTRACE 0x68 +#define LX_IOC_TYPE_SOCK 0x89 +#define LX_IOC_TYPE_AUTOFS 0x93 + +/* + * Supported ioctls + */ +#define LX_HDIO_GETGEO 0x0301 +#define LX_BLKGETSIZE 0x1260 +#define LX_BLKSSZGET 0x1268 +#define LX_BLKGETSIZE64 0x80081272 +#define LX_TCGETS 0x5401 +#define LX_TCSETS 0x5402 +#define LX_TCSETSW 0x5403 +#define LX_TCSETSF 0x5404 +#define LX_TCGETA 0x5405 +#define LX_TCSETA 0x5406 +#define LX_TCSETAW 0x5407 +#define LX_TCSETAF 0x5408 +#define LX_TCSBRK 0x5409 +#define LX_TCXONC 0x540a +#define LX_TCFLSH 0x540b +#define LX_TIOCEXCL 0x540c +#define LX_TIOCNXCL 0x540d +#define LX_TIOCSCTTY 0x540e +#define LX_TIOCGPGRP 0x540f +#define LX_TIOCSPGRP 0x5410 +#define LX_TIOCOUTQ 0x5411 +#define LX_TIOCSTI 0x5412 +#define LX_TIOCGWINSZ 0x5413 +#define LX_TIOCSWINSZ 0x5414 +#define LX_TIOCMGET 0x5415 +#define LX_TIOCMBIS 0x5416 +#define LX_TIOCMBIC 0x5417 +#define LX_TIOCMSET 0x5418 +#define LX_TIOCGSOFTCAR 0x5419 +#define LX_TIOCSSOFTCAR 0x541a +#define LX_FIONREAD 0x541b +#define LX_TIOCPKT 0x5420 +#define LX_FIONBIO 0x5421 +#define LX_TIOCNOTTY 0x5422 +#define LX_TIOCSETD 0x5423 +#define LX_TIOCGETD 0x5424 +#define LX_TCSBRKP 0x5425 +#define LX_TIOCGSID 0x5429 +#define LX_TIOCGPTN 0x80045430 +#define LX_TIOCSPTLCK 0x40045431 +#define LX_FIONCLEX 0x5450 +#define LX_FIOCLEX 0x5451 +#define LX_FIOASYNC 0x5452 +#define LX_FIOSETOWN 0x8901 +#define LX_SIOCSPGRP 0x8902 +#define LX_FIOGETOWN 0x8903 +#define LX_SIOCGPGRP 0x8904 +#define LX_SIOCATMARK 0x8905 +#define LX_SIOCGSTAMP 0x8906 +#define LX_SIOCADDRT 0x890b +#define LX_SIOCDELRT 0x890c +#define LX_SIOCRTMSG 0x890d +#define LX_SIOCGIFNAME 0x8910 +#define LX_SIOCSIFLINK 0x8911 +#define LX_SIOCGIFCONF 0x8912 +#define LX_SIOCGIFFLAGS 0x8913 +#define LX_SIOCSIFFLAGS 0x8914 +#define LX_SIOCGIFADDR 0x8915 +#define LX_SIOCSIFADDR 0x8916 +#define LX_SIOCGIFDSTADDR 0x8917 +#define LX_SIOCSIFDSTADDR 0x8918 +#define LX_SIOCGIFBRDADDR 0x8919 +#define LX_SIOCSIFBRDADDR 0x891a +#define LX_SIOCGIFNETMASK 0x891b +#define LX_SIOCSIFNETMASK 0x891c +#define LX_SIOCGIFMETRIC 0x891d +#define LX_SIOCSIFMETRIC 0x891e +#define LX_SIOCGIFMEM 0x891f +#define LX_SIOCSIFMEM 0x8920 +#define LX_SIOCGIFMTU 0x8921 +#define LX_SIOCSIFMTU 0x8922 +#define LX_SIOCSIFNAME 0x8923 +#define LX_SIOCSIFHWADDR 0x8924 +#define LX_SIOCGIFENCAP 0x8925 +#define LX_SIOCSIFENCAP 0x8926 +#define LX_SIOCGIFHWADDR 0x8927 +#define LX_SIOCGIFSLAVE 0x8929 +#define LX_SIOCSIFSLAVE 0x8930 +#define LX_SIOCADDMULTI 0x8931 +#define LX_SIOCDELMULTI 0x8932 +#define LX_SIOCGIFINDEX 0x8933 +#define LX_SIOCSIFPFLAGS 0x8934 +#define LX_SIOCGIFPFLAGS 0x8935 +#define LX_SIOCDIFADDR 0x8936 +#define LX_SIOCSIFHWBROADCAST 0x8937 +#define LX_SIOCGIFCOUNT 0x8938 +#define LX_SIOCGIFBR 0x8940 +#define LX_SIOCSIFBR 0x8941 +#define LX_SIOCGIFTXQLEN 0x8942 +#define LX_SIOCSIFTXQLEN 0x8943 +#define LX_SIOCETHTOOL 0x8946 +#define LX_SIOCGMIIPHY 0x8947 +#define LX_SIOCGMIIREG 0x8948 +#define LX_SIOCSMIIREG 0x8949 +#define LX_SIOCWANDEV 0x894a +#define LX_SIOCOUTQNSD 0x894b +#define LX_SIOCDARP 0x8953 +#define LX_SIOCGARP 0x8954 +#define LX_SIOCSARP 0x8955 +#define LX_SIOCDRARP 0x8960 +#define LX_SIOCGRARP 0x8961 +#define LX_SIOCSRARP 0x8962 +#define LX_SIOCGIFMAP 0x8970 +#define LX_SIOCSIFMAP 0x8971 +#define LX_SIOCADDDLCI 0x8980 +#define LX_SIOCDELDLCI 0x8981 +#define LX_SIOCGIFVLAN 0x8982 +#define LX_SIOCSIFVLAN 0x8983 +#define LX_SIOCBONDENSLAVE 0x8990 +#define LX_SIOCBONDRELEASE 0x8991 +#define LX_SIOCBONDSETHWADDR 0x8992 +#define LX_SIOCBONDSLAVEINFOQUERY 0x8993 +#define LX_SIOCBONDINFOQUERY 0x8994 +#define LX_SIOCBONDCHANGEACTIVE 0x8995 +#define LX_SIOCBRADDBR 0x89a0 +#define LX_SIOCBRDELBR 0x89a1 +#define LX_SIOCBRADDIF 0x89a2 +#define LX_SIOCBRDELIF 0x89a3 +#define LX_SIOCSHWTSTAMP 0x89b0 +#define LX_SIOCGHWTSTAMP 0x89b1 +#define LX_SIOCDEVPRIVATE 0x89f0 +#define LX_SIOCPROTOPRIVATE 0x89e0 + +#define FLUSER(fp) fp->f_flag | get_udatamodel() +#define FLFAKE(fp) fp->f_flag | FKIOCTL + +/* + * LX_NCC must be different from LX_NCCS since while the termio and termios + * structures may look similar they are fundamentally different sizes and + * have different members. + */ +#define LX_NCC 8 +#define LX_NCCS 19 + +struct lx_termio { + unsigned short c_iflag; /* input mode flags */ + unsigned short c_oflag; /* output mode flags */ + unsigned short c_cflag; /* control mode flags */ + unsigned short c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCC]; /* control characters */ +}; + +struct lx_termios { + uint32_t c_iflag; /* input mode flags */ + uint32_t c_oflag; /* output mode flags */ + uint32_t c_cflag; /* control mode flags */ + uint32_t c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCCS]; /* control characters */ +}; + +/* + * c_cc characters which are valid for lx_termio and lx_termios + */ +#define LX_VINTR 0 +#define LX_VQUIT 1 +#define LX_VERASE 2 +#define LX_VKILL 3 +#define LX_VEOF 4 +#define LX_VTIME 5 +#define LX_VMIN 6 +#define LX_VSWTC 7 + +/* + * c_cc characters which are valid for lx_termios + */ +#define LX_VSTART 8 +#define LX_VSTOP 9 +#define LX_VSUSP 10 +#define LX_VEOL 11 +#define LX_VREPRINT 12 +#define LX_VDISCARD 13 +#define LX_VWERASE 14 +#define LX_VLNEXT 15 +#define LX_VEOL2 16 + +/* + * Defaults needed for SunOS to Linux format conversion. + * See INIT_C_CC in linux-stable/include/asm-generic/termios.h + */ +#define LX_DEF_VTIME 0 +#define LX_DEF_VMIN 1 +#define LX_DEF_VEOF '\004' +#define LX_DEF_VEOL 0 + +/* VSD key for lx_cc information */ +static uint_t lx_ioctl_vsd = 0; + +extern int lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid); + +/* Terminal helpers */ + +static void +l2s_termios(struct lx_termios *l_tios, struct termios *s_tios) +{ + ASSERT((l_tios != NULL) && (s_tios != NULL)); + + bzero(s_tios, sizeof (*s_tios)); + + s_tios->c_iflag = l_tios->c_iflag; + s_tios->c_oflag = l_tios->c_oflag; + s_tios->c_cflag = l_tios->c_cflag; + s_tios->c_lflag = l_tios->c_lflag; + + if (s_tios->c_lflag & ICANON) { + s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF]; + s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL]; + } else { + s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN]; + s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME]; + } + + s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2]; + s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE]; + s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL]; + s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT]; + s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT]; + s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE]; + s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR]; + s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT]; + s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC]; + s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART]; + s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP]; + s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP]; + s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD]; +} + +static void +l2s_termio(struct lx_termio *l_tio, struct termio *s_tio) +{ + ASSERT((l_tio != NULL) && (s_tio != NULL)); + + bzero(s_tio, sizeof (*s_tio)); + + s_tio->c_iflag = l_tio->c_iflag; + s_tio->c_oflag = l_tio->c_oflag; + s_tio->c_cflag = l_tio->c_cflag; + s_tio->c_lflag = l_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF]; + } else { + s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN]; + s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME]; + } + + s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR]; + s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT]; + s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE]; + s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL]; + s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC]; +} + +static void +termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio) +{ + ASSERT((l_tios != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tios->c_cc[LX_VEOF]; + lio->veol = l_tios->c_cc[LX_VEOL]; + lio->vmin = l_tios->c_cc[LX_VMIN]; + lio->vtime = l_tios->c_cc[LX_VTIME]; +} + +static void +termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio) +{ + ASSERT((l_tio != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tio->c_cc[LX_VEOF]; + lio->veol = 0; + lio->vmin = l_tio->c_cc[LX_VMIN]; + lio->vtime = l_tio->c_cc[LX_VTIME]; +} + +static void +s2l_termios(struct termios *s_tios, struct lx_termios *l_tios) +{ + ASSERT((s_tios != NULL) && (l_tios != NULL)); + + bzero(l_tios, sizeof (*l_tios)); + + l_tios->c_iflag = s_tios->c_iflag; + l_tios->c_oflag = s_tios->c_oflag; + l_tios->c_cflag = s_tios->c_cflag; + l_tios->c_lflag = s_tios->c_lflag; + + /* + * Since use of the VMIN/VTIME and VEOF/VEOL control characters is + * mutually exclusive (determined by ICANON), SunOS aliases them in the + * c_cc field in termio/termios. Linux does not perform this aliasing, + * so it expects that the default values are present regardless of + * ICANON status. + * + * These defaults can be overridden later by any values stored via the + * lx_cc mechanism. + */ + if (s_tios->c_lflag & ICANON) { + l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF]; + l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL]; + l_tios->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tios->c_cc[LX_VMIN] = LX_DEF_VMIN; + + } else { + l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN]; + l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME]; + l_tios->c_cc[LX_VEOF] = LX_DEF_VEOF; + l_tios->c_cc[LX_VEOL] = LX_DEF_VEOL; + } + + l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2]; + l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE]; + l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL]; + l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT]; + l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT]; + l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE]; + l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR]; + l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT]; + l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH]; + l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART]; + l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP]; + l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP]; + l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD]; +} + +static void +s2l_termio(struct termio *s_tio, struct lx_termio *l_tio) +{ + ASSERT((s_tio != NULL) && (l_tio != NULL)); + + bzero(l_tio, sizeof (*l_tio)); + + l_tio->c_iflag = s_tio->c_iflag; + l_tio->c_oflag = s_tio->c_oflag; + l_tio->c_cflag = s_tio->c_cflag; + l_tio->c_lflag = s_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF]; + l_tio->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tio->c_cc[LX_VMIN] = LX_DEF_VMIN; + } else { + l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN]; + l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME]; + l_tio->c_cc[LX_VEOF] = LX_DEF_VEOF; + } + + l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR]; + l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT]; + l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE]; + l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL]; + l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH]; +} + +static void +set_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + /* + * Linux expects that the termio/termios control characters are + * preserved more strictly than illumos supports. In order to preserve + * the illusion that the characters are maintained, they are stored as + * vnode-specific data. + */ + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur == NULL) { + cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP); + bcopy(lio, cur, sizeof (struct lx_cc)); + (void) vsd_set(vp, lx_ioctl_vsd, cur); + } else { + bcopy(lio, cur, sizeof (struct lx_cc)); + } + mutex_exit(&vp->v_vsd_lock); +} + +static int +get_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + int rv = 1; + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur != NULL) { + bcopy(cur, lio, sizeof (*lio)); + rv = 0; + } + mutex_exit(&vp->v_vsd_lock); + return (rv); +} + +/* Socket helpers */ + +typedef struct lx_ifreq32 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + }; +} lx_ifreq32_t; + +typedef struct lx_ifreq64 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + /* pad this out to the Linux size */ + uint64_t ifmap[3]; + }; +} lx_ifreq64_t; + +typedef struct lx_ifconf32 { + int32_t if_len; + caddr32_t if_buf; +} lx_ifconf32_t; + +typedef struct lx_ifconf64 { + int32_t if_len; + caddr_t if_buf; +} lx_ifconf64_t; + + +/* Generic translators */ + +static int +ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error = 0; + int rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + int32_t iflag, flags; + int error; + + if (copyin((caddr_t)arg, &iflag, sizeof (iflag))) + return (set_errno(EFAULT)); + + mutex_enter(&fp->f_tlock); + vp = fp->f_vnode; + flags = fp->f_flag; + /* Linux sets NONBLOCK instead of FIONBIO */ + if (iflag) + flags |= FNONBLOCK; + else + flags &= ~FNONBLOCK; + /* push the flag down */ + error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL); + fp->f_flag = flags; + mutex_exit(&fp->f_tlock); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + struct vattr vattr; + int error = 0; + int rv; + /* + * offset is int32_t because that is what FIONREAD is defined in terms + * of. We cap at INT_MAX as in other cases for this ioctl. + */ + int32_t offset; + + vp = fp->f_vnode; + + if (vp->v_type == VREG || vp->v_type == VDIR) { + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL); + if (error != 0) + return (set_errno(error)); + offset = MIN(vattr.va_size - fp->f_offset, INT_MAX); + if (copyout(&offset, (caddr_t)arg, sizeof (offset))) + return (set_errno(EFAULT)); + } else { + error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred, + &rv, NULL); + if (error) + return (set_errno(error)); + } + return (0); +} + +/* + * hard disk-related translators + * + * Note that the normal disk ioctls only work for VCHR devices. See spec_ioctl + * which will return ENOTTY for a VBLK device. However, fdisk, etc. expect to + * work with block devices. + * + * We expect a zvol to be the primary block device we're interacting with and + * we use the zone's lxzd_vdisks list to handle zvols specifically. + */ + +typedef struct lx_hd_geom { + unsigned char heads; + unsigned char sectors; + unsigned short cylinders; + unsigned long start; +} lx_hd_geom_t; + +static lx_virt_disk_t * +lx_lookup_zvol(lx_zone_data_t *lxzd, dev_t dev) +{ + lx_virt_disk_t *vd; + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL && vd->lxvd_real_dev == dev) + return (vd); + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + return (NULL); +} + +/* + * See zvol_ioctl() which always fails for DKIOCGGEOM. The geometry for a + * zvol (or really any modern disk) is made up, so we do that here as well. + */ +static int +ict_hdgetgeo(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_hd_geom_t lx_geom; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev); + if (vd == NULL) { + /* should only happen if new zvol */ + bzero(&lx_geom, sizeof (lx_geom)); + } else { + diskaddr_t tot; + + tot = vd->lxvd_volsize / vd->lxvd_blksize; + + /* + * Since the 'sectors' value is only one byte we make + * up heads/cylinder values to get things to fit. + * We roundup the number of heads to ensure we don't + * overflow the sectors due to truncation. + */ + lx_geom.heads = lx_geom.cylinders = (tot / 0xff) + 1; + lx_geom.sectors = tot / lx_geom.heads; + lx_geom.start = 0; + } + } else { + int res, rv; + struct dk_geom geom; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGGEOM, (intptr_t)&geom, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + lx_geom.heads = geom.dkg_nhead; + lx_geom.sectors = geom.dkg_nsect; + lx_geom.cylinders = geom.dkg_ncyl; + lx_geom.start = 0; + } + + if (copyout(&lx_geom, (caddr_t)arg, sizeof (lx_geom))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Per the Linux sd(4) man page, get the number of sectors. The linux/fs.h + * header says its 512 byte blocks. + */ +static int +ict_blkgetsize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + diskaddr_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev); + if (vd == NULL) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd->lxvd_volsize / 512; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity; + if (minfo.dki_lbsize > 512) { + uint_t bsize = minfo.dki_lbsize / 512; + + tot *= bsize; + } + } + + if (copyout(&tot, (caddr_t)arg, sizeof (long))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the sector size (i.e. the logical block size). + */ +static int +ict_blkgetssize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint_t bsize; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev); + if (vd == NULL) { + /* should only happen if new zvol */ + bsize = 0; + } else { + bsize = (uint_t)vd->lxvd_blksize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + bsize = (uint_t)minfo.dki_lbsize; + } + + if (copyout(&bsize, (caddr_t)arg, sizeof (bsize))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the size. The linux/fs.h header says its in bytes. + */ +static int +ict_blkgetsize64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint64_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = lx_lookup_zvol(lxzd, fp->f_vnode->v_rdev); + if (vd == NULL) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd->lxvd_volsize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity * minfo.dki_lbsize; + } + + if (copyout(&tot, (caddr_t)arg, sizeof (uint64_t))) + return (set_errno(EFAULT)); + return (0); +} + +/* Terminal-related translators */ + +static int +ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF); + + if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + termios2lx_cc(&l_tios, &lio); + l2s_termios(&l_tios, &s_tios); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +static int +ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF); + + if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + l2s_termio(&l_tio, &s_tio); + termio2lx_cc(&l_tio, &lio); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +static int +ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios, *s_tiosd; + uint_t s_tiosl; + + /* get termios defaults */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd, + &s_tiosl) != DDI_SUCCESS) + return (EIO); + ASSERT(s_tiosl == sizeof (*s_tiosd)); + bcopy(s_tiosd, &s_tios, sizeof (s_tios)); + ddi_prop_free(s_tiosd); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tios.c_cc[LX_VEOF] = lio.veof; + l_tios.c_cc[LX_VEOL] = lio.veol; + l_tios.c_cc[LX_VMIN] = lio.vmin; + l_tios.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV)) + return (ict_tcgets_ptm(fp, cmd, arg, lxcmd)); + else + return (ict_tcgets_native(fp, cmd, arg, lxcmd)); +} + +static int +ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + s2l_termio(&s_tio, &l_tio); + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tio.c_cc[LX_VEOF] = lio.veof; + l_tio.c_cc[LX_VMIN] = lio.vmin; + l_tio.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t lpid, spid, tid; + int error, rv; + + /* Converting to the illumos pid is necessary */ + if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(lpid, &spid, &tid) < 0) + return (set_errno(EPERM)); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int rv, error; + /* use null duration to emulate TCSBRKP */ + int dur = 0; + error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur, + FLFAKE(fp), fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t spgrp; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp), + fp->f_cred, &rv, NULL); + if (error == 0) { + if (spgrp == curproc->p_zone->zone_proc_initpid) { + spgrp = 1; + } + if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) { + return (set_errno(EFAULT)); + } + } + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + int error, rv; + + istr.ic_cmd = UNLKPT; + istr.ic_len = 0; + istr.ic_timout = 0; + istr.ic_dp = NULL; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + /* + * The success/fail return values are different between Linux + * and illumos. Linux expects 0 or -1. Illumos can return + * positive number on success. + */ + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + cred_t *cr; + pt_own_t pto; + int error, rv; + int ptyno; + + /* This operation is only valid for the lx_ptm device. */ + if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV)) + return (set_errno(ENOTTY)); + + cr = CRED(); + pto.pto_ruid = cr->cr_uid; + pto.pto_rgid = cr->cr_gid; + + istr.ic_cmd = OWNERPT; + istr.ic_len = sizeof (pto); + istr.ic_timout = 0; + istr.ic_dp = (char *)&pto; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (error) + return (set_errno((error == ENOTTY) ? error: EACCES)); + + ptyno = getminor(fp->f_vnode->v_rdev) - 1; + if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno))) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + + /* + * A few Linux libc's (e.g. musl) have chosen to implement isatty() + * using the TIOCGWINSZ ioctl. Some apps also do the same thing + * directly. On Linux that ioctl will return a size of 0x0 for dumb + * terminals but on illumos see the handling for TIOCGWINSZ in ptem's + * ptioc(). We fail if the winsize is all zeros. To emulate the Linux + * behavior use the native ioctl check that we do for isatty and return + * a size of 0x0 if that succeeds. + */ + if (error == EINVAL) { + int err; + struct termio s_tio; + + err = VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (err == 0) { + struct winsize w; + + bzero(&w, sizeof (w)); + if (copyout(&w, (struct winsize *)arg, sizeof (w)) != 0) + return (set_errno(EFAULT)); + return (0); + } + } + + if (error != 0) + return (set_errno(error)); + + return (0); +} + +static int +ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t ttysid, mysid; + int error, rv; + proc_t *p = curproc; + + /* getsid */ + mutex_enter(&p->p_splock); + mysid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + /* + * Report success if we already control the tty. + * If no one controls it, TIOCSCTTY will change that later. + */ + error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error == 0 && ttysid == mysid) + return (0); + + /* + * Need to make sure we're a session leader, otherwise the + * TIOCSCTTY ioctl will fail. + */ + mutex_enter(&pidlock); + if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + sess_create(); + } else { + mutex_exit(&pidlock); + } + + error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp), + fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* Socket-related translators */ + +static int +ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp = fp->f_vnode; + int error, rv; + /* + * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while + * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this. + */ + if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM) + return (set_errno(ENOTTY)); + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (error) + return (set_errno(error)); + + return (0); +} + +static int +ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred) +{ + int error, rv; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + ksocket_t ks; + + ASSERT(lxzd != NULL); + + /* + * For ioctls of this type, we are strict about address family + * whereas Linux is lenient. This strictness can be avoided by using + * an internal AF_INET ksocket, which we use if the family is anything + * but AF_PACKET. + */ + if (vn->v_type == VSOCK && VTOSO(vn)->so_family == AF_PACKET) + return (VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL)); + + mutex_enter(&lxzd->lxzd_lock); + ks = lxzd->lxzd_ioctl_sock; + if (ks == NULL) { + /* + * Linux is not at all picky about address family when it comes + * to supporting interface-related ioctls. To mimic this + * behavior, we'll attempt those ioctls against a ksocket + * configured for that purpose. + */ + (void) ksocket_socket(&lxzd->lxzd_ioctl_sock, AF_INET, + SOCK_DGRAM, 0, 0, curproc->p_zone->zone_kcred); + ks = lxzd->lxzd_ioctl_sock; + } + mutex_exit(&lxzd->lxzd_lock); + + if (ks != NULL) { + error = ksocket_ioctl(ks, cmd, arg, &rv, cred); + } else { + error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL); + } + + return (error); +} + +static int +ict_sioghwaddr(file_t *fp, struct lifreq *lreq) +{ + struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr; + struct sockaddr hwaddr; + int error, size; + + error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq, + FLFAKE(fp), fp->f_cred); + + if (error == EADDRNOTAVAIL && + strncmp(lreq->lifr_name, "lo", 2) == 0) { + /* Emulate success on suspected loopbacks */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + bzero(LLADDR(sdl), sdl->sdl_alen); + error = 0; + } + + if (error == 0) { + bzero(&hwaddr, sizeof (hwaddr)); + lx_stol_hwaddr(sdl, &hwaddr, &size); + bcopy(&hwaddr, &lreq->lifr_addr, + size + sizeof (sdl->sdl_family)); + } + + return (error); +} + +static int +ict_siocgifname(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + int len; + char name[LIFNAMSIZ]; + netstack_t *ns; + ip_stack_t *ipst; + phyint_t *phyi; + + if (fp->f_vnode->v_type != VSOCK) { + return (set_errno(EINVAL)); + } + + len = (curproc->p_model == DATAMODEL_LP64) ? sizeof (lx_ifreq64_t) : + sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) { + return (set_errno(EFAULT)); + } + + /* + * Since Linux calls this ioctl on all sorts of sockets, perform the + * interface name lookup manually. + */ + if ((ns = netstack_get_current()) == NULL) { + return (set_errno(EINVAL)); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &req.ifr_index, NULL); + if (phyi != NULL) { + strncpy(name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + if (strlen(name) != 0) { + /* Truncate for ifreq and copyout */ + strncpy(req.ifr_name, name, IFNAMSIZ); + if (copyout(&req, (struct ifreq *)arg, len) != 0) { + return (set_errno(EFAULT)); + } + return (0); + } + + return (set_errno(EINVAL)); +} + +static int +ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + struct lifreq lreq; + int error, len; + + /* Convert from Linux ifreq to illumos lifreq */ + if (curproc->p_model == DATAMODEL_LP64) + len = sizeof (lx_ifreq64_t); + else + len = sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) + return (set_errno(EFAULT)); + bzero(&lreq, sizeof (lreq)); + strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ); + bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ); + lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE); + + switch (cmd) { + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMTU: + case SIOCSIFMTU: + /* + * Convert cmd from SIO*IF* to SIO*LIF*. + * This is needed since Linux allows ifreq operations on ipv6 + * sockets where illumos does not. + */ + cmd = ((cmd & IOC_INOUT) | + _IOW('i', ((cmd & 0xff) + 100), struct lifreq)); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFINDEX: + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFFLAGS: + cmd = SIOCGLIFFLAGS; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE); + break; + case SIOCSIFFLAGS: + cmd = SIOCSLIFFLAGS; + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFHWADDR: + error = ict_sioghwaddr(fp, &lreq); + break; + case LX_SIOCGIFTXQLEN: + /* + * Illumos lacks the notion of txqlen. Confirm the provided + * interface is valid with SIOCGLIFINDEX and return a fake + * txqlen of 1. Loopback devices will report txqlen of 0. + */ + if (strncmp(lreq.lifr_name, "lo", 2) == 0) { + lreq.lifr_index = 0; + error = 0; + break; + } + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) { + /* lifr_index aliases to the qlen field */ + lreq.lifr_index = 1; + } + break; + case LX_SIOCSIFHWADDR: + /* + * We're not going to support SIOCSIFHWADDR, but we need to be + * able to check the result of the copyin first to see if the + * command should have returned EFAULT. + */ + default: + error = EINVAL; + } + + if (error != 0) + return (set_errno(error)); + + /* Convert back to a Linux ifreq */ + lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE); + bzero(&req, sizeof (req)); + strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ); + bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ); + + if (copyout(&req, (struct lifreq *)arg, len) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf32_t conf; + lx_ifreq32_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq32_t); + + if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq32_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq32_t); + oreq = (lx_ifreq32_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +static int +ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf64_t conf; + lx_ifreq64_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq64_t); + + if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq64_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq64_t); + oreq = (lx_ifreq64_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq64_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +static int +ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (curproc->p_model == DATAMODEL_LP64) + return (ict_siocgifconf64(fp, cmd, arg, lxcmd)); + else + return (ict_siocgifconf32(fp, cmd, arg, lxcmd)); +} + +/* + * Unfortunately some of the autofs ioctls want to return a positive integer + * result which does not indicate an error. To minimize disruption in the + * rest of the code, we'll treat a positive return as an errno and a negative + * return as the non-error return (which we then negate). + */ +static int +ict_autofs(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int res = 0; + int rv; + + res = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (res > 0) + return (set_errno(res)); + if (res == 0) + return (0); + return (-res); +} + +/* Structure used to define an ioctl translator. */ +typedef struct lx_ioc_cmd_translator { + int lict_lxcmd; + int lict_cmd; + int (*lict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd); +} lx_ioc_cmd_translator_t; + +#define LX_IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_END \ + {0, 0, NULL} + +static lx_ioc_cmd_translator_t lx_ioc_xlate_fd[] = { + LX_IOC_CMD_TRANSLATOR_FILTER(FIONBIO, ict_fionbio) + LX_IOC_CMD_TRANSLATOR_FILTER(FIONREAD, ict_fionread) + LX_IOC_CMD_TRANSLATOR_PASS(FIOASYNC) + + /* streams related */ + LX_IOC_CMD_TRANSLATOR_PASS(TCXONC) + LX_IOC_CMD_TRANSLATOR_PASS(TCFLSH) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCEXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSTI) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIS) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIC) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMSET) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSETD) + LX_IOC_CMD_TRANSLATOR_PASS(TCSBRK) + + /* terminal related */ + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGETD) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGSID) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCPKT) + + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETS, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSW, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSF, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETA, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAW, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAF, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETS, ict_tcgets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETA, ict_tcgeta) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ, ict_tiocgwinsz) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP, ict_tcsbrkp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP, ict_tiocspgrp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP, ict_tiocgpgrp) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK, ict_sptlock) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN, ict_gptn) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY, ict_tiocsctty) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_socket[] = { + LX_IOC_CMD_TRANSLATOR_PASS(FIOGETOWN) + + LX_IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK, ict_siocatmark) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF, ict_siocgifconf) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFNAME, ict_siocgifname) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_dtrace[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_autofs[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_READY) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_FAIL) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_CATATONIC) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_SETTIMEOUT) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE_MULTI) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOSUBVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_ASKUMOUNT) + + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_VERSION_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_READY_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_FAIL_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CATATONIC_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_TIMEOUT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_REQUESTER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_EXPIRE_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD, + ict_autofs) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_hd[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_HDIO_GETGEO, ict_hdgetgeo) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_blk[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE, ict_blkgetsize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKSSZGET, ict_blkgetssize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE64, ict_blkgetsize64) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static void +lx_ioctl_vsd_free(void *data) +{ + kmem_free(data, sizeof (struct lx_cc)); +} + +void +lx_ioctl_init() +{ + vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free); +} + +void +lx_ioctl_fini() +{ + vsd_destroy(&lx_ioctl_vsd); +} + +long +lx_ioctl(int fdes, int cmd, intptr_t arg) +{ + file_t *fp; + int res = 0, error = ENOTTY; + lx_ioc_cmd_translator_t *ict = NULL; + + if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) { + res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0); + return ((res != 0) ? set_errno(res) : 0); + } + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + switch ((cmd & 0xff00) >> 8) { + case LX_IOC_TYPE_FD: + ict = lx_ioc_xlate_fd; + break; + + case LX_IOC_TYPE_DTRACE: + ict = lx_ioc_xlate_dtrace; + break; + + case LX_IOC_TYPE_SOCK: + ict = lx_ioc_xlate_socket; + error = EOPNOTSUPP; + break; + + case LX_IOC_TYPE_AUTOFS: + ict = lx_ioc_xlate_autofs; + break; + + case LX_IOC_TYPE_BLK: + ict = lx_ioc_xlate_blk; + break; + + case LX_IOC_TYPE_HD: + ict = lx_ioc_xlate_hd; + break; + + default: + releasef(fdes); + return (set_errno(ENOTTY)); + } + + /* + * Today, none of the ioctls supported by the emulation possess + * overlapping cmd values. Because of that, no type interrogation of + * the fd is done before executing specific ioctl emulation. It's + * assumed that the vnode-specific logic called by the emulation + * function will reject ioctl commands not supported by the fd. + */ + VERIFY(ict != NULL); + while (ict->lict_func != NULL) { + if (ict->lict_lxcmd == cmd) + break; + ict++; + } + if (ict->lict_func == NULL) { + releasef(fdes); + return (set_errno(error)); + } + + res = ict->lict_func(fp, ict->lict_cmd, arg, ict->lict_lxcmd); + releasef(fdes); + return (res); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c new file mode 100644 index 0000000000..13397e199e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c @@ -0,0 +1,66 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/lx_brand.h> + +/* 'which' values. */ +#define LX_IOPRIO_WHO_PROCESS 1 +#define LX_IOPRIO_WHO_PGRP 2 +#define LX_IOPRIO_WHO_USER 3 + +/* + * The possible values for the class. We report best effort (BE) as the class + * in use. + */ +#define LX_IOPRIO_CLASS_RT 1 +#define LX_IOPRIO_CLASS_BE 2 +#define LX_IOPRIO_CLASS_IDLE 3 + +/* Macro to determine the class from the input mask */ +#define LX_IOPRIO_PRIO_CLASS(m) ((m) >> 13) + +/* ARGSUSED */ +long +lx_ioprio_get(int which, int who) +{ + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + return (LX_IOPRIO_CLASS_BE); +} + +/* + * We allow setting any valid class, even though it's ignored. + * We ignore the 'who' parameter which means that we're not searching for + * the specified target in order to return a specific errno in the case that + * the target does not exist. + */ +/* ARGSUSED */ +long +lx_ioprio_set(int which, int who, int mask) +{ + int class; + + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + class = LX_IOPRIO_PRIO_CLASS(mask); + if (class < LX_IOPRIO_CLASS_RT || class > LX_IOPRIO_CLASS_IDLE) + return (set_errno(EINVAL)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c new file mode 100644 index 0000000000..eeed914566 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -0,0 +1,402 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/thread.h> +#include <sys/signal.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <lx_signum.h> +#include <sys/contract/process_impl.h> + +extern int kill(pid_t, int); + +/* + * Check if it is legal to send this signal to the init process. Linux + * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid + * 1. + */ +static int +init_sig_check(int sig, pid_t pid) +{ + proc_t *p; + int rv = 0; + + mutex_enter(&pidlock); + + if (((p = prfind(pid)) == NULL) || (p->p_stat == SIDL)) + rv = ESRCH; + else if (sig && (sigismember(&cantmask, sig) || + (PTOU(p)->u_signal[sig-1] == SIG_DFL) || + (PTOU(p)->u_signal[sig-1] == SIG_IGN))) + rv = EPERM; + + mutex_exit(&pidlock); + + return (rv); +} + +static long +lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill) +{ + kthread_t *t; + proc_t *pp, *cp = curproc; + pid_t initpid; + sigqueue_t *sqp; + int tid = 1; /* default tid */ + int sig, rv; + + /* + * Unlike kill(2), Linux tkill(2) doesn't allow signals to + * be sent to process IDs <= 0 as it doesn't overlay any special + * semantics on the pid. + */ + if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * If the Linux pid is 1, translate the pid to the actual init + * pid for the zone. Note that Linux dictates that no unhandled + * signals may be sent to init, so check for that, too. + * + * Otherwise, extract the tid and real pid from the Linux pid. + */ + initpid = cp->p_zone->zone_proc_initpid; + if (pid == 1) + pid = initpid; + if ((pid == initpid) && ((rv = init_sig_check(sig, pid)) != 0)) + return (set_errno(rv)); + else if (lx_lpid_to_spair(pid, &pid, &tid) < 0) + return (set_errno(ESRCH)); + + if (tgkill && tgid != pid) + return (set_errno(ESRCH)); + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + + /* + * Find the process for the passed pid... + */ + mutex_enter(&pidlock); + if (((pp = prfind(pid)) == NULL) || (pp->p_stat == SIDL)) { + mutex_exit(&pidlock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + mutex_enter(&pp->p_lock); + mutex_exit(&pidlock); + + /* + * Deny permission to send the signal if either of the following + * is true: + * + * + The signal is SIGCONT and the target pid is not in the same + * session as the sender + * + * + prochasprocperm() shows the user lacks sufficient permission + * to send the signal to the target pid + */ + if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) || + (!prochasprocperm(pp, cp, CRED()))) { + mutex_exit(&pp->p_lock); + rv = set_errno(EPERM); + goto free_and_exit; + } + + /* check for the tid */ + if ((t = idtot(pp, tid)) == NULL) { + mutex_exit(&pp->p_lock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + + /* a signal of 0 means just check for the existence of the thread */ + if (lx_sig == 0) { + mutex_exit(&pp->p_lock); + rv = 0; + goto free_and_exit; + } + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = cp->p_pid; + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(pp, t, sqp); + + mutex_exit(&pp->p_lock); + + return (0); + +free_and_exit: + kmem_free(sqp, sizeof (sigqueue_t)); + return (rv); +} + +long +lx_tgkill(pid_t tgid, pid_t pid, int lx_sig) +{ + return (lx_thrkill(tgid, pid, lx_sig, B_TRUE)); +} + +long +lx_tkill(pid_t pid, int lx_sig) +{ + return (lx_thrkill(0, pid, lx_sig, B_FALSE)); +} + +long +lx_kill(pid_t lx_pid, int lx_sig) +{ + pid_t s_pid, initpid; + sigsend_t v; + zone_t *zone = curproc->p_zone; + struct proc *p; + int err, sig, nfound; + + if ((lx_sig < 0) || (lx_sig > LX_NSIG) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * Since some linux apps rely on init(1M) having PID 1, we + * transparently translate 1 to the real init(1M)'s pid. We then + * check to be sure that it is legal for this process to send this + * signal to init(1M). + */ + initpid = zone->zone_proc_initpid; + if (lx_pid == 1) { + s_pid = initpid; + } else if (lx_pid == 0 || lx_pid == -1) { + s_pid = 0; + } else if (lx_pid > 0) { + if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } else { + ASSERT(lx_pid < 0); + if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid it means that the + * process group leader doesn't exist in this zone. + * In this case assuming that the Linux pid is + * the same as the Solaris pid will get us the + * correct behavior. + */ + s_pid = -lx_pid; + } + } + + if ((s_pid == initpid) && ((err = init_sig_check(sig, s_pid)) != 0)) + return (set_errno(err)); + + /* + * For individual processes, kill() semantics are the same between + * Solaris and Linux. + */ + if (lx_pid >= 0) + return (kill(s_pid, sig)); + + /* + * In Solaris, sending a signal to -pid means "send a signal to + * everyone in process group pid." In Linux it means "send a + * signal to everyone in the group other than init." Sending a + * signal to -1 means "send a signal to every process except init + * and myself." + */ + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + err = 0; + + mutex_enter(&pidlock); + + p = (lx_pid == -1) ? practive : pgfind(s_pid); + nfound = 0; + while (err == 0 && p != NULL) { + if ((p->p_zone == zone) && (p->p_stat != SIDL) && + (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) { + nfound++; + err = sigsendproc(p, &v); + } + + p = (lx_pid == -1) ? p->p_next : p->p_pglink; + } + mutex_exit(&pidlock); + + /* + * If we found no processes, we'll return ESRCH -- but unlike our + * native kill(2), we do not return EPERM if processes are found but + * we did not have permission to send any of them a signal. + */ + if (nfound == 0) + err = ESRCH; + + return (err ? set_errno(err) : 0); +} + +/* + * This handles the unusual case where the user sends a non-queueable signal + * through rt_sigqueueinfo. Signals sent with codes that indicate they are + * queuable are sent through the sigqueue syscall via the user level function + * lx_rt_sigqueueinfo(). + */ +int +lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + proc_t *target_proc; + pid_t s_pid; + zone_t *zone = curproc->p_zone; + sigsend_t send; + int err; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + /* Unlike in lx_kill, this process id must be exact, no negatives. */ + if (tgid == 0) + return (set_errno(ESRCH)); + if (tgid < 0) + return (set_errno(EINVAL)); + /* + * Translate init directly, otherwise use the convenient utility + * function to translate. Since we're sending to the whole group, we + * only need the solaris pid, and not the lwp id. + */ + if (tgid == 1) { + s_pid = zone->zone_proc_initpid; + } else { + if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } + /* + * We shouldn't have queuable signals here, those are sent elsewhere by + * the usermode handler for this emulated call. + */ + if (!SI_CANQUEUE(kinfo.si_code)) { + return (set_errno(EINVAL)); + } + /* Since our signal shouldn't queue, we just call sigsendproc(). */ + bzero(&send, sizeof (send)); + send.sig = sig; + send.checkperm = 1; + send.sicode = kinfo.si_code; + send.value = kinfo.si_value; + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + err = 0; + if (target_proc != NULL) { + err = sigsendproc(target_proc, &send); + if (err == 0 && send.perm == 0) + err = EPERM; + } else { + err = ESRCH; + } + mutex_exit(&pidlock); + + return (err ? set_errno(err) : 0); +} + +/* + * Unlike the above function, this handles all system calls to rt_tgsigqueue + * regardless of si_code. + */ +int +lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) +{ + id_t s_tid; + pid_t s_pid; + proc_t *target_proc; + sigqueue_t *sqp; + kthread_t *t; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(tid, &s_pid, &s_tid) != 0) + return (set_errno(ESRCH)); + /* + * For group leaders, solaris pid == linux pid, so the solaris leader + * pid should be the same as the tgid but since the tgid comes in via + * the syscall we need to check for an invalid value. + */ + if (s_pid != tgid) + return (set_errno(EINVAL)); + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + if (target_proc != NULL) + mutex_enter(&target_proc->p_lock); + mutex_exit(&pidlock); + + if (target_proc == NULL) { + return (set_errno(ESRCH)); + } + if (sig < 0 || sig >= NSIG) + return (set_errno(EINVAL)); + + /* + * Some code adapted from lwp_kill, duplicated here because we do some + * customization to the sq_info field of sqp. + */ + if ((t = idtot(target_proc, s_tid)) == NULL) { + mutex_exit(&target_proc->p_lock); + return (set_errno(ESRCH)); + } + /* Just checking for existence of the process, not sending a signal. */ + if (sig == 0) { + mutex_exit(&target_proc->p_lock); + return (0); + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = kinfo.si_code; + sqp->sq_info.si_pid = target_proc->p_pid; + sqp->sq_info.si_ctid = PRCTID(target_proc); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(target_proc, t, sqp); + mutex_exit(&target_proc->p_lock); + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_link.c b/usr/src/uts/common/brand/lx/syscall/lx_link.c new file mode 100644 index 0000000000..23e0768581 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_link.c @@ -0,0 +1,97 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/systm.h> +#include <sys/lx_fcntl.h> + +#define LX_LINK_ALLOWED (LX_AT_SYMLINK_FOLLOW | LX_AT_EMPTY_PATH) + +static long +lx_link_common(int ffd, char *from, int tfd, char *to, int flags) +{ + int error; + vnode_t *fsvp = NULL, *tsvp = NULL; + enum symfollow follow = NO_FOLLOW; + + if ((flags & ~LX_LINK_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flags & LX_AT_EMPTY_PATH) == 0) { + char c; + + /* + * Check that both 'from' and 'to' names are non-empty if + * AT_EMPTY_PATH is not set. + */ + if (copyin(from, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + if (copyin(to, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + + /* + * XXX: When our support for LX capabilities improves, ENOENT + * should be thrown when a process lacking CAP_DAC_READ_SEARCH + * attempts to use the AT_EMPTY_PATH flag. + */ + } + if ((flags & LX_AT_SYMLINK_FOLLOW) != 0) { + follow = FOLLOW; + } + + if ((error = fgetstartvp(ffd, from, &fsvp)) != 0) { + goto out; + } + if ((error = fgetstartvp(tfd, to, &tsvp)) != 0) { + goto out; + } + error = vn_linkat(fsvp, from, follow, tsvp, to, UIO_USERSPACE); + +out: + if (fsvp != NULL) { + VN_RELE(fsvp); + } + if (tsvp != NULL) { + VN_RELE(tsvp); + } + if (error) { + return (set_errno(error)); + } + return (0); +} + +long +lx_link(char *from, char *to) +{ + return (lx_link_common(AT_FDCWD, from, AT_FDCWD, to, 0)); +} + +long +lx_linkat(int ffd, char *from, int tfd, char *to, int flags) +{ + ffd = (ffd == LX_AT_FDCWD) ? AT_FDCWD : ffd; + tfd = (tfd == LX_AT_FDCWD) ? AT_FDCWD : tfd; + + return (lx_link_common(ffd, from, tfd, to, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c new file mode 100644 index 0000000000..2f29f56d5f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* + * From "uts/common/syscall/mkdir.c": + */ +extern int mkdirat(int, char *, int); + +long +lx_mkdirat(int fd, char *dname, int dmode) +{ + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + return (mkdirat(fd, dname, dmode)); +} + +long +lx_mkdir(char *dname, int dmode) +{ + return (mkdirat(AT_FDCWD, dname, dmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c new file mode 100644 index 0000000000..aa6e12a7d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/segments.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/sysi86.h> +#include <sys/cmn_err.h> +#include <sys/lx_ldt.h> + +/* + * Read the ldt_info structure in from the Linux app, convert it to an ssd + * structure, and then call setdscr() to do all the heavy lifting. + */ +static int +write_ldt(void *data, ulong_t count) +{ + user_desc_t usd; + struct ssd ssd; + struct ldt_info ldt_inf; + proc_t *pp = curthread->t_procp; + int err; + + if (count != sizeof (ldt_inf)) + return (set_errno(EINVAL)); + + if (copyin(data, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + if (ldt_inf.entry_number >= MAXNLDT) + return (set_errno(EINVAL)); + + LDT_INFO_TO_DESC(&ldt_inf, &usd); + usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number)); + + /* + * Get everyone into a safe state before changing the LDT. + */ + if (!holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + + err = setdscr(&ssd); + + /* + * Release the hounds! + */ + mutex_enter(&pp->p_lock); + continuelwps(pp); + mutex_exit(&pp->p_lock); + + return (err ? set_errno(err) : 0); +} + +static int +read_ldt(void *uptr, ulong_t count) +{ + proc_t *pp = curproc; + int bytes; + + if (pp->p_ldt == NULL) + return (0); + + bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + if (bytes > count) + bytes = count; + + if (copyout(pp->p_ldt, uptr, bytes)) + return (set_errno(EFAULT)); + + return (bytes); +} + +long +lx_modify_ldt(int op, void *data, ulong_t count) +{ + int rval; + + switch (op) { + case 0: + rval = read_ldt(data, count); + break; + + case 1: + rval = write_ldt(data, count); + break; + + default: + rval = set_errno(ENOSYS); + break; + } + + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c new file mode 100644 index 0000000000..431c2ed1ba --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/mutex.h> + +#include <sys/lx_types.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +extern int fcntl(int, int, intptr_t); +extern int openat(int, char *, int, int); +extern int open(char *, int, int); +extern int close(int); +extern int cioctl(file_t *, int, intptr_t, int *); +extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **, + vnode_t *); + + +static int +ltos_open_flags(int input) +{ + int flags; + + if (input & LX_O_PATH) { + input &= (LX_O_DIRECTORY | LX_O_NOFOLLOW | LX_O_CLOEXEC); + } + + /* This depends on the Linux ACCMODE flags being the same as SunOS. */ + flags = (input & LX_O_ACCMODE); + + if (input & LX_O_CREAT) { + flags |= O_CREAT; + } + + if (input & LX_O_EXCL) + flags |= O_EXCL; + if (input & LX_O_NOCTTY) + flags |= O_NOCTTY; + if (input & LX_O_TRUNC) + flags |= O_TRUNC; + if (input & LX_O_APPEND) + flags |= O_APPEND; + if (input & LX_O_NONBLOCK) + flags |= O_NONBLOCK; + if (input & LX_O_SYNC) + flags |= O_SYNC; + if (input & LX_O_LARGEFILE) + flags |= O_LARGEFILE; + if (input & LX_O_NOFOLLOW) + flags |= O_NOFOLLOW; + if (input & LX_O_CLOEXEC) + flags |= O_CLOEXEC; + + /* + * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the + * device backing the fd in question. Illumos doesn't have similar + * functionality, but we can attempt to simulate it using the flags + * (O_RSYNC|O_SYNC) and directio(3C). + * + * The LX_O_DIRECT flag also requires that the transfer size and + * alignment of I/O buffers be a multiple of the logical block size for + * the underlying file system, but frankly there isn't an easy way to + * support that functionality without doing something like adding an + * fcntl(2) flag to denote LX_O_DIRECT mode. + * + * Since LX_O_DIRECT is merely a performance advisory, we'll just + * emulate what we can and trust that the only applications expecting + * an error when performing I/O from a misaligned buffer or when + * passing a transfer size is not a multiple of the underlying file + * system block size will be test suites. + */ + if (input & LX_O_DIRECT) + flags |= (O_RSYNC|O_SYNC); + + return (flags); +} + +#define LX_POSTPROCESS_OPTS (LX_O_DIRECT | LX_O_ASYNC | LX_O_PATH) + +static int +lx_open_postprocess(int fd, int fmode) +{ + file_t *fp; + int rv, error = 0; + + if ((fmode & LX_POSTPROCESS_OPTS) == 0) { + /* Skip out early, if possible */ + return (0); + } + + if ((fp = getf(fd)) == NULL) { + /* + * It is possible that this fd was closed by the time we + * arrived here if some one is hammering away with close(). + */ + return (EIO); + } + + if (fmode & LX_O_DIRECT && error == 0) { + (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, + fp->f_flag, fp->f_cred, &rv, NULL); + } + + if (fmode & LX_O_ASYNC && error == 0) { + if ((error = VOP_SETFL(fp->f_vnode, fp->f_flag, FASYNC, + fp->f_cred, NULL)) == 0) { + mutex_enter(&fp->f_tlock); + fp->f_flag |= FASYNC; + mutex_exit(&fp->f_tlock); + } + } + + if (fmode & LX_O_PATH && error == 0) { + /* + * While the O_PATH flag has no direct analog in SunOS, it is + * emulated by removing both FREAD and FWRITE from f_flag. + * This causes read(2) and write(2) result in EBADF and can be + * checked for in other syscalls to tigger the correct behavior + * there. + */ + mutex_enter(&fp->f_tlock); + fp->f_flag &= ~(FREAD|FWRITE); + mutex_exit(&fp->f_tlock); + } + + releasef(fd); + if (error != 0) { + (void) closeandsetf(fd, NULL); + } + return (error); +} + +long +lx_openat(int atfd, char *path, int fmode, int cmode) +{ + int flags, fd, error; + mode_t mode = 0; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + flags = ltos_open_flags(fmode); + + /* + * We use the FSEARCH flag to make sure this is a directory. We have to + * explicitly add 1 to emulate the FREAD/FWRITE mapping of the OPENMODE + * macro since it won't get set via OPENMODE when FSEARCH is used. + */ + if (fmode & LX_O_DIRECTORY) { + flags |= FSEARCH; + flags++; + } + + if (flags & O_CREAT) + mode = (mode_t)cmode; + + ttolwp(curthread)->lwp_errno = 0; + fd = openat(atfd, path, flags, mode); + if (ttolwp(curthread)->lwp_errno != 0) { + if ((fmode & LX_O_DIRECTORY) && + ttolwp(curthread)->lwp_errno != ENOTDIR) { + /* + * We got an error trying to open a file as a directory. + * We need to determine if we should return the original + * error or ENOTDIR. + */ + vnode_t *startvp; + vnode_t *vp; + int oerror, error = 0; + + oerror = ttolwp(curthread)->lwp_errno; + + if (atfd == AT_FDCWD) { + /* regular open */ + startvp = NULL; + } else { + char startchar; + + if (copyin(path, &startchar, sizeof (char))) + return (set_errno(oerror)); + + /* if startchar is / then startfd is ignored */ + if (startchar == '/') { + startvp = NULL; + } else { + file_t *startfp; + + if ((startfp = getf(atfd)) == NULL) + return (set_errno(oerror)); + startvp = startfp->f_vnode; + VN_HOLD(startvp); + releasef(atfd); + } + } + + if (lookupnameat(path, UIO_USERSPACE, + (fmode & LX_O_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, &vp, startvp) != 0) { + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(oerror)); + } + + if (startvp != NULL) + VN_RELE(startvp); + + if (vp->v_type != VDIR) + error = ENOTDIR; + + VN_RELE(vp); + if (error != 0) + return (set_errno(ENOTDIR)); + + set_errno(oerror); + } + return (ttolwp(curthread)->lwp_errno); + } + + if ((error = lx_open_postprocess(fd, fmode)) != 0) { + return (set_errno(error)); + } + return (fd); +} + +long +lx_open(char *path, int fmode, int cmode) +{ + return (lx_openat(LX_AT_FDCWD, path, fmode, cmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_personality.c b/usr/src/uts/common/brand/lx/syscall/lx_personality.c new file mode 100644 index 0000000000..e7aa945b50 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_personality.c @@ -0,0 +1,112 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + + +/* + * These flags are for what Linux calls "bug emulation". + * (Descriptions from the personality(2) Linux man page.) + * + * Flags which are currently actionable in LX: + * - READ_IMPLIES_EXEC (since Linux 2.6.8) + * With this flag set, PROT_READ implies PROT_EXEC for mmap(2). + * + * Flags which are current accepted but ignored: + * - UNAME26 (since Linux 3.1) + * Have uname(2) report a 2.6.40+ version number rather than a 3.x version + * number. Added as a stopgap measure to support broken applications that + * could not handle the kernel version- numbering switch from 2.6.x to 3.x. + * + * - ADDR_NO_RANDOMIZE (since Linux 2.6.12) + * With this flag set, disable address-space-layout randomization. + * + * - FDPIC_FUNCPTRS (since Linux 2.6.11) + * User-space function pointers to signal handlers point (on certain + * architectures) to descriptors. + * + * - MMAP_PAGE_ZERO (since Linux 2.4.0) + * Map page 0 as read-only (to support binaries that depend on this SVr4 + * behavior). + * + * - ADDR_COMPAT_LAYOUT (since Linux 2.6.9) + * With this flag set, provide legacy virtual address space layout. + * + * - ADDR_LIMIT_32BIT (since Linux 2.2) + * Limit the address space to 32 bits. + * + * - SHORT_INODE (since Linux 2.4.0) + * No effects(?). + * + * - WHOLE_SECONDS (since Linux 1.2.0) + * No effects(?). + * + * - STICKY_TIMEOUTS (since Linux 1.2.0) + * With this flag set, select(2), pselect(2), and ppoll(2) do not modify the + * returned timeout argument when interrupted by a signal handler. + * + * - ADDR_LIMIT_3GB (since Linux 2.4.0) + * With this flag set, use 0xc0000000 as the offset at which to search a + * virtual memory chunk on mmap(2); otherwise use 0xffffe000. + */ + +#define LX_PER_GET 0xffffffff + +long +lx_personality(unsigned int arg) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + unsigned int result = 0; + + mutex_enter(&curproc->p_lock); + result = lxpd->l_personality; + + if (arg == LX_PER_GET) { + mutex_exit(&curproc->p_lock); + return (result); + } + + /* + * Prevent changes to the personality if the process is undergoing an + * exec. This will allow elfexec and friends to manipulate the + * personality without hinderance. + */ + if ((curproc->p_flag & P_PR_EXEC) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EINVAL)); + } + + /* + * Keep tabs when a non-Linux personality is set. This is silently + * allowed to succeed, even though the emulation required is almost + * certainly missing. + */ + if ((arg & LX_PER_MASK) != LX_PER_LINUX) { + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "invalid personality: %02X", + arg & LX_PER_MASK); + lx_unsupported(buf); + } + + lxpd->l_personality = arg; + mutex_exit(&curproc->p_lock); + return (result); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c new file mode 100644 index 0000000000..519c742abc --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c @@ -0,0 +1,200 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved. + * + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/fs/fifonode.h> +#include <sys/fcntl.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +/* + * Based on native pipe(2) system call, except that the pipe is half-duplex. + */ +static int +lx_hd_pipe(intptr_t arg, int flags) +{ + vnode_t *vp1, *vp2; + struct file *fp1, *fp2; + int error = 0; + int flag1, flag2, iflags; + int fd1, fd2; + + /* + * Validate allowed flags. + */ + if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) { + return (set_errno(EINVAL)); + } + /* + * Allocate and initialize two vnodes. + */ + makepipe(&vp1, &vp2); + + /* + * Allocate and initialize two file table entries and two + * file pointers. The first file pointer is open for read and the + * second is open for write. + */ + if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) { + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); + } + + if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0) + goto out2; + + /* + * Create two stream heads and attach to each vnode. + */ + if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0) + goto out; + + if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) { + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, + fp1->f_cred, NULL); + goto out; + } + + strmate(vp1, vp2); + + VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid(); + + /* + * Set the O_NONBLOCK flag if requested. + */ + if (flags & FNONBLOCK) { + flag1 = fp1->f_flag; + flag2 = fp2->f_flag; + iflags = flags & FNONBLOCK; + + if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp1->f_flag |= iflags; + + if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp2->f_flag |= iflags; + } + + /* + * Return the file descriptors to the user. They now + * point to two different vnodes which have different + * stream heads. + */ + if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) || + copyout(&fd2, &((int *)arg)[1], sizeof (int))) { + error = EFAULT; + goto out_vop_close; + } + + /* + * Now fill in the entries that falloc reserved + */ + mutex_exit(&fp1->f_tlock); + mutex_exit(&fp2->f_tlock); + setf(fd1, fp1); + setf(fd2, fp2); + + /* + * Optionally set the FCLOEXEC flag + */ + if ((flags & FCLOEXEC) != 0) { + f_setfd(fd1, FD_CLOEXEC); + f_setfd(fd2, FD_CLOEXEC); + } + + return (0); +out_vop_close: + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL); + (void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL); +out: + setf(fd2, NULL); + unfalloc(fp2); +out2: + setf(fd1, NULL); + unfalloc(fp1); + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); +} + +/* + * pipe(2) system call. + */ +long +lx_pipe(intptr_t arg) +{ + return (lx_hd_pipe(arg, 0)); +} + +/* + * pipe2(2) system call. + */ +long +lx_pipe2(intptr_t arg, int lxflags) +{ + int flags = 0; + + /* + * Validate allowed flags. + */ + if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) { + return (set_errno(EINVAL)); + } + + /* + * Convert from Linux flags to illumos flags. + */ + if (lxflags & LX_O_NONBLOCK) { + flags |= FNONBLOCK; + } + if (lxflags & LX_O_CLOEXEC) { + flags |= FCLOEXEC; + } + + return (lx_hd_pipe(arg, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_poll.c b/usr/src/uts/common/brand/lx/syscall/lx_poll.c new file mode 100644 index 0000000000..1d92a55ddf --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_poll.c @@ -0,0 +1,762 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/poll_impl.h> +#include <sys/schedctl.h> +#include <sys/lx_signal.h> + + +/* From uts/common/syscall/poll.c */ +extern int poll_copyin(pollstate_t *, pollfd_t *, nfds_t); +extern int poll_common(pollstate_t *, pollfd_t *, nfds_t, timespec_t *, int *); + +/* + * These events are identical between Linux and SunOS + */ +#define LX_POLLIN 0x001 +#define LX_POLLPRI 0x002 +#define LX_POLLOUT 0x004 +#define LX_POLLERR 0x008 +#define LX_POLLHUP 0x010 +#define LX_POLLNVAL 0x020 +#define LX_POLLRDNORM 0x040 +#define LX_POLLRDBAND 0x080 + +#define LX_POLL_COMMON_EVENTS (LX_POLLIN | LX_POLLPRI | LX_POLLOUT | \ + LX_POLLERR | LX_POLLHUP | LX_POLLNVAL | LX_POLLRDNORM | LX_POLLRDBAND) + +/* + * These events differ between Linux and SunOS + */ +#define LX_POLLWRNORM 0x0100 +#define LX_POLLWRBAND 0x0200 +#define LX_POLLRDHUP 0x2000 + + +#define LX_POLL_SUPPORTED_EVENTS \ + (LX_POLL_COMMON_EVENTS | LX_POLLWRNORM | LX_POLLWRBAND | LX_POLLRDHUP) + + +static int +lx_poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i, error = 0; + pollfd_t *pollfdp; + + if ((error = poll_copyin(ps, fds, nfds)) != 0) { + return (error); + } + pollfdp = ps->ps_pollfd; + + /* Convert the Linux events bitmask into SunOS equivalent. */ + for (i = 0; i < nfds; i++) { + short lx_events = pollfdp[i].events; + short events; + + /* + * If the caller is polling for an unsupported event, we + * have to bail out. + */ + if (lx_events & ~LX_POLL_SUPPORTED_EVENTS) { + return (ENOTSUP); + } + + events = lx_events & LX_POLL_COMMON_EVENTS; + if (lx_events & LX_POLLWRNORM) + events |= POLLWRNORM; + if (lx_events & LX_POLLWRBAND) + events |= POLLWRBAND; + if (lx_events & LX_POLLRDHUP) + events |= POLLRDHUP; + pollfdp[i].events = events; + oldevt[i] = lx_events; + } + return (0); +} + +static int +lx_poll_copyout(pollfd_t *pollfdp, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i; + + /* + * Convert SunOS revents bitmask into Linux equivalent and restore + * cached events field which was swizzled by lx_poll_copyin. + */ + for (i = 0; i < nfds; i++) { + short revents = pollfdp[i].revents; + short lx_revents = revents & LX_POLL_COMMON_EVENTS; + short orig_events = oldevt[i]; + + if (revents & POLLWRBAND) + lx_revents |= LX_POLLWRBAND; + if (revents & POLLRDHUP) + lx_revents |= LX_POLLRDHUP; + /* + * Because POLLOUT and POLLWRNORM are native defined as the + * same value, care must be taken when translating them to + * Linux where they differ. + */ + if (revents & POLLOUT) { + if ((orig_events & LX_POLLOUT) == 0) + lx_revents &= ~LX_POLLOUT; + if (orig_events & LX_POLLWRNORM) + lx_revents |= LX_POLLWRNORM; + } + + pollfdp[i].revents = lx_revents; + pollfdp[i].events = orig_events; + } + + if (copyout(pollfdp, fds, sizeof (pollfd_t) * nfds) != 0) + return (EFAULT); + + return (0); +} + +static long +lx_poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + short *oldevt = NULL; + int error = 0, fdcnt = 0; + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + /* + * Certain event types which are distinct on Linux are aliased + * against each other on illumos. In order properly translate + * back into the Linux format, the original events of interest + * are stored in 'oldevt' for use during lx_poll_copyout. + */ + oldevt = kmem_alloc(nfds * sizeof (short), KM_SLEEP); + if ((error = lx_poll_copyin(ps, fds, nfds, oldevt)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + +pollout: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (nfds != 0 && error == 0) { + error = lx_poll_copyout(pollfdp, fds, nfds, oldevt); + } + if (oldevt != NULL) { + kmem_free(oldevt, nfds * sizeof (short)); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_poll(pollfd_t *fds, nfds_t nfds, int timeout) +{ + timespec_t ts, *tsp = NULL; + + if (timeout >= 0) { + ts.tv_sec = timeout / MILLISEC; + ts.tv_nsec = (timeout % MILLISEC) * MICROSEC; + tsp = &ts; + } + + return (lx_poll_common(fds, nfds, tsp, NULL)); +} + +long +lx_ppoll(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, lx_sigset_t *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset; + + if (copyin(setp, &lset, sizeof (lset))) + return (set_errno(EFAULT)); + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + + return (lx_poll_common(fds, nfds, tsp, ksetp)); +} + +typedef struct lx_select_buf_s { + long *lsb_rfds; + long *lsb_wfds; + long *lsb_efds; + unsigned int lsb_size; +} lx_select_buf_t; + +/* + * Size (in bytes) of buffer appropriate for fd_set copyin/copyout. + * Linux uses buffers of 'long' to accomplish this. + */ +#define LX_FD_SET_BYTES (sizeof (long)) +#define LX_FD_SET_BITS (8 * LX_FD_SET_BYTES) +#define LX_FD_SET_SIZE(nfds) \ + ((((nfds) + (LX_FD_SET_BITS - 1)) / LX_FD_SET_BITS) * LX_FD_SET_BYTES) + +static int +lx_select_copyin(pollstate_t *ps, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds) +{ + int n; + long *in, *out, *ex; + long absent = 0; + pollfd_t *pfd; + nfds_t old_nfds; + + /* + * Just like pollsys and lx_poll, attempt to reuse ps_pollfd if it is + * appropriately sized. See poll_copyin for more detail. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pfd = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pfd; + ps->ps_nfds = nfds; + } else { + pfd = ps->ps_pollfd; + } + + if (rfds != NULL) { + if (copyin(rfds, sbuf->lsb_rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyin(wfds, sbuf->lsb_wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyin(efds, sbuf->lsb_efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + in = (rfds != NULL) ? sbuf->lsb_rfds : &absent; + out = (wfds != NULL) ? sbuf->lsb_wfds : &absent; + ex = (efds != NULL) ? sbuf->lsb_efds : &absent; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long b, m, j; + + b = (unsigned long)(*in | *out | *ex); + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + int fd = n + j; + + if (fd >= nfds) + return (0); + pfd->events = 0; + if (b & 1) { + pfd->fd = fd; + if (*in & m) + pfd->events |= POLLRDNORM; + if (*out & m) + pfd->events |= POLLWRNORM; + if (*ex & m) + pfd->events |= POLLRDBAND; + } else { + pfd->fd = -1; + } + pfd++; + b >>= 1; + m <<= 1; + } + + if (rfds != NULL) + in++; + if (wfds != NULL) + out++; + if (efds != NULL) + ex++; + } + return (0); +} + +static int +lx_select_copyout(pollfd_t *pollfdp, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds, int *fdcnt) +{ + int n; + pollfd_t *pfd; + long rv = 0; + + /* + * If poll did not find any fds of interest, we can just zero out the + * fd_set fields for copyout. + */ + if (*fdcnt == 0) { + if (rfds != NULL) { + bzero(sbuf->lsb_rfds, sbuf->lsb_size); + } + if (wfds != NULL) { + bzero(sbuf->lsb_wfds, sbuf->lsb_size); + } + if (efds != NULL) { + bzero(sbuf->lsb_efds, sbuf->lsb_size); + } + goto copyout; + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + pfd = pollfdp; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long m, j; + long in = 0, out = 0, ex = 0; + + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + if ((n + j) >= nfds) + break; + if (pfd->revents != 0) { + if (pfd->revents & POLLNVAL) { + return (EBADF); + } + if (pfd->revents & POLLRDNORM) { + in |= m; + rv++; + } + if (pfd->revents & POLLWRNORM) { + out |= m; + rv++; + } + if (pfd->revents & POLLRDBAND) { + ex |= m; + rv++; + } + /* + * Only set this bit on return if we asked + * about input conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDNORM)) { + if ((in & m) == 0) { + /* wasn't already set */ + rv++; + } + in |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLWRNORM)) { + if ((out & m) == 0) { + /* wasn't already set */ + rv++; + } + out |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDBAND)) { + if ((ex & m) == 0) { + /* wasn't already set */ + rv++; + } + ex |= m; + } + } + m <<= 1; + pfd++; + } + if (rfds != NULL) + sbuf->lsb_rfds[n / LX_FD_SET_BITS] = in; + if (wfds != NULL) + sbuf->lsb_wfds[n / LX_FD_SET_BITS] = out; + if (efds != NULL) + sbuf->lsb_efds[n / LX_FD_SET_BITS] = ex; + } + +copyout: + if (rfds != NULL) { + if (copyout(sbuf->lsb_rfds, rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyout(sbuf->lsb_wfds, wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyout(sbuf->lsb_efds, efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + *fdcnt = rv; + return (0); +} + + +static long +lx_select_common(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL, *fake_fds = NULL; + lx_select_buf_t sbuf = {0}; + int error = 0, fdcnt = 0; + + if (nfds < 0) { + return (set_errno(EINVAL)); + } + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto out; + } + mutex_exit(&p->p_lock); + } + + /* + * Because poll caching uses the userspace pollfd_t pointer to verify + * cache reuse validity, a simulated value must be supplied when + * emulating Linux select(2). The first non-NULL pointer from + * rfds/wfds/efds is used for this purpose. + */ + if (rfds != NULL) { + fake_fds = (pollfd_t *)rfds; + } else if (wfds != NULL) { + fake_fds = (pollfd_t *)wfds; + } else if (efds != NULL) { + fake_fds = (pollfd_t *)efds; + } else { + /* + * A non-zero nfds was supplied but all three fd_set pointers + * were null. Fall back to doing a simple timeout. + */ + nfds = 0; + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto out; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + sbuf.lsb_size = LX_FD_SET_SIZE(nfds); + if (rfds != NULL) + sbuf.lsb_rfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (wfds != NULL) + sbuf.lsb_wfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (efds != NULL) + sbuf.lsb_efds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + + error = lx_select_copyin(ps, &sbuf, nfds, rfds, wfds, efds); + if (error != 0) { + goto out; + } + + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fake_fds, (nfds_t)nfds, tsp, &fdcnt); + +out: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (error == 0 && nfds != 0) { + error = lx_select_copyout(pollfdp, &sbuf, nfds, rfds, wfds, + efds, &fdcnt); + } + if (sbuf.lsb_size != 0) { + if (sbuf.lsb_rfds != NULL) + kmem_free(sbuf.lsb_rfds, sbuf.lsb_size); + if (sbuf.lsb_wfds != NULL) + kmem_free(sbuf.lsb_wfds, sbuf.lsb_size); + if (sbuf.lsb_efds != NULL) + kmem_free(sbuf.lsb_efds, sbuf.lsb_size); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_select(int nfds, long *rfds, long *wfds, long *efds, + struct timeval *timeoutp) +{ + timespec_t ts, *tsp = NULL; + + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + struct timeval tv; + + if (copyin(timeoutp, &tv, sizeof (tv))) + return (set_errno(EFAULT)); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * (NANOSEC / MICROSEC); + } else { + struct timeval32 tv32; + + if (copyin(timeoutp, &tv32, sizeof (tv32))) + return (set_errno(EFAULT)); + ts.tv_sec = tv32.tv_sec; + ts.tv_nsec = tv32.tv_usec * (NANOSEC / MICROSEC); + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, NULL)); +} + + +typedef struct { + uintptr_t lpsa_addr; + unsigned long lpsa_len; +} lx_pselect_sig_arg_t; + +#if defined(_LP64) +typedef struct { + caddr32_t lpsa_addr; + uint32_t lpsa_len; +} lx_pselect_sig_arg32_t; +#endif /* defined(_LP64) */ + +long +lx_pselect(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *timeoutp, void *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset, *sigaddr = NULL; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_pselect_sig_arg_t lpsa; + + if (copyin(setp, &lpsa, sizeof (lpsa)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)lpsa.lpsa_addr; + } +#if defined(_LP64) + else { + lx_pselect_sig_arg32_t lpsa32; + + if (copyin(setp, &lpsa32, sizeof (lpsa32)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa32.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)(uint64_t)lpsa32.lpsa_addr; + } +#endif /* defined(_LP64) */ + + /* This is where we check if the sigset is *really* NULL. */ + if (sigaddr != NULL) { + if (copyin(sigaddr, &lset, sizeof (lset)) != 0) + return (set_errno(EFAULT)); + + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, ksetp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_prctl.c b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c new file mode 100644 index 0000000000..091a6f547b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c @@ -0,0 +1,210 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/user.h> +#include <sys/priv.h> +#include <sys/brand.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <lx_signum.h> + +#define LX_PR_SET_PDEATHSIG 1 +#define LX_PR_GET_PDEATHSIG 2 +#define LX_PR_GET_DUMPABLE 3 +#define LX_PR_SET_DUMPABLE 4 +#define LX_PR_GET_UNALIGN 5 +#define LX_PR_SET_UNALIGN 6 +#define LX_PR_GET_KEEPCAPS 7 +#define LX_PR_SET_KEEPCAPS 8 +#define LX_PR_GET_FPEMU 9 +#define LX_PR_SET_FPEMU 10 +#define LX_PR_GET_FPEXC 11 +#define LX_PR_SET_FPEXC 12 +#define LX_PR_GET_TIMING 13 +#define LX_PR_SET_TIMING 14 +#define LX_PR_SET_NAME 15 +#define LX_PR_GET_NAME 16 +#define LX_PR_GET_ENDIAN 19 +#define LX_PR_SET_ENDIAN 20 +#define LX_PR_GET_SECCOMP 21 +#define LX_PR_SET_SECCOMP 22 +#define LX_PR_CAPBSET_READ 23 +#define LX_PR_CAPBSET_DROP 24 +#define LX_PR_GET_TSC 25 +#define LX_PR_SET_TSC 26 +#define LX_PR_GET_SECUREBITS 27 +#define LX_PR_SET_SECUREBITS 28 +#define LX_PR_SET_TIMERSLACK 29 +#define LX_PR_GET_TIMERSLACK 30 +#define LX_PR_TASK_PERF_EVENTS_DISABLE 31 +#define LX_PR_TASK_PERF_EVENTS_ENABLE 32 +#define LX_PR_MCE_KILL 33 +#define LX_PR_MCE_KILL_GET 34 +#define LX_PR_SET_MM 35 +#define LX_PR_SET_CHILD_SUBREAPER 36 +#define LX_PR_GET_CHILD_SUBREAPER 37 +#define LX_PR_SET_NO_NEW_PRIVS 38 +#define LX_PR_GET_NO_NEW_PRIVS 39 +#define LX_PR_GET_TID_ADDRESS 40 +#define LX_PR_SET_THP_DISABLE 41 +#define LX_PR_GET_THP_DISABLE 42 + +#define LX_PR_SET_NAME_NAMELEN 16 + +long +lx_prctl(int opt, uintptr_t data) +{ + long err; + char ebuf[64]; + + switch (opt) { + case LX_PR_GET_DUMPABLE: { + /* Indicate that process is always dumpable */ + return (1); + } + + case LX_PR_SET_DUMPABLE: { + if (data != 0 && data != 1) { + return (set_errno(EINVAL)); + } + /* Lie about altering process dumpability */ + return (0); + } + + case LX_PR_GET_SECUREBITS: { + /* Our bits are always 0 */ + return (0); + } + + case LX_PR_SET_SECUREBITS: { + /* Ignore setting any bits from arg2 */ + return (0); + } + + case LX_PR_SET_KEEPCAPS: { + /* + * The closest illumos analog to SET_KEEPCAPS is the PRIV_AWARE + * flag. There are probably some cases where it's not exactly + * the same, but this will do for a first try. + */ + if (data == 0) { + err = setpflags(PRIV_AWARE_RESET, 1, NULL); + } else { + err = setpflags(PRIV_AWARE, 1, NULL); + } + + if (err != 0) { + return (set_errno(err)); + } + return (0); + } + + case LX_PR_SET_NAME: { + char name[LX_PR_SET_NAME_NAMELEN + 1]; + proc_t *p = curproc; + /* + * In Linux, PR_SET_NAME sets the name of the thread, not the + * process. Due to the historical quirks of Linux's asinine + * thread model, this name is effectively the name of the + * process (as visible via ps(1)) if the thread is the first of + * its task group. The first thread is therefore special, and + * to best mimic Linux semantics (and absent a notion of + * per-LWP names), we do nothing (but return success) on LWPs + * other than LWP 1. + */ + if (curthread->t_tid != 1) { + return (0); + } + if (copyin((void *)data, name, LX_PR_SET_NAME_NAMELEN) != 0) { + return (set_errno(EFAULT)); + } + name[LX_PR_SET_NAME_NAMELEN] = '\0'; + mutex_enter(&p->p_lock); + (void) strncpy(p->p_user.u_comm, name, MAXCOMLEN + 1); + (void) strncpy(p->p_user.u_psargs, name, PSARGSZ); + mutex_exit(&p->p_lock); + return (0); + } + + case LX_PR_GET_PDEATHSIG: { + int sig; + lx_proc_data_t *lxpd; + + mutex_enter(&curproc->p_lock); + VERIFY(lxpd = ptolxproc(curproc)); + sig = lxpd->l_parent_deathsig; + mutex_exit(&curproc->p_lock); + + return (sig); + } + + case LX_PR_SET_PDEATHSIG: { + int sig = lx_ltos_signo((int)data, 0); + proc_t *pp = NULL; + lx_proc_data_t *lxpd; + + if (sig == 0 && data != 0) { + return (set_errno(EINVAL)); + } + + mutex_enter(&pidlock); + /* Set signal on our self */ + mutex_enter(&curproc->p_lock); + VERIFY(lxpd = ptolxproc(curproc)); + lxpd->l_parent_deathsig = sig; + pp = curproc->p_parent; + mutex_exit(&curproc->p_lock); + + /* Configure parent to potentially signal children on death */ + mutex_enter(&pp->p_lock); + if (PROC_IS_BRANDED(pp)) { + VERIFY(lxpd = ptolxproc(pp)); + /* + * Mark the parent as having children which wish to be + * signaled on death of parent. + */ + lxpd->l_flags |= LX_PROC_CHILD_DEATHSIG; + } else { + /* + * If the parent is not a branded process, the needed + * hooks to facilitate this mechanism will not fire + * when it dies. We lie about success in this case. + */ + } + mutex_exit(&pp->p_lock); + mutex_exit(&pidlock); + return (0); + } + + case LX_PR_CAPBSET_DROP: { + /* + * On recent versions of Linux the login svc drops capabilities + * and if that fails the svc dies and is restarted by systemd. + * For now we pretend dropping capabilities succeeded. + */ + return (0); + } + + default: + break; + } + + snprintf(ebuf, 64, "prctl option %d", opt); + lx_unsupported(ebuf); + return (set_errno(EINVAL)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c new file mode 100644 index 0000000000..6581ead25b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c @@ -0,0 +1,575 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/zone.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#define LX_RLIMIT_CPU 0 +#define LX_RLIMIT_FSIZE 1 +#define LX_RLIMIT_DATA 2 +#define LX_RLIMIT_STACK 3 +#define LX_RLIMIT_CORE 4 +#define LX_RLIMIT_RSS 5 +#define LX_RLIMIT_NPROC 6 +#define LX_RLIMIT_NOFILE 7 +#define LX_RLIMIT_MEMLOCK 8 +#define LX_RLIMIT_AS 9 +#define LX_RLIMIT_LOCKS 10 /* NA limit on locks, early 2.4 only */ +#define LX_RLIMIT_SIGPENDING 11 +#define LX_RLIMIT_MSGQUEUE 12 +#define LX_RLIMIT_NICE 13 /* NA ceiling for nice */ +#define LX_RLIMIT_RTPRIO 14 /* NA ceiling on the RT priority */ +#define LX_RLIMIT_RTTIME 15 /* NA cpu limit for RT proc. */ + +#define LX_RLIMIT_NLIMITS 16 + +#define RCTL_INFINITE(x) \ + ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x->rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct { + ulong_t rlim_cur; + ulong_t rlim_max; +} lx_rlimit_t; + +typedef struct { + uint32_t rlim_cur; + uint32_t rlim_max; +} lx_rlimit32_t; + +/* + * Linux supports many of the same resources that we do, but on Illumos these + * are rctls. Instead of using rlimit, we use rctls for all of the limits. + * This table is used to translate Linux rlimit keys into the Illumos legacy + * rlimit. We then primarily use the rctl/rlimit compatability code to + * manage these. + */ +static int l_to_r[LX_RLIMIT_NLIMITS] = { + RLIMIT_CPU, /* 0 CPU */ + RLIMIT_FSIZE, /* 1 FSIZE */ + RLIMIT_DATA, /* 2 DATA */ + RLIMIT_STACK, /* 3 STACK */ + RLIMIT_CORE, /* 4 CORE */ + -1, /* 5 RSS */ + -1, /* 6 NPROC */ + RLIMIT_NOFILE, /* 7 NOFILE */ + -1, /* 8 MEMLOCK */ + RLIMIT_AS, /* 9 AS */ + -1, /* 10 LOCKS */ + -1, /* 11 SIGPENDING */ + -1, /* 12 MSGQUEUE */ + -1, /* 13 NICE */ + -1, /* 14 RTPRIO */ + -1 /* 15 RTTIME */ +}; + +/* + * Magic value Linux uses to indicate infinity + */ +#define LX_RLIM_INFINITY_N ULONG_MAX + +static void +lx_get_rctl(char *nm, struct rlimit64 *rlp64) +{ + rctl_hndl_t hndl; + rctl_val_t *oval, *nval; + + rlp64->rlim_cur = RLIM_INFINITY; + rlp64->rlim_max = RLIM_INFINITY; + + nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP); + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + oval = NULL; + while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) { + oval = nval; + switch (nval->rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_cur = nval->rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_max = nval->rcv_value; + break; + } + } + + mutex_exit(&curproc->p_lock); + kmem_free(nval, sizeof (rctl_val_t)); + + if (rlp64->rlim_cur == RLIM_INFINITY && + rlp64->rlim_max != RLIM_INFINITY) + rlp64->rlim_cur = rlp64->rlim_max; +} + +static int +lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int resource; + int64_t cur = -1; + boolean_t cur_inf = B_FALSE; + int64_t max = -1; + boolean_t max_inf = B_FALSE; + struct rlimit64 rlim64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max; + break; + + case LX_RLIMIT_NICE: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max; + break; + + case LX_RLIMIT_RTTIME: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max; + break; + + case LX_RLIMIT_RSS: + /* zone.max-physical-memory */ + rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_phys_mem_ctl; + break; + + case LX_RLIMIT_NPROC: + /* zone.max-lwps */ + rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_nlwps_ctl; + break; + + case LX_RLIMIT_MEMLOCK: + /* zone.max-locked-memory */ + rlim64.rlim_cur = rlim64.rlim_max = + curzone->zone_locked_mem_ctl; + break; + + case LX_RLIMIT_SIGPENDING: + lx_get_rctl("process.max-sigqueue-size", &rlim64); + break; + + case LX_RLIMIT_MSGQUEUE: + lx_get_rctl("process.max-msg-messages", &rlim64); + break; + + default: + resource = l_to_r[lx_resource]; + + mutex_enter(&curproc->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], curproc, + &rlim64); + mutex_exit(&curproc->p_lock); + break; + } + + + if (rlim64.rlim_cur == RLIM64_INFINITY) { + cur = LX_RLIM_INFINITY_N; + } else { + cur = rlim64.rlim_cur; + } + if (rlim64.rlim_max == RLIM64_INFINITY) { + max = LX_RLIM_INFINITY_N; + } else { + max = rlim64.rlim_max; + } + + if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) { + /* + * Stunningly, Linux has somehow managed to confuse the concept + * of a "limit" with that of a "default" -- and the value of + * RLIMIT_STACK is used by NPTL as the _default_ stack size if + * it isn't specified. (!!) Even for a system that prides + * itself on slapdash castles of junk, this is an amazingly + * willful act of incompetence -- and one that is gleefully + * confessed in the pthread_create() man page: "if the + * RLIMIT_STACK soft resource limit at the time the program + * started has any value other than 'unlimited', then it + * determines the default stack size of new threads." A + * typical stack limit for us is 32TB; if it needs to be said, + * setting the default stack size to be 32TB doesn't work so + * well! Of course, glibc dropping a deuce in its pants + * becomes our problem -- so to prevent smelly accidents we + * tell Linux that any stack limit over the old (32-bit) values + * for infinity are just infinitely large. + */ + cur_inf = B_TRUE; + max_inf = B_TRUE; + } + + if (cur_inf) { + *rlim_curp = LX_RLIM64_INFINITY; + } else { + *rlim_curp = cur; + } + + if (max_inf) { + *rlim_maxp = LX_RLIM64_INFINITY; + } else { + *rlim_maxp = max; + } + + return (0); +} + +/* + * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit + * in Linux headers and code. The only difference between this and the old + * getrlimit (variously called getrlimit or old_getrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. Modern code will + * use this version by default. + */ +long +lx_getrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (rlim_cur == LX_RLIM64_INFINITY) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else if (rlim_cur > LX_RLIM_INFINITY_N) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max == LX_RLIM64_INFINITY) + rl.rlim_max = LX_RLIM_INFINITY_N; + else if (rlim_max > LX_RLIM_INFINITY_N) + rl.rlim_max = LX_RLIM_INFINITY_N; + else + rl.rlim_max = (ulong_t)rlim_max; + + if (copyout(&rl, rlp, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (rlim_cur > UINT_MAX) + rl.rlim_cur = UINT_MAX; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > UINT_MAX) + rl.rlim_max = UINT_MAX; + else + rl.rlim_max = (ulong_t)rlim_max; + + rl32.rlim_cur = rl.rlim_cur; + rl32.rlim_max = rl.rlim_max; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +/* + * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit + * in Linux headers and code. The only difference between this and the new + * getrlimit (variously called getrlimit or ugetrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. + * + * This is only used for 32-bit code. + */ +long +lx_oldgetrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit32_t rl32; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (rlim_cur > INT_MAX) + rl32.rlim_cur = INT_MAX; + else + rl32.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > INT_MAX) + rl32.rlim_max = INT_MAX; + else + rl32.rlim_max = (ulong_t)rlim_cur; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +lx_set_rctl(char *nm, struct rlimit64 *rlp64) +{ + int err; + rctl_hndl_t hndl; + rctl_alloc_gp_t *gp; + + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + + /* + * We're not supposed to do this but since we want all our rctls to + * behave like rlimits, we take advantage of this function to set up + * this way. + */ + err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0, + CRED()); + + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + + return (err); +} + +static int +lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int err; + int resource; + rctl_alloc_gp_t *gp; + struct rlimit64 rl64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max; + break; + + case LX_RLIMIT_NICE: + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTTIME: + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RSS: + /* + * zone.max-physical-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_NPROC: + /* + * zone.max-lwps + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_MEMLOCK: + /* + * zone.max-locked-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_SIGPENDING: + /* + * On Ubuntu at least, the login and sshd processes expect to + * set this limit to 16k and login will fail if this fails. On + * Illumos we have a system limit of 8k and normally the + * privileged limit is 512. We simply pretend this works to + * allow login to work. + */ + if (rlim_max > 8192) + return (0); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64)) + != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_MSGQUEUE: + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0) + return (set_errno(err)); + break; + + default: + resource = l_to_r[lx_resource]; + + /* + * Linux limits the max number of open files to 1m and there is + * a test for this. + */ + if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024)) + return (EPERM); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + err = rctl_rlimit_set(rctlproc_legacy[resource], curproc, + &rl64, gp, rctlproc_flags[resource], + rctlproc_signals[resource], CRED()); + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + if (err != 0) + return (set_errno(err)); + break; + } + + return (0); +} + +long +lx_setrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(rlp, &rl, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (copyin(rlp, &rl32, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + rl.rlim_cur = rl32.rlim_cur; + rl.rlim_max = rl32.rlim_max; + } +#endif + + if ((rl.rlim_max != LX_RLIM_INFINITY_N && + rl.rlim_cur == LX_RLIM_INFINITY_N) || + rl.rlim_cur > rl.rlim_max) + return (set_errno(EINVAL)); + + if (rl.rlim_cur == LX_RLIM_INFINITY_N) + rlim_cur = LX_RLIM64_INFINITY; + else + rlim_cur = rl.rlim_cur; + + if (rl.rlim_max == LX_RLIM_INFINITY_N) + rlim_max = LX_RLIM64_INFINITY; + else + rlim_max = rl.rlim_max; + + rv = lx_setrlimit_common(resource, rlim_cur, rlim_max); + if (rv != 0) + return (set_errno(rv)); + return (0); +} + +/* + * From the man page: + * The Linux-specific prlimit() system call combines and extends the + * functionality of setrlimit() and getrlimit(). It can be used to both set + * and get the resource limits of an arbitrary process. + * + * If pid is 0, then the call applies to the calling process. + */ +long +lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp) +{ + int rv; + lx_rlimit64_t nrl, orl; + + if (pid != 0) { + /* XXX TBD if needed */ + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "setting prlimit %d for another process\n", resource); + lx_unsupported(buf); + return (ENOTSUP); + } + + if (orlp != NULL) { + /* we first get the current limits */ + rv = lx_getrlimit_common(resource, &orl.rlim_cur, + &orl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (nrlp != NULL) { + if (copyin(nrlp, &nrl, sizeof (nrl)) != 0) + return (set_errno(EFAULT)); + + if ((nrl.rlim_max != LX_RLIM64_INFINITY && + nrl.rlim_cur == LX_RLIM64_INFINITY) || + nrl.rlim_cur > nrl.rlim_max) + return (set_errno(EINVAL)); + + rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (orlp != NULL) { + /* now return the original limits, if necessary */ + if (copyout(&orl, orlp, sizeof (orl)) != 0) + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c new file mode 100644 index 0000000000..50d532ff51 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c @@ -0,0 +1,949 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/nbmlock.h> +#include <sys/limits.h> + +/* uts/common/syscall/rw.c */ +extern size_t copyout_max_cached; + + +/* Common routines */ + +static int +lx_iovec_copyin(void *uiovp, int iovcnt, iovec_t *kiovp, ssize_t *count) +{ +#ifdef _SYSCALL32_IMPL + /* + * 32-bit callers need to have their iovec expanded, while ensuring + * that they can't move more than 2Gbytes of data in a single call. + */ + if (get_udatamodel() == DATAMODEL_ILP32) { + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len = 0; + ssize32_t total32 = 0; + int i; + + if (iovcnt > IOV_MAX_STACK) { + aiov32len = iovcnt * sizeof (iovec32_t); + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + } + + if (copyin(uiovp, aiov32, iovcnt * sizeof (iovec32_t))) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32 = aiov32[i].iov_len; + total32 += iovlen32; + if (iovlen32 < 0 || total32 < 0) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EINVAL); + } + kiovp[i].iov_len = iovlen32; + kiovp[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT32) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + } + *count = total32; + + if (aiov32len != 0) + kmem_free(aiov32, aiov32len); + } else +#endif + { + ssize_t total = 0; + int i; + + if (copyin(uiovp, kiovp, iovcnt * sizeof (iovec_t))) + return (EFAULT); + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = kiovp[i].iov_len; + total += iovlen; + if (iovlen < 0 || total < 0) { + return (EINVAL); + } + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT) { + return (EFAULT); + } + } + *count = total; + } + return (0); +} + +static int +lx_read_common(file_t *fp, uio_t *uiop, size_t *nread, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 0, ioflag; + ssize_t count = uiop->uio_resid; + size_t rcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, uiop->uio_offset, count, svmand, + NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + /* + * For non-positioned reads, recheck offset/count validity inside + * VOP_WRLOCK to prevent filesize from changing during validation. + */ + if (!positioned) { + u_offset_t uoffset = (u_offset_t)(ulong_t)fp->f_offset; + + if ((vp->v_type == VREG) && (uoffset >= OFFSET_MAX(fp))) { + struct vattr va; + + va.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL); + VOP_RWUNLOCK(vp, rwflag, NULL); + if (error != 0) + goto out; + /* We have to return EOF if fileoff is >= file size. */ + if (uoffset >= va.va_size) + goto out; + /* + * File is greater than or equal to maxoff and + * therefore we return EOVERFLOW. + */ + error = EOVERFLOW; + goto out; + } + if ((vp->v_type == VREG) && + (uoffset + count > OFFSET_MAX(fp))) { + count = (ssize_t)(OFFSET_MAX(fp) - uoffset); + uiop->uio_resid = count; + } + uiop->uio_offset = uoffset; + } + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, uiop, ioflag, fp->f_cred, NULL); + rcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)rcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)rcount; + /* Store offset for non-positioned reads */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = rcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nread = rcount; + return (error); +} + +static int +lx_write_common(file_t *fp, uio_t *uiop, size_t *nwrite, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 1, ioflag; + ssize_t count = uiop->uio_resid; + size_t wcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, uiop->uio_loffset, count, + svmand, NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + if (!positioned) { + /* + * For non-positioned writes, the value of fp->f_offset is + * re-queried while inside VOP_RWLOCK. This ensures that other + * writes which alter the filesize will be taken into account. + */ + uiop->uio_loffset = fp->f_offset; + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } else { + /* + * In a senseless departure from POSIX, positioned write calls + * on Linux do _not_ ignore the O_APPEND flag. + */ + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } + if (vp->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)(ulong_t)uiop->uio_loffset; + + if (fileoff >= curproc->p_fsz_ctl) { + VOP_RWUNLOCK(vp, rwflag, NULL); + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + error = EFBIG; + goto out; + } + if (fileoff >= OFFSET_MAX(fp)) { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EFBIG; + goto out; + } + if (fileoff + count > OFFSET_MAX(fp)) { + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + uiop->uio_resid = count; + } + } + + error = VOP_WRITE(vp, uiop, ioflag, fp->f_cred, NULL); + wcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)wcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)wcount; + + /* Store offset for non-positioned writes */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = wcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nwrite = wcount; + return (error); +} + +/* + * The Linux routines for reading and writing data from file descriptors behave + * differently from their SunOS counterparts in a few key ways: + * + * - Passing an iovcnt of 0 to the vectored functions results in an error on + * SunOS, but on Linux it yields return value of 0. + * + * - If any data is successfully read or written, Linux will return a success. + * This is unlike SunOS which would return an error code for the entire + * operation in cases where vectors had gone unprocessed. + * + * - Breaking from POSIX, Linux positioned writes (pwrite/pwritev) on Linux + * will obey the O_APPEND flag if it is set on the descriptor. + */ + +ssize_t +lx_read(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + if (fp->f_vnode->v_type == VDIR) { + error = EISDIR; + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return ((ssize_t)nread); +} + +ssize_t +lx_write(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return (nwrite); +} + +ssize_t +lx_readv(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + if (fp->f_vnode->v_type == VDIR) { + error = EISDIR; + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_writev(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pread(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write + * a file beyond the maximum offset representable in + * an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } else if (fp->f_vnode->v_type == VFIFO) { + error = ESPIPE; + goto out; + } else if (fp->f_vnode->v_type == VDIR) { + error = EISDIR; + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error) { + return (set_errno(error)); + } + return ((ssize_t)nread); + +} + +ssize_t +lx_pwrite(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & (FWRITE)) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * return EINVAL for offsets that cannot be + * represented in an off_t. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwrite to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } else if (fp->f_vnode->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pread32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pread(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwrite32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwrite(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_preadv(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } else if (fp->f_vnode->v_type == VDIR) { + error = EISDIR; + goto out; + } else if (fp->f_vnode->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_pwritev(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwritev to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } else if (fp->f_vnode->v_type == VFIFO) { + error = ESPIPE; + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_preadv32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, uint32_t off_hi) +{ + return (lx_preadv(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwritev32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwritev(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c new file mode 100644 index 0000000000..0def559e29 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/processor.h> +#include <sys/brand.h> +#include <sys/lx_sched.h> +#include <sys/lx_brand.h> + +extern int yield(); +extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); + +long +lx_sched_yield(void) +{ + yield(); + + return (0); +} + +int +lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp, + int64_t *rval) +{ + pid_t s_pid; + id_t s_tid; + kthread_t *t = curthread; + lx_lwp_data_t *lx_lwp; + + if (cmd != B_GET_AFFINITY_MASK && cmd != B_SET_AFFINITY_MASK) + return (set_errno(EINVAL)); + + /* + * The caller wants to know how large the mask should be. + */ + if (cmd == B_GET_AFFINITY_MASK && len == 0) { + *rval = sizeof (lx_affmask_t); + return (0); + } + + /* + * Otherwise, ensure they have a large enough mask. + */ + if (cmd == B_GET_AFFINITY_MASK && len < sizeof (lx_affmask_t)) { + *rval = -1; + return (set_errno(EINVAL)); + } + + if (pid == 0) { + s_pid = curproc->p_pid; + s_tid = curthread->t_tid; + } else if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) == -1) { + return (set_errno(ESRCH)); + } + + /* + * For now, we only support manipulating threads in the + * same process. + */ + if (curproc->p_pid != s_pid) + return (set_errno(EPERM)); + + /* + * We must hold the process lock so that the thread list + * doesn't change while we're looking at it. We'll hold + * the lock until we no longer reference the + * corresponding lwp. + */ + + mutex_enter(&curproc->p_lock); + + do { + if (t->t_tid == s_tid) + break; + t = t->t_forw; + } while (t != curthread); + + /* + * If the given PID is in the current thread's process, + * then we _must_ find it in the process's thread list. + */ + ASSERT(t->t_tid == s_tid); + + lx_lwp = t->t_lwp->lwp_brand; + + if (cmd == B_SET_AFFINITY_MASK) { + if (copyin_nowatch((void *)maskp, &lx_lwp->br_affinitymask, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = 0; + } else { + if (copyout_nowatch(&lx_lwp->br_affinitymask, (void *)maskp, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = sizeof (lx_affmask_t); + } + + mutex_exit(&curproc->p_lock); + return (0); +} + +long +lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (policy < 0) { + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + return (set_errno(EINVAL)); + } + } + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getscheduler(l_pid_t pid) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int policy; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + policy = set_errno(EINVAL); + + return (policy); +} + +long +lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int policy; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_sched_param local_param; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + tsinfo_t *tsi; + int prio, scale; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + bzero(&local_param, sizeof (local_param)); + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + /* + * I don't know if we need to do this, coz it can't be + * changed from zero anyway..... + */ + tsi = (tsinfo_t *)pcinfo.pc_clinfo; + prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + scale = tsi->ts_maxupri; + if (scale == 0) + local_param.lx_sched_prio = 0; + else + local_param.lx_sched_prio = -(prio * 20) / scale; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + local_param.lx_sched_prio = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else { + rv = set_errno(EINVAL); + } + + if (rv == 0) + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (rv); +} + +long +lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival) +{ + klwp_t *lwp = ttolwp(curthread); + struct timespec interval; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0); + bzero(&pcinfo, sizeof (pcinfo)); + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (pcparm.pc_cid == pcinfo.pc_cid && + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) { + interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + + return (0); + } + + return (set_errno(EINVAL)); +} + +int +sched_setprocset(procset_t *procset, l_pid_t pid) +{ + id_t lid, rid; + idtype_t lidtype, ridtype; + + /* + * define the target lwp + */ + if (pid == 0) { + ridtype = P_ALL; + lidtype = P_PID; + rid = 0; + lid = P_MYID; + } else { + if (lx_lpid_to_spair(pid, &pid, &lid) < 0) + return (set_errno(ESRCH)); + if (pid != curproc->p_pid) + return (set_errno(ESRCH)); + rid = 0; + ridtype = P_ALL; + lidtype = P_LWPID; + } + setprocset(procset, POP_AND, lidtype, lid, ridtype, rid); + + return (0); +} + +long +do_priocntlsys(int cmd, procset_t *procset, void *arg) +{ + return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0, + UIO_SYSSPACE)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_socket.c b/usr/src/uts/common/brand/lx/syscall/lx_socket.c new file mode 100644 index 0000000000..e8e9714143 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_socket.c @@ -0,0 +1,3750 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/sockio.h> +#include <sys/thread.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kmem.h> +#include <sys/un.h> +#include <sys/sunddi.h> +#include <sys/cred.h> +#include <sys/ucred.h> +#include <sys/model.h> +#include <sys/brand.h> +#include <sys/vmsystm.h> +#include <sys/limits.h> +#include <sys/fcntl.h> +#include <sys/sysmacros.h> +#include <netpacket/packet.h> +#include <sockcommon.h> +#include <socktpi_impl.h> +#include <netinet/udp.h> +#include <sys/sdt.h> +#include <netinet/tcp.h> +#include <netinet/igmp.h> +#include <netinet/icmp6.h> +#include <lx_errno.h> + +#include <sys/lx_brand.h> +#include <sys/lx_socket.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> + + +typedef struct lx_ucred { + pid_t lxu_pid; + lx_uid_t lxu_uid; + lx_gid_t lxu_gid; +} lx_ucred_t; + +typedef struct lx_socket_aux_data +{ + kmutex_t lxsad_lock; + enum lxsad_status_t { + LXSS_NONE = 0, + LXSS_CONNECTING, + LXSS_CONNECTED + } lxsad_status; + boolean_t lxsad_stream_cred; +} lx_socket_aux_data_t; + +static lx_socket_aux_data_t *lx_sad_acquire(vnode_t *); + +/* VSD key for lx-specific socket information */ +static uint_t lx_socket_vsd = 0; + +/* Convenience enum to enforce translation direction */ +typedef enum lx_xlate_dir { + SUNOS_TO_LX, + LX_TO_SUNOS +} lx_xlate_dir_t; + +/* enum for getpeername/getsockname handling */ +typedef enum lx_getname_type { + LX_GETPEERNAME, + LX_GETSOCKNAME +} lx_getname_type_t; + +/* + * What follows are a series of tables we use to translate Linux constants + * into equivalent Illumos constants and back again. I wish this were + * cleaner, more programmatic, and generally nicer. Sadly, life is messy, + * and Unix networking even more so. + */ +static const int ltos_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* LX_AF_UNSPEC */ + AF_UNIX, /* LX_AF_UNIX */ + AF_INET, /* LX_AF_INET */ + AF_NOTSUPPORTED, /* LX_AF_AX25 */ + AF_NOTSUPPORTED, /* LX_AF_IPX */ + AF_NOTSUPPORTED, /* LX_AF_APPLETALK */ + AF_NOTSUPPORTED, /* LX_AF_NETROM */ + AF_NOTSUPPORTED, /* LX_AF_BRIDGE */ + AF_NOTSUPPORTED, /* LX_AF_ATMPVC */ + AF_NOTSUPPORTED, /* LX_AF_X25 */ + AF_INET6, /* LX_AF_INET6 */ + AF_NOTSUPPORTED, /* LX_AF_ROSE */ + AF_NOTSUPPORTED, /* LX_AF_DECNET */ + AF_NOTSUPPORTED, /* LX_AF_NETBEUI */ + AF_NOTSUPPORTED, /* LX_AF_SECURITY */ + AF_NOTSUPPORTED, /* LX_AF_KEY */ + AF_LX_NETLINK, /* LX_AF_NETLINK */ + AF_PACKET, /* LX_AF_PACKET */ + AF_NOTSUPPORTED, /* LX_AF_ASH */ + AF_NOTSUPPORTED, /* LX_AF_ECONET */ + AF_NOTSUPPORTED, /* LX_AF_ATMSVC */ + AF_NOTSUPPORTED, /* LX_AF_RDS */ + AF_NOTSUPPORTED, /* LX_AF_SNA */ + AF_NOTSUPPORTED, /* LX_AF_IRDA */ + AF_NOTSUPPORTED, /* LX_AF_PPOX */ + AF_NOTSUPPORTED, /* LX_AF_WANPIPE */ + AF_NOTSUPPORTED, /* LX_AF_LLC */ + AF_NOTSUPPORTED, /* EMPTY */ + AF_NOTSUPPORTED, /* EMPTY */ + AF_NOTSUPPORTED, /* LX_AF_CAN */ + AF_NOTSUPPORTED, /* LX_AF_TIPC */ + AF_NOTSUPPORTED, /* LX_AF_BLUETOOTH */ + AF_NOTSUPPORTED, /* LX_AF_IUCV */ + AF_NOTSUPPORTED /* LX_AF_RXRPC */ + /* LX_AF_ISDN */ + /* LX_AF_PHONET */ + /* LX_AF_IEEE802154 */ + /* LX_AF_CAIF */ + /* LX_AF_ALG */ + /* LX_AF_NFC */ + /* LX_AF_VSOCK */ +}; + +static const int stol_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* AF_UNSPEC */ + AF_UNIX, /* AF_UNIX */ + AF_INET, /* AF_INET */ + AF_NOTSUPPORTED, /* AF_IMPLINK */ + AF_NOTSUPPORTED, /* AF_PUP */ + AF_NOTSUPPORTED, /* AF_CHAOS */ + AF_NOTSUPPORTED, /* AF_NS */ + AF_NOTSUPPORTED, /* AF_NBS */ + AF_NOTSUPPORTED, /* AF_ECMA */ + AF_NOTSUPPORTED, /* AF_DATAKIT */ + AF_NOTSUPPORTED, /* AF_CCITT */ + AF_NOTSUPPORTED, /* AF_SNA */ + AF_NOTSUPPORTED, /* AF_DECNET */ + AF_NOTSUPPORTED, /* AF_DLI */ + AF_NOTSUPPORTED, /* AF_LAT */ + AF_NOTSUPPORTED, /* AF_HYLINK */ + AF_NOTSUPPORTED, /* AF_APPLETALK */ + AF_NOTSUPPORTED, /* AF_NIT */ + AF_NOTSUPPORTED, /* AF_802 */ + AF_NOTSUPPORTED, /* AF_OSI */ + AF_NOTSUPPORTED, /* AF_X25 */ + AF_NOTSUPPORTED, /* AF_OSINET */ + AF_NOTSUPPORTED, /* AF_GOSIP */ + AF_NOTSUPPORTED, /* AF_IPX */ + AF_NOTSUPPORTED, /* AF_ROUTE */ + AF_NOTSUPPORTED, /* AF_LINK */ + LX_AF_INET6, /* AF_INET6 */ + AF_NOTSUPPORTED, /* AF_KEY */ + AF_NOTSUPPORTED, /* AF_NCA */ + AF_NOTSUPPORTED, /* AF_POLICY */ + AF_NOTSUPPORTED, /* AF_INET_OFFLOAD */ + AF_NOTSUPPORTED, /* AF_TRILL */ + LX_AF_PACKET, /* AF_PACKET */ + LX_AF_NETLINK /* AF_LX_NETLINK */ +}; + +#define LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL) +#define STOL_FAMILY(d) ((d) <= LX_AF_MAX ? stol_family[(d)] : AF_INVAL) + + +static const int ltos_socktype[LX_SOCK_PACKET + 1] = { + SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, + SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, + SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED +}; + +static const int stol_socktype[SOCK_SEQPACKET + 1] = { + SOCK_NOTSUPPORTED, LX_SOCK_DGRAM, LX_SOCK_STREAM, SOCK_NOTSUPPORTED, + LX_SOCK_RAW, LX_SOCK_RDM, LX_SOCK_SEQPACKET +}; + +#define LTOS_SOCKTYPE(t) \ + ((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL) +#define STOL_SOCKTYPE(t) \ + ((t) <= SOCK_SEQPACKET ? ltos_socktype[(t)] : SOCK_INVAL) + + +/* + * This string is used to prefix all abstract namespace Unix sockets, ie all + * abstract namespace sockets are converted to regular sockets in the /tmp + * directory with .ABSK_ prefixed to their names. + */ +#define ABST_PRFX "/tmp/.ABSK_" +#define ABST_PRFX_LEN (sizeof (ABST_PRFX) - 1) + +#define DATAFILT "datafilt" + +typedef enum { + lxa_none, + lxa_abstract, + lxa_devlog +} lx_addr_type_t; + +static int +ltos_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + + +typedef struct lx_flag_map { + enum { + LXFM_MAP, + LXFM_IGNORE, + LXFM_UNSUP + } lxfm_action; + int lxfm_sunos_flag; + int lxfm_linux_flag; + char *lxfm_name; +} lx_flag_map_t; + +static lx_flag_map_t lx_flag_map_tbl[] = { + { LXFM_MAP, MSG_OOB, LX_MSG_OOB, NULL }, + { LXFM_MAP, MSG_PEEK, LX_MSG_PEEK, NULL }, + { LXFM_MAP, MSG_DONTROUTE, LX_MSG_DONTROUTE, NULL }, + { LXFM_MAP, MSG_CTRUNC, LX_MSG_CTRUNC, NULL }, + { LXFM_MAP, MSG_TRUNC, LX_MSG_TRUNC, NULL }, + { LXFM_MAP, MSG_DONTWAIT, LX_MSG_DONTWAIT, NULL }, + { LXFM_MAP, MSG_EOR, LX_MSG_EOR, NULL }, + { LXFM_MAP, MSG_WAITALL, LX_MSG_WAITALL, NULL }, + /* MSG_CONFIRM is safe to ignore */ + { LXFM_IGNORE, 0, LX_MSG_CONFIRM, NULL }, + /* + * The NOSIGNAL and CMSG_CLOEXEC flags are handled by the emulation + * outside of the flag-conversion routine. + */ + { LXFM_IGNORE, 0, LX_MSG_NOSIGNAL, NULL }, + { LXFM_IGNORE, 0, LX_MSG_CMSG_CLOEXEC, NULL }, + { LXFM_UNSUP, LX_MSG_PROXY, 0, "MSG_PROXY" }, + { LXFM_UNSUP, LX_MSG_FIN, 0, "MSG_FIN" }, + { LXFM_UNSUP, LX_MSG_SYN, 0, "MSG_SYN" }, + { LXFM_UNSUP, LX_MSG_RST, 0, "MSG_RST" }, + { LXFM_UNSUP, LX_MSG_ERRQUEUE, 0, "MSG_ERRQUEUE" }, + { LXFM_UNSUP, LX_MSG_MORE, 0, "MSG_MORE" }, + { LXFM_UNSUP, LX_MSG_WAITFORONE, 0, "MSG_WAITFORONE" }, + { LXFM_UNSUP, LX_MSG_FASTOPEN, 0, "MSG_FASTOPEN" }, +}; + +#define LX_FLAG_MAP_MAX \ + (sizeof (lx_flag_map_tbl) / sizeof (lx_flag_map_tbl[0])) + +#define LX_UNSUP_BUFSZ 64 + +static int +lx_xlate_sock_flags(int inflags, lx_xlate_dir_t dir) +{ + int i, outflags = 0; + char buf[LX_UNSUP_BUFSZ]; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_FLAG_MAP_MAX; i++) { + lx_flag_map_t *map = &lx_flag_map_tbl[i]; + int match, out; + + if (dir == SUNOS_TO_LX) { + match = inflags & map->lxfm_sunos_flag; + out = map->lxfm_linux_flag; + } else { + match = inflags & map->lxfm_linux_flag; + out = map->lxfm_sunos_flag; + } + switch (map->lxfm_action) { + case LXFM_MAP: + if (match != 0) { + inflags &= ~(match); + outflags |= out; + } + break; + case LXFM_IGNORE: + if (match != 0) { + inflags &= ~(match); + } + break; + case LXFM_UNSUP: + if (match != 0) { + snprintf(buf, LX_UNSUP_BUFSZ, + "unsupported sock flag %s", map->lxfm_name); + lx_unsupported(buf); + } + } + } + if (inflags != 0) { + snprintf(buf, LX_UNSUP_BUFSZ, "unsupported sock flags 0x%08x", + inflags); + lx_unsupported(buf); + } + + return (outflags); +} + +typedef enum lx_sun_type { + LX_SUN_NORMAL, + LX_SUN_ABSTRACT, +} lx_sun_type_t; + +static void +ltos_sockaddr_ux(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + struct sockaddr_un buf; + /* Calculate size of (sun_family + any padding) in sockaddr */ + int sizediff = (sizeof (buf) - sizeof (buf.sun_path)); + int len = inlen - sizediff; + + VERIFY(len > 0); + VERIFY(len <= sizeof (buf.sun_path)); + bzero(&buf, sizeof (buf)); + + if (inaddr->sa_data[0] == '\0') { + /* + * Linux supports abstract Unix sockets, which are simply + * sockets that do not exist on the file system. These sockets + * are denoted by beginning the path with a NULL character. To + * support these, we strip out the leading NULL character and + * change the path to point to a real place in /tmp directory, + * by prepending ABST_PRFX and replacing all illegal characters + * with * '_'. + * + * Since these sockets are supposed to exist outside the + * filesystem, they must be cleaned up after use. This removal + * is performed during bind(). + */ + int idx, odx; + + /* Add our abstract prefix */ + (void) strcpy(buf.sun_path, ABST_PRFX); + for (idx = 1, odx = ABST_PRFX_LEN; + idx < len && odx < sizeof (buf.sun_path); + idx++, odx++) { + char c = inaddr->sa_data[idx]; + if (c == '\0' || c == '/') { + buf.sun_path[odx] = '_'; + } else { + buf.sun_path[odx] = c; + } + } + + /* + * Since abstract socket addresses might not be NUL terminated, + * we must explicitly NUL terminate the translated path. + * Care is taken not to overflow the buffer. + */ + if (odx == sizeof (buf.sun_path)) { + buf.sun_path[odx - 1] = '\0'; + } else { + buf.sun_path[odx] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_ABSTRACT; + } + } else { + /* Copy the address directly, minding termination */ + (void) strncpy(buf.sun_path, inaddr->sa_data, len); + len = strnlen(buf.sun_path, len); + if (len == sizeof (buf.sun_path)) { + buf.sun_path[len - 1] = '\0'; + } else { + VERIFY(len < sizeof (buf.sun_path)); + buf.sun_path[len] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_NORMAL; + } + } + buf.sun_family = AF_UNIX; + *outlen = strlen(buf.sun_path) + 1 + sizediff; + VERIFY(*outlen <= sizeof (struct sockaddr_un)); + + *outaddr = kmem_alloc(*outlen, KM_SLEEP); + bcopy(&buf, *outaddr, *outlen); +} + +/* + * Copy in a Linux-native socket address from userspace and convert it into + * illumos format. When successful, it will allocate an appropriately sized + * struct to be freed by the caller. + */ +static long +ltos_sockaddr_copyin(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + sa_family_t family; + struct sockaddr *laddr; + struct sockaddr_ll *sal; + int proto, error = 0; + + VERIFY(inaddr != NULL); + + if (inlen < sizeof (sa_family_t) || + inlen > sizeof (struct sockaddr_storage)) { + return (EINVAL); + } + laddr = kmem_alloc(inlen, KM_SLEEP); + if (copyin(inaddr, laddr, inlen) != 0) { + kmem_free(laddr, inlen); + return (EFAULT); + } + + family = LTOS_FAMILY(laddr->sa_family); + switch (family) { + case (sa_family_t)AF_NOTSUPPORTED: + error = EPROTONOSUPPORT; + break; + + case (sa_family_t)AF_INVAL: + error = EAFNOSUPPORT; + break; + + case AF_UNIX: + if (inlen < sizeof (sa_family_t) + 2 || + inlen > sizeof (struct sockaddr_un)) { + error = EINVAL; + break; + } + ltos_sockaddr_ux(laddr, inlen, outaddr, outlen, + sun_type); + + /* AF_UNIX bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + case AF_PACKET: + if (inlen < sizeof (struct sockaddr_ll)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_ll); + + /* sll_protocol must be translated */ + sal = (struct sockaddr_ll *)laddr; + proto = ltos_pkt_proto(sal->sll_protocol); + if (proto < 0) { + error = EINVAL; + } + sal->sll_protocol = proto; + break; + + case AF_INET: + if (inlen < sizeof (struct sockaddr)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr); + break; + + case AF_INET6: + /* + * The illumos sockaddr_in6 has one more 32-bit field + * than the Linux version. We simply zero that field + * via kmem_zalloc. + */ + if (inlen < sizeof (lx_sockaddr_in6_t)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_in6); + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, + KM_SLEEP); + bcopy(laddr, *outaddr, sizeof (lx_sockaddr_in6_t)); + (*outaddr)->sa_family = AF_INET6; + /* AF_INET6 bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + default: + *outlen = inlen; + } + + if (error == 0) { + /* + * For most address families, just copying into a sockaddr of + * the correct size and updating sa_family is adequate. + */ + VERIFY(inlen >= *outlen); + + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, KM_SLEEP); + bcopy(laddr, *outaddr, *outlen); + (*outaddr)->sa_family = family; + } + kmem_free(laddr, inlen); + return (error); +} + +/* + * Convert an illumos-native socket address into Linux format and copy it out + * to userspace. + */ +static long +stol_sockaddr_copyout(struct sockaddr *inaddr, socklen_t inlen, + struct sockaddr *outaddr, void *outlenp, socklen_t orig) +{ + socklen_t size = inlen; + struct sockaddr_storage buf; + struct sockaddr *bufaddr; + + /* + * Either we were passed a valid sockaddr (with length) or the length + * is set to 0. + */ + VERIFY(inaddr != NULL || inlen == 0); + + if (inlen == 0) { + goto finish; + } + + + switch (inaddr->sa_family) { + case AF_INET: + if (inlen != sizeof (struct sockaddr)) { + return (EINVAL); + } + break; + + case AF_INET6: + if (inlen != sizeof (struct sockaddr_in6)) { + return (EINVAL); + } + /* + * The linux sockaddr_in6 is shorter than illumos. + * Truncate the extra field on the way out. + */ + size = (sizeof (lx_sockaddr_in6_t)); + inlen = (sizeof (lx_sockaddr_in6_t)); + break; + + case AF_UNIX: + if (inlen > sizeof (struct sockaddr_un)) { + return (EINVAL); + } + + /* + * On Linux an empty AF_UNIX address is returned as NULL, which + * means setting the returned length to only encompass the + * address family part of the buffer. However, some code also + * references the address portion of the buffer and uses it, + * even though the returned length has been shortened. Thus, we + * clear the buffer to ensure that the address portion is NULL. + */ + if (inaddr->sa_data[0] == '\0') { + bzero(&buf, sizeof (buf)); + inlen = sizeof (inaddr->sa_family); + } + break; + + case (sa_family_t)AF_NOTSUPPORTED: + return (EPROTONOSUPPORT); + + case (sa_family_t)AF_INVAL: + return (EAFNOSUPPORT); + + default: + break; + } + + /* + * The input should be smaller than sockaddr_storage, the largest + * sockaddr we support. + */ + VERIFY(inlen <= sizeof (buf)); + + bufaddr = (struct sockaddr *)&buf; + bcopy(inaddr, bufaddr, inlen); + bufaddr->sa_family = STOL_FAMILY(bufaddr->sa_family); + + /* + * It is possible that userspace passed us a smaller buffer than we + * hope to output. When this is the case, we will truncate our output + * to the max size of their buffer but report the true size of the + * sockaddr when outputting the outlen value. + */ + size = (orig < size) ? orig : size; + + if (copyout(bufaddr, outaddr, size) != 0) { + return (EFAULT); + } + +finish: +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)inlen; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&inlen, outlenp, sizeof (inlen)) != 0) { + return (EFAULT); + } + } + + return (0); +} + +typedef struct lx_cmsg_xlate { + int lcx_sunos_level; + int lcx_sunos_type; + int (*lcx_stol_conv)(struct cmsghdr *, struct cmsghdr *); + int lcx_linux_level; + int lcx_linux_type; + int (*lcx_ltos_conv)(struct cmsghdr *, struct cmsghdr *); +} lx_cmsg_xlate_t; + +static int cmsg_conv_generic(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int ltos_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_recvttl(struct cmsghdr *, struct cmsghdr *); + +/* + * Table describing SunOS <-> Linux cmsg translation mappings. + * Certain types (IP_RECVTTL) are only converted in one direction and are + * indicated by one of the translation functions being set to NULL. + */ +static lx_cmsg_xlate_t lx_cmsg_xlate_tbl[] = { + { SOL_SOCKET, SCM_RIGHTS, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_RIGHTS, cmsg_conv_generic }, + { SOL_SOCKET, SCM_UCRED, stol_conv_ucred, + LX_SOL_SOCKET, LX_SCM_CRED, ltos_conv_ucred }, + { SOL_SOCKET, SCM_TIMESTAMP, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_TIMESTAMP, cmsg_conv_generic }, + { IPPROTO_IP, IP_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_PKTINFO, cmsg_conv_generic }, + { IPPROTO_IP, IP_RECVTTL, stol_conv_recvttl, + LX_IPPROTO_IP, LX_IP_TTL, NULL }, + { IPPROTO_IP, IP_TTL, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_TTL, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_HOPLIMIT, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_HOPLIMIT, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_PKTINFO, cmsg_conv_generic } +}; + +#define LX_MAX_CMSG_XLATE \ + (sizeof (lx_cmsg_xlate_tbl) / sizeof (lx_cmsg_xlate_tbl[0])) + +#if defined(_LP64) + +typedef struct { + int64_t cmsg_len; + int32_t cmsg_level; + int32_t cmsg_type; +} lx_cmsghdr64_t; + +/* The alignment/padding for 64bit Linux cmsghdr is not the same. */ +#define LX_CMSG64_ALIGNMENT 8 +#define ISALIGNED_LX_CMSG64(addr) \ + (((uintptr_t)(addr) & (LX_CMSG64_ALIGNMENT - 1)) == 0) +#define ROUNDUP_LX_CMSG64_LEN(len) \ + (((len) + LX_CMSG64_ALIGNMENT - 1) & ~(LX_CMSG64_ALIGNMENT - 1)) + +#define LX_CMSG64_IS_ALIGNED(m) \ + (((uintptr_t)(m) & (_CMSG_DATA_ALIGNMENT - 1)) == 0) +#define LX_CMSG64_DATA(c) ((unsigned char *)(((lx_cmsghdr64_t *)(c)) + 1)) +/* + * LX_CMSG64_VALID is closely derived from CMSG_VALID with one particularly + * important addition. Since cmsg_len is 64bit, (cmsg + cmsg_len) is checked + * against the start address as well. This prevents bogus inputs from wrapping + * around the address space. + */ +#define LX_CMSG64_VALID(cmsg, start, end) \ + (ISALIGNED_LX_CMSG64(cmsg) && \ + ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ + ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ + ((cmsg)->cmsg_len >= sizeof (lx_cmsghdr64_t)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len >= (uintptr_t)(start))) +#define LX_CMSG64_NEXT(cmsg) \ + (lx_cmsghdr64_t *)((uintptr_t)(cmsg) + \ + ROUNDUP_LX_CMSG64_LEN((cmsg)->cmsg_len)) +#define LX_CMSG64_DIFF sizeof (uint32_t) + +#endif /* defined(_LP64) */ + +/* + * convert ucred_s to lx_ucred. + */ +static int +stol_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * Format the data correctly in the omsg buffer. + */ + if (omsg != NULL) { + struct ucred_s *scred = (struct ucred_s *)CMSG_CONTENT(inmsg); + prcred_t *cr; + lx_ucred_t lcred; + + lcred.lxu_pid = scred->uc_pid; + cr = UCCRED(scred); + if (cr != NULL) { + lcred.lxu_uid = cr->pr_euid; + lcred.lxu_gid = cr->pr_egid; + } else { + lcred.lxu_uid = lcred.lxu_gid = 0; + } + + bcopy(&lcred, CMSG_CONTENT(omsg), sizeof (lx_ucred_t)); + } + + return (sizeof (struct cmsghdr) + sizeof (lx_ucred_t)); +} + +static int +ltos_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + struct ucred_s *uc; + prcred_t *pc; + lx_ucred_t *lcred; + + uc = (struct ucred_s *)CMSG_CONTENT(omsg); + pc = (prcred_t *)((char *)uc + sizeof (struct ucred_s)); + + uc->uc_credoff = sizeof (struct ucred_s); + + lcred = (lx_ucred_t *)CMSG_CONTENT(inmsg); + + uc->uc_pid = lcred->lxu_pid; + pc->pr_euid = lcred->lxu_uid; + pc->pr_egid = lcred->lxu_gid; + } + + return (sizeof (struct cmsghdr) + sizeof (struct ucred_s) + + sizeof (prcred_t)); + +} + +static int +stol_conv_recvttl(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * SunOS communicates the TTL of incoming packets via IP_RECVTTL using + * a uint8_t value instead of IP_TTL using an int. This conversion is + * only needed in the one direction since Linux does not handle + * IP_RECVTTL in the sendmsg path. + */ + if (omsg != NULL) { + uint8_t *inttl = (uint8_t *)CMSG_CONTENT(inmsg); + int *ottl = (int *)CMSG_CONTENT(omsg); + + *ottl = (int)*inttl; + } + + return (sizeof (struct cmsghdr) + sizeof (int)); +} + +static int +cmsg_conv_generic(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + size_t data_len; + + data_len = inmsg->cmsg_len - sizeof (struct cmsghdr); + bcopy(CMSG_CONTENT(inmsg), CMSG_CONTENT(omsg), data_len); + } + + return (inmsg->cmsg_len); +} + +static int +lx_xlate_cmsg(struct cmsghdr *inmsg, struct cmsghdr *omsg, lx_xlate_dir_t dir) +{ + int i; + int len; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_MAX_CMSG_XLATE; i++) { + lx_cmsg_xlate_t *xlate = &lx_cmsg_xlate_tbl[i]; + if (dir == LX_TO_SUNOS && + inmsg->cmsg_level == xlate->lcx_linux_level && + inmsg->cmsg_type == xlate->lcx_linux_type && + xlate->lcx_ltos_conv != NULL) { + len = xlate->lcx_ltos_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_sunos_level; + omsg->cmsg_type = xlate->lcx_sunos_type; + } + return (len); + } else if (dir == SUNOS_TO_LX && + inmsg->cmsg_level == xlate->lcx_sunos_level && + inmsg->cmsg_type == xlate->lcx_sunos_type && + xlate->lcx_stol_conv != NULL) { + len = xlate->lcx_stol_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_linux_level; + omsg->cmsg_type = xlate->lcx_linux_type; + } + return (len); + } + } + /* + * The Linux man page for sendmsg does not define a specific error for + * unsupported cmsgs. While it is meant to indicated bad values for + * passed flags, EOPNOTSUPP appears to be the next closest choice. + */ + return (-EOPNOTSUPP); +} + +static long +ltos_cmsgs_copyin(void *addr, socklen_t inlen, void **outmsg, + socklen_t *outlenp) +{ + void *inbuf, *obuf; + struct cmsghdr *inmsg, *omsg; + int slen = 0; + + if (inlen < sizeof (struct cmsghdr) || inlen > SO_MAXARGSIZE) { + return (EINVAL); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE && + inlen < sizeof (lx_cmsghdr64_t)) { + /* The size requirements are more strict for 64bit. */ + return (EINVAL); + } +#endif /* defined(_LP64) */ + + inbuf = kmem_alloc(inlen, KM_SLEEP); + if (copyin(addr, inbuf, inlen) != 0) { + kmem_free(inbuf, inlen); + return (EFAULT); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Linux cmsg headers are longer than illumos under x86_64. + * Convert to regular cmsgs first. + */ + lx_cmsghdr64_t *lmsg; + struct cmsghdr *smsg; + void *newbuf; + int len = 0; + + /* Inventory the new cmsg size */ + for (lmsg = (lx_cmsghdr64_t *)inbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg)) { + len += ROUNDUP_cmsglen(lmsg->cmsg_len - LX_CMSG64_DIFF); + } + + VERIFY(len < inlen); + if (len == 0) { + /* Input was bogus, so we can give up early. */ + kmem_free(inbuf, inlen); + *outmsg = NULL; + *outlenp = 0; + return (EINVAL); + } + + newbuf = kmem_alloc(len, KM_SLEEP); + + for (lmsg = (lx_cmsghdr64_t *)inbuf, + smsg = (struct cmsghdr *)newbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg), smsg = CMSG_NEXT(smsg)) { + smsg->cmsg_level = lmsg->cmsg_level; + smsg->cmsg_type = lmsg->cmsg_type; + smsg->cmsg_len = lmsg->cmsg_len - LX_CMSG64_DIFF; + + /* The above length measurement should ensure this */ + ASSERT(CMSG_VALID(smsg, newbuf, + (uintptr_t)newbuf + len)); + + bcopy(LX_CMSG64_DATA(lmsg), CMSG_CONTENT(smsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(inbuf, inlen); + inbuf = newbuf; + inlen = len; + } +#endif /* defined(_LP64) */ + + /* + * Now determine how much space we need for the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, LX_TO_SUNOS)) < 0) { + /* unsupported msg */ + kmem_free(inbuf, inlen); + return (-sz); + } + + slen += ROUNDUP_cmsglen(sz); + } + + obuf = kmem_zalloc(slen, KM_SLEEP); + + /* + * Now do the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, LX_TO_SUNOS) >= 0); + } + + kmem_free(inbuf, inlen); + *outmsg = obuf; + *outlenp = slen; + return (0); +} + +static long +stol_cmsgs_copyout(void *input, socklen_t inlen, void *addr, + void *outlenp, socklen_t orig_outlen) +{ + void *obuf; + struct cmsghdr *inmsg, *omsg; + int error = 0; + socklen_t lx_len = 0; +#if defined(_LP64) + model_t model = get_udatamodel(); +#endif + + if (inlen == 0) { + /* Simply output the zero controllen */ + goto finish; + } + + VERIFY(inlen > sizeof (struct cmsghdr)); + + /* + * First determine how much space we need for the conversion and + * make sure the caller has provided at least that much space to return + * results. + */ + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, SUNOS_TO_LX)) < 0) { + /* unsupported msg */ + return (-sz); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* + * The converted 64-bit cmsgs require an additional 4 + * bytes of header space and must be aligned to 8 bytes + * (instead of the typical 4 for x86) + */ + sz = ROUNDUP_LX_CMSG64_LEN(sz + LX_CMSG64_DIFF); + } else +#endif /* defined(_LP64) */ + { + /* + * The converted 32-bit cmsgs do not require additional + * header space or padding for Linux conversion. + */ + sz = ROUNDUP_cmsglen(sz); + } + + /* + * Unlike SunOS, Linux requires that the last cmsg be + * adequately padded for alignment. + */ + lx_len += sz; + } + + if (lx_len > orig_outlen || addr == NULL) { + /* This will be interpreted by the caller */ + error = EMSGSIZE; + lx_len = 0; + goto finish; + } + + /* + * Since cmsgs are often padded to an aligned size, kmem_zalloc is + * necessary to prevent leaking the contents of uninitialized memory. + */ + obuf = kmem_zalloc(lx_len, KM_SLEEP); + + /* + * Convert the msgs. + */ + for (inmsg = (struct cmsghdr *)input, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, SUNOS_TO_LX) >= 0); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* Linux cmsg headers are longer than illumos under x86_64. */ + struct cmsghdr *smsg; + lx_cmsghdr64_t *lmsg; + void *newbuf; + + /* + * Once again, kmem_zalloc is needed to avoid leaking the + * contents of uninialized memory + */ + newbuf = kmem_zalloc(lx_len, KM_SLEEP); + for (smsg = (struct cmsghdr *)obuf, + lmsg = (lx_cmsghdr64_t *)newbuf; + CMSG_VALID(smsg, obuf, (uintptr_t)obuf + inlen) != 0; + smsg = CMSG_NEXT(smsg), lmsg = LX_CMSG64_NEXT(lmsg)) { + lmsg->cmsg_level = smsg->cmsg_level; + lmsg->cmsg_type = smsg->cmsg_type; + lmsg->cmsg_len = smsg->cmsg_len + LX_CMSG64_DIFF; + + ASSERT(LX_CMSG64_VALID(lmsg, newbuf, + (uintptr_t)newbuf + lx_len) != 0); + + bcopy(CMSG_CONTENT(smsg), LX_CMSG64_DATA(lmsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(obuf, lx_len); + obuf = newbuf; + } +#endif /* defined(_LP64) */ + + if (copyout(obuf, addr, lx_len) != 0) { + kmem_free(obuf, lx_len); + return (EFAULT); + } + kmem_free(obuf, lx_len); + +finish: + if (outlenp != NULL) { +#if defined(_LP64) + if (model != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)lx_len; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&lx_len, outlenp, sizeof (lx_len)) != 0) { + return (EFAULT); + } + } + } + return (error); +} + +static void +lx_cmsg_set_cloexec(void *input, socklen_t inlen) +{ + struct cmsghdr *inmsg; + + if (inlen == 0) { + return; + } + + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + if (inmsg->cmsg_level == SOL_SOCKET && + inmsg->cmsg_type == SCM_RIGHTS) { + int *fds = (int *)CMSG_CONTENT(inmsg); + int i, num = (int)CMSG_CONTENTLEN(inmsg) / sizeof (int); + for (i = 0; i < num; i++) { + char flags; + file_t *fp; + /* set CLOEXEC on the fd */ + fp = getf(fds[i]); + VERIFY(fp != NULL); + flags = f_getfd(fds[i]); + flags |= FD_CLOEXEC; + f_setfd(fds[i], flags); + releasef(fds[i]); + } + } + } +} + +static int +lx_cmsg_try_ucred(sonode_t *so, struct nmsghdr *msg, socklen_t origlen) +{ + lx_socket_aux_data_t *sad; + struct cmsghdr *cmsg = NULL; + int msgsize; + cred_t *cred; + + if (origlen == 0) { + return (0); + } + sad = lx_sad_acquire(SOTOV(so)); + if (!sad->lxsad_stream_cred) { + mutex_exit(&sad->lxsad_lock); + return (0); + } + mutex_exit(&sad->lxsad_lock); + + mutex_enter(&so->so_lock); + if (so->so_peercred == NULL) { + mutex_exit(&so->so_lock); + return (0); + } + crhold(cred = so->so_peercred); + mutex_exit(&so->so_lock); + + msgsize = ucredminsize(cred) + sizeof (struct cmsghdr); + if (msg->msg_control == NULL) { + msg->msg_controllen = msgsize; + msg->msg_control = cmsg = kmem_zalloc(msgsize, KM_SLEEP); + } else { + /* + * The so_recvmsg operation may have allocated a msg_control + * buffer which precisely fits all returned cmsgs. We must + * manually verify the length of that cmsg data and reallocate + * the buffer if it lacks the necessary space. + */ + uintptr_t start = (uintptr_t)msg->msg_control; + uintptr_t end = start + msg->msg_controllen; + + ASSERT(msg->msg_controllen > 0); + cmsg = (struct cmsghdr *)msg->msg_control; + while (CMSG_VALID(cmsg, start, end) != 0) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_UCRED) { + /* + * If some later code change results in a ucred + * being attached anyways, there is no need for + * us to do it manually + */ + crfree(cred); + return (0); + } + cmsg = CMSG_NEXT(cmsg); + } + if (((uintptr_t)cmsg + msgsize) > end) { + socklen_t offset = (uintptr_t)cmsg - start; + socklen_t newsize = offset + msgsize; + void *newbuf; + + if (newsize < msg->msg_controllen) { + /* size overflow, bail */ + crfree(cred); + return (-1); + } + newbuf = kmem_alloc(newsize, KM_SLEEP); + bcopy(msg->msg_control, newbuf, msg->msg_controllen); + kmem_free(msg->msg_control, msg->msg_controllen); + + msg->msg_control = newbuf; + msg->msg_controllen = newsize; + cmsg = (struct cmsghdr *)((uintptr_t)newbuf + offset); + } + } + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + cmsg->cmsg_len = msgsize; + (void) cred2ucred(cred, so->so_cpid, CMSG_CONTENT(cmsg), CRED()); + crfree(cred); + return (0); +} + +static lx_socket_aux_data_t * +lx_sad_acquire(vnode_t *vp) +{ + lx_socket_aux_data_t *cur, *created; + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + /* perform our allocation carefully */ + mutex_exit(&vp->v_vsd_lock); + + created = (lx_socket_aux_data_t *)kmem_zalloc( + sizeof (*created), KM_SLEEP); + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + mutex_init(&created->lxsad_lock, NULL, MUTEX_DEFAULT, + NULL); + (void) vsd_set(vp, lx_socket_vsd, created); + cur = created; + } else { + kmem_free(created, sizeof (*created)); + } + } + mutex_exit(&vp->v_vsd_lock); + mutex_enter(&cur->lxsad_lock); + return (cur); +} + +static int +lx_convert_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + +static int +lx_convert_sock_args(int in_dom, int in_type, int in_proto, int *out_dom, + int *out_type, int *out_options, int *out_proto) +{ + int domain, type, options; + + if (in_dom < 0 || in_type < 0 || in_proto < 0) + return (EINVAL); + + domain = LTOS_FAMILY(in_dom); + if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC) + return (EAFNOSUPPORT); + if (domain == AF_INVAL) + return (EINVAL); + + type = LTOS_SOCKTYPE(in_type & LX_SOCK_TYPE_MASK); + if (type == SOCK_NOTSUPPORTED) + return (ESOCKTNOSUPPORT); + if (type == SOCK_INVAL) + return (EINVAL); + + /* + * Linux does not allow the app to specify IP Protocol for raw sockets. + * SunOS does, so bail out here. + */ + if (domain == AF_INET && type == SOCK_RAW && in_proto == IPPROTO_IP) + return (ESOCKTNOSUPPORT); + + options = 0; + in_type &= ~(LX_SOCK_TYPE_MASK); + if (in_type & LX_SOCK_NONBLOCK) { + in_type ^= LX_SOCK_NONBLOCK; + options |= SOCK_NONBLOCK; + } + if (in_type & LX_SOCK_CLOEXEC) { + in_type ^= LX_SOCK_CLOEXEC; + options |= SOCK_CLOEXEC; + } + if (in_type != 0) { + return (EINVAL); + } + + /* Protocol definitions for PF_PACKET differ between Linux and SunOS */ + if (domain == PF_PACKET && + (in_proto = lx_convert_pkt_proto(in_proto)) < 0) + return (EINVAL); + + *out_dom = domain; + *out_type = type; + *out_options = options; + *out_proto = in_proto; + return (0); +} + +long +lx_socket(int domain, int type, int protocol) +{ + int fd, error, options; + sonode_t *so; + vnode_t *vp; + struct file *fp; + + if ((error = lx_convert_sock_args(domain, type, protocol, &domain, + &type, &options, &protocol)) != 0) { + return (set_errno(error)); + } + + /* logic cloned from so_socket */ + so = socket_create(domain, type, protocol, NULL, NULL, SOCKET_SLEEP, + SOV_DEFAULT, CRED(), &error); + + if (so == NULL) { + if (error == EPROTOTYPE || error == EPROTONOSUPPORT) { + error = ESOCKTNOSUPPORT; + } + return (set_errno(error)); + } + + /* Allocate a file descriptor for the socket */ + vp = SOTOV(so); + if ((error = falloc(vp, FWRITE|FREAD, &fp, &fd)) != 0) { + (void) socket_close(so, 0, CRED()); + socket_destroy(so); + return (set_errno(error)); + } + + /* + * Linux programs do not tolerate errors appearing from asynchronous + * events (such as ICMP messages arriving). Setting SM_DEFERERR will + * prevent checking/delivery of such errors. + */ + so->so_mode |= SM_DEFERERR; + + /* Now fill in the entries that falloc reserved */ + if (options & SOCK_NONBLOCK) { + so->so_state |= SS_NONBLOCK; + fp->f_flag |= FNONBLOCK; + } + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if ((options & SOCK_CLOEXEC) != 0) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); +} + +long +lx_bind(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + socklen_t len = 0; + file_t *fp; + int error; + lx_sun_type_t sun_type; + boolean_t not_sock = B_FALSE; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, &sun_type); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + if (addr != NULL && addr->sa_family == AF_UNIX) { + vnode_t *vp; + + error = so_ux_lookup(so, (struct sockaddr_un *)addr, B_TRUE, + &vp); + if (error == 0) { + /* A valid socket exists and is open at this address. */ + VN_RELE(vp); + } else { + /* Keep track of paths which are not valid sockets. */ + if (error == ENOTSOCK) { + not_sock = B_TRUE; + } + + /* + * When binding to an abstract namespace address or + * /dev/log, implicit clean-up must occur if there is + * not a valid socket at the specififed address. See + * ltos_sockaddr_copyin for details about why these + * socket types act differently. + */ + if (sun_type == LX_SUN_ABSTRACT) { + (void) vn_removeat(NULL, addr->sa_data, + UIO_SYSSPACE, RMFILE); + } + } + } + + error = socket_bind(so, addr, len, _SOBIND_XPG4_2, CRED()); + + /* + * Linux returns EADDRINUSE for attempts to bind to Unix domain + * sockets that aren't sockets. + */ + if (error == EINVAL && addr != NULL && addr->sa_family == AF_UNIX && + not_sock == B_TRUE) { + error = EADDRINUSE; + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_connect(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + lx_socket_aux_data_t *sad = NULL; + socklen_t len = 0; + file_t *fp; + int error; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + /* + * Ensure the name is sized appropriately before we alloc memory and + * copy it in from userspace. We need at least the address family to + * make later sizing decisions. + */ + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, NULL); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + error = socket_connect(so, addr, len, fp->f_flag, + _SOCONNECT_XPG4_2, CRED()); + + /* + * Linux connect(2) behavior is rather strange when using the + * O_NONBLOCK flag. The first call will return EINPROGRESS, as + * expected. Provided that is successful, a second call to connect + * will return 0 instead of EISCONN. Subsequent connect calls will + * return EISCONN. + */ + if ((fp->f_flag & FNONBLOCK) != 0 && error != 0) { + sad = lx_sad_acquire(SOTOV(so)); + if (error == EISCONN && + sad->lxsad_status == LXSS_CONNECTING) { + /* Report the one success */ + sad->lxsad_status = LXSS_CONNECTED; + error = 0; + } else if (error == EINPROGRESS) { + sad->lxsad_status = LXSS_CONNECTING; + } + mutex_exit(&sad->lxsad_lock); + } + + /* + * When connecting to a UDP socket, configure it so that future + * sendto/sendmsg operations are allowed to specify a destination + * address. See the Posix spec. for sendto(2). Linux allows this while + * illumos would return EISCONN if the option is not set. + */ + if (error == 0 && so->so_protocol == IPPROTO_UDP && + (so->so_family == AF_INET || so->so_family == AF_INET6)) { + int val = 1; + + DTRACE_PROBE(lx__connect__udp); + (void) socket_setsockopt(so, IPPROTO_UDP, UDP_SND_TO_CONNECTED, + &val, sizeof (val), CRED()); + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +/* + * Custom version of socket_recvmsg for error-handling overrides. + */ +static int +lx_socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr) +{ + int error; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache when reading data, as the application + * is likely to access the data shortly. + */ + uiop->uio_extflg |= UIO_COPY_CACHED; + + error = SOP_RECVMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial read */ + if (uiop->uio_resid != orig_resid) + error = 0; + break; + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + } + break; + default: + break; + } + return (error); +} + +static long +lx_recv_common(int sock, struct nmsghdr *msg, xuio_t *xuiop, int flags, + void *namelenp, void *controllenp, void *flagsp) +{ + struct sonode *so; + file_t *fp; + void *name; + socklen_t namelen; + void *control; + socklen_t controllen; + ssize_t len; + int error; + boolean_t fd_cloexec; + boolean_t is_peek_trunc; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + fd_cloexec = ((flags & LX_MSG_CMSG_CLOEXEC) != 0); + flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + is_peek_trunc = (flags & (MSG_PEEK|MSG_TRUNC)) == (MSG_PEEK|MSG_TRUNC); + len = xuiop->xu_uio.uio_resid; + xuiop->xu_uio.uio_fmode = fp->f_flag; + xuiop->xu_uio.uio_extflg = UIO_COPY_CACHED; + + /* + * Linux accepts MSG_TRUNC as an input flag, unlike SunOS and many + * other UNIX distributions. When combined with MSG_PEEK, it causes + * recvmsg to return the size of the waiting message, regardless of + * buffer size. This behavior is commonly used with a 0-length buffer + * to interrogate the size of a queued message prior to allocating a + * buffer for it. + * + * In order to support this functionality, a custom XUIO type is used + * to communicate the total message size out from the depths of sockfs. + */ + if (is_peek_trunc) { + xuiop->xu_uio.uio_extflg |= UIO_XUIO; + xuiop->xu_type = UIOTYPE_PEEKSIZE; + xuiop->xu_ext.xu_ps.xu_ps_set = B_FALSE; + xuiop->xu_ext.xu_ps.xu_ps_size = 0; + } + + name = msg->msg_name; + namelen = msg->msg_namelen; + control = msg->msg_control; + controllen = msg->msg_controllen; + + /* + * socket_recvmsg will allocate these if needed. + * NULL them out to prevent any confusion. + */ + msg->msg_name = NULL; + msg->msg_control = NULL; + + msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + + error = lx_socket_recvmsg(so, msg, (struct uio *)xuiop, CRED()); + if (error) { + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGRCV, 1); + releasef(sock); + + if (namelen != 0) { + error = stol_sockaddr_copyout(msg->msg_name, msg->msg_namelen, + name, namelenp, namelen); + + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + msg->msg_namelen = 0; + } + + /* + * Errors during copyout of the name are not a concern to Linux + * callers at this point in the syscall + */ + if (error != 0 && error != EFAULT) { + goto err; + } + } + + if (controllen != 0) { + if (fd_cloexec) { + /* + * If CLOEXEC needs to set on file descriptors passed + * via SCM_RIGHTS, do so before formatting the cmsgs + * for Linux. + */ + lx_cmsg_set_cloexec(msg->msg_control, + msg->msg_controllen); + } + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + /* + * It may be necessary to append a SCM_UCRED cmsg to + * the controls if SO_PASSCRED is set on a + * connection-oriented AF_UNIX socket. + * + * See lx_setsockopt_socket for more details. + */ + if (lx_cmsg_try_ucred(so, msg, controllen) != 0) { + msg->msg_flags |= MSG_CTRUNC; + } + } + + error = stol_cmsgs_copyout(msg->msg_control, + msg->msg_controllen, control, controllenp, controllen); + + if (error != 0) { + /* + * If there was an error during cmsg translation or + * copyout, we need to clean up any FDs that are being + * passed back via SCM_RIGHTS. This prevents us from + * leaking those open files. + */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, + 0); + + /* + * An error during cmsg_copyout means we had + * _something_ to process. + */ + VERIFY(msg->msg_controllen != 0); + + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + + if (error == EMSGSIZE) { + /* Communicate that messages were truncated */ + msg->msg_flags |= MSG_CTRUNC; + error = 0; + } else { + goto err; + } + } else if (msg->msg_controllen != 0) { + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + } + } + + if (flagsp != NULL) { + int flags; + + /* Clear internal flag. */ + flags = msg->msg_flags & ~MSG_XPG4_2; + flags = lx_xlate_sock_flags(flags, SUNOS_TO_LX); + + if (copyout(&flags, flagsp, sizeof (flags) != 0)) { + error = EFAULT; + goto err; + } + } + + /* + * If both MSG_PEEK|MSG_TRUNC were set on the input flags and the + * socket layer was able to calculate the total message size for us, + * return that instead of the copied size. + */ + if (is_peek_trunc && xuiop->xu_ext.xu_ps.xu_ps_set == B_TRUE) { + return (xuiop->xu_ext.xu_ps.xu_ps_size); + } + + return (len - xuiop->xu_uio.uio_resid); + +err: + if (msg->msg_controllen != 0) { + /* Prevent FD leakage (see above) */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, 0); + kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + } + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + } + return (set_errno(error)); +} + +long +lx_recv(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* + * The input len is unsigned, so limit it to SSIZE_MAX since + * the return value is signed. + */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_namelen = 0; + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + return (lx_recv_common(sock, &smsg, &xuio, flags, NULL, NULL, NULL)); +} + +long +lx_recvfrom(int sock, void *buffer, size_t len, int flags, + struct sockaddr *srcaddr, socklen_t *addrlenp) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_name = (char *)srcaddr; + if (addrlenp != NULL && srcaddr != NULL) { + /* + * Despite addrlenp being defined as a socklen_t *, Linux + * treats it internally as an int *. Certain LTP tests depend + * upon this behavior, so we must emulate it as well. + */ + int namelen; + + if (copyin(addrlenp, &namelen, sizeof (namelen)) != 0) { + return (set_errno(EFAULT)); + } + if (namelen < 0) { + return (set_errno(EINVAL)); + } + smsg.msg_namelen = namelen; + } else { + smsg.msg_namelen = 0; + } + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + + return (lx_recv_common(sock, &smsg, &xuio, flags, addrlenp, NULL, + NULL)); +} + +long +lx_recvmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec luiov[IOV_MAX_STACK], *uiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + void *namelenp, *controllenp, *flagsp; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + + namelenp = &((lx_msghdr32_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr32_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr32_t *)msg)->msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + + namelenp = &((lx_msghdr_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr_t *)msg)->msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + uiov = kmem_alloc(iovsize, KM_SLEEP); + } else if (iovcnt > 0) { + iovsize = 0; + uiov = luiov; + } else { + iovsize = 0; + uiov = NULL; + goto noiov; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 luiov32[IOV_MAX_STACK], *uiov32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + uiov32 = kmem_alloc(iov32size, KM_SLEEP); + } else { + uiov32 = luiov32; + } + + if (copyin((struct iovec32 *)smsg.msg_iov, uiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = uiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + uiov[i].iov_len = iovlen32; + uiov[i].iov_base = + (caddr_t)(uintptr_t)uiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, uiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = uiov[i].iov_len; + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + +noiov: + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = uiov; + xuio.xu_uio.uio_iovcnt = iovcnt; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + res = lx_recv_common(sock, &smsg, &xuio, flags, namelenp, controllenp, + flagsp); + + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + + return (res); +} + +/* + * Custom version of socket_sendmsg for error-handling overrides. + */ +static int +lx_socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr, boolean_t nosig) +{ + int error = 0; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache if we are doing a local (AF_UNIX) write. + */ + if (so->so_family == AF_UNIX) { + uiop->uio_extflg |= UIO_COPY_CACHED; + } else { + uiop->uio_extflg &= ~UIO_COPY_CACHED; + } + + error = SOP_SENDMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + case ENOMEM: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial send */ + if (uiop->uio_resid != orig_resid) { + error = 0; + } + break; + + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + break; + } + + /* Appease LTP and match behavior detailed in the man page */ + error = EPIPE; + /* FALLTHROUGH */ + case EPIPE: + if (nosig == B_FALSE) { + tsignal(curthread, SIGPIPE); + } + break; + + default: + break; + } + + return (error); +} + +static long +lx_send_common(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) +{ + struct sonode *so; + file_t *fp; + struct sockaddr *name = NULL; + socklen_t namelen; + void *control = NULL; + socklen_t controllen; + ssize_t len = 0; + int error; + boolean_t nosig; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + uiop->uio_fmode = fp->f_flag; + + /* Allocate and copyin name and control */ + if (msg->msg_name != NULL && msg->msg_namelen != 0) { + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + + error = ltos_sockaddr_copyin((struct sockaddr *)msg->msg_name, + msg->msg_namelen, &name, &namelen, NULL); + if (error != 0) { + goto done; + } + /* copyin_name null terminates addresses for AF_UNIX */ + msg->msg_namelen = namelen; + msg->msg_name = name; + } else { + msg->msg_name = name = NULL; + msg->msg_namelen = namelen = 0; + } + + if (msg->msg_control != NULL && msg->msg_controllen != 0) { + /* + * Verify that the length is not excessive to prevent + * an application from consuming all of kernel memory. + */ + if (msg->msg_controllen > SO_MAXARGSIZE) { + error = EINVAL; + goto done; + } + if ((error = ltos_cmsgs_copyin(msg->msg_control, + msg->msg_controllen, &control, &controllen)) != 0) { + goto done; + } + msg->msg_control = control; + msg->msg_controllen = controllen; + } else { + msg->msg_control = control = NULL; + msg->msg_controllen = controllen = 0; + } + + len = uiop->uio_resid; + msg->msg_flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + nosig = ((flags & LX_MSG_NOSIGNAL) != 0); + + error = lx_socket_sendmsg(so, msg, uiop, CRED(), nosig); +done: + if (control != NULL) { + kmem_free(control, controllen); + } + if (name != NULL) { + kmem_free(name, namelen); + } + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGSND, 1); + releasef(sock); + return (len - uiop->uio_resid); +} + +long +lx_send(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = NULL; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendto(int sock, void *buffer, size_t len, int flags, + struct sockaddr *dstaddr, socklen_t addrlen) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = (char *)dstaddr; + smsg.msg_namelen = addrlen; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt <= 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } else { + iovsize = 0; + aiov = buf; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + } + + if (copyin((struct iovec32 *)smsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = aiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + aiov[i].iov_len = iovlen32; + aiov[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, aiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = aiov[i].iov_len; + + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + res = lx_send_common(sock, &smsg, &auio, flags); + + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + + return (res); +} + +/* + * Linux socket option type definitions + * + * The protocol `levels` are well defined (see in.h) The option values are + * not so well defined. Linux often uses different values vs. Illumos + * although they mean the same thing. For example, IP_TOS in Linux is + * defined as value 1 but in Illumos it is defined as value 3. This table + * maps all the Protocol levels to their options and maps them between + * Linux and Illumos and vice versa. Hence the reason for the complexity. + * + * For a certain subset of sockopts, Linux will implicitly truncate optval + * input, so long as optlen meets a minimum size. Because illumos is strict + * about optlen, we must cap optlen for those options. + */ + +typedef struct lx_sockopt_map { + const int lsm_opt; /* Illumos-native equivalent */ + const int lsm_lcap; /* Cap optlen to this size. (Ignored if 0) */ +} lx_sockopt_map_t; + +typedef struct lx_proto_opts { + const lx_sockopt_map_t *lpo_entries; /* Linux->SunOS map entries */ + unsigned int lpo_max; /* max entries in table */ +} lx_proto_opts_t; + +#define OPTNOTSUP -1 /* we don't support it */ + +#define PROTO_SOCKOPTS(opts) \ + { (opts), sizeof ((opts)) / sizeof ((opts)[0]) } + +/* Shorten name so the columns can line up */ +#define IP_MREQ_SZ sizeof (struct ip_mreq) + +static const lx_sockopt_map_t ltos_ip_sockopts[LX_IP_UNICAST_IF + 1] = { + { OPTNOTSUP, 0 }, + { IP_TOS, sizeof (int) }, /* IP_TOS */ + { IP_TTL, sizeof (int) }, /* IP_TTL */ + { IP_HDRINCL, sizeof (int) }, /* IP_HDRINCL */ + { IP_OPTIONS, 0 }, /* IP_OPTIONS */ + { OPTNOTSUP, 0 }, /* IP_ROUTER_ALERT */ + { IP_RECVOPTS, sizeof (int) }, /* IP_RECVOPTS */ + { IP_RETOPTS, sizeof (int) }, /* IP_RETOPTS */ + { IP_PKTINFO, sizeof (int) }, /* IP_PKTINFO */ + { OPTNOTSUP, 0 }, /* IP_PKTOPTIONS */ + { OPTNOTSUP, 0 }, /* IP_MTUDISCOVER */ + { OPTNOTSUP, 0 }, /* IP_RECVERR */ + { IP_RECVTTL, sizeof (int) }, /* IP_RECVTTL */ + { OPTNOTSUP, 0 }, /* IP_RECVTOS */ + { OPTNOTSUP, 0 }, /* IP_MTU */ + { OPTNOTSUP, 0 }, /* IP_FREEBIND */ + { OPTNOTSUP, 0 }, /* IP_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IP_XFRM_POLICY */ + { OPTNOTSUP, 0 }, /* IP_PASSSEC */ + { OPTNOTSUP, 0 }, /* IP_TRANSPARENT */ + { OPTNOTSUP, 0 }, /* IP_ORIGDSTADDR */ + { OPTNOTSUP, 0 }, /* IP_MINTTL */ + { OPTNOTSUP, 0 }, /* IP_NODEFRAG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IP_MULTICAST_IF, sizeof (int) }, /* IP_MULTICAST_IF */ + { IP_MULTICAST_TTL, sizeof (int) }, /* IP_MULTICAST_TTL */ + { IP_MULTICAST_LOOP, sizeof (int) }, /* IP_MULTICAST_LOOP */ + { IP_ADD_MEMBERSHIP, IP_MREQ_SZ }, /* IP_ADD_MEMBERSHIP */ + { IP_DROP_MEMBERSHIP, IP_MREQ_SZ }, /* IP_DROP_MEMBERSHIP */ + { IP_UNBLOCK_SOURCE, 0 }, /* IP_UNBLOCK_SOURCE */ + { IP_BLOCK_SOURCE, 0 }, /* IP_BLOCK_SOURCE */ + { IP_ADD_SOURCE_MEMBERSHIP, 0 }, /* IP_ADD_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_DROP_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_MSFILTER */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */ + { OPTNOTSUP, 0 }, /* IP_MULTICAST_ALL */ + { OPTNOTSUP, 0 } /* IP_UNICAST_IF */ +}; + +static const lx_sockopt_map_t ltos_ipv6_sockopts[LX_IPV6_TCLASS + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* IPV6_ADDRFORM */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTINFO */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292DSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTOPTIONS */ + { IPV6_CHECKSUM, sizeof (int) }, /* IPV6_CHECKSUM */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_NEXTHOP */ + { OPTNOTSUP, 0 }, /* IPV6_AUTHHDR */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IPV6_UNICAST_HOPS, sizeof (int) }, /* IPV6_UNICAST_HOPS */ + { IPV6_MULTICAST_IF, sizeof (int) }, /* IPV6_MULTICAST_IF */ + { IPV6_MULTICAST_HOPS, sizeof (int) }, /* IPV6_MULTICAST_HOPS */ + { IPV6_MULTICAST_LOOP, sizeof (int) }, /* IPV6_MULTICAST_LOOP */ + { OPTNOTSUP, 0 }, /* IPV6_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* IPV6_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* IPV6_ROUTER_ALERT */ + { OPTNOTSUP, 0 }, /* IPV6_MTU_DISCOVER */ + { OPTNOTSUP, 0 }, /* IPV6_MTU */ + { OPTNOTSUP, 0 }, /* IPV6_RECVERR */ + { IPV6_V6ONLY, sizeof (int) }, /* IPV6_V6ONLY */ + { OPTNOTSUP, 0 }, /* IPV6_JOIN_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_LEAVE_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IPV6_XFRM_POLICY */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IPV6_RECVPKTINFO, sizeof (int) }, /* IPV6_RECVPKTINFO */ + { IPV6_PKTINFO, 0 }, /* IPV6_PKTINFO */ + { IPV6_RECVHOPLIMIT, sizeof (int) }, /* IPV6_RECVHOPLIMIT */ + { IPV6_HOPLIMIT, 0 }, /* IPV6_HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_RECVHOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDRDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RECVRTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RECVDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_DSTOPTS */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* IPV6_RECVTCLASS */ + { IPV6_TCLASS, sizeof (int) } /* IPV6_TCLASS */ +}; + +static const lx_sockopt_map_t ltos_icmpv6_sockopts[LX_ICMP6_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { ICMP6_FILTER, 0 } /* ICMP6_FILTER */ +}; + +static const lx_sockopt_map_t ltos_tcp_sockopts[LX_TCP_NOTSENT_LOWAT + 1] = { + { OPTNOTSUP, 0 }, + { TCP_NODELAY, sizeof (int) }, /* TCP_NODELAY */ + { TCP_MAXSEG, sizeof (int) }, /* TCP_MAXSEG */ + { TCP_CORK, sizeof (int) }, /* TCP_CORK */ + { TCP_KEEPIDLE, sizeof (int) }, /* TCP_KEEPIDLE */ + { TCP_KEEPINTVL, sizeof (int) }, /* TCP_KEEPINTVL */ + { TCP_KEEPCNT, sizeof (int) }, /* TCP_KEEPCNT */ + { OPTNOTSUP, 0 }, /* TCP_SYNCNT */ + { TCP_LINGER2, sizeof (int) }, /* TCP_LINGER2 */ + { OPTNOTSUP, 0 }, /* TCP_DEFER_ACCEPT */ + { OPTNOTSUP, 0 }, /* TCP_WINDOW_CLAMP */ + { OPTNOTSUP, 0 }, /* TCP_INFO */ + { OPTNOTSUP, 0 }, /* TCP_QUICKACK */ + { OPTNOTSUP, 0 }, /* TCP_CONGESTION */ + { OPTNOTSUP, 0 }, /* TCP_MD5SIG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* TCP_THIN_LINEAR_TIMEOUTS */ + { OPTNOTSUP, 0 }, /* TCP_THIN_DUPACK */ + { OPTNOTSUP, 0 }, /* TCP_USER_TIMEOUT */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_QUEUE */ + { OPTNOTSUP, 0 }, /* TCP_QUEUE_SEQ */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_OPTIONS */ + { OPTNOTSUP, 0 }, /* TCP_FASTOPEN */ + { OPTNOTSUP, 0 }, /* TCP_TIMESTAMP */ + { OPTNOTSUP, 0 } /* TCP_NOTSENT_LOWAT */ +}; + +static const lx_sockopt_map_t ltos_igmp_sockopts[IGMP_MTRACE + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MINLEN, 0 }, /* IGMP_MINLEN */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MEMBERSHIP_QUERY, 0 }, /* IGMP_HOST_MEMBERSHIP_QUERY */ + { IGMP_V1_MEMBERSHIP_REPORT, 0 }, /* IGMP_HOST_MEMBERSHIP_REPORT */ + { IGMP_DVMRP, 0 }, /* IGMP_DVMRP */ + { IGMP_PIM, 0 }, /* IGMP_PIM */ + { OPTNOTSUP, 0 }, /* IGMP_TRACE */ + { IGMP_V2_MEMBERSHIP_REPORT, 0 }, /* IGMPV2_HOST_MEMBERSHIP_REPORT */ + { IGMP_V2_LEAVE_GROUP, 0 }, /* IGMP_HOST_LEAVE_MESSAGE */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MTRACE_RESP, 0 }, /* IGMP_MTRACE_RESP */ + { IGMP_MTRACE, 0 } /* IGMP_MTRACE */ +}; + +static const lx_sockopt_map_t ltos_socket_sockopts[LX_SO_BPF_EXTENSIONS + 1] = { + { OPTNOTSUP, 0 }, + { SO_DEBUG, sizeof (int) }, /* SO_DEBUG */ + { SO_REUSEADDR, sizeof (int) }, /* SO_REUSEADDR */ + { SO_TYPE, 0 }, /* SO_TYPE */ + { SO_ERROR, 0 }, /* SO_ERROR */ + { SO_DONTROUTE, sizeof (int) }, /* SO_DONTROUTE */ + { SO_BROADCAST, sizeof (int) }, /* SO_BROADCAST */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUF */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUF */ + { SO_KEEPALIVE, sizeof (int) }, /* SO_KEEPALIVE */ + { SO_OOBINLINE, sizeof (int) }, /* SO_OOBINLINE */ + { OPTNOTSUP, 0 }, /* SO_NO_CHECK */ + { OPTNOTSUP, 0 }, /* SO_PRIORITY */ + { SO_LINGER, 0 }, /* SO_LINGER */ + { OPTNOTSUP, 0 }, /* SO_BSDCOMPAT */ + { SO_REUSEPORT, sizeof (int) }, /* SO_REUSEPORT */ + { SO_RECVUCRED, sizeof (int) }, /* SO_PASSCRED */ + { OPTNOTSUP, 0 }, /* SO_PEERCRED */ + { SO_RCVLOWAT, sizeof (int) }, /* SO_RCVLOWAT */ + { SO_SNDLOWAT, sizeof (int) }, /* SO_SNDLOWAT */ + { SO_RCVTIMEO, 0 }, /* SO_RCVTIMEO */ + { SO_SNDTIMEO, 0 }, /* SO_SNDTIMEO */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_AUTHENTICATION */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_TRANSPORT */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_NETWORK */ + { OPTNOTSUP, 0 }, /* SO_BINDTODEVICE */ + { SO_ATTACH_FILTER, 0 }, /* SO_ATTACH_FILTER */ + { SO_DETACH_FILTER, 0 }, /* SO_DETACH_FILTER */ + { OPTNOTSUP, 0 }, /* SO_PEERNAME */ + { SO_TIMESTAMP, sizeof (int) }, /* SO_TIMESTAMP */ + { SO_ACCEPTCONN, 0 }, /* SO_ACCEPTCONN */ + { OPTNOTSUP, 0 }, /* SO_PEERSEC */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUFFORCE */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUFFORCE */ + { OPTNOTSUP, 0 }, /* SO_PASSSEC */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPNS */ + { OPTNOTSUP, 0 }, /* SO_MARK */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPING */ + { SO_PROTOTYPE, 0 }, /* SO_PROTOCOL */ + { SO_DOMAIN, 0 }, /* SO_DOMAIN */ + { OPTNOTSUP, 0 }, /* SO_RXQ_OVFL */ + { OPTNOTSUP, 0 }, /* SO_WIFI_STATUS */ + { OPTNOTSUP, 0 }, /* SO_PEEK_OFF */ + { OPTNOTSUP, 0 }, /* SO_NOFCS */ + { OPTNOTSUP, 0 }, /* SO_LOCK_FILTER */ + { OPTNOTSUP, 0 }, /* SO_SELECT_ERR_QUEUE */ + { OPTNOTSUP, 0 }, /* SO_BUSY_POLL */ + { OPTNOTSUP, 0 }, /* SO_MAX_PACING_RATE */ + { OPTNOTSUP, 0 } /* SO_BPF_EXTENSIONS */ +}; + +static const lx_sockopt_map_t ltos_raw_sockopts[LX_ICMP_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 } /* ICMP_FILTER */ +}; + +static const lx_sockopt_map_t ltos_packet_sockopts[LX_PACKET_STATISTICS + 1] = { + { OPTNOTSUP, 0 }, + { PACKET_ADD_MEMBERSHIP, 0 }, /* PACKET_ADD_MEMBERSHIP */ + { PACKET_DROP_MEMBERSHIP, 0 }, /* PACKET_DROP_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* PACKET_RECV_OUTPUT */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* PACKET_RX_RING */ + { PACKET_STATISTICS, 0 } /* PACKET_STATISTICS */ +}; + +/* Needed for SO_ATTACH_FILTER */ +struct lx_bpf_program { + unsigned short bf_len; + caddr_t bf_insns; +}; + +/* Invert filter fields as Linux expects */ +#define LX_ICMP6_FILTER_INVERT(filterp) ( \ + ((filterp)->__icmp6_filt[0] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[1] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[2] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[3] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[4] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[5] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[6] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[7] ^= 0xFFFFFFFFU)) + +static boolean_t +lx_sockopt_lookup(lx_proto_opts_t tbl, int *optname, socklen_t *optlen) +{ + const lx_sockopt_map_t *entry; + + if (*optname > tbl.lpo_max) { + return (B_FALSE); + } + entry = &tbl.lpo_entries[*optname]; + if (entry->lsm_opt == OPTNOTSUP) { + return (B_FALSE); + } + *optname = entry->lsm_opt; + /* Truncate the optlen if needed/allowed */ + if (entry->lsm_lcap != 0 && *optlen > entry->lsm_lcap) { + *optlen = entry->lsm_lcap; + } + return (B_TRUE); +} + +static int +lx_setsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + switch (optname) { + case LX_IP_RECVERR: + /* + * Ping sets this option to receive errors on raw sockets. + * Currently we just ignore it to make ping happy. From the + * Linux ip.7 man page: + * + * For raw sockets, IP_RECVERR enables passing of all + * received ICMP errors to the application. + * + * Programs known to depend upon this: + * - ping + * - traceroute + * - mount.nfs + */ + return (0); + + case LX_IP_MTU_DISCOVER: { + int val; + + /* + * We translate Linux's IP_MTU_DISCOVER into our IP_DONTFRAG, + * allowing this be a byte or an integer and observing the + * inverted sense of the two relative to one another (and + * translating accordingly). + */ + if (optlen < sizeof (int)) { + val = *((uint8_t *)optval); + } else { + val = *((int *)optval); + } + + switch (val) { + case LX_IP_PMTUDISC_DONT: + val = 1; + break; + + case LX_IP_PMTUDISC_DO: + case LX_IP_PMTUDISC_WANT: + val = 0; + break; + + default: + return (EOPNOTSUPP); + } + + error = socket_setsockopt(so, IPPROTO_IP, IP_DONTFRAG, + &val, sizeof (val), CRED()); + return (error); + } + + case LX_IP_MULTICAST_TTL: + case LX_IP_MULTICAST_LOOP: + /* + * For IP_MULTICAST_TTL and IP_MULTICAST_LOOP, Linux defines + * the option value to be an integer while we define it to be + * an unsigned character. To prevent the kernel from spitting + * back an error on an illegal length, verify that the option + * value is less than UCHAR_MAX before truncating optlen. + */ + if (optlen <= 0 || optlen > sizeof (int) || + *intval > UINT8_MAX) { + return (EINVAL); + } + optlen = sizeof (uchar_t); + break; + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + if (optname == LX_IPV6_MTU) { + /* + * There isn't a good translation for IPV6_MTU and certain apps + * such as bind9 will bail if it cannot be set. + * We just lie about the success for now. + */ + return (0); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_IPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_icmpv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER && optval != NULL) { + /* + * Surprise! The input to ICMP6_FILTER on Linux is inverted + * when compared to illumos. + */ + if (optlen != sizeof (icmp6_filter_t)) { + return (EINVAL); + } + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + + if (optname == LX_TCP_DEFER_ACCEPT) { + int *intval; + char *dfp; + + /* + * Emulate TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we can't emulate the timeout aspect so treat any + * non-zero value as enabling and zero as disabling. + */ + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + + /* + * socket_setsockopt asserts that the optval is aligned, so + * we use kmem_alloc to ensure this. + */ + dfp = (char *)kmem_alloc(sizeof (DATAFILT), KM_SLEEP); + (void) strcpy(dfp, DATAFILT); + + if (*intval > 0) { + error = socket_setsockopt(so, SOL_FILTER, FIL_ATTACH, + dfp, 9, CRED()); + if (error == EEXIST) { + error = 0; + } + } else { + error = socket_setsockopt(so, SOL_FILTER, FIL_DETACH, + dfp, 9, CRED()); + if (error == ENXIO) { + error = 0; + } + } + kmem_free(dfp, sizeof (DATAFILT)); + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_socket(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + struct lx_bpf_program *lbp; + int *intval; + struct bpf_program bp; + + switch (optname) { + case LX_SO_BSDCOMPAT: + /* Linux ignores this option. */ + return (0); + + case LX_SO_TIMESTAMP: + /* + * SO_TIMESTAMP is not supported on AF_UNIX sockets but we have + * some of those which apps use for logging, etc., so pretend + * this worked. + */ + if (so->so_family == AF_UNIX) { + return (0); + } + break; + + case LX_SO_ATTACH_FILTER: + /* + * Convert bpf program struct + */ + if (optlen != sizeof (struct lx_bpf_program)) { + return (EINVAL); + } + lbp = (struct lx_bpf_program *)optval; + bp.bf_len = lbp->bf_len; + bp.bf_insns = (struct bpf_insn *)lbp->bf_insns; + optval = &bp; + break; + + case LX_SO_PASSSEC: + /* + * SO_PASSSEC is very similar to SO_PASSCRED (emulated by + * SO_RECVUCRED) in that it requests that cmsgs containing + * identity information be attached to recieved messages. + * Instead of ucred information, security-module-specific + * information such as selinux label is expected + * + * Since LX does not at all support selinux today, the + * option is silently accepted. + */ + return (0); + + case LX_SO_PASSCRED: + /* + * In many cases, the Linux SO_PASSCRED is mapped to the SunOS + * SO_RECVUCRED to enable the passing of peer credential + * information via received cmsgs. One exception is for + * connection-oriented AF_UNIX sockets which do not yet support + * that option. Instead, we track the setting internally and, + * when there is appropriate cmsg space, emulate the credential + * passing by querying the STREAMS ioctl. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + lx_socket_aux_data_t *sad; + + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + sad = lx_sad_acquire(SOTOV(so)); + sad->lxsad_stream_cred = !(*intval == 0); + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_SOCKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + switch (optname) { + case LX_ICMP_FILTER: + /* + * This option is currently ignored to appease ping. + */ + return (0); + + case LX_IPV6_CHECKSUM: + /* + * Ping6 tries to set the IPV6_CHECKSUM offset in a way that + * illumos won't allow. Quietly ignore this to prevent it from + * complaining. + */ + return (0); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_packet(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + struct packet_mreq *mr; + + switch (optname) { + case LX_PACKET_ADD_MEMBERSHIP: + case LX_PACKET_DROP_MEMBERSHIP: + /* Convert Linux mr_type to illumos */ + if (optlen != sizeof (struct packet_mreq)) { + return (EINVAL); + } + mr = (struct packet_mreq *)optval; + if (--mr->mr_type > PACKET_MR_ALLMULTI) + return (EINVAL); + optval = mr; + break; + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_PACKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IGMP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_getsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IPV6, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_icmpv6(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER) { + error = socket_getsockopt(so, IPPROTO_ICMPV6, ICMP6_FILTER, + optval, optlen, 0, CRED()); + + /* + * ICMP6_FILTER is inverted on Linux. Make it so before copying + * back to caller's buffer. + */ + if (error == 0) { + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + 0, CRED()); + return (error); +} + +static int +lx_getsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + + switch (optname) { + case LX_TCP_CORK: + /* + * We do not support TCP_CORK but some apps rely on it. Rather + * than return an error we just return 0. This isn't exactly a + * lie, since this option really isn't set, but it's not the + * whole truth either. Fortunately, we aren't under oath. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_TCP_DEFER_ACCEPT: + /* + * We do support TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we don't emulate the timeout aspect so treat the + * existence as 1 and absence as 0. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + struct fil_info fi[10]; + int i; + socklen_t len = sizeof (fi); + + if ((error = socket_getsockopt(so, SOL_FILTER, + FIL_LIST, fi, &len, 0, CRED()) != 0)) { + *optlen = sizeof (int); + return (error); + } + + *intval = 0; + len = len / sizeof (struct fil_info); + for (i = 0; i < len; i++) { + if (fi[i].fi_flags == FILF_PROG && + strcmp(fi[i].fi_name, "datafilt") == 0) { + *intval = 1; + break; + } + } + } + *optlen = sizeof (int); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_TCP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_socket(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + + switch (optname) { + case LX_SO_PASSSEC: + /* + * Communicate value of 0 since selinux-related functionality + * is not supported. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_SO_PASSCRED: + /* + * Special handling for connection-oriented AF_UNIX sockets. + * See lx_setsockopt_socket for more details. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + lx_socket_aux_data_t *sad; + + if (*optlen < sizeof (int)) { + return (EINVAL); + } + sad = lx_sad_acquire(SOTOV(so)); + *intval = sad->lxsad_stream_cred; + *optlen = sizeof (int); + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + + case LX_SO_PEERCRED: + if (*optlen < sizeof (struct lx_ucred)) { + error = EINVAL; + } else { + struct lx_ucred *lcred = (struct lx_ucred *)optval; + + mutex_enter(&so->so_lock); + if ((so->so_mode & SM_CONNREQUIRED) == 0) { + error = ENOTSUP; + } else if (so->so_peercred == NULL) { + error = EINVAL; + } else { + lcred->lxu_uid = crgetuid(so->so_peercred); + lcred->lxu_gid = crgetgid(so->so_peercred); + lcred->lxu_pid = so->so_cpid; + } + mutex_exit(&so->so_lock); + } + *optlen = sizeof (struct lx_ucred); + return (error); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_SOCKET, optname, optval, optlen, 0, + CRED()); + + if (error == 0) { + switch (optname) { + case SO_TYPE: + /* translate our type back to Linux */ + *intval = STOL_SOCKTYPE(*intval); + break; + + case SO_ERROR: + *intval = lx_errno(*intval, EINVAL); + break; + default: + break; + } + } + return (error); +} + +static int +lx_getsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_RAW, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_packet(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_PACKET, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IGMP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +long +lx_setsockopt(int sock, int level, int optname, void *optval, socklen_t optlen) +{ + struct sonode *so; + file_t *fp; + int buflen = 0; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + int error = 0; + + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_alloc(optlen, KM_SLEEP); + } else { + /* + * Zero the on-stack buffer to avoid poisoning smaller + * optvals with stack garbage. + */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + if (copyin(optval, optbuf, optlen) != 0) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(EFAULT)); + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_setsockopt_ip(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_setsockopt_ipv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_setsockopt_icmpv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_TCP: + error = lx_setsockopt_tcp(so, optname, optbuf, optlen); + break; + case LX_SOL_SOCKET: + error = lx_setsockopt_socket(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_RAW: + error = lx_setsockopt_raw(so, optname, optbuf, optlen); + break; + case LX_SOL_PACKET: + error = lx_setsockopt_packet(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_setsockopt_igmp(so, optname, optbuf, optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_setsockopt(so, LX_SOL_NETLINK, optname, optval, + optlen, CRED()); + break; + default: + error = ENOPROTOOPT; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + snprintf(buf, LX_UNSUP_BUFSZ, "setsockopt(%d, %d)", level, + optname); + lx_unsupported(buf); + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getsockopt(int sock, int level, int optname, void *optval, + socklen_t *optlenp) +{ + struct sonode *so; + file_t *fp; + int error = 0, buflen = 0; + socklen_t optlen; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + + if (copyin(optlenp, &optlen, sizeof (optlen)) != 0) { + return (set_errno(EFAULT)); + } + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_zalloc(optlen, KM_SLEEP); + } else { + /* zero the on-stack buffer, just in case */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_getsockopt_ip(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_getsockopt_ipv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_getsockopt_icmpv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_TCP: + error = lx_getsockopt_tcp(so, optname, optbuf, &optlen); + break; + case LX_SOL_SOCKET: + error = lx_getsockopt_socket(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_RAW: + error = lx_getsockopt_raw(so, optname, optbuf, &optlen); + break; + case LX_SOL_PACKET: + error = lx_getsockopt_packet(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_getsockopt_igmp(so, optname, optbuf, &optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_getsockopt(so, LX_SOL_NETLINK, optname, optval, + &optlen, 0, CRED()); + break; + default: + error = EOPNOTSUPP; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + snprintf(buf, LX_UNSUP_BUFSZ, "getsockopt(%d, %d)", level, + optname); + lx_unsupported(buf); + } + if (copyout(&optlen, optlenp, sizeof (optlen)) != 0) { + error = EFAULT; + } + if (error == 0 && optlen > 0) { + VERIFY(optlen <= sizeof (stkbuf) || optlen <= buflen); + if (copyout(optbuf, optval, optlen) != 0) { + error = EFAULT; + } + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getname_common(lx_getname_type_t type, int sockfd, void *np, int *nlp) +{ + struct sockaddr_storage buf; + struct sockaddr *name = (struct sockaddr *)&buf; + socklen_t namelen, namelen_orig; + int err, tmp; + struct sonode *so; + + /* We need to validate the name address up front to pass LTP. */ + if (copyin(np, &tmp, sizeof (tmp)) != 0) + return (set_errno(EFAULT)); + + if (copyin(nlp, &namelen, sizeof (socklen_t)) != 0) + return (set_errno(EFAULT)); + namelen_orig = namelen; + + /* LTP can pass -1 */ + if ((int)namelen < 0) + return (set_errno(EINVAL)); + + if ((so = getsonode(sockfd, &err, NULL)) == NULL) + return (set_errno(err)); + + bzero(&buf, sizeof (buf)); + namelen = sizeof (struct sockaddr_storage); + if (type == LX_GETPEERNAME) { + err = socket_getpeername(so, name, &namelen, B_FALSE, CRED()); + } else { + err = socket_getsockname(so, name, &namelen, CRED()); + } + + if (err == 0) { + ASSERT(namelen <= so->so_max_addr_len); + err = stol_sockaddr_copyout(name, namelen, + (struct sockaddr *)np, (socklen_t *)nlp, namelen_orig); + } + + releasef(sockfd); + return (err != 0 ? set_errno(err) : 0); +} + +long +lx_getpeername(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETPEERNAME, sockfd, np, nlp)); +} + +long +lx_getsockname(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETSOCKNAME, sockfd, np, nlp)); +} + +static int +lx_accept_common(int sock, struct sockaddr *name, socklen_t *nlp, int flags) +{ + struct sonode *so; + file_t *fp; + int error; + socklen_t namelen; + struct sonode *nso; + struct vnode *nvp; + struct file *nfp; + int nfd; + int arg; + + if (flags & ~(LX_SOCK_CLOEXEC | LX_SOCK_NONBLOCK)) { + return (set_errno(EINVAL)); + } + + if ((so = getsonode(sock, &error, &fp)) == NULL) + return (set_errno(error)); + + if (name != NULL) { + /* + * The Linux man page says that -1 is returned and errno is set + * to EFAULT if the "name" address is bad, but it is silent on + * what to set errno to if the "namelen" address is bad. + * LTP expects EINVAL. + * + * Note that we must first check the name pointer, as the Linux + * docs state nothing is copied out if the "name" pointer is + * NULL. If it is NULL, we don't care about the namelen + * pointer's value or about dereferencing it. + */ + if (copyin(nlp, &namelen, sizeof (namelen))) { + releasef(sock); + return (set_errno(EINVAL)); + } + if (namelen == 0) { + name = NULL; + } + } else { + namelen = 0; + } + + /* + * Allocate the user fd before socket_accept() in order to + * catch EMFILE errors before calling socket_accept(). + */ + if ((error = falloc(NULL, FWRITE|FREAD, &nfp, &nfd)) != 0) { + eprintsoline(so, EMFILE); + releasef(sock); + return (set_errno(error)); + } + if ((error = socket_accept(so, fp->f_flag, CRED(), &nso)) != 0) { + setf(nfd, NULL); + unfalloc(nfp); + releasef(sock); + return (set_errno(error)); + } + + nvp = SOTOV(nso); + + if (namelen != 0) { + socklen_t addrlen = sizeof (struct sockaddr_storage); + struct sockaddr_storage buf; + struct sockaddr *addrp = (struct sockaddr *)&buf; + + if ((error = socket_getpeername(nso, addrp, &addrlen, B_TRUE, + CRED())) == 0) { + error = stol_sockaddr_copyout(addrp, addrlen, + name, nlp, namelen); + /* + * Logic might dictate that we should check if we can + * write to the namelen pointer earlier so we don't + * accept a pending connection only to fail the call + * because we can't write the namelen value back out. + * However, testing shows Linux does indeed fail the + * call after accepting the connection so we must + * behave in a compatible manner. + */ + } else { + ASSERT(error == EINVAL || error == ENOTCONN); + error = ECONNABORTED; + } + } + + if (error != 0) { + setf(nfd, NULL); + unfalloc(nfp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + releasef(sock); + return (set_errno(error)); + } + + /* Fill in the entries that falloc reserved */ + nfp->f_vnode = nvp; + mutex_exit(&nfp->f_tlock); + setf(nfd, nfp); + + /* Act on LX_SOCK_CLOEXEC from flags */ + if (flags & LX_SOCK_CLOEXEC) { + f_setfd(nfd, FD_CLOEXEC); + } + + /* + * In Linux, accept()ed sockets do not inherit anything set by fcntl(), + * so either explicitly set the flags or filter those out. + * + * The VOP_SETFL code is a simplification of the F_SETFL code in + * fcntl(). Ignore any errors from VOP_SETFL. + */ + arg = 0; + if (flags & LX_SOCK_NONBLOCK) + arg |= FNONBLOCK; + + error = VOP_SETFL(nvp, nfp->f_flag, arg, nfp->f_cred, NULL); + if (error != 0) { + eprintsoline(so, error); + error = 0; + } else { + mutex_enter(&nfp->f_tlock); + nfp->f_flag &= ~FMASK | (FREAD|FWRITE); + nfp->f_flag |= arg; + mutex_exit(&nfp->f_tlock); + } + + releasef(sock); + return (nfd); +} + +long +lx_accept(int sockfd, void *np, int *nlp) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, 0)); +} + +long +lx_accept4(int sockfd, void *np, int *nlp, int flags) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, flags)); +} + +#if defined(_SYSCALL32_IMPL) + +#define LX_SYS_SOCKETCALL 102 +#define LX_SOCKETCALL_MAX 20 + +typedef long (*lx_sockfn_t)(); + +static struct { + lx_sockfn_t s_fn; /* Function implementing the subcommand */ + int s_nargs; /* Number of arguments the function takes */ +} lx_socketcall_fns[] = { + lx_socket, 3, /* socket */ + lx_bind, 3, /* bind */ + lx_connect, 3, /* connect */ + NULL, 2, /* listen */ + lx_accept, 3, /* accept */ + lx_getsockname, 3, /* getsockname */ + lx_getpeername, 3, /* getpeername */ + NULL, 4, /* socketpair */ + lx_send, 4, /* send */ + lx_recv, 4, /* recv */ + lx_sendto, 6, /* sendto */ + lx_recvfrom, 6, /* recvfrom */ + NULL, 2, /* shutdown */ + lx_setsockopt, 5, /* setsockopt */ + lx_getsockopt, 5, /* getsockopt */ + lx_sendmsg, 3, /* sendmsg */ + lx_recvmsg, 3, /* recvmsg */ + lx_accept4, 4, /* accept4 */ + NULL, 5, /* recvmmsg */ + NULL, 4 /* sendmmsg */ +}; + +long +lx_socketcall(long p1, uint32_t *p2) +{ + int subcmd, i; + unsigned long args[6] = { 0, 0, 0, 0, 0, 0 }; + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + /* incoming subcmds are 1-indexed */ + subcmd = (int)p1 - 1; + + if (subcmd < 0 || subcmd >= LX_SOCKETCALL_MAX) { + return (-EINVAL); + } + + /* Vector back out to userland emulation if we lack IKE */ + if (lx_socketcall_fns[subcmd].s_fn == NULL) { + uintptr_t uargs[2] = {p1, (uintptr_t)p2}; + /* The userspace emulation will handle the syscall return */ + lwpd->br_eosys = JUSTRETURN; + lx_emulate_user32(ttolwp(curthread), LX_SYS_SOCKETCALL, uargs); + return (0); + } + + /* + * Copy the arguments to the subcommand in from the app's address + * space, returning EFAULT if we get a bogus pointer. + */ + for (i = 0; i < lx_socketcall_fns[subcmd].s_nargs; i++) { + uint32_t arg; + + if (copyin(&p2[i], &arg, sizeof (uint32_t)) != 0) { + return (set_errno(EFAULT)); + } + args[i] = (unsigned long)arg; + } + + return ((lx_socketcall_fns[subcmd].s_fn)(args[0], args[1], args[2], + args[3], args[4], args[5])); +} + +#endif /* defined(_SYSCALL32_IMPL) */ + +static void +lx_socket_vsd_free(void *data) +{ + lx_socket_aux_data_t *entry; + + entry = (lx_socket_aux_data_t *)data; + mutex_destroy(&entry->lxsad_lock); + kmem_free(entry, sizeof (*entry)); +} + +void +lx_socket_init() +{ + vsd_create(&lx_socket_vsd, lx_socket_vsd_free); +} + +void +lx_socket_fini() +{ + vsd_destroy(&lx_socket_vsd); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_stat.c b/usr/src/uts/common/brand/lx/syscall/lx_stat.c new file mode 100644 index 0000000000..2ec8a4542d --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_stat.c @@ -0,0 +1,439 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/model.h> +#include <sys/mode.h> +#include <sys/stat.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> +#include <sys/brand.h> +#include <sys/ddi.h> + +/* From "uts/common/syscall/stat.c" */ +extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **); + +typedef struct lx_timespec32 { + int32_t ts_sec; + int32_t ts_nsec; +} lx_timespec32_t; + +typedef struct lx_timespec64 { + int64_t ts_sec; + int64_t ts_nsec; +}lx_timespec64_t; + +struct lx_stat32 { + uint16_t st_dev; + uint16_t st_pad1; + uint32_t st_ino; + uint16_t st_mode; + uint16_t st_nlink; + uint16_t st_uid; + uint16_t st_gid; + uint16_t st_rdev; + uint16_t st_pad2; + uint32_t st_size; + uint32_t st_blksize; + uint32_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint32_t st_pad3; + uint32_t st_pad4; +}; + +#pragma pack(4) +struct lx_stat64_32 { + uint64_t st_dev; + uint32_t st_pad1; + uint32_t st_small_ino; + uint32_t st_mode; + uint32_t st_nlink; + uint32_t st_uid; + uint32_t st_gid; + uint64_t st_rdev; + uint32_t st_pad2; + uint64_t st_size; + uint32_t st_blksize; + uint64_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint64_t st_ino; +}; +#pragma pack() + +#if defined(_LP64) +struct lx_stat64_64 { + uint64_t st_dev; + uint64_t st_ino; + uint64_t st_nlink; /* yes, the order really is */ + uint32_t st_mode; /* different for these two */ + uint32_t st_uid; + uint32_t st_gid; + uint32_t st_pad0; + uint64_t st_rdev; + int64_t st_size; + int64_t st_blksize; + int64_t st_blocks; + lx_timespec64_t st_atime; + lx_timespec64_t st_mtime; + lx_timespec64_t st_ctime; + int64_t st_unused[3]; +}; +#endif /* defined(_LP64) */ + +typedef enum lx_stat_fmt { + LXF_STAT32, + LXF_STAT64_32, + LXF_STAT64_64 +} lx_stat_fmt_t; + +static void +lx_stat_xlate_dev(vattr_t *vattr) +{ + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + dev_t dev = vattr->va_fsid; + lx_virt_disk_t *vd; + + /* Substitute emulated major/minor on mounted datasets */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_real_dev == dev) { + dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + /* Mangle st_dev into expected format */ + vattr->va_fsid = LX_MAKEDEVICE(getmajor(dev), getminor(dev)); +} + +static long +lx_stat_common(vnode_t *vp, cred_t *cr, void *outp, lx_stat_fmt_t fmt) +{ + vattr_t vattr; + mode_t mode; + int error; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, cr, NULL)) != 0) { + return (error); + } + + mode = VTTOIF(vattr.va_type) | vattr.va_mode; + if ((mode & S_IFMT) == S_IFBLK) { + /* Linux seems to report a 0 st_size for all block devices */ + vattr.va_size = 0; + } + if (vattr.va_rdev == NODEV) { + /* Linux leaves st_rdev zeroed when it is absent */ + vattr.va_rdev = 0; + } + + lx_stat_xlate_dev(&vattr); + + if (fmt == LXF_STAT32) { + struct lx_stat32 sb; + + if (vattr.va_fsid > USHRT_MAX || vattr.va_rdev > USHRT_MAX || + vattr.va_nlink > USHRT_MAX || vattr.va_size > INT_MAX) { + return (EOVERFLOW); + } + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = LX_UID32_TO_UID16(vattr.va_uid); + sb.st_gid = LX_GID32_TO_GID16(vattr.va_gid); + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_32) { + struct lx_stat64_32 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_small_ino = (vattr.va_nodeid & UINT_MAX); + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_64) { +#if defined(_LP64) + struct lx_stat64_64 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); +#else + /* Invalid output format on 32-bit */ + VERIFY(0); +#endif + } + + /* Invalid output format */ + VERIFY(0); + return (0); +} + +long +lx_stat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat32(int fd, void *outp) +{ + file_t *fp; + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, LXF_STAT32); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_stat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat64(int fd, void *outp) +{ + file_t *fp; + model_t model = get_udatamodel(); + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +#define LX_FSTATAT_ALLOWED (LX_AT_SYMLINK_NOFOLLOW | LX_AT_EMPTY_PATH | \ + LX_AT_NO_AUTOMOUNT) + +long +lx_fstatat64(int fd, char *name, void *outp, int flag) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + enum symfollow follow = FOLLOW; + int error; + char c; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + if ((flag & ~LX_FSTATAT_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flag & LX_AT_NO_AUTOMOUNT) != 0) { + /* + * While AT_NO_AUTOMOUNT is a legal flag for fstatat64, it is + * not yet supported by lx_autofs. + */ + lx_unsupported("fstatat(AT_NO_AUTOMOUNT)"); + return (set_errno(EINVAL)); + } + if ((flag & LX_AT_SYMLINK_NOFOLLOW) != 0) { + follow = NO_FOLLOW; + } + + if (copyin(name, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + if ((flag & LX_AT_EMPTY_PATH) == 0) { + return (set_errno(ENOENT)); + } + + /* + * When AT_EMPTY_PATH is set and and empty string has been + * passed for the name parameter, direct the lookup against the + * vnode for that fd. + */ + if (fd == AT_FDCWD) { + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + cr = CRED(); + crhold(cr); + } else { + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + cr = fp->f_cred; + crhold(cr); + releasef(fd); + } + } else { + if ((error = cstatat_getvp(fd, name, follow, &vp, &cr)) != 0) { + return (set_errno(error)); + } + } + + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sync.c b/usr/src/uts/common/brand/lx/syscall/lx_sync.c new file mode 100644 index 0000000000..614afca0b0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sync.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +long +lx_syncfs(int fd) +{ + file_t *fp; + vfs_t *vfsp; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + vfsp = fp->f_vnode->v_vfsp; + releasef(fd); + + (void) (vfsp->vfs_op->vfs_sync)(vfsp, 0, CRED()); + + return (0); +} + +#define LX_SYNC_FILE_RANGE_WAIT_BEFORE 0x1 +#define LX_SYNC_FILE_RANGE_WRITE 0x2 +#define LX_SYNC_FILE_RANGE_WAIT_AFTER 0x4 + +#define LX_SYNC_FILE_RANGE_VALID (LX_SYNC_FILE_RANGE_WAIT_BEFORE | \ + LX_SYNC_FILE_RANGE_WRITE | LX_SYNC_FILE_RANGE_WAIT_AFTER) + + +long +lx_sync_file_range(int fd, off_t offset, off_t nbytes, int flags) +{ + file_t *fp; + int error, sflags = 0; + + if ((flags & ~LX_SYNC_FILE_RANGE_VALID) != 0) + return (set_errno(EINVAL)); + if (offset < 0 || nbytes < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + /* + * Since sync_file_range is implemented in terms of VOP_PUTPAGE, both + * SYNC_FILE_RANGE_WAIT flags are treated as forcing synchronous + * operation. While this differs from the Linux behavior where + * BEFORE/AFTER are distinct, it achieves an adequate level of safety + * since the requested data is synced out at the end of the call. + */ + if ((flags & (LX_SYNC_FILE_RANGE_WAIT_BEFORE | + LX_SYNC_FILE_RANGE_WAIT_AFTER)) == 0) { + sflags |= B_ASYNC; + } + + error = VOP_PUTPAGE(fp->f_vnode, offset, nbytes, sflags, CRED(), NULL); + if (error == ENOSYS) { + error = ESPIPE; + } + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c new file mode 100644 index 0000000000..449d5882d4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -0,0 +1,218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <vm/anon.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/zone.h> +#include <sys/time.h> + +typedef struct lx_sysinfo { + int64_t si_uptime; /* Seconds since boot */ + uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint64_t si_totalram; /* Total memory size */ + uint64_t si_freeram; /* Available memory */ + uint64_t si_sharedram; /* Shared memory */ + uint64_t si_bufferram; /* Buffer memory */ + uint64_t si_totalswap; /* Total swap space */ + uint64_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint64_t si_totalhigh; /* High memory size */ + uint64_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ +} lx_sysinfo_t; + +#if defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit usermode struct. + */ +#pragma pack(4) +typedef struct lx_sysinfo32 { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ + char __si_pad[8]; +} lx_sysinfo32_t; +#pragma pack() +#endif + +extern pgcnt_t swapfs_minfree; + +static void +lx_sysinfo_common(lx_sysinfo_t *si) +{ + zone_t *zone = curthread->t_procp->p_zone; + uint64_t zphysmem, zfreemem, ztotswap, zfreeswap; + + si->si_uptime = gethrestime_sec() - zone->zone_boot_time; + + si->si_loads[0] = zone->zone_hp_avenrun[0]; + si->si_loads[1] = zone->zone_hp_avenrun[1]; + si->si_loads[2] = zone->zone_hp_avenrun[2]; + + /* + * In linux each thread looks like a process, so we conflate the + * two in this stat as well. + */ + si->si_procs = (int32_t)zone->zone_nlwps; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are in pages, but the + * zone values are in bytes. Likewise, ani_max and ani_free are in + * pages. + */ + if (zone->zone_phys_mem_ctl == UINT64_MAX) { + zphysmem = physmem; + zfreemem = freemem; + } else { + zphysmem = btop(zone->zone_phys_mem_ctl); + zfreemem = btop(zone->zone_phys_mem_ctl - zone->zone_phys_mem); + } + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + ztotswap = k_anoninfo.ani_max; + zfreeswap = k_anoninfo.ani_free; + } else { + /* + * See the comment in swapctl for a description of how free is + * calculated within a zone. + */ + rctl_qty_t used; + spgcnt_t avail; + uint64_t max; + + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail; + + mutex_enter(&zone->zone_mem_lock); + ztotswap = btop(zone->zone_max_swap_ctl); + used = btop(zone->zone_max_swap); + mutex_exit(&zone->zone_mem_lock); + + zfreeswap = MIN(ztotswap, max) - used; + } + + /* + * If the maximum memory stat is less than 1^20 pages (i.e. 4GB), + * then we report the result in bytes. Otherwise we use pages. + * Once we start supporting >1TB systems/zones, we'll need a third + * option. + */ + if (MAX(zphysmem, ztotswap) < 1024 * 1024) { + si->si_totalram = ptob(zphysmem); + si->si_freeram = ptob(zfreemem); + si->si_totalswap = ptob(ztotswap); + si->si_freeswap = ptob(zfreeswap); + si->si_mem_unit = 1; + } else { + si->si_totalram = zphysmem; + si->si_freeram = zfreemem; + si->si_totalswap = ztotswap; + si->si_freeswap = zfreeswap; + si->si_mem_unit = PAGESIZE; + } + si->si_bufferram = 0; + si->si_sharedram = 0; + + /* + * These two stats refer to high physical memory. If an + * application running in a Linux zone cares about this, then + * either it or we are broken. + */ + si->si_totalhigh = 0; + si->si_freehigh = 0; +} + +long +lx_sysinfo64(caddr_t sip) +{ + lx_sysinfo_t si; + + bzero(&si, sizeof (si)); + lx_sysinfo_common(&si); + + if (copyout(&si, sip, sizeof (si)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +long +lx_sysinfo32(caddr_t sip) +{ + lx_sysinfo_t si; + lx_sysinfo32_t si32; + int i; + + lx_sysinfo_common(&si); + + /* + * Convert the lx_sysinfo_t into the legacy 32-bit view: + */ + bzero(&si32, sizeof (si32)); + si32.si_uptime = si.si_uptime; + + for (i = 0; i < 3; i++) { + if ((si.si_loads[i]) > 0x7fffffff) + si32.si_loads[i] = 0x7fffffff; + else + si32.si_loads[i] = si.si_loads[i]; + } + + si32.si_procs = si.si_procs; + si32.si_totalram = si.si_totalram; + si32.si_freeram = si.si_freeram; + si32.si_totalswap = si.si_totalswap; + si32.si_freeswap = si.si_freeswap; + si32.si_mem_unit = si.si_mem_unit; + + si32.si_bufferram = si.si_bufferram; + si32.si_sharedram = si.si_sharedram; + + si32.si_totalhigh = si.si_totalhigh; + si32.si_freehigh = si.si_freehigh; + + if (copyout(&si32, sip, sizeof (si32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c new file mode 100644 index 0000000000..48d91b09cc --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cpuvar.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <lx_syscall.h> + +long +lx_arch_prctl(int code, ulong_t addr) +{ +#if defined(__amd64) + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *llwp = lwptolxlwp(lwp); + pcb_t *pcb = &lwp->lwp_pcb; + + switch (code) { + case LX_ARCH_GET_FS: + if (copyout(&llwp->br_lx_fsbase, (void *)addr, + sizeof (llwp->br_lx_fsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_FS: + llwp->br_lx_fsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_fsbase != llwp->br_lx_fsbase) { + pcb->pcb_fsbase = llwp->br_lx_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); + break; + + case LX_ARCH_GET_GS: + if (copyout(&llwp->br_lx_gsbase, (void *)addr, + sizeof (llwp->br_lx_gsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_GS: + llwp->br_lx_gsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_gsbase != llwp->br_lx_gsbase) { + pcb->pcb_gsbase = llwp->br_lx_gsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); + break; + + default: + return (set_errno(EINVAL)); + } +#endif + + return (0); +} + +long +lx_get_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + + if (fuword32(&inf->entry_number, (uint32_t *)&entry)) + return (set_errno(EFAULT)); + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + /* + * convert the solaris ldt to the linux format expected by the + * caller + */ + DESC_TO_LDT_INFO(dscrp, &ldt_inf); + ldt_inf.entry_number = entry; + + if (copyout(&ldt_inf, inf, sizeof (struct ldt_info))) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_set_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + int i; + + /* Check that casts for accessing the words in user_desc are valid */ + ASSERT(sizeof (user_desc_t) == 8); + + if (copyin(inf, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + entry = ldt_inf.entry_number; + if (entry == -1) { + /* + * Find an empty entry in the tls for this thread. + * The casts assume each user_desc_t entry is 8 bytes. + */ + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) { + if (((uint_t *)dscrp)[0] == 0 && + ((uint_t *)dscrp)[1] == 0) + break; + } + + if (i < LX_TLSNUM) { + /* + * found one + */ + entry = i + GDT_TLSMIN; + if (suword32(&inf->entry_number, entry)) + return (set_errno(EFAULT)); + } else { + return (set_errno(ESRCH)); + } + } + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + /* + * convert the linux ldt info to standard intel descriptor + */ + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + if (LDT_INFO_EMPTY(&ldt_inf)) { + ((uint_t *)dscrp)[0] = 0; + ((uint_t *)dscrp)[1] = 0; + } else { + LDT_INFO_TO_DESC(&ldt_inf, dscrp); + } + + /* + * update the gdt with the new descriptor + */ + kpreempt_disable(); + + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) + lx_set_gdt(GDT_TLSMIN + i, dscrp); + + kpreempt_enable(); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c new file mode 100644 index 0000000000..c2fb4a4c7d --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c @@ -0,0 +1,379 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * The illumos kernel provides two clock backends: CLOCK_REALTIME, the + * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically + * increasing time source that is not subject to drift or adjustment. By + * contrast, the Linux kernel is furnished with an overabundance of narrowly + * differentiated clock types. + * + * Fortunately, most of the commonly used Linux clock types are either similar + * enough to the native clock backends that they can be directly mapped, or + * represent queries to the per-process and per-LWP microstate counters. + * + * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into + * account time that the system is suspended. Since that is uninteresting to + * us, we treat it the same. + */ + +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * From "uts/common/os/timer.c": + */ +extern int clock_settime(clockid_t, timespec_t *); +extern int clock_gettime(clockid_t, timespec_t *); +extern int clock_getres(clockid_t, timespec_t *); +extern int nanosleep(timespec_t *, timespec_t *); + + +static int lx_emul_clock_getres(clockid_t, timespec_t *); +static int lx_emul_clock_gettime(clockid_t, timespec_t *); +static int lx_emul_clock_settime(clockid_t, timespec_t *); + +typedef struct lx_clock_backend { + clockid_t lclk_ntv_id; + int (*lclk_clock_getres)(clockid_t, timespec_t *); + int (*lclk_clock_gettime)(clockid_t, timespec_t *); + int (*lclk_clock_settime)(clockid_t, timespec_t *); +} lx_clock_backend_t; + +/* + * NOTE: The Linux man pages state this structure is obsolete and is + * unsupported, so it is declared here for sizing purposes only. + */ +struct lx_timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +/* + * Use the native clock_* system call implementation, but with a translated + * clock identifier: + */ +#define NATIVE(ntv_id) \ + { ntv_id, clock_getres, clock_gettime, clock_settime } + +/* + * This backend is not supported, so we provide an emulation handler: + */ +#define EMUL(ntv_id) \ + { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime, \ + lx_emul_clock_settime } + +static lx_clock_backend_t lx_clock_backends[] = { + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC */ + EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */ + EMUL(CLOCK_THREAD_CPUTIME_ID), /* LX_CLOCK_THREAD_CPUTIME_ID */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_RAW */ + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME_COARSE */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_COARSE */ + NATIVE(CLOCK_HIGHRES) /* LX_CLOCK_BOOTTIME */ +}; + +#define LX_CLOCK_MAX \ + (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0])) +#define LX_CLOCK_BACKEND(clk) \ + ((clk) < LX_CLOCK_MAX && (clk) >= 0 ? &lx_clock_backends[(clk)] : NULL) + +static int +lx_emul_clock_settime(clockid_t clock, timespec_t *tp) +{ + return (set_errno(EINVAL)); +} + +static int +lx_emul_clock_gettime(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: { + proc_t *p = ttoproc(curthread); + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage() in "rusagesys.c": + */ + mutex_enter(&p->p_lock); + unsecs = mstate_aggr_state(p, LMS_USER); + snsecs = mstate_aggr_state(p, LMS_SYSTEM); + mutex_exit(&p->p_lock); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + case CLOCK_THREAD_CPUTIME_ID: { + klwp_t *lwp = ttolwp(curthread); + struct mstate *ms = &lwp->lwp_mstate; + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage_lwp() in "rusagesys.c": + */ + unsecs = ms->ms_acct[LMS_USER]; + snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + + scalehrtime(&unsecs); + scalehrtime(&snsecs); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static int +lx_emul_clock_getres(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + if (tp == NULL) { + return (0); + } + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: + case CLOCK_THREAD_CPUTIME_ID: + /* + * These clock backends return microstate accounting values for + * the LWP or the entire process. The Linux kernel claims they + * have nanosecond resolution; so will we. + */ + t.tv_sec = 0; + t.tv_nsec = 1; + break; + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static void +lx_clock_unsupported(int clock) +{ + char buf[100]; + + (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock); + lx_unsupported(buf); +} + +long +lx_clock_settime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_gettime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_getres(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if (tp == NULL) { + return (0); + } + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp)); +} + + +long +lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp) +{ + struct lx_timezone tz; + + bzero(&tz, sizeof (tz)); + + /* + * We want to be similar to libc which just does a fasttrap to + * gethrestime and simply converts that result. We follow how uniqtime + * does the conversion but we can't use that code since it does some + * extra work which can cause the result to bounce around based on which + * CPU we run on. + */ + if (tvp != NULL) { + struct timeval tv; + timestruc_t ts; + int usec, nsec; + + gethrestime(&ts); + nsec = ts.tv_nsec; + usec = nsec + (nsec >> 2); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 4); + usec = nsec - (usec >> 3); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 3); + usec = nsec + (usec >> 4); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 6); + usec = usec >> 10; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = usec; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(&tv, tvp, sizeof (tv)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (copyout(&tv32, tvp, sizeof (tv32))) + return (set_errno(EFAULT)); + } +#endif + } + + /* + * The Linux man page states use of the second parameter is obsolete, + * but gettimeofday(2) should still return EFAULT if it is set + * to a bad non-NULL pointer (sigh...) + */ + if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure + * mode is documented as "undefined." + */ +long +lx_time(time_t *tp) +{ + timestruc_t ts; + struct timeval tv; + + gethrestime(&ts); + tv.tv_sec = ts.tv_sec; + tv.tv_usec = 0; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (tp != NULL && + copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0) + return (set_errno(EFAULT)); + + return (tv.tv_sec); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (tp != NULL && + copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec))) + return (set_errno(EFAULT)); + + return (tv32.tv_sec); + } +#endif +} + +long +lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp) +{ + return (nanosleep(rqtp, rmtp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_uname.c b/usr/src/uts/common/brand/lx/syscall/lx_uname.c new file mode 100644 index 0000000000..2d18408eaa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_uname.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> + +struct lx_utsname { + char lxu_sysname[LX_SYS_UTS_LN]; + char lxu_nodename[LX_SYS_UTS_LN]; + char lxu_release[LX_SYS_UTS_LN]; + char lxu_version[LX_SYS_UTS_LN]; + char lxu_machine[LX_SYS_UTS_LN]; + char lxu_domainname[LX_SYS_UTS_LN]; +}; + +long +lx_uname(void *uptr) +{ + proc_t *p = curproc; + lx_proc_data_t *lxpd = ptolxproc(p); + lx_zone_data_t *lxzd = ztolxzd(p->p_zone); + struct lx_utsname un; + + bzero(&un, sizeof (un)); + + (void) strlcpy(un.lxu_sysname, LX_UNAME_SYSNAME, LX_SYS_UTS_LN); + (void) strlcpy(un.lxu_nodename, p->p_zone->zone_nodename, + LX_SYS_UTS_LN); + + mutex_enter(&lxzd->lxzd_lock); + + if (lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(un.lxu_release, lxpd->l_uname_release, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_release, lxzd->lxzd_kernel_release, + LX_SYS_UTS_LN); + } + if (lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(un.lxu_version, lxpd->l_uname_version, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_version, lxzd->lxzd_kernel_version, + LX_SYS_UTS_LN); + } + + mutex_exit(&lxzd->lxzd_lock); + + if (get_udatamodel() == DATAMODEL_LP64) { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE64, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE32, + LX_SYS_UTS_LN); + } + (void) strlcpy(un.lxu_domainname, p->p_zone->zone_domain, + LX_SYS_UTS_LN); + + if (copyout(&un, uptr, sizeof (un)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c new file mode 100644 index 0000000000..e8358f9f69 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +/* + * wait() family of functions. + * + * The first minor difference between the Linux and Solaris family of wait() + * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully, + * the exit status values are identical between the two implementations. + * + * Things get very different and very complicated when we introduce the Linux + * threading model. Under linux, both threads and child processes are + * represented as processes. However, the behavior of wait() with respect to + * each child varies according to the flags given to clone() + * + * SIGCHLD The SIGCHLD signal should be sent on termination + * CLONE_THREAD The child shares the same thread group as the parent + * CLONE_DETACHED The parent receives no notification when the child exits + * + * The following flags control the Linux behavior w.r.t. the above attributes: + * + * __WALL Wait on all children, regardless of type + * __WCLONE Wait only on non-SIGCHLD children + * __WNOTHREAD Don't wait on children of other threads in this group + * + * The following chart shows whether wait() returns when the child exits: + * + * default __WCLONE __WALL + * no SIGCHLD - X X + * SIGCHLD X - X + * + * The following chart shows whether wait() returns when the grandchild exits: + * + * default __WNOTHREAD + * no CLONE_THREAD - - + * CLONE_THREAD X - + * + * The CLONE_DETACHED flag is universal - when the child exits, no state is + * stored and wait() has no effect. + * + * XXX Support the above combination of options, or some reasonable subset that + * covers at least fork() and pthread_create(). + */ + +#include <sys/wait.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_errno.h> +#include <lx_syscall.h> + +/* + * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c": + */ +extern int waitid(idtype_t, id_t, k_siginfo_t *, int); +extern int rusagesys(int, void *, void *, void *, void *); + +/* + * Convert between Linux options and Solaris options, returning -1 if any + * invalid flags are found. + */ +#define LX_WNOHANG 0x00000001 +#define LX_WUNTRACED 0x00000002 +#define LX_WSTOPPED LX_WUNTRACED +#define LX_WEXITED 0x00000004 +#define LX_WCONTINUED 0x00000008 +#define LX_WNOWAIT 0x01000000 + +#define LX_WNOTHREAD 0x20000000 +#define LX_WALL 0x40000000 +#define LX_WCLONE 0x80000000 + +#define LX_P_ALL 0x0 +#define LX_P_PID 0x1 +#define LX_P_GID 0x2 + +/* + * Split the passed waitpid/waitid options into two separate variables: + * those for the native illumos waitid(2), and the extra Linux-specific + * options we will handle in our brand-specific code. + */ +static int +ltos_options(uintptr_t options, int *native_options, int *extra_options) +{ + int newoptions = 0; + + if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED | + LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL | + LX_WCLONE)) != 0) { + return (-1); + } + + *extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE); + + if (options & LX_WNOHANG) + newoptions |= WNOHANG; + if (options & LX_WUNTRACED) + newoptions |= WUNTRACED; + if (options & LX_WEXITED) + newoptions |= WEXITED; + if (options & LX_WCONTINUED) + newoptions |= WCONTINUED; + if (options & LX_WNOWAIT) + newoptions |= WNOWAIT; + + /* + * The trapped option is implicit on Linux. + */ + newoptions |= WTRAPPED; + + *native_options = newoptions; + return (0); +} + +static int +lx_wstat(int code, int status) +{ + int stat = 0; + + switch (code) { + case CLD_EXITED: + stat = status << 8; + break; + case CLD_DUMPED: + stat = lx_stol_signo(status, SIGKILL) | WCOREFLG; + break; + case CLD_KILLED: + stat = lx_stol_signo(status, SIGKILL); + break; + case CLD_TRAPPED: + case CLD_STOPPED: + stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG; + break; + case CLD_CONTINUED: + stat = WCONTFLG; + break; + } + + return (stat); +} + +static int +lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options, + int extra_options) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int error; + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = extra_options; + + if ((error = waitid(idtype, id, sip, native_options)) == EINTR) { + /* + * According to signal(7), the wait4(2), waitid(2), and + * waitpid(2) system calls are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (error); +} + +long +lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) +{ + k_siginfo_t info = { 0 }; + idtype_t idtype; + id_t id; + int status = 0; + pid_t pid = (pid_t)p1; + int error; + int native_options, extra_options; + int *statusp = (int *)p2; + void *rup = (void *)p4; + + if (ltos_options(p3, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (pid > maxpid) { + return (set_errno(ECHILD)); + } + + /* + * While not listed as a valid return code, Linux's wait4(2) does, + * in fact, get an EFAULT if either the status pointer or rusage + * pointer is invalid. Since a failed waitpid should leave child + * process in a state where a future wait4(2) will succeed, we + * check them by copying out the values their buffers originally + * contained. (We need to do this as a failed system call should + * never affect the contents of a passed buffer.) + * + * This will fail if the buffers in question are write-only. + */ + if (statusp != NULL) { + if (copyin(statusp, &status, sizeof (status)) != 0 || + copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + /* + * Do the same check for the "struct rusage" pointer, which differs + * in size for 32- and 64-bit processes. + */ + if (rup != NULL) { + struct rusage ru; + void *krup = &ru; + size_t rusz = sizeof (ru); +#if defined(_SYSCALL32_IMPL) + struct rusage32 ru32; + + if (get_udatamodel() != DATAMODEL_NATIVE) { + krup = &ru32; + rusz = sizeof (ru32); + } +#endif + + if (copyin(rup, krup, rusz) != 0 || + copyout(krup, rup, rusz) != 0) { + return (set_errno(EFAULT)); + } + } + + if (pid < -1) { + idtype = P_PGID; + id = -pid; + } else if (pid == -1) { + idtype = P_ALL; + id = 0; + } else if (pid == 0) { + idtype = P_PGID; + mutex_enter(&pidlock); + id = curproc->p_pgrp; + mutex_exit(&pidlock); + } else { + idtype = P_PID; + id = pid; + } + + native_options |= (WEXITED | WTRAPPED); + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + + status = lx_wstat(info.si_code, info.si_status); + + /* + * Unfortunately if this attempt to copy out either the status or the + * rusage fails, the process will be in an inconsistent state as + * subsequent calls to wait for the same child will fail where they + * should succeed on a Linux system. This, however, is rather + * unlikely since we tested the validity of both above. + */ + if (statusp != NULL) { + if (copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + if (rup != NULL) { + if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL, + NULL, NULL)) != 0) { + return (set_errno(error)); + } + } + + return (info.si_pid); +} + +long +lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3) +{ + return (lx_wait4(p1, p2, p3, NULL)); +} + +long +lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) +{ + int error; + int native_options, extra_options; + k_siginfo_t info = { 0 }; + + if (ltos_options(opt, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) { + return (set_errno(EINVAL)); + } + + switch (idtype) { + case LX_P_ALL: + idtype = P_ALL; + break; + case LX_P_PID: + idtype = P_PID; + break; + case LX_P_GID: + idtype = P_PGID; + break; + default: + return (set_errno(EINVAL)); + } + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (stol_ksiginfo32_copyout(&info, (void *)infop)); + } else +#endif + { + return (stol_ksiginfo_copyout(&info, (void *)infop)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c new file mode 100644 index 0000000000..bd7667226f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c @@ -0,0 +1,371 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/pathname.h> + + +#define LX_XATTR_NAME_MAX 255 +#define LX_XATTR_SIZE_MAX 65536 +#define LX_XATTR_LIST_MAX 65536 + +#define LX_XATTR_FLAG_CREATE 0x1 +#define LX_XATTR_FLAG_REPLACE 0x2 +#define LX_XATTR_FLAGS_VALID (LX_XATTR_FLAG_CREATE | LX_XATTR_FLAG_REPLACE) + +#define LX_CAP_XATTR_NAME "security.capability" + +/* + * *xattr() family of functions. + * + * These are largely unimplemented. In most cases we return EOPNOTSUPP, rather + * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1). + * + * Note that CRED() is used instead of f_cred in the f*xattr functions. This + * is intentional as Linux does not have the same notion of per-fd credentials. + */ + +/* ARGSUSED */ +static int +lx_setxattr_common(vnode_t *vp, char *name, void *value, size_t size, + int flags) +{ + int error; + char name_buf[LX_XATTR_NAME_MAX + 1]; + size_t name_len; + + if ((flags & ~LX_XATTR_FLAGS_VALID) != 0) { + return (EINVAL); + } + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + if (size > LX_XATTR_SIZE_MAX) { + return (E2BIG); + } + + /* + * In order to keep package management software happy, despite lacking + * support for file-based Linux capabilities via xattrs, we fake + * success when root attempts a setxattr on that attribute. + */ + if (crgetuid(CRED()) == 0 && + strcmp(name_buf, LX_CAP_XATTR_NAME) == 0) { + return (0); + } + + + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +static int +lx_getxattr_common(vnode_t *vp, char *name, char *value, size_t size, + ssize_t *osize) +{ + int error; + char name_buf[LX_XATTR_NAME_MAX + 1]; + size_t name_len; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + /* + * Only parameter validation is attempted for now. + */ + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +static int +lx_listxattr_common(vnode_t *vp, char *list, size_t size, ssize_t *osize) +{ + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +static int +lx_removexattr_common(vnode_t *vp, char *name) +{ + int error; + char name_buf[LX_XATTR_NAME_MAX + 1]; + size_t name_len; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + /* + * Only parameter validation is attempted for now. + */ + return (EOPNOTSUPP); +} + + +long +lx_setxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lsetxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fsetxattr(int fd, char *name, void *value, size_t size, int flags) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_setxattr_common(fp->f_vnode, name, value, size, flags); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +ssize_t +lx_getxattr(char *path, char *name, void *value, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_lgetxattr(char *path, char *name, void *value, size_t size) +{ + + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_fgetxattr(int fd, char *name, void *value, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_getxattr_common(fp->f_vnode, name, value, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_listxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_llistxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_flistxattr(int fd, char *list, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_listxattr_common(fp->f_vnode, list, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +int +lx_removexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_lremovexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_fremovexattr(int fd, char *name) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_removexattr_common(fp->f_vnode, name); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h new file mode 100644 index 0000000000..93dc316c1e --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h @@ -0,0 +1,196 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXSYSFS_H +#define _LXSYSFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_sysfs.h: declarations, data structures and macros for lx_sysfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> + +/* + * Convert a vnode into an lxsys_mnt_t + */ +#define VTOLXSM(vp) ((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxsys_node + */ +#define VTOLXS(vp) ((lxsys_node_t *)(vp)->v_data) + +/* + * convert a lxsys_node into a vnode + */ +#define LXSTOV(lxsnp) ((lxsnp)->lxsys_vnode) + +/* + * convert a lxsys_node into zone for fs + */ +#define LXSTOZ(lxsnp) \ + (((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone) + +#define LXSNSIZ 256 /* max size of lx /sys file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXSYS_SDSIZE 16 + +/* Root sysfs lxsys_instance */ +#define LXSYS_INST_ROOT 0 + +/* + * Node/file types for lx /sys files + * (directories and files contained therein). + */ +typedef enum lxsys_nodetype { + LXSYS_NONE, /* None-type to keep inodes non-zero */ + LXSYS_STATIC, /* Statically defined entries */ + LXSYS_CLASS_NET, /* /sys/class/net/<iface> */ + LXSYS_DEV_NET, /* /sys/devices/virtual/net/<iface> */ + LXSYS_BLOCK, /* /sys/block/<dev> */ + LXSYS_DEV_ZFS, /* /sys/devices/zfs/<dev> */ + LXSYS_DEV_SYS_CPU, /* /sys/devices/system/cpu/<cpu> */ + LXSYS_DEV_SYS_CPUINFO, /* /sys/devices/system/cpu/cpuN/<info> */ + LXSYS_DEV_SYS_NODE, /* /sys/devices/system/node/node0/<info> */ + LXSYS_MAXTYPE, /* type limit */ +} lxsys_nodetype_t; + +/* + * external dirent characteristics + */ +typedef struct { + unsigned int d_idnum; + char *d_name; +} lxsys_dirent_t; + +typedef struct { + unsigned int dl_instance; + lxsys_dirent_t *dl_list; + int dl_length; +} lxsys_dirlookup_t; + +/* + * This is the lx sysfs private data object + * which is attached to v_data in the vnode structure + */ +struct lxsys_node; +typedef struct lxsys_node lxsys_node_t; +struct lxsys_node { + lxsys_nodetype_t lxsys_type; /* type ID of node */ + unsigned int lxsys_instance; /* instance ID node */ + unsigned int lxsys_endpoint; /* endpoint ID node */ + vnode_t *lxsys_vnode; /* vnode for the node */ + vnode_t *lxsys_parentvp; /* parent directory */ + lxsys_node_t *lxsys_next; /* next list entry */ + timestruc_t lxsys_time; /* creation time */ + mode_t lxsys_mode; /* file mode bits */ + uid_t lxsys_uid; /* file owner */ + gid_t lxsys_gid; /* file group owner */ + ino_t lxsys_ino; /* node id */ +}; + +/* + * This is the lxsysfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxsys_mnt { + kmutex_t lxsysm_lock; /* protects fields */ + lxsys_node_t *lxsysm_node; /* node at root of sys mount */ + zone_t *lxsysm_zone; /* zone for this mount */ +} lxsys_mnt_t; + +extern vnodeops_t *lxsys_vnodeops; + +typedef struct mounta mounta_t; + +extern void lxsys_initnodecache(); +extern void lxsys_fininodecache(); +extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int); +extern ino_t lxsys_parentinode(lxsys_node_t *); +extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int, + unsigned int); +extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int); +extern void lxsys_freenode(lxsys_node_t *); + +extern netstack_t *lxsys_netstack(lxsys_node_t *); +extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t bufsize; + char *pos; + size_t beg; + int error; +} lxsys_uiobuf_t; + +extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *); +extern void lxsys_uiobuf_free(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int); +extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t); +extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LXSYSFS_H */ diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c new file mode 100644 index 0000000000..3184b34d08 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c @@ -0,0 +1,457 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * lx_syssubr.c: Various functions for the /sys vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_sysfs.h" + +#define LXSYSCACHE_NAME "lxsys_cache" + +static int lxsys_node_constructor(void *, void *, int); +static void lxsys_node_destructor(void *, void *); + +static kmem_cache_t *lxsys_node_cache; + +void +lxsys_initnodecache() +{ + lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME, + sizeof (lxsys_node_t), 0, + lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxsys_fininodecache() +{ + kmem_cache_destroy(lxsys_node_cache); +} + +/* ARGSUSED */ +static int +lxsys_node_constructor(void *buf, void *un, int kmflags) +{ + lxsys_node_t *lxsnp = buf; + vnode_t *vp; + + vp = lxsnp->lxsys_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxsys_vnodeops); + vp->v_data = lxsnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxsys_node_destructor(void *buf, void *un) +{ + lxsys_node_t *lxsnp = buf; + + vn_free(LXSTOV(lxsnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxsys node + */ +ino_t +lxsys_inode(lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + /* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + */ + ASSERT(instance <= 0xffff); + ASSERT(endpoint <= 0xff); + + return ((ino_t)(type << 24)|(instance << 8)|endpoint); +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxsys_parentinode(lxsys_node_t *lxsnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxsnp->lxsys_type == LXSYS_STATIC && + lxsnp->lxsys_instance == LXSYS_INST_ROOT) { + return (lxsnp->lxsys_ino); + } else { + return (VTOLXS(lxsnp->lxsys_parentvp)->lxsys_ino); + } +} + +/* + * Allocate a new lxsys node + * + * This also allocates the vnode associated with it + */ +lxsys_node_t * +lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + lxsys_node_t *lxsnp; + vnode_t *vp; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxsnp->lxsys_type = type; + lxsnp->lxsys_instance = instance; + lxsnp->lxsys_endpoint = endpoint; + lxsnp->lxsys_next = NULL; + lxsnp->lxsys_parentvp = dp; + VN_HOLD(dp); + + lxsnp->lxsys_time = now; + lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0; + lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint); + + /* initialize the vnode data */ + vp = lxsnp->lxsys_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Default to a directory with open permissions. + * Specific components will override this + */ + if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) { + vp->v_flag |= VROOT; + } + vp->v_type = VDIR; + lxsnp->lxsys_mode = 0555; + + return (lxsnp); +} + +lxsys_node_t * +lxsys_getnode_static(vnode_t *dp, unsigned int instance) +{ + lxsys_mnt_t *lxsm = VTOLXSM(dp); + lxsys_node_t *lnp; + + mutex_enter(&lxsm->lxsysm_lock); + lnp = lxsm->lxsysm_node; + while (1) { + if (lnp->lxsys_instance == instance) { + VERIFY(lnp->lxsys_parentvp == dp); + + VN_HOLD(lnp->lxsys_vnode); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); + } else if (lnp->lxsys_next == NULL) { + break; + } + lnp = lnp->lxsys_next; + } + + /* + * No persistent node found. + * Create one and add it to the end of the list. + */ + lnp->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0); + lnp = lnp->lxsys_next; + /* Allow mounts on static entries */ + LXSTOV(lnp)->v_flag &= (~VNOMOUNT); + + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); +} + +/* Clean up persistence for static lxsys_node */ +int +lxsys_freenode_static(lxsys_node_t *lnp) +{ + lxsys_node_t *plnp; + vnode_t *vp = LXSTOV(lnp); + lxsys_mnt_t *lxsm = VTOLXSM(vp); + + if (lnp->lxsys_instance == LXSYS_INST_ROOT) { + /* + * The root vnode does not need special cleanup since it + * anchors the list and is freed by lxsys_unmount. + */ + return (0); + } + + mutex_enter(&lxsm->lxsysm_lock); + + /* + * It is possible that a different process acquired a fresh reference + * to this vnode via lookup while we were waiting on the lxsysm_lock. + * To avoid freeing the vnode out from under them, we will double-check + * v_count and bail from the fop_inactive if it was grabbed. + */ + mutex_enter(&vp->v_lock); + if (vp->v_count != 1) { + VERIFY(vp->v_count > 0); + + /* Release our hold before bailing out of lxsys_inactive */ + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&lxsm->lxsysm_lock); + return (-1); + } + mutex_exit(&vp->v_lock); + + /* search for the record pointing to lnp */ + plnp = lxsm->lxsysm_node; + while (plnp != NULL && plnp->lxsys_next != lnp) { + plnp = plnp->lxsys_next; + } + /* entry should always be found */ + VERIFY(plnp != NULL); + plnp->lxsys_next = lnp->lxsys_next; + + mutex_exit(&lxsm->lxsysm_lock); + return (0); +} + +/* + * Free the storage obtained from lxsys_getnode(). + */ +void +lxsys_freenode(lxsys_node_t *lxsnp) +{ + vnode_t *vp = LXSTOV(lxsnp); + + VERIFY(vp != NULL); + + if (lxsnp->lxsys_type == LXSYS_STATIC) { + if (lxsys_freenode_static(lxsnp) != 0) { + return; + } + } + + /* + * delete any association with parent vp + */ + if (lxsnp->lxsys_parentvp != NULL) + VN_RELE(lxsnp->lxsys_parentvp); + + /* + * Release the lxsysnode. + */ + kmem_cache_free(lxsys_node_cache, lxsnp); +} + +/* + * Get the netstack associated with this lxsys mount + */ +netstack_t * +lxsys_netstack(lxsys_node_t *lnp) +{ + zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone; + netstack_t *ns = zone->zone_netstack; + + VERIFY(ns != NULL); + + if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) { + ns = NULL; + } else { + netstack_hold(ns); + } + + return (ns); +} + +ill_t * +lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex) +{ + ill_t *ill; + phyint_t *phyi; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &ifindex, NULL); + if (phyi != NULL) { + /* + * Since interface information presented via /sys is not + * specific to IPv4 or IPv6, an ill reference from either + * protocol will be adequate. Check both, starting with IPv4 + * for a valid reference to use. + */ + for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6; + ill = phyi->phyint_illv6) { + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ill); + } + mutex_exit(&ill->ill_lock); + } + } + } + rw_exit(&ipst->ips_ill_g_lock); + return (NULL); +} + + +#define LXSYSUIOBUFSZ 4096 + +lxsys_uiobuf_t * +lxsys_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxsys_uiobuf and output buffer */ + int bufsize = LXSYSUIOBUFSZ; + lxsys_uiobuf_t *uiobuf = + kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->bufsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize); +} + +void +lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->bufsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxsys_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxsys_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxsys_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c new file mode 100644 index 0000000000..9bb1d70527 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c @@ -0,0 +1,348 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * lxsysvfsops.c: vfs operations for lx sysfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_sysfs.h" + +/* Module level parameters */ +static int lxsysfstype; +static dev_t lxsysdev; +static kmutex_t lxsys_mount_lock; + +static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxsys_unmount(vfs_t *, int, cred_t *); +static int lxsys_root(vfs_t *, vnode_t **); +static int lxsys_statvfs(vfs_t *, statvfs64_t *); +static int lxsys_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_sysfs", + lxsys_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand sysfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxsys_node cache + */ + lxsys_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxsysfstype); + vn_freevnodeops(lxsys_vnodeops); + + mutex_destroy(&lxsys_mount_lock); +done: + return (retval); +} + +static int +lxsys_init(int fstype, char *name) +{ + static const fs_operation_def_t lxsys_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxsys_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxsys_unmount }, + VFSNAME_ROOT, { .vfs_root = lxsys_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxsys_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxsys_vnodeops_template[]; + int error; + major_t dev; + + lxsysfstype = fstype; + ASSERT(lxsysfstype != 0); + + mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxsys_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxsys_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxsys_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxsysdev = makedevice(dev, 0); + + /* + * Initialise cache for lxsys_nodes + */ + lxsys_initnodecache(); + + return (0); +} + +static int +lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt; + zone_t *zone = curproc->p_zone; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxsys" doesn't make sense + */ + vfs_setresource(vfsp, "lxsys", 0); + + lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP); + + mutex_enter(&lxsys_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxsys_mount_lock); + kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + + mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL); + zone_hold(lxsys_mnt->lxsysm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC, + LXSYS_INST_ROOT, 0); + lxsys_mnt->lxsysm_node->lxsys_next = NULL; + + /* Correctly set the fs for the root node */ + lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxsysfstype; + vfsp->vfs_data = (caddr_t)lxsys_mnt; + vfsp->vfs_dev = lxsysdev; + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data; + lxsys_node_t *lnp; + vnode_t *vp; + int count; + + VERIFY(lxsys_mnt != NULL); + + mutex_enter(&lxsys_mount_lock); + + /* must be root to unmount */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxsys_mount_lock); + return (EPERM); + } + + /* forced unmount is not supported by this fs */ + if (flag & MS_FORCE) { + mutex_exit(&lxsys_mount_lock); + return (ENOTSUP); + } + + /* Ensure that no vnodes are in use on this mount point. */ + lnp = lxsys_mnt->lxsysm_node; + vp = LXSTOV(lnp); + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxsys_mount_lock); + return (EBUSY); + } + + /* + * If there are no references to the root vnode the list of persistent + * static vnodes should be empty + */ + VERIFY(lnp->lxsys_next == NULL); + + (void) dnlc_purge_vfsp(vfsp, 0); + + lxsys_mnt->lxsysm_node = NULL; + lxsys_freenode(lnp); + zone_rele(lxsys_mnt->lxsysm_zone); + vfsp->vfs_data = NULL; + kmem_free(lxsys_mnt, sizeof (*lxsys_mnt)); + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + + VERIFY(lxsm != NULL); + VERIFY(lxsm->lxsysm_node != NULL); + + vp = LXSTOV(lxsm->lxsysm_node); + VN_HOLD(vp); + *vpp = vp; + + return (0); +} + +static int +lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + dev32_t d32; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)3; + sp->f_ffree = (fsfilcnt64_t)0; /* none */ + sp->f_favail = (fsfilcnt64_t)0; /* none */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/sys"); + (void) strcpy(&sp->f_fstr[6], "/sys"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c new file mode 100644 index 0000000000..f3df77428c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c @@ -0,0 +1,1796 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * lx_sysfs -- a Linux-compatible /sys for the LX brand + */ + +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/sunddi.h> +#include <sys/vnode.h> +#include <sys/netstack.h> +#include <sys/ethernet.h> +#include <inet/ip_arp.h> + +#include "lx_sysfs.h" + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxsys_init() in lx_sysvfsops.c + */ +vnodeops_t *lxsys_vnodeops; + +static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxsys_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxsys_sync(void); +static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_blockdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_zfsdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpu(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpuinfo(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_sysnode(lxsys_node_t *, char *); + +static int lxsys_read_static(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_zfs_block(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_sysnode(lxsys_node_t *, lxsys_uiobuf_t *); + +static int lxsys_readdir_devices_syscpu(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_syscpuinfo(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_sysnode(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_blockdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_zfsdir(lxsys_node_t *, uio_t *, int *); + +static int lxsys_readlink_class_net(lxsys_node_t *, char *, size_t); +static int lxsys_readlink_block(lxsys_node_t *, char *, size_t); + +/* + * The lx /sys vnode operations vector + */ +const fs_operation_def_t lxsys_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxsys_open }, + VOPNAME_CLOSE, { .vop_close = lxsys_close }, + VOPNAME_READ, { .vop_read = lxsys_read }, + VOPNAME_GETATTR, { .vop_getattr = lxsys_getattr }, + VOPNAME_ACCESS, { .vop_access = lxsys_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxsys_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxsys_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxsys_readlink }, + VOPNAME_FSYNC, { .error = lxsys_sync }, + VOPNAME_SEEK, { .error = lxsys_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxsys_inactive }, + VOPNAME_CMP, { .vop_cmp = lxsys_cmp }, + NULL, NULL +}; + +/* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + * + * Where TYPE is one of: + * 1 - SYS_STATIC + * 2 - SYS_CLASS_NET + * 3 - SYS_DEV_NET + * 4 - SYS_BLOCK + * 5 - SYS_DEV_ZFS + * 6 - SYS_DEV_SYS_CPU + * 7 - SYS_DEV_SYS_CPUINFO + * 8 - SYS_DEV_SYS_NODE + * + * Static entries will have assigned INSTANCE identifiers: + * - 0x00: /sys + * - 0x01: /sys/class + * - 0x02: /sys/devices + * - 0x03: /sys/fs + * - 0x04: /sys/class/net + * - 0x05: /sys/devices/virtual + * - 0x06: /sys/devices/system + * - 0x07: /sys/fs/cgroup + * - 0x08: /sys/devices/virtual/net + * - 0x09: /sys/block + * - 0x0a: /sys/devices/zfs + * - 0x0b: /sys/devices/system/cpu + * - 0x0c: /sys/devices/system/cpu/kernel_max + * - 0x0d: /sys/devices/system/node + * + * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived + * from the corresonding ifindex. + * + * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use + * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs + * for the contained entries. + * + * Dynamic /sys/block/<dev> symlinks will use an INSTANCE derived from the + * device major and instance from records listed in kstat or zvols. + * + * Dynamic /sys/devices/zfs/<dev> directories will use an INSTANCE derived from + * the emulated minor number. + * + * Static/Dynamic /sys/devices/system/cpu contains a static kernel_max file + * and a dynamic set of cpuN subdirectories. + * + * Static/Dynamic /sys/devices/system/node/node0 currently only contains a + * static cpulist file, but will likely need future dynamic entries for cpuN + * symlinks, and perhaps other static files. By only providing 'node0' we + * pretend that there is only a single NUMA node available to a zone (trying to + * be NUMA-aware inside a zone is generally not going to work anyway). + */ + +#define LXSYS_INST_CLASSDIR 0x1 +#define LXSYS_INST_DEVICESDIR 0x2 +#define LXSYS_INST_FSDIR 0x3 +#define LXSYS_INST_CLASS_NETDIR 0x4 +#define LXSYS_INST_DEVICES_VIRTUALDIR 0x5 +#define LXSYS_INST_DEVICES_SYSTEMDIR 0x6 +#define LXSYS_INST_FS_CGROUPDIR 0x7 +#define LXSYS_INST_DEVICES_VIRTUAL_NETDIR 0x8 +#define LXSYS_INST_BLOCKDIR 0x9 +#define LXSYS_INST_DEVICES_ZFSDIR 0xa +#define LXSYS_INST_DEVICES_SYSCPU 0xb +#define LXSYS_INST_DEV_SYSCPU_KMAX 0xc +#define LXSYS_INST_DEVICES_SYSNODE 0xd + +/* + * file contents of an lx /sys directory. + */ +static lxsys_dirent_t dirlist_root[] = { + { LXSYS_INST_BLOCKDIR, "block" }, + { LXSYS_INST_CLASSDIR, "class" }, + { LXSYS_INST_DEVICESDIR, "devices" }, + { LXSYS_INST_FSDIR, "fs" } +}; +static lxsys_dirent_t dirlist_empty[] = {}; +static lxsys_dirent_t dirlist_class[] = { + { LXSYS_INST_CLASS_NETDIR, "net" } +}; +static lxsys_dirent_t dirlist_fs[] = { + { LXSYS_INST_FS_CGROUPDIR, "cgroup" } +}; +static lxsys_dirent_t dirlist_devices[] = { + { LXSYS_INST_DEVICES_SYSTEMDIR, "system" }, + { LXSYS_INST_DEVICES_VIRTUALDIR, "virtual" }, + { LXSYS_INST_DEVICES_ZFSDIR, "zfs" } +}; +static lxsys_dirent_t dirlist_devices_virtual[] = { + { LXSYS_INST_DEVICES_VIRTUAL_NETDIR, "net" } +}; + +/* + * XXX: The presence of the cpu tree in sysfs triggers new behavior in various + * applications. The glibc code which accesses this part of the tree expects + * dirents to have the d_type field populated. We cannot implement the 'cpu' + * hierarchy until that is addressed. One such application is java, which + * becomes unstable due to the incorrect data from glibc. + */ +static lxsys_dirent_t dirlist_devices_system[] = { + /* { LXSYS_INST_DEVICES_SYSCPU, "cpu" }, */ + { LXSYS_INST_DEVICES_SYSNODE, "node" } +}; + +#define LXSYS_ENDP_NET_ADDRESS 1 +#define LXSYS_ENDP_NET_ADDRLEN 2 +#define LXSYS_ENDP_NET_FLAGS 3 +#define LXSYS_ENDP_NET_IFINDEX 4 +#define LXSYS_ENDP_NET_MTU 5 +#define LXSYS_ENDP_NET_TXQLEN 6 +#define LXSYS_ENDP_NET_TYPE 7 + +#define LXSYS_ENDP_BLOCK_DEVICE 1 + +#define LXSYS_ENDP_NODE_CPULIST 1 + +static lxsys_dirent_t dirlist_devices_virtual_net[] = { + { LXSYS_ENDP_NET_ADDRESS, "address" }, + { LXSYS_ENDP_NET_ADDRLEN, "addr_len" }, + { LXSYS_ENDP_NET_FLAGS, "flags" }, + { LXSYS_ENDP_NET_IFINDEX, "ifindex" }, + { LXSYS_ENDP_NET_MTU, "mtu" }, + { LXSYS_ENDP_NET_TXQLEN, "tx_queue_len" }, + { LXSYS_ENDP_NET_TYPE, "type" } +}; + +static lxsys_dirent_t dirlist_devices_zfs_block[] = { + { LXSYS_ENDP_BLOCK_DEVICE, "device" } +}; + +static lxsys_dirent_t dirlist_devices_sysnode[] = { + { LXSYS_ENDP_NODE_CPULIST, "cpulist" } +}; + +#define SYSDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +#define SYSDLENT(i, l) { i, l, SYSDIRLISTSZ(l) } +static lxsys_dirlookup_t lxsys_dirlookup[] = { + SYSDLENT(LXSYS_INST_ROOT, dirlist_root), + SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class), + SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs), + SYSDLENT(LXSYS_INST_FS_CGROUPDIR, dirlist_empty), + SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices), + SYSDLENT(LXSYS_INST_DEVICES_SYSTEMDIR, dirlist_devices_system), + SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual), + SYSDLENT(LXSYS_INST_DEVICES_SYSNODE, dirlist_devices_sysnode) +}; + + +/* + * Array of lookup functions, indexed by lx /sys file type. + */ +static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_lookup_static, /* LXSYS_STATIC */ + lxsys_lookup_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_lookup_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_lookup_blockdir, /* LXSYS_BLOCK */ + lxsys_lookup_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_lookup_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_lookup_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_lookup_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readdir functions, indexed by /sys file type. + */ +static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_readdir_static, /* LXSYS_STATIC */ + lxsys_readdir_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_readdir_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_readdir_blockdir, /* LXSYS_BLOCK */ + lxsys_readdir_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_readdir_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_readdir_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_readdir_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of read functions, indexed by /sys file type. + */ +static int (*lxsys_read_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_read_static, /* LXSYS_STATIC */ + NULL, /* LXSYS_CLASS_NET */ + lxsys_read_devices_virtual_net, /* LXSYS_DEV_NET */ + NULL, /* LXSYS_BLOCK */ + lxsys_read_devices_zfs_block, /* LXSYS_DEV_ZFS */ + NULL, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_read_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readlink functions, indexed by /sys file type. + */ +static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + NULL, /* LXSYS_STATIC */ + lxsys_readlink_class_net, /* LXSYS_CLASS_NET */ + NULL, /* LXSYS_DEV_NET */ + lxsys_readlink_block, /* LXSYS_BLOCK */ + NULL, /* LXSYS_DEV_ZFS */ + NULL, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + NULL, /* LXSYS_DEV_SYS_NODE */ +}; + +typedef struct lxsys_cpu_info { + processorid_t cpu_id; + processorid_t cpu_seqid; +} lxsys_cpu_info_t; + +/* + * lxsys_open(): Vnode operation for VOP_OPEN() + */ +static int +lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + /* + * We only allow reading in this file system + */ + if (flag & FWRITE) + return (EROFS); + + return (0); +} + + +/* + * lxsys_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + + +/* + * lxsys_read(): Vnode operation for VOP_READ() + * All we currently have in this fs are directories. + */ +/* ARGSUSED */ +static int +lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + lxsys_uiobuf_t *luio; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type == VDIR) { + return (EISDIR); + } + + rlfunc = lxsys_read_function[type]; + if (rlfunc != NULL) { + luio = lxsys_uiobuf_new(uiop); + if ((error = rlfunc(lnp, luio)) == 0) { + error = lxsys_uiobuf_flush(luio); + } + lxsys_uiobuf_free(luio); + } else { + error = EIO; + } + + return (error); +} + +/* + * lxsys_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxsys_node_t *lxsnp = VTOLXS(vp); + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxsnp->lxsys_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxsnp->lxsys_uid; + vap->va_gid = lxsnp->lxsys_gid; + vap->va_nodeid = lxsnp->lxsys_ino; + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxsys_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxsys_node_t *lxsnp = VTOLXS(vp); + int shift = 0; + + /* + * Although our lx sysfs is basically a read only file system, Linux + * expects it to be writable so we can't just error if (mode & VWRITE). + */ + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxsnp->lxsys_uid) { + shift += 3; + if (!groupmember((uid_t)lxsnp->lxsys_gid, cr)) + shift += 3; + } + + mode &= ~(lxsnp->lxsys_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* + * lxsys_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + int error; + + VERIFY(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxsnp->lxsys_parentvp); + *vpp = lxsnp->lxsys_parentvp; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxsys_lookup_function[type](lxsnp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +static lxsys_node_t * +lxsys_lookup_disk(lxsys_node_t *ldp, char *comp, lxsys_nodetype_t type) +{ + lxsys_node_t *lnp = NULL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (NULL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int inst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (strcmp(vd->lxvd_name, comp) == 0 && inst != 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, type, inst, 0); + break; + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (lnp); +} + +static vnode_t * +lxsys_lookup_static(lxsys_node_t *ldp, char *comp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + if (dirent == NULL) { + return (NULL); + } + + for (i = 0; i < len; i++) { + if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) { + lxsys_nodetype_t node_type = ldp->lxsys_type; + unsigned int node_instance = 0; + lxsys_node_t *lnp; + + switch (dirent[i].d_idnum) { + case LXSYS_INST_BLOCKDIR: + node_type = LXSYS_BLOCK; + break; + case LXSYS_INST_CLASS_NETDIR: + node_type = LXSYS_CLASS_NET; + break; + case LXSYS_INST_DEVICES_VIRTUAL_NETDIR: + node_type = LXSYS_DEV_NET; + break; + case LXSYS_INST_DEVICES_ZFSDIR: + node_type = LXSYS_DEV_ZFS; + break; + case LXSYS_INST_DEVICES_SYSCPU: + node_type = LXSYS_DEV_SYS_CPU; + break; + case LXSYS_INST_DEVICES_SYSNODE: + node_type = LXSYS_DEV_SYS_NODE; + break; + default: + /* Another static node */ + node_instance = dirent[i].d_idnum; + } + if (node_type == LXSYS_STATIC) { + lnp = lxsys_getnode_static(ldp->lxsys_vnode, + node_instance); + } else { + lnp = lxsys_getnode(ldp->lxsys_vnode, + node_type, node_instance, 0); + } + return (lnp->lxsys_vnode); + } + } + return (NULL); +} + +static vnode_t * +lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp) +{ + vnode_t *result = NULL; + lxsys_node_t *lnp; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + if (ldp->lxsys_type != LXSYS_CLASS_NET || + ldp->lxsys_instance != 0) { + /* Lookups only allowed at directory level */ + return (NULL); + } + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + result->v_type = VLNK; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); +} + +static vnode_t * +lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level interface listing */ + vnode_t *result = NULL; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); + } else if (ldp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_virtual_net); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_virtual_net[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_blockdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_BLOCK); + + if (lnp != NULL) { + lnp->lxsys_vnode->v_type = VLNK; + return (lnp->lxsys_vnode); + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_zfsdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_DEV_ZFS); + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + /* + * All of these entries currently look like regular files + * but on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + size = SYSDIRLISTSZ(dirlist_devices_zfs_block); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_zfs_block[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_syscpu(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* top-level cpu listing */ + + /* If fixed entry */ + if (strcmp(comp, "kernel_max") == 0) { + lnp = lxsys_getnode_static(ldp->lxsys_vnode, + LXSYS_INST_DEV_SYSCPU_KMAX); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else { + /* Else dynamic cpuN entry */ + cpu_t *cp, *cpstart; + int pools_enabled; + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + char cpunm[16]; + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%d", + cp->cpu_seqid); + + if (strcmp(comp, cpunm) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + LXSYS_DEV_SYS_CPUINFO, + cp->cpu_id + 1, 0); + break; + } + if (pools_enabled) { + cp = cp->cpu_next_part; + } else { + cp = cp->cpu_next; + } + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + } + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing, currently empty */ + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_syscpuinfo(lxsys_node_t *ldp, char *comp) +{ + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_sysnode(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* + * The system is presently represented as a single node, + * regardless of any NUMA topology which exists. + * The instances are offset by 1 to account for the top level + * directory occupying instance 0. + */ + if (strcmp(comp, "node0") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + 1, 0); + return (lnp->lxsys_vnode); + } + } else { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_sysnode); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_sysnode[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static int +lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + netstack_t *ns; + ill_t *ill; + uint_t ifindex = lnp->lxsys_instance; + uint8_t *addr; + uint64_t flags; + int error = 0; + + if (ifindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (EIO); + } + + ill = lxsys_find_ill(ns->netstack_ip, ifindex); + if (ill == NULL) { + netstack_rele(ns); + return (EIO); + } + + switch (lnp->lxsys_endpoint) { + case LXSYS_ENDP_NET_ADDRESS: + if (ill->ill_phys_addr_length != ETHERADDRL) { + lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n"); + break; + } + addr = ill->ill_phys_addr; + lxsys_uiobuf_printf(luio, + "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + break; + case LXSYS_ENDP_NET_ADDRLEN: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length); + break; + case LXSYS_ENDP_NET_FLAGS: + flags = (ill->ill_flags | ill->ill_ipif->ipif_flags | + ill->ill_phyint->phyint_flags) & 0xffff; + lx_ifflags_convert(&flags, LX_IF_FROMNATIVE); + lxsys_uiobuf_printf(luio, "0x%x\n", flags); + break; + case LXSYS_ENDP_NET_IFINDEX: + lxsys_uiobuf_printf(luio, "%u\n", ifindex); + break; + case LXSYS_ENDP_NET_MTU: + lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu); + break; + case LXSYS_ENDP_NET_TXQLEN: + /* perpetuate the txqlen lie */ + if (IS_LOOPBACK(ill)) { + lxsys_uiobuf_printf(luio, "0\n"); + } else { + lxsys_uiobuf_printf(luio, "1\n"); + } + break; + case LXSYS_ENDP_NET_TYPE: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK : + arp_hw_type(ill->ill_mactype)); + break; + default: + error = EIO; + } + + ill_refrele(ill); + netstack_rele(ns); + return (error); +} + +static int +lxsys_read_devices_zfs_block(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t dskindex = lnp->lxsys_instance; + + if (dskindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + return (EIO); +} + +static int +lxsys_read_devices_sysnode(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + if (lnp->lxsys_instance == 1 && + lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPULIST) { + /* Show the range of CPUs */ + cpu_t *cp, *cpstart; + int pools_enabled, maxid = -1; + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + if (cp->cpu_seqid > maxid) + maxid = cp->cpu_seqid; + + if (pools_enabled) { + cp = cp->cpu_next_part; + } else { + cp = cp->cpu_next; + } + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxsys_uiobuf_printf(luio, "0-%d\n", maxid); + return (0); + } + return (EISDIR); + +} + +static int +lxsys_read_static(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t inst = lnp->lxsys_instance; + + if (inst == LXSYS_INST_DEV_SYSCPU_KMAX) { + lxsys_uiobuf_printf(luio, "%d\n", NCPU); + return (0); + } + + /* All other static nodes are directories */ + return (EISDIR); +} + +/* + * lxsys_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + ssize_t uresid; + off_t uoffset; + int error, leof; + + ASSERT(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXSYS_SDSIZE) + return (ENOENT); + + /* Free lower functions from having to check eofp == NULL */ + if (eofp == NULL) { + eofp = &leof; + } + + return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp)); +} + +static int +lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio) +{ + int error; + off_t offset = uio->uio_offset; + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the + * same amount. But we want uiop->uio_offset to change in increments + * of LXSYS_SDSIZE, which is different from the number of bytes being + * returned to the user. To accomplish this, we set uiop->uio_offset + * separately on success, overriding what uiomove() does. + */ + d->d_off = (off64_t)(offset + LXSYS_SDSIZE); + d->d_reclen = n; + if ((error = uiomove(d, n, UIO_READ, uio)) != 0) { + return (error); + } + uio->uio_offset = offset + LXSYS_SDSIZE; + return (0); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(LXSYS_STATIC, + dirtab[dirindex].d_idnum, 0); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(lxsnp->lxsys_type, + lxsnp->lxsys_instance, dirtab[dirindex].d_idnum); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + int error, i; + + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + if ((ns = lxsys_netstack(ldp)) == NULL) { + *eofp = 1; + return (0); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_first(phytree); + if (phyi == NULL) { + *eofp = 1; + } + bzero(bp, sizeof (bp)); + + /* + * Skip records we have already passed with the offset. + * This accounts for the two "." and ".." records already seen. + */ + for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) { + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) { + uint_t ifindex; + int reclen; + + ifindex = phyi->phyint_ifindex; + (void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE); + dirent->d_ino = lxsys_inode(type, ifindex, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readdir_disks(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + uint_t instance; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (EINVAL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (skip > 0) { + skip--; + goto next; + } + + if (strnlen(vd->lxvd_name, sizeof (vd->lxvd_name)) > LXSNSIZ) + goto next; + + (void) strncpy(dirent->d_name, vd->lxvd_name, LXSNSIZ); + + instance = getminor(vd->lxvd_emul_dev) & 0xffff; + if (instance == 0) + goto next; + + dirent->d_ino = lxsys_inode(type, instance, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + +next: + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Indicate EOF if we reached the end of the virtual disks. */ + if (vd == NULL) { + *eofp = 1; + } + + return (error); +} + + +static int +lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + + if (dirent == NULL) { + return (ENOTDIR); + } + + return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len)); +} + +static int +lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_CLASS_NET || + lnp->lxsys_instance != 0) { + /* + * Since /sys/class/net contains only symlinks, readdir + * operations should not be performed anywhere except the top + * level (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET)); +} + +static int +lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level interface listing */ + error = lxsys_readdir_ifaces(lnp, uiop, eofp, + LXSYS_DEV_NET); + } else if (lnp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_virtual_net, + SYSDIRLISTSZ(dirlist_devices_virtual_net)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_blockdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_BLOCK || + lnp->lxsys_instance != 0) { + /* + * Since /sys/block contains only symlinks, readdir operations + * should not be performed anywhere except the top level + * (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_disks(lnp, uiop, eofp, LXSYS_BLOCK)); +} + +static int +lxsys_readdir_devices_zfsdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level dev listing */ + error = lxsys_readdir_disks(lnp, uiop, eofp, + LXSYS_DEV_ZFS); + } else if (lnp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_zfs_block, + SYSDIRLISTSZ(dirlist_devices_zfs_block)); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_cpu(lxsys_node_t *ldp, struct uio *uiop, int *eofp) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + cpu_t *cp, *cpstart; + int pools_enabled; + int i, cpucnt; + lxsys_cpu_info_t cpu_info[NCPU]; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + (void) strncpy(dirent->d_name, "kernel_max", LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_STATIC, + LXSYS_INST_DEV_SYSCPU_KMAX, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + goto done; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + goto done; + } + } + + /* Collect a list of CPU info */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cpucnt = 0; + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + cpu_info[cpucnt].cpu_id = cp->cpu_id; + cpu_info[cpucnt++].cpu_seqid = cp->cpu_seqid; + ASSERT(cpucnt < NCPU); + if (pools_enabled) { + cp = cp->cpu_next_part; + } else { + cp = cp->cpu_next; + } + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + /* Output dynamic CPU info */ + for (i = 0; i < cpucnt; i++) { + char cpunm[16]; + + if (skip > 0) { + skip--; + continue; + } + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%d", + cpu_info[i].cpu_seqid); + (void) strncpy(dirent->d_name, cpunm, LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPU, + cpu_info[i].cpu_id + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + } + + /* Indicate EOF if we reached the end of the CPU list. */ + if (i == cpucnt) { + *eofp = 1; + } + +done: + return (error); +} + +static int +lxsys_readdir_devices_syscpu(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level cpu listing */ + error = lxsys_readdir_cpu(lnp, uiop, eofp); + } else if (lnp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_empty, SYSDIRLISTSZ(dirlist_empty)); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_devices_syscpuinfo(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_type != LXSYS_DEV_SYS_CPUINFO) { + /* + * Since /sys/devices/system/cpu/cpuN is empty, readdir + * operations should not be performed anywhere except the top + * level. + */ + return (ENOTDIR); + } + + /* + * Emit "." and ".." entries + * All cpuN directories are currently empty. + */ + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + /* Indicate EOF */ + *eofp = 1; + + return (error); +} + +static int +lxsys_readdir_devices_sysnode(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level node listing */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int reclen, skip; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + (void) strncpy(dirent->d_name, "node0", LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_NODE, + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + return (EINVAL); + } + return (0); + } + error = lxsys_dirent_out(dirent, reclen, uiop); + } + /* Indicate EOF */ + if (error == 0) { + *eofp = 1; + } + } else if (lnp->lxsys_endpoint == 0) { + /* node-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_sysnode, + SYSDIRLISTSZ(dirlist_devices_sysnode)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +/* + * lxsys_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char buf[MAXPATHLEN + 1]; + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type != VLNK) { + return (EINVAL); + } + + rlfunc = lxsys_readlink_function[lnp->lxsys_type]; + if (rlfunc != NULL) { + if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) { + error = uiomove(buf, strlen(buf), UIO_READ, uiop); + } + } else { + error = EINVAL; + } + + return (error); +} + + +static int +lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len) +{ + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + uint_t ifindex; + char ifname[LIFNAMSIZ]; + int error = EINVAL; + + if ((ifindex = lnp->lxsys_instance) == 0) { + return (error); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (error); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_find(phytree, &ifindex, NULL); + if (phyi != NULL) { + (void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + (void) snprintf(buf, len, "/sys/devices/virtual/net/%s", + ifname); + error = 0; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readlink_block(lxsys_node_t *lnp, char *buf, size_t len) +{ + int inst, error = EINVAL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + if ((inst = lnp->lxsys_instance) == 0) { + return (error); + } + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (error); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int vinst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (vinst == inst) { + (void) snprintf(buf, len, + "../devices/zfs/%s", vd->lxvd_name); + error = 0; + break; + } + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (error); +} + +/* + * lxsys_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxsys_freenode(VTOLXS(vp)); +} + +/* + * lxsys_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxsys_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxsys_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + if (vn_matchops(vp1, lxsys_vnodeops) || + vn_matchops(vp2, lxsys_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index d61928d578..32fb7d9127 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/errno.h> @@ -42,43 +43,66 @@ char *sn1_emulation_table = NULL; -void sn1_init_brand_data(zone_t *); +void sn1_init_brand_data(zone_t *, kmutex_t *); void sn1_free_brand_data(zone_t *); void sn1_setbrand(proc_t *); int sn1_getattr(zone_t *, int, void *, size_t *); int sn1_setattr(zone_t *, int, void *, size_t); int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void sn1_copy_procdata(proc_t *, proc_t *); -void sn1_proc_exit(struct proc *, klwp_t *); +void sn1_proc_exit(struct proc *); void sn1_exec(); -int sn1_initlwp(klwp_t *); +void sn1_initlwp(klwp_t *, void *); void sn1_forklwp(klwp_t *, klwp_t *); void sn1_freelwp(klwp_t *); void sn1_lwpexit(klwp_t *); int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); /* sn1 brand */ struct brand_ops sn1_brops = { - sn1_init_brand_data, - sn1_free_brand_data, - sn1_brandsys, - sn1_setbrand, - sn1_getattr, - sn1_setattr, - sn1_copy_procdata, - sn1_proc_exit, - sn1_exec, - lwp_setrval, - sn1_initlwp, - sn1_forklwp, - sn1_freelwp, - sn1_lwpexit, - sn1_elfexec, - NULL, - NULL, - NSIG, + sn1_init_brand_data, /* b_init_brand_data */ + sn1_free_brand_data, /* b_free_brand_data */ + sn1_brandsys, /* b_brandsys */ + sn1_setbrand, /* b_setbrand */ + sn1_getattr, /* b_getattr */ + sn1_setattr, /* b_setattr */ + sn1_copy_procdata, /* b_copy_procdata */ + sn1_proc_exit, /* b_proc_exit */ + sn1_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + sn1_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + sn1_forklwp, /* b_forklwp */ + sn1_freelwp, /* b_freelwp */ + sn1_lwpexit, /* b_lwpexit */ + sn1_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE /* b_intp_parse_arg */ }; #ifdef sparc @@ -94,9 +118,12 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, + NULL, sn1_brand_int91_callback, sn1_brand_syscall_callback, - sn1_brand_syscall32_callback + sn1_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -104,7 +131,10 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, NULL, + NULL, sn1_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -115,7 +145,8 @@ struct brand sn1_brand = { BRAND_VER_1, "sn1", &sn1_brops, - &sn1_mops + &sn1_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -151,7 +182,7 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) /*ARGSUSED*/ int sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) { int res; @@ -171,9 +202,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent) } void -sn1_proc_exit(struct proc *p, klwp_t *l) +sn1_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &sn1_brand); + brand_solaris_proc_exit(p, &sn1_brand); } void @@ -182,10 +213,10 @@ sn1_exec() brand_solaris_exec(&sn1_brand); } -int -sn1_initlwp(klwp_t *l) +void +sn1_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &sn1_brand)); + brand_solaris_initlwp(l, &sn1_brand); } void @@ -214,18 +245,18 @@ sn1_free_brand_data(zone_t *zone) /*ARGSUSED*/ void -sn1_init_brand_data(zone_t *zone) +sn1_init_brand_data(zone_t *zone, kmutex_t *zsl) { } int sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME, - SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32)); + SN1_LIB, SN1_LIB32)); } int diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h index b487745e21..fef9dc128b 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.h +++ b/usr/src/uts/common/brand/sn1/sn1_brand.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SN1_BRAND_H @@ -37,20 +38,14 @@ extern "C" { #define SN1_VERSION SN1_VERSION_1 #define SN1_LIB_NAME "sn1_brand.so.1" -#define SN1_LINKER_NAME "ld.so.1" #define SN1_LIB32 BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME -#define SN1_LINKER32 "/lib/" SN1_LINKER_NAME - #define SN1_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME -#define SN1_LINKER64 "/lib/64/" SN1_LINKER_NAME #if defined(_LP64) #define SN1_LIB SN1_LIB64 -#define SN1_LINKER SN1_LINKER64 #else /* !_LP64 */ #define SN1_LIB SN1_LIB32 -#define SN1_LINKER SN1_LINKER32 #endif /* !_LP64 */ #if defined(_KERNEL) diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index f24b864eef..a02ee7de3d 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/errno.h> @@ -45,45 +46,68 @@ char *s10_emulation_table = NULL; -void s10_init_brand_data(zone_t *); +void s10_init_brand_data(zone_t *, kmutex_t *); void s10_free_brand_data(zone_t *); void s10_setbrand(proc_t *); int s10_getattr(zone_t *, int, void *, size_t *); int s10_setattr(zone_t *, int, void *, size_t); int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void s10_copy_procdata(proc_t *, proc_t *); -void s10_proc_exit(struct proc *, klwp_t *); +void s10_proc_exit(struct proc *); void s10_exec(); -int s10_initlwp(klwp_t *); +void s10_initlwp(klwp_t *, void *); void s10_forklwp(klwp_t *, klwp_t *); void s10_freelwp(klwp_t *); void s10_lwpexit(klwp_t *); int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); void s10_sigset_native_to_s10(sigset_t *); void s10_sigset_s10_to_native(sigset_t *); /* s10 brand */ struct brand_ops s10_brops = { - s10_init_brand_data, - s10_free_brand_data, - s10_brandsys, - s10_setbrand, - s10_getattr, - s10_setattr, - s10_copy_procdata, - s10_proc_exit, - s10_exec, - lwp_setrval, - s10_initlwp, - s10_forklwp, - s10_freelwp, - s10_lwpexit, - s10_elfexec, - s10_sigset_native_to_s10, - s10_sigset_s10_to_native, - S10_NSIG, + s10_init_brand_data, /* b_init_brand_data */ + s10_free_brand_data, /* b_free_brand_data */ + s10_brandsys, /* b_brandsys */ + s10_setbrand, /* b_setbrand */ + s10_getattr, /* b_getattr */ + s10_setattr, /* b_setattr */ + s10_copy_procdata, /* b_copy_procdata */ + s10_proc_exit, /* b_proc_exit */ + s10_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + s10_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + s10_forklwp, /* b_forklwp */ + s10_freelwp, /* b_freelwp */ + s10_lwpexit, /* b_lwpexit */ + s10_elfexec, /* b_elfexec */ + s10_sigset_native_to_s10, /* b_sigset_native_to_brand */ + s10_sigset_s10_to_native, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + S10_NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE /* b_intp_parse_arg */ }; #ifdef sparc @@ -99,9 +123,12 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, + NULL, s10_brand_int91_callback, s10_brand_syscall_callback, - s10_brand_syscall32_callback + s10_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -109,7 +136,10 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, NULL, + NULL, s10_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -120,7 +150,8 @@ struct brand s10_brand = { BRAND_VER_1, "solaris10", &s10_brops, - &s10_mops + &s10_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -252,7 +283,7 @@ s10_native(void *cmd, void *args) /*ARGSUSED*/ int s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) { proc_t *p = curproc; int res; @@ -326,9 +357,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent) } void -s10_proc_exit(struct proc *p, klwp_t *l) +s10_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &s10_brand); + brand_solaris_proc_exit(p, &s10_brand); } void @@ -337,10 +368,10 @@ s10_exec() brand_solaris_exec(&s10_brand); } -int -s10_initlwp(klwp_t *l) +void +s10_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &s10_brand)); + brand_solaris_initlwp(l, &s10_brand); } void @@ -380,7 +411,7 @@ s10_free_brand_data(zone_t *zone) } void -s10_init_brand_data(zone_t *zone) +s10_init_brand_data(zone_t *zone, kmutex_t *zsl) { ASSERT(zone->zone_brand == &s10_brand); ASSERT(zone->zone_brand_data == NULL); @@ -390,11 +421,11 @@ s10_init_brand_data(zone_t *zone) int s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, - S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); + S10_LIB, S10_LIB32)); } void diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h index 11f9853f48..ffef485e12 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.h +++ b/usr/src/uts/common/brand/solaris10/s10_brand.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _S10_BRAND_H @@ -42,17 +43,12 @@ extern "C" { #define S10_LINKER_NAME "ld.so.1" #define S10_LIB32 BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME -#define S10_LINKER32 "/lib/" S10_LINKER_NAME - #define S10_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME -#define S10_LINKER64 "/lib/64/" S10_LINKER_NAME #if defined(_LP64) #define S10_LIB S10_LIB64 -#define S10_LINKER S10_LINKER64 #else /* !_LP64 */ #define S10_LIB S10_LIB32 -#define S10_LINKER S10_LINKER32 #endif /* !_LP64 */ /* diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index 6b3ba51f31..a71be771fd 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. */ @@ -559,8 +560,8 @@ char *isa_list = architecture; static pgcnt_t original_physmem = 0; #define MIN_DEFAULT_MAXUSERS 8u -#define MAX_DEFAULT_MAXUSERS 2048u -#define MAX_MAXUSERS 4096u +#define MAX_DEFAULT_MAXUSERS 10000u +#define MAX_MAXUSERS 20000u void param_preset(void) @@ -572,7 +573,7 @@ void param_calc(int platform_max_nprocs) { /* - * Default to about one "user" per megabyte, taking into + * Default to about one "user" per 8MB, taking into * account both physical and virtual constraints. * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT) * converts pages to megs without integer overflow. @@ -586,8 +587,9 @@ param_calc(int platform_max_nprocs) if (maxusers == 0) { pgcnt_t physmegs = physmem >> (20 - PAGESHIFT); pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20; - maxusers = MIN(MAX(MIN(physmegs, virtmegs), - MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS); + maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */ + maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS); + maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS); } if (maxusers > MAX_MAXUSERS) { maxusers = MAX_MAXUSERS; diff --git a/usr/src/uts/common/contract/process.c b/usr/src/uts/common/contract/process.c index 9fd23fdb61..cad5d7f955 100644 --- a/usr/src/uts/common/contract/process.c +++ b/usr/src/uts/common/contract/process.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #include <sys/mutex.h> @@ -955,6 +956,18 @@ contract_process_exit(cont_process_t *ctp, proc_t *p, int exitstatus) (void) cte_publish_all(ct, event, nvl, NULL); mutex_enter(&ct->ct_lock); } + + /* + * CT_PR_EV_EXIT is not part of the CT_PR_ALLFATAL definition since + * we never allow including this in the fatal set via a user-land + * application, but we do allow CT_PR_EV_EXIT in the contract's fatal + * set for a process setup for zone init. See zone_start_init(). + */ + if (EVFATALP(ctp, CT_PR_EV_EXIT)) { + ASSERT(MUTEX_HELD(&ct->ct_lock)); + contract_process_kill(ct, p, B_TRUE); + } + if (empty) { /* * Send EMPTY message. diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c index bc72fa984a..75072fb686 100644 --- a/usr/src/uts/common/crypto/api/kcf_random.c +++ b/usr/src/uts/common/crypto/api/kcf_random.c @@ -70,6 +70,7 @@ #include <sys/cpuvar.h> #include <sys/taskq.h> #include <rng/fips_random.h> +#include <sys/strlog.h> #define RNDPOOLSIZE 1024 /* Pool size in bytes */ #define MINEXTRACTBYTES 20 @@ -933,7 +934,8 @@ rnd_handler(void *arg) int len = 0; if (!rng_prov_found && rng_ok_to_log) { - cmn_err(CE_WARN, "No randomness provider enabled for " + (void) strlog(0, 0, 0, SL_NOTE, + "No randomness provider enabled for " "/dev/random. Use cryptoadm(1M) to enable a provider."); rng_ok_to_log = B_FALSE; } diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index f461fe048c..8b2760b237 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg) case 0: case -1: /* - * Woke up with no work to do. Check - * if this thread should exit. We keep - * at least kcf_minthreads. + * Woke up with no work to do. Check if we + * should lwp_exit() (which won't return). We + * keep at least kcf_minthreads. */ if (kcfpool->kp_threads > kcf_minthreads) { KCF_ATOMIC_DECR(kcfpool->kp_threads); diff --git a/usr/src/uts/common/ctf/ctf_mod.c b/usr/src/uts/common/ctf/ctf_mod.c index b34cf400cd..421b922c96 100644 --- a/usr/src/uts/common/ctf/ctf_mod.c +++ b/usr/src/uts/common/ctf/ctf_mod.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sysmacros.h> #include <sys/modctl.h> #include <sys/debug.h> @@ -117,6 +115,15 @@ ctf_version(int version) /*ARGSUSED*/ ctf_file_t * +ctf_fdcreate_int(int fd, int *errp, ctf_sect_t *ctfp) +{ + if (errp != NULL) + *errp = ENOTSUP; + return (NULL); +} + +/*ARGSUSED*/ +ctf_file_t * ctf_modopen(struct module *mp, int *error) { ctf_sect_t ctfsect, symsect, strsect; diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index 1c5e1f79a9..3ecbf39393 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp) /* * Return non-zero if thread can migrate between "from" and "to" - * without a performance penalty + * without a performance penalty. This is true only if we share a core on + * virtually any CPU; sharing the last-level cache is insufficient to make + * migration possible without penalty. */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { - if (from->cpu_physid->cpu_cacheid == - to->cpu_physid->cpu_cacheid) + if (from->cpu_physid->cpu_coreid == + to->cpu_physid->cpu_coreid) return (1); return (0); } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..2a4365ff73 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap) ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); + + /* do this first to avoid race with cap_kstat_update */ + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; - if (cap->cap_kstat != NULL) { - kstat_delete(cap->cap_kstat); - cap->cap_kstat = NULL; - } - } /* @@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t) /* * Convert internal cap statistics into values exported by cap kstat. + * Note that the kstat is held throughout this function but caps_lock is not. */ static int cap_kstat_update(kstat_t *ksp, int rw) @@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index 0c2c0b4993..5f9c2c68a2 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -105,7 +109,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri); /* * If this is set, only interrupt threads will cause kernel preemptions. * This is done by changing the value of kpreemptpri. kpreemptpri - * will either be the max sysclass pri + 1 or the min interrupt pri. + * will either be the max sysclass pri or the min interrupt pri. */ int only_intr_kpreempt; @@ -252,7 +256,23 @@ dispinit(void) maxglobpri = cl_maxglobpri; } } - kpreemptpri = (pri_t)v.v_maxsyspri + 1; + + /* + * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is + * to say, maxclsyspri + 1. However, over time, the system has used + * more and more asynchronous kernel threads, with an increasing number + * of these doing work on direct behalf of higher-level software (e.g., + * network processing). This has led to potential priority inversions: + * threads doing low-priority lengthy kernel work can effectively + * delay kernel-level processing of higher-priority data. To minimize + * such inversions, we set kpreemptpri to be v_maxsyspri; anything in + * the kernel that runs at maxclsyspri will therefore induce kernel + * preemption, and this priority should be used if/when an asynchronous + * thread (or, as is often the case, task queue) is performing a task + * on behalf of higher-level software (or any task that is otherwise + * latency-sensitve). + */ + kpreemptpri = (pri_t)v.v_maxsyspri; if (kpqpri == KPQPRI) kpqpri = kpreemptpri; @@ -2258,7 +2278,7 @@ disp_getbest(disp_t *dp) * placed earlier. */ if (tcp == NULL || - pri >= minclsyspri || + (pri >= minclsyspri && tp->t_procp == &p0) || tp->t_cpu != tcp) break; diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c index ab5ba278a0..8260680a07 100644 --- a/usr/src/uts/common/disp/fx.c +++ b/usr/src/uts/common/disp/fx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -71,16 +71,6 @@ static struct modlinkage modlinkage = { }; -/* - * control flags (kparms->fx_cflags). - */ -#define FX_DOUPRILIM 0x01 /* change user priority limit */ -#define FX_DOUPRI 0x02 /* change user priority */ -#define FX_DOTQ 0x04 /* change FX time quantum */ - - -#define FXMAXUPRI 60 /* maximum user priority setting */ - #define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */ /* diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c index f87f8c56ce..115e42ccb8 100644 --- a/usr/src/uts/common/disp/rt.c +++ b/usr/src/uts/common/disp/rt.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -103,13 +103,6 @@ _info(struct modinfo *modinfop) pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */ rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */ -/* - * control flags (kparms->rt_cflags). - */ -#define RT_DOPRI 0x01 /* change priority */ -#define RT_DOTQ 0x02 /* change RT time quantum */ -#define RT_DOSIG 0x04 /* change RT time quantum signal */ - static int rt_admin(caddr_t, cred_t *); static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *); static int rt_fork(kthread_t *, kthread_t *, void *); diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c index 768b499ef2..cc88ed72fc 100644 --- a/usr/src/uts/common/disp/rt_dptbl.c +++ b/usr/src/uts/common/disp/rt_dptbl.c @@ -28,8 +28,6 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/proc.h> #include <sys/priocntl.h> #include <sys/class.h> @@ -70,8 +68,6 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ - rtdpent_t config_rt_dptbl[] = { /* prilevel Time quantum */ diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index f2685af534..ae6c5eef16 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -75,6 +75,10 @@ #include <sys/cpucaps.h> #include <sys/kiconv.h> +#ifndef STACK_GROWTH_DOWN +#error Stacks do not grow downward; 3b2 zombie attack detected! +#endif + struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ struct kmem_cache *turnstile_cache; /* cache of free turnstiles */ @@ -372,7 +376,7 @@ thread_create( if (stksize <= sizeof (kthread_t) + PTR24_ALIGN) cmn_err(CE_PANIC, "thread_create: proposed stack size" " too small to hold thread."); -#ifdef STACK_GROWTH_DOWN + stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1); stksize &= -PTR24_ALIGN; /* make thread aligned */ t = (kthread_t *)(stk + stksize); @@ -381,13 +385,6 @@ thread_create( audit_thread_create(t); t->t_stk = stk + stksize; t->t_stkbase = stk; -#else /* stack grows to larger addresses */ - stksize -= SA(sizeof (kthread_t)); - t = (kthread_t *)(stk); - bzero(t, sizeof (kthread_t)); - t->t_stk = stk + sizeof (kthread_t); - t->t_stkbase = stk + stksize + sizeof (kthread_t); -#endif /* STACK_GROWTH_DOWN */ t->t_flag |= T_TALLOCSTK; t->t_swap = stk; } else { @@ -400,13 +397,8 @@ thread_create( * Initialize t_stk to the kernel stack pointer to use * upon entry to the kernel */ -#ifdef STACK_GROWTH_DOWN t->t_stk = stk + stksize; t->t_stkbase = stk; -#else - t->t_stk = stk; /* 3b2-like */ - t->t_stkbase = stk + stksize; -#endif /* STACK_GROWTH_DOWN */ } if (kmem_stackinfo != 0) { @@ -589,6 +581,9 @@ thread_exit(void) if ((t->t_proc_flag & TP_ZTHREAD) != 0) cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called"); + if ((t->t_flag & T_SPLITSTK) != 0) + cmn_err(CE_PANIC, "thread_exit: called when stack is split"); + tsd_exit(); /* Clean up this thread's TSD */ kcpc_passivate(); /* clean up performance counter state */ @@ -1050,6 +1045,8 @@ installctx( ctx->free_op = free; ctx->arg = arg; ctx->next = t->t_ctx; + ctx->save_ts = 0; + ctx->restore_ts = 0; t->t_ctx = ctx; } @@ -1124,9 +1121,12 @@ savectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->save_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->save_op != NULL) { + ctx->save_ts = gethrtime_unscaled(); (ctx->save_op)(ctx->arg); + } + } } void @@ -1135,9 +1135,12 @@ restorectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->restore_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->restore_op != NULL) { + ctx->restore_ts = gethrtime_unscaled(); (ctx->restore_op)(ctx->arg); + } + } } void @@ -1883,6 +1886,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) return (on_rq); } + +/* + * There are occasions in the kernel when we need much more stack than we + * allocate by default, but we do not wish to have that work done + * asynchronously by another thread. To accommodate these scenarios, we allow + * for a split stack (also known as a "segmented stack") whereby a new stack + * is dynamically allocated and the current thread jumps onto it for purposes + * of executing the specified function. After the specified function returns, + * the stack is deallocated and control is returned to the caller. This + * functionality is implemented by thread_splitstack(), below; there are a few + * constraints on its use: + * + * - The caller must be in a context where it is safe to block for memory. + * - The caller cannot be in a t_onfault context + * - The called function must not call thread_exit() while on the split stack + * + * The code will explicitly panic if these constraints are violated. Notably, + * however, thread_splitstack() _can_ be called on a split stack -- there + * is no limit to the level that split stacks can nest. + * + * When the stack is split, it is constructed such that stack backtraces + * from kernel debuggers continue to function -- though note that DTrace's + * stack() action and stackdepth function will only show the stack up to and + * including thread_splitstack_run(); DTrace explicitly bounds itself to + * pointers that exist within the current declared stack as a safety + * mechanism. + */ +void +thread_splitstack(void (*func)(void *), void *arg, size_t stksize) +{ + kthread_t *t = curthread; + caddr_t ostk, ostkbase, stk; + ushort_t otflag; + + if (t->t_onfault != NULL) + panic("thread_splitstack: called with non-NULL t_onfault"); + + ostk = t->t_stk; + ostkbase = t->t_stkbase; + otflag = t->t_flag; + + stksize = roundup(stksize, PAGESIZE); + + if (stksize < default_stksize) + stksize = default_stksize; + + if (stksize == default_stksize) { + stk = (caddr_t)segkp_cache_get(segkp_thread); + } else { + stksize = roundup(stksize, PAGESIZE); + stk = (caddr_t)segkp_get(segkp, stksize, + (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED)); + } + + /* + * We're going to lock ourselves before we set T_SPLITSTK to assure + * that we're not swapped out in the meantime. (Note that we don't + * bother to set t_swap, as we're not going to be swapped out.) + */ + thread_lock(t); + + if (!(otflag & T_SPLITSTK)) + t->t_flag |= T_SPLITSTK; + + t->t_stk = stk + stksize; + t->t_stkbase = stk; + + thread_unlock(t); + + /* + * Now actually run on the new (split) stack... + */ + thread_splitstack_run(t->t_stk, func, arg); + + /* + * We're back onto our own stack; lock ourselves and restore our + * pre-split state. + */ + thread_lock(t); + + t->t_stk = ostk; + t->t_stkbase = ostkbase; + + if (!(otflag & T_SPLITSTK)) + t->t_flag &= ~T_SPLITSTK; + + thread_unlock(t); + + /* + * Now that we are entirely back on our own stack, call back into + * the platform layer to perform any platform-specific cleanup. + */ + thread_splitstack_cleanup(); + + segkp_release(segkp, stk); +} + /* * Tunable kmem_stackinfo is set, fill the kernel thread stack with a * specific pattern. diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c index 67ccc6922f..c840bdf31a 100644 --- a/usr/src/uts/common/disp/thread_intr.c +++ b/usr/src/uts/common/disp/thread_intr.c @@ -23,19 +23,10 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - /* - * FILE NOTICE BEGIN - * - * This file should not be modified. If you wish to modify it or have it - * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com-> - * (without anti-spam dashes) - * - * FILE NOTICE END + * Copyright 2015, Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/cpuvar.h> #include <sys/stack.h> #include <vm/seg_kp.h> @@ -44,6 +35,17 @@ #include <sys/sysmacros.h> /* + * Use a slightly larger thread stack size for interrupt threads rather than the + * default. This is useful for cases where the networking stack may do an rx and + * a tx in the context of a single interrupt and when combined with various + * promisc hooks that need memory, can cause us to get dangerously close to the + * edge of the traditional stack sizes. This is only a few pages more than a + * traditional stack and given that we don't have that many interrupt threads, + * the memory costs end up being more than worthwhile. + */ +#define LL_INTR_STKSZ (32 * 1024) + +/* * Create and initialize an interrupt thread. */ static void @@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp) { kthread_t *tp; - tp = thread_create(NULL, 0, + tp = thread_create(NULL, LL_INTR_STKSZ, (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0); /* @@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp) } /* - * Allocate a given number of interrupt threads for a given CPU. - * These threads will get freed by cpu_destroy_bound_threads() - * when CPU gets unconfigured. + * Allocate a given number of interrupt threads for a given CPU. These threads + * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured. + * + * Note, high level interrupts are always serviced using cpu_intr_stack and are + * not allowed to block. Low level interrupts or soft-interrupts use the + * kthread_t's that we create through the calls to thread_create_intr(). */ void cpu_intr_alloc(cpu_t *cp, int n) @@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n) thread_create_intr(cp); cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE, - KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + - INTR_STACK_SIZE - SA(MINFRAME); + KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + + INTR_STACK_SIZE - SA(MINFRAME); } diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c index c775224d86..fc0206da29 100644 --- a/usr/src/uts/common/dtrace/dtrace.c +++ b/usr/src/uts/common/dtrace/dtrace.c @@ -7710,7 +7710,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); - *zoneidp = crgetzoneid(cr); + *zoneidp = crgetzonedid(cr); priv = 0; if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) @@ -8206,7 +8206,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { provider->dtpv_priv.dtpp_uid = crgetuid(cr); - provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr); } provider->dtpv_pops = *pops; @@ -8817,6 +8817,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) uint32_t priv; uid_t uid; zoneid_t zoneid; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; ASSERT(MUTEX_HELD(&dtrace_lock)); dtrace_ecb_create_cache = NULL; @@ -8831,8 +8832,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) } dtrace_probekey(desc, &pkey); - dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, - &priv, &uid, &zoneid); + dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid); + + if ((priv & DTRACE_PRIV_ZONEOWNER) && + state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) { + /* + * If we have the privilege of instrumenting all zones but we + * have been told to instrument but one, we will spoof this up + * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes + * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for + * security but rather for performance: it allows the global + * zone to instrument USDT probes in a local zone without + * requiring all zones to be instrumented.) + */ + priv &= ~DTRACE_PRIV_ZONEOWNER; + zoneid = state->dts_options[DTRACEOPT_ZONE]; + } return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab)); diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index 157acc25fc..3d350ff278 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, }; +/* + * When adding a new provider you must add it before sdt as sdt is a catch all + * for remaining probes. + */ sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr }, { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER }, @@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = { { "fc", "__fc_", &fc_attr }, { "srp", "__srp_", &fc_attr }, { "sysevent", "__sysevent_", &stab_attr }, + { "vnd", "__vnd_", &stab_attr }, { "sdt", NULL, &sdt_attr }, { NULL } }; @@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = { { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", "fc_port_info_t *" }, + { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-in", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-out", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" }, + { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "send", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "send", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" }, + { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "recv", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" }, { NULL } }; diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c index fc45bd9544..5dbb2ed28c 100644 --- a/usr/src/uts/common/exec/aout/aout.c +++ b/usr/src/uts/common/exec/aout/aout.c @@ -22,6 +22,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -54,7 +55,7 @@ static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action); + caddr_t exec_file, cred_t *cred, int *brand_action); static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz, int *isdyn); static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp, @@ -130,7 +131,7 @@ _info(struct modinfo *modinfop) static int aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action) + caddr_t exec_file, cred_t *cred, int *brand_action) { auxv32_t auxflags_auxv32; int error; diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index dc04b292b0..d74737dead 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -66,6 +66,11 @@ #include <sys/sdt.h> #include <sys/siginfo.h> +#if defined(__x86) && !defined(__xpv) +#include <sys/comm_page.h> +#endif /* defined(__x86) && !defined(__xpv) */ + + extern int at_flags; #define ORIGIN_STR "ORIGIN" @@ -163,12 +168,16 @@ dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base) } /* - * Map in the executable pointed to by vp. Returns 0 on success. + * Map in the executable pointed to by vp. Returns 0 on success. Note that + * this function currently has the maximum number of arguments allowed by + * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without + * adding to MAXNARG. (Better yet, do not add to this monster of a function + * signature!) */ int mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, - intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase, - caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap) + intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase, + caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp) { size_t len; struct vattr vat; @@ -180,6 +189,7 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, Phdr *junk = NULL; Phdr *dynphdr = NULL; Phdr *dtrphdr = NULL; + char *interp = NULL; uintptr_t lddata; long execsz; intptr_t minaddr; @@ -187,6 +197,9 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, if (lddatap != NULL) *lddatap = NULL; + if (minaddrp != NULL) + *minaddrp = NULL; + if (error = execpermissions(vp, &vat, args)) { uprintf("%s: Cannot execute %s\n", exec_file, args->pathname); return (error); @@ -212,25 +225,89 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr, len, &execsz, brksize)) { uprintf("%s: Cannot map %s\n", exec_file, args->pathname); + if (uphdr != NULL && uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); kmem_free(phdrbase, phdrsize); return (error); } + if (minaddrp != NULL) + *minaddrp = minaddr; + /* - * Inform our caller if the executable needs an interpreter. + * If the executable requires an interpreter, determine its name. */ - *interp = (dynphdr == NULL) ? 0 : 1; + if (dynphdr != NULL) { + ssize_t resid; + + if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) { + uprintf("%s: Invalid interpreter\n", exec_file); + kmem_free(phdrbase, phdrsize); + return (ENOEXEC); + } + + interp = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz, + (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0, + (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 || + interp[dynphdr->p_filesz - 1] != '\0') { + uprintf("%s: Cannot obtain interpreter pathname\n", + exec_file); + kmem_free(interp, MAXPATHLEN); + kmem_free(phdrbase, phdrsize); + return (error != 0 ? error : ENOEXEC); + } + } /* * If this is a statically linked executable, voffset should indicate * the address of the executable itself (it normally holds the address * of the interpreter). */ - if (ehdr->e_type == ET_EXEC && *interp == 0) + if (ehdr->e_type == ET_EXEC && interp == NULL) *voffset = minaddr; + /* + * If the caller has asked for the interpreter name, return it (it's + * up to the caller to free it); if the caller hasn't asked for it, + * free it ourselves. + */ + if (interpp != NULL) { + *interpp = interp; + } else if (interp != NULL) { + kmem_free(interp, MAXPATHLEN); + } + if (uphdr != NULL) { *uphdr_vaddr = uphdr->p_vaddr; + + if (uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); + } else if (ehdr->e_type == ET_DYN) { + /* + * If we don't have a uphdr, we'll apply the logic found + * in mapelfexec() and use the p_vaddr of the first PT_LOAD + * section as the base address of the object. + */ + Phdr *phdr = (Phdr *)phdrbase; + int i, hsize = ehdr->e_phentsize; + + for (i = nphdrs; i > 0; i--) { + if (phdr->p_type == PT_LOAD) { + *uphdr_vaddr = (uintptr_t)phdr->p_vaddr + + ehdr->e_phoff; + break; + } + + phdr = (Phdr *)((caddr_t)phdr + hsize); + } + + /* + * If we don't have a PT_LOAD segment, we should have returned + * ENOEXEC when elfsize() returned 0, above. + */ + VERIFY(i > 0); } else { *uphdr_vaddr = (Addr)-1; } @@ -243,13 +320,13 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { caddr_t phdrbase = NULL; caddr_t bssbase = 0; caddr_t brkbase = 0; size_t brksize = 0; - ssize_t dlnsize; + ssize_t dlnsize, nsize = 0; aux_entry_t *aux; int error; ssize_t resid; @@ -273,6 +350,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int hasauxv = 0; int hasdy = 0; int branded = 0; + int dynuphdr = 0; struct proc *p = ttoproc(curthread); struct user *up = PTOU(p); @@ -327,7 +405,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1); } else { args->to_model = DATAMODEL_LP64; - args->stk_prot &= ~PROT_EXEC; + if (!args->stk_prot_override) { + args->stk_prot &= ~PROT_EXEC; + } #if defined(__i386) || defined(__amd64) args->dat_prot &= ~PROT_EXEC; #endif @@ -339,11 +419,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* _LP64 */ /* - * We delay invoking the brand callback until we've figured out - * what kind of elf binary we're trying to run, 32-bit or 64-bit. - * We do this because now the brand library can just check - * args->to_model to see if the target is 32-bit or 64-bit without - * having do duplicate all the code above. + * We delay invoking the brand callback until we've figured out what + * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this + * because now the brand library can just check args->to_model to see if + * the target is 32-bit or 64-bit without having do duplicate all the + * code above. + * + * We also give the brand a chance to indicate that based on the ELF + * OSABI of the target binary it should become unbranded and optionally + * indicate that it should be treated as existing in a specific prefix. + * + * Note that if a brand opts to go down this route it does not actually + * end up being debranded. In other words, future programs that exec + * will still be considered for branding unless this escape hatch is + * used. Consider the case of lx brand for example. If a user runs + * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable + * of DTrace that's in /native will take this escape hatch and be run + * and interpreted using the normal system call table; however, the + * execution of a non-illumos binary in the form of /bin/ls will still + * be branded and be subject to all of the normal actions of the brand. * * The level checks associated with brand handling below are used to * prevent a loop since the brand elfexec function typically comes back @@ -351,8 +445,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * handling in the #! interpreter code will increment the level before * calling gexec to run the final elfexec interpreter. */ + if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) && + (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) { + if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI], + &args->brand_nroot) == B_TRUE) { + ASSERT(ehdrp->e_ident[EI_OSABI]); + *brand_action = EBA_NATIVE; + /* Add one for the trailing '/' in the path */ + if (args->brand_nroot != NULL) + nsize = strlen(args->brand_nroot) + 1; + } + } + if ((level <= INTP_MAXDEPTH) && - (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { error = BROP(p)->b_elfexec(vp, uap, args, idatap, level + 1, execsz, setid, exec_file, cred, brand_action); @@ -423,14 +529,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * AT_BASE * AT_FLAGS * AT_PAGESZ + * AT_RANDOM (added in stk_copyout) * AT_SUN_AUXFLAGS * AT_SUN_HWCAP * AT_SUN_HWCAP2 - * AT_SUN_PLATFORM (added in stk_copyout) - * AT_SUN_EXECNAME (added in stk_copyout) + * AT_SUN_PLATFORM (added in stk_copyout) + * AT_SUN_EXECNAME (added in stk_copyout) * AT_NULL * - * total == 9 + * total == 10 */ if (hasdy && hasu) { /* @@ -445,7 +552,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 5 */ - args->auxsize = (9 + 5) * sizeof (aux_entry_t); + args->auxsize = (10 + 5) * sizeof (aux_entry_t); } else if (hasdy) { /* * Has PT_INTERP but no PT_PHDR @@ -455,9 +562,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 2 */ - args->auxsize = (9 + 2) * sizeof (aux_entry_t); + args->auxsize = (10 + 2) * sizeof (aux_entry_t); } else { - args->auxsize = 9 * sizeof (aux_entry_t); + args->auxsize = 10 * sizeof (aux_entry_t); } } else { args->auxsize = 0; @@ -470,13 +577,41 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (args->emulator != NULL) args->auxsize += sizeof (aux_entry_t); - if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + /* + * If this is a native binary that's been given a modified interpreter + * root, inform it that the native system exists at that root. + */ + if (args->brand_nroot != NULL) { + args->auxsize += sizeof (aux_entry_t); + } + + + /* + * On supported kernels (64-bit, non-xpv) make room in the auxv for the + * AT_SUN_COMMPAGE entry. + */ +#if defined(__amd64) && !defined(__xpv) + args->auxsize += sizeof (aux_entry_t); +#endif /* defined(__amd64) && !defined(__xpv) */ + + /* + * If we have user credentials, we'll supply the following entries: + * AT_SUN_UID + * AT_SUN_RUID + * AT_SUN_GID + * AT_SUN_RGID + */ + if (cred != NULL) { + args->auxsize += 4 * sizeof (aux_entry_t); + } + + if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { branded = 1; /* - * We will be adding 4 entries to the aux vectors. One for - * the the brandname and 3 for the brand specific aux vectors. + * We will be adding 5 entries to the aux vectors. One for + * the the brandname and 4 for the brand specific aux vectors. */ - args->auxsize += 4 * sizeof (aux_entry_t); + args->auxsize += 5 * sizeof (aux_entry_t); } /* Hardware/Software capabilities */ @@ -507,7 +642,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, aux = bigwad->elfargs; /* * Move args to the user's stack. - * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries. + * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM + * aux entries. */ if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) { if (error == -1) { @@ -534,6 +670,14 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, len, execsz, &brksize)) != 0) goto bad; + if (uphdr != NULL) { + /* + * Our uphdr has been dynamically allocated if (and only if) + * its program header flags are clear. + */ + dynuphdr = (uphdr->p_flags == 0); + } + if (uphdr != NULL && dyphdr == NULL) goto bad; @@ -548,17 +692,22 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, char *p; struct vnode *nvp; - dlnsize = dyphdr->p_filesz; + dlnsize = dyphdr->p_filesz + nsize; if (dlnsize > MAXPATHLEN || dlnsize <= 0) goto bad; + if (nsize != 0) { + bcopy(args->brand_nroot, dlnp, nsize - 1); + dlnp[nsize - 1] = '/'; + } + /* * Read in "interpreter" pathname. */ - if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz, - (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0, - CRED(), &resid)) != 0) { + if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize, + dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE, + 0, (rlim64_t)0, CRED(), &resid)) != 0) { uprintf("%s: Cannot obtain interpreter pathname\n", exec_file); goto bad; @@ -703,9 +852,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; - error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk, + error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk, &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len, execsz, NULL); + if (error || junk != NULL) { VN_RELE(nvp); uprintf("%s: Cannot map %s\n", exec_file, dlnp); @@ -732,9 +882,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (hasauxv) { int auxf = AF_SUN_HWCAPVERIFY; + /* - * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via - * exec_args() + * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were + * filled in via exec_args() */ ADDAUX(aux, AT_BASE, voffset) ADDAUX(aux, AT_FLAGS, at_flags) @@ -762,7 +913,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * malicious user within the zone from crafting a wrapper to * run native suid commands with unsecure libraries interposed. */ - if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && + if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && (setid &= ~EXECSETID_SETID) != 0)) auxf &= ~AF_SUN_SETUGID; @@ -775,6 +926,18 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ((char *)&aux->a_type - (char *)bigwad->elfargs)); ADDAUX(aux, AT_SUN_AUXFLAGS, auxf); + + /* + * Record information about the real and effective user and + * group IDs. + */ + if (cred != NULL) { + ADDAUX(aux, AT_SUN_UID, crgetuid(cred)); + ADDAUX(aux, AT_SUN_RUID, crgetruid(cred)); + ADDAUX(aux, AT_SUN_GID, crgetgid(cred)); + ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred)); + } + /* * Hardware capability flag word (performance hints) * Used for choosing faster library routines. @@ -804,8 +967,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_BRAND_AUX1, 0) ADDAUX(aux, AT_SUN_BRAND_AUX2, 0) ADDAUX(aux, AT_SUN_BRAND_AUX3, 0) + ADDAUX(aux, AT_SUN_BRAND_AUX4, 0) } + /* + * Add the comm page auxv entry, mapping it in if needed. + */ +#if defined(__amd64) && !defined(__xpv) + if (args->commpage != NULL || + (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) { + ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage) + } +#endif /* defined(__amd64) && !defined(__xpv) */ + ADDAUX(aux, AT_NULL, 0) postfixsize = (char *)aux - (char *)bigwad->elfargs; @@ -845,6 +1019,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, } bzero(up->u_auxv, sizeof (up->u_auxv)); + up->u_commpagep = args->commpage; if (postfixsize) { int num_auxv; @@ -911,6 +1086,8 @@ bad: if (error == 0) error = ENOEXEC; out: + if (dynuphdr) + kmem_free(uphdr, sizeof (Phdr)); if (phdrbase != NULL) kmem_free(phdrbase, phdrsize); if (cap != NULL) @@ -1177,6 +1354,29 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, return (0); } + +#ifdef _ELF32_COMPAT +int +elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs, + caddr_t *phbasep, ssize_t *phsizep) +#else +int +elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs, + caddr_t *phbasep, ssize_t *phsizep) +#endif +{ + int error, nshdrs, shstrndx; + + if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx, + nphdrs)) != 0 || + (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep, + phsizep)) != 0) { + return (error); + } + return (0); +} + + static int mapelfexec( vnode_t *vp, @@ -1197,7 +1397,7 @@ mapelfexec( size_t *brksize) { Phdr *phdr; - int i, prot, error; + int i, prot, error, lastprot = 0; caddr_t addr = NULL; size_t zfodsz; int ptload = 0; @@ -1205,43 +1405,78 @@ mapelfexec( off_t offset; int hsize = ehdr->e_phentsize; caddr_t mintmp = (caddr_t)-1; + uintptr_t lastaddr = NULL; extern int use_brk_lpg; if (ehdr->e_type == ET_DYN) { - /* - * Obtain the virtual address of a hole in the - * address space to map the "interpreter". - */ - map_addr(&addr, len, (offset_t)0, 1, 0); - if (addr == NULL) - return (ENOMEM); - *voffset = (intptr_t)addr; + caddr_t vaddr; /* - * Calculate the minimum vaddr so it can be subtracted out. - * According to the ELF specification, since PT_LOAD sections - * must be sorted by increasing p_vaddr values, this is - * guaranteed to be the first PT_LOAD section. + * Despite the fact that mmapobj(2) refuses to load them, we + * need to support executing ET_DYN objects that have a + * non-NULL p_vaddr. When found in the wild, these objects + * are likely to be due to an old (and largely obviated) Linux + * facility, prelink(8), that rewrites shared objects to + * prefer specific (disjoint) virtual address ranges. (Yes, + * this is putatively for performance -- and yes, it has + * limited applicability, many edge conditions and grisly + * failure modes; even for Linux, it's insane.) As ELF + * mandates that the PT_LOAD segments be in p_vaddr order, we + * find the lowest p_vaddr by finding the first PT_LOAD + * segment. */ phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { if (phdr->p_type == PT_LOAD) { - *voffset -= (uintptr_t)phdr->p_vaddr; + addr = (caddr_t)(uintptr_t)phdr->p_vaddr; break; } phdr = (Phdr *)((caddr_t)phdr + hsize); } + /* + * We have a non-zero p_vaddr in the first PT_LOAD segment -- + * presumably because we're directly executing a prelink(8)'d + * ld-linux.so. While we could correctly execute such an + * object without locating it at its desired p_vaddr (it is, + * after all, still relocatable), our inner antiquarian + * derives a perverse pleasure in accommodating the steampunk + * prelink(8) contraption -- goggles on! + */ + if ((vaddr = addr) != NULL) { + if (as_gap(curproc->p_as, len, + &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) { + addr = NULL; + } + } + + if (addr == NULL) { + /* + * We either have a NULL p_vaddr (the common case, by + * many orders of magnitude) or we have a non-NULL + * p_vaddr and we were unable to obtain the specified + * VA range (presumably because it's an illegal + * address). Either way, obtain an address in which + * to map the interpreter. + */ + map_addr(&addr, len, (offset_t)0, 1, 0); + if (addr == NULL) + return (ENOMEM); + } + + /* + * Our voffset is the difference between where we landed and + * where we wanted to be. + */ + *voffset = (uintptr_t)addr - (uintptr_t)vaddr; } else { *voffset = 0; } + phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { switch (phdr->p_type) { case PT_LOAD: - if ((*dyphdr != NULL) && (*uphdr == NULL)) - return (0); - ptload = 1; prot = PROT_USER; if (phdr->p_flags & PF_R) @@ -1253,6 +1488,34 @@ mapelfexec( addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset); + if ((*dyphdr != NULL) && uphdr != NULL && + (*uphdr == NULL)) { + /* + * The PT_PHDR program header is, strictly + * speaking, optional. If we find that this + * is missing, we will determine the location + * of the program headers based on the address + * of the lowest PT_LOAD segment (namely, this + * one): we subtract the p_offset to get to + * the ELF header and then add back the program + * header offset to get to the program headers. + * We then cons up a Phdr that corresponds to + * the (missing) PT_PHDR, setting the flags + * to 0 to denote that this is artificial and + * should (must) be freed by the caller. + */ + Phdr *cons; + + cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP); + + cons->p_flags = 0; + cons->p_type = PT_PHDR; + cons->p_vaddr = ((uintptr_t)addr - + phdr->p_offset) + ehdr->e_phoff; + + *uphdr = cons; + } + /* * Keep track of the segment with the lowest starting * address. @@ -1260,6 +1523,41 @@ mapelfexec( if (addr < mintmp) mintmp = addr; + /* + * Segments need not correspond to page boundaries: + * they are permitted to share a page. If two PT_LOAD + * segments share the same page, and the permissions + * of the segments differ, the behavior is historically + * that the permissions of the latter segment are used + * for the page that the two segments share. This is + * also historically a non-issue: binaries generated + * by most anything will make sure that two PT_LOAD + * segments with differing permissions don't actually + * share any pages. However, there exist some crazy + * things out there (including at least an obscure + * Portuguese teaching language called G-Portugol) that + * actually do the wrong thing and expect it to work: + * they have a segment with execute permission share + * a page with a subsequent segment that does not + * have execute permissions and expect the resulting + * shared page to in fact be executable. To accommodate + * such broken link editors, we take advantage of a + * latitude explicitly granted to the loader: it is + * permitted to make _any_ PT_LOAD segment executable + * (provided that it is readable or writable). If we + * see that we're sharing a page and that the previous + * page was executable, we will add execute permissions + * to our segment. + */ + if (btop(lastaddr) == btop((uintptr_t)addr) && + (phdr->p_flags & (PF_R | PF_W)) && + (lastprot & PROT_EXEC)) { + prot |= PROT_EXEC; + } + + lastaddr = (uintptr_t)addr + phdr->p_filesz; + lastprot = prot; + zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz; offset = phdr->p_offset; @@ -1324,8 +1622,22 @@ mapelfexec( break; case PT_INTERP: - if (ptload) - goto bad; + /* + * The ELF specification is unequivocal about the + * PT_INTERP program header with respect to any PT_LOAD + * program header: "If it is present, it must precede + * any loadable segment entry." Linux, however, makes + * no attempt to enforce this -- which has allowed some + * binary editing tools to get away with generating + * invalid ELF binaries in the respect that PT_INTERP + * occurs after the first PT_LOAD program header. This + * is unfortunate (and of course, disappointing) but + * it's no worse than that: there is no reason that we + * can't process the PT_INTERP entry (if present) after + * one or more PT_LOAD entries. We therefore + * deliberately do not check ptload here and always + * store dyphdr to be the PT_INTERP program header. + */ *dyphdr = phdr; break; @@ -1334,9 +1646,12 @@ mapelfexec( break; case PT_PHDR: - if (ptload) + if (ptload || phdr->p_flags == 0) goto bad; - *uphdr = phdr; + + if (uphdr != NULL) + *uphdr = phdr; + break; case PT_NULL: @@ -2185,7 +2500,7 @@ static struct modlexec modlexec = { extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action); + int *brand_action); extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c index 269ba86b1b..512cab2b66 100644 --- a/usr/src/uts/common/exec/intp/intp.c +++ b/usr/src/uts/common/exec/intp/intp.c @@ -22,6 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -47,6 +48,7 @@ #include <sys/kmem.h> #include <sys/note.h> #include <sys/sdt.h> +#include <sys/brand.h> /* * This is the loadable module wrapper. @@ -54,7 +56,7 @@ #include <sys/modctl.h> extern int intpexec(struct vnode *, struct execa *, struct uarg *, - struct intpdata *, int, long *, int, caddr_t, struct cred *, int); + struct intpdata *, int, long *, int, caddr_t, struct cred *, int *); static struct execsw esw = { intpmagicstr, @@ -126,13 +128,20 @@ getintphead(struct vnode *vp, struct intpdata *idatap) *cp = '\0'; /* - * Locate the beginning and end of the interpreter name. - * In addition to the name, one additional argument may - * optionally be included here, to be prepended to the - * arguments provided on the command line. Thus, for - * example, you can say + * Locate the beginning and end of the interpreter name. Historically, + * for illumos and its predecessors, in addition to the name, one + * additional argument may optionally be included here, to be prepended + * to the arguments provided on the command line. Thus, for example, + * you can say * * #! /usr/bin/awk -f + * + * However, handling of interpreter arguments varies across operating + * systems and other systems allow more than one argument. In + * particular, Linux allows more than one and delivers all arguments + * as a single string (argv[1] is "-arg1 -arg2 ..."). We support this + * style of argument handling as a brand-specific option (setting + * b_intp_parse_arg to B_FALSE). */ for (cp = &linep[2]; *cp == ' '; cp++) ; @@ -151,9 +160,12 @@ getintphead(struct vnode *vp, struct intpdata *idatap) idatap->intp_arg[0] = NULL; else { idatap->intp_arg[0] = cp; - while (*cp && *cp != ' ') - cp++; - *cp = '\0'; + if (!PROC_IS_BRANDED(curproc) || + BROP(curproc)->b_intp_parse_arg) { + while (*cp && *cp != ' ') + cp++; + *cp = '\0'; + } } } return (0); @@ -188,9 +200,8 @@ intpexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { - _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; int error = 0; struct intpdata idata; @@ -281,7 +292,7 @@ intpexec( } error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, - EBA_NONE); + brand_action); if (!error) { /* diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c index fdc327dcbb..5170fda5cb 100644 --- a/usr/src/uts/common/exec/java/java.c +++ b/usr/src/uts/common/exec/java/java.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* @@ -85,7 +86,7 @@ char *jexec_arg = "-jar"; static int javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t execfile, cred_t *cred, int brand_action) + caddr_t execfile, cred_t *cred, int *brand_action) { struct intpdata idata; int error; diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c index ee5060a07e..016d87b9ef 100644 --- a/usr/src/uts/common/exec/shbin/shbin.c +++ b/usr/src/uts/common/exec/shbin/shbin.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -58,7 +59,7 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action); + int *brand_action); #define SHBIN_CNTL(x) ((x)&037) #define SHBINMAGIC_LEN 4 @@ -162,7 +163,7 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c index 4eaf38f484..41441ec52d 100644 --- a/usr/src/uts/common/fs/dev/sdev_netops.c +++ b/usr/src/uts/common/fs/dev/sdev_netops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -41,8 +42,102 @@ #include <sys/zone.h> #include <sys/dls.h> +static const char *devnet_zpath = "/dev/net/zone/"; struct vnodeops *devnet_vnodeops; +static zoneid_t +devnet_nodetozone(sdev_node_t *dv) +{ + char *zname = NULL, *dup; + zone_t *zone; + int duplen; + zoneid_t zid; + + /* + * If in a non-global zone, always return it's zid no matter what the + * node is. + */ + zid = getzoneid(); + if (zid != GLOBAL_ZONEID) + return (zid); + + /* + * If it doesn't have /dev/net/zone/ then it can't be a specific zone + * we're targetting. + */ + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0) + return (GLOBAL_ZONEID); + + if (dv->sdev_vnode->v_type == VDIR) { + zone = zone_find_by_name(dv->sdev_name); + } else { + /* Non directories have the form /dev/net/zone/%z/%s */ + dup = strdup(dv->sdev_path); + duplen = strlen(dup); + zname = strrchr(dup, '/'); + *zname = '\0'; + zname--; + zname = strrchr(dup, '/'); + zname++; + zone = zone_find_by_name(zname); + kmem_free(dup, duplen + 1); + } + if (zone == NULL) + return (GLOBAL_ZONEID); + zid = zone->zone_id; + zone_rele(zone); + return (zid); +} + +static int +devnet_mkdir(struct sdev_node *ddv, char *name) +{ + sdev_node_t *dv; + struct vattr va; + int ret; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + dv = sdev_cache_lookup(ddv, name); + if (dv != NULL) { + SDEV_SIMPLE_RELE(dv); + return (EEXIST); + } + + va = *sdev_getdefault_attr(VDIR); + gethrestime(&va.va_atime); + va.va_mtime = va.va_atime; + va.va_ctime = va.va_atime; + + ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(dv); + return (0); +} + +/* + * We basically need to walk down the directory path to determine what we should + * do. At the top level of /dev/net, only the directory /dev/net/zone is valid, + * and it is always valid. Following on that, /dev/net/zone/%zonename is valid + * if and only if we can look up that zone name. If it's not, or it's some other + * name, then it's SDEV_VTOR_INVALID. + */ +static int +devnet_dirvalidate(struct sdev_node *dv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, dv->sdev_path) == 0) + return (SDEV_VTOR_VALID); + + zonep = zone_find_by_name(dv->sdev_name); + if (zonep == NULL) + return (SDEV_VTOR_INVALID); + zone_rele(zonep); + return (SDEV_VTOR_VALID); +} + /* * Check if a net sdev_node is still valid - i.e. it represents a current * network link. @@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv) ASSERT(dv->sdev_state == SDEV_READY); - if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0) + if (dv->sdev_vnode->v_type == VDIR) + return (devnet_dirvalidate(dv)); + + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) { + ASSERT(SDEV_IS_GLOBAL(dv)); + zoneid = devnet_nodetozone(dv); + } else { + zoneid = getzoneid(); + } + + if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0) return (SDEV_VTOR_INVALID); - if (SDEV_IS_GLOBAL(dv)) + if (zoneid == GLOBAL_ZONEID) return (SDEV_VTOR_VALID); - zoneid = getzoneid(); return (zone_check_datalink(&zoneid, linkid) == 0 ? SDEV_VTOR_VALID : SDEV_VTOR_INVALID); } @@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv) * a net entry when the node is not found in the cache. */ static int -devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp) +devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp, + zoneid_t zid) { timestruc_t now; dev_t dev; int error; - if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) { + if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) { sdcmn_err12(("devnet_create_rvp: not a valid vanity name " "network node: %s\n", nm)); return (error); @@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct sdev_node *ddv = VTOSDEV(dvp); struct sdev_node *dv = NULL; dls_dl_handle_t ddh = NULL; + zone_t *zone; struct vattr vattr; int nmlen; int error = ENOENT; @@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, if (SDEVTOV(ddv)->v_type != VDIR) return (ENOTDIR); + if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID) + return (EPERM); + /* * Empty name or ., return node itself. */ @@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, rw_enter(&ddv->sdev_contents, RW_WRITER); /* + * ZOMBIED parent does not allow new node creation, bail out early. + */ + if (ddv->sdev_state == SDEV_ZOMBIE) + goto failed; + + /* * directory cache lookup: */ if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) { @@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, goto found; } + if (SDEV_IS_GLOBAL(ddv)) { + /* + * Check for /dev/net/zone + */ + if (strcmp("zone", nm) == 0 && strcmp("/dev/net", + ddv->sdev_path) == 0) { + (void) devnet_mkdir(ddv, nm); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + + /* + * Check for /dev/net/zone/%z. We can't use devnet_zpath due to + * its trailing slash. + */ + if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) { + zone = zone_find_by_name(nm); + if (zone == NULL) + goto failed; + (void) devnet_mkdir(ddv, nm); + zone_rele(zone); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + } else if (strcmp("/dev/net", ddv->sdev_path) != 0) { + goto failed; + } + /* - * ZOMBIED parent does not allow new node creation, bail out early. + * We didn't find what we were looking for. What that is depends a lot + * on what directory we're in. */ - if (ddv->sdev_state == SDEV_ZOMBIE) - goto failed; - error = devnet_create_rvp(nm, &vattr, &ddh); + error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv)); if (error != 0) goto failed; @@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg) if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL) goto found; - if (devnet_create_rvp(link, &vattr, &ddh) != 0) + if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0) return (0); ASSERT(ddh != NULL); @@ -244,16 +388,77 @@ found: return (0); } +/* + * Fill in all the entries for the current zone. + */ static void -devnet_filldir(struct sdev_node *ddv) +devnet_fillzone(struct sdev_node *ddv, zoneid_t zid) { - sdev_node_t *dv, *next; datalink_id_t linkid; + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + if (zid == GLOBAL_ZONEID) { + ASSERT(SDEV_IS_GLOBAL(ddv)); + linkid = DATALINK_INVALID_LINKID; + do { + linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); + if (linkid != DATALINK_INVALID_LINKID) + (void) devnet_filldir_datalink(linkid, ddv); + } while (linkid != DATALINK_INVALID_LINKID); + } else { + (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv); + } +} + +/* + * Callback for zone_walk when filling up /dev/net/zone/... + */ +static int +devnet_fillzdir_cb(zone_t *zonep, void *arg) +{ + sdev_node_t *ddv = arg; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + (void) devnet_mkdir(ddv, zonep->zone_name); + return (0); +} + +/* + * Fill in a directory that isn't the top level /dev/net. + */ +static void +devnet_fillzdir(struct sdev_node *ddv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, ddv->sdev_path) == 0) { + (void) zone_walk(devnet_fillzdir_cb, ddv); + return; + } + + zonep = zone_find_by_name(ddv->sdev_name); + if (zonep == NULL) + return; + devnet_fillzone(ddv, zonep->zone_id); + zone_rele(zonep); +} + +static void +devnet_filldir(struct sdev_node *ddv) +{ + int ret; + sdev_node_t *dv, *next; + ASSERT(RW_READ_HELD(&ddv->sdev_contents)); if (rw_tryupgrade(&ddv->sdev_contents) == NULL) { rw_exit(&ddv->sdev_contents); rw_enter(&ddv->sdev_contents, RW_WRITER); + if (ddv->sdev_state == SDEV_ZOMBIE) { + rw_exit(&ddv->sdev_contents); + return; + } } for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) { @@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv) if (SDEVTOV(dv)->v_count > 0) continue; + SDEV_HOLD(dv); + + /* + * Clean out everything underneath before we remove ourselves. + */ + if (SDEVTOV(ddv)->v_type == VDIR) { + ret = sdev_cleandir(dv, NULL, 0); + ASSERT(ret == 0); + } /* remove the cache node */ (void) sdev_cache_update(ddv, &dv, dv->sdev_name, SDEV_CACHE_DELETE); SDEV_RELE(dv); } + if (strcmp(ddv->sdev_path, "/dev/net") != 0) { + devnet_fillzdir(ddv); + goto done; + } + if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild()) goto done; if (SDEV_IS_GLOBAL(ddv)) { - linkid = DATALINK_INVALID_LINKID; - do { - linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, - DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); - if (linkid != DATALINK_INVALID_LINKID) - (void) devnet_filldir_datalink(linkid, ddv); - } while (linkid != DATALINK_INVALID_LINKID); + devnet_fillzone(ddv, GLOBAL_ZONEID); + (void) devnet_mkdir(ddv, "zone"); } else { - (void) zone_datalink_walk(getzoneid(), - devnet_filldir_datalink, ddv); + devnet_fillzone(ddv, getzoneid()); } ddv->sdev_flags &= ~SDEV_BUILD; - done: rw_downgrade(&ddv->sdev_contents); } @@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, ASSERT(sdvp); + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + if (uiop->uio_offset == 0) devnet_filldir(sdvp); diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c new file mode 100644 index 0000000000..885191175f --- /dev/null +++ b/usr/src/uts/common/fs/dev/sdev_plugin.c @@ -0,0 +1,913 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* + * Dynamic directory plugin interface for sdev. + * + * The sdev plugin interfaces provides a means for a dynamic directory based on + * in-kernel state to be simply created. Traditionally, dynamic directories were + * built into sdev itself. While these legacy plugins are useful, it makes more + * sense for these pieces of functionality to live with the individual drivers. + * + * The plugin interface requires folks to implement three interfaces and + * provides a series of callbacks that can be made in the context of those + * interfaces to interrogate the sdev_node_t without having to leak + * implementation details of the sdev_node_t. These interfaces are: + * + * o spo_validate + * + * Given a particular node, answer the question as to whether or not this + * entry is still valid. Here, plugins should use the name and the dev_t + * associated with the node to verify that it matches something that still + * exists. + * + * o spo_filldir + * + * Fill all the entries inside of a directory. Note that some of these entries + * may already exist. + * + * o spo_inactive + * + * The given node is no longer being used. This allows the consumer to + * potentially tear down anything that was being held open related to this. + * Note that this only fires when the given sdev_node_t becomes a zombie. + * + * During these callbacks a consumer is not allowed to register or unregister a + * plugin, especially their own. They may call the sdev_ctx style functions. All + * callbacks fire in a context where blocking is allowed (eg. the spl is below + * LOCK_LEVEL). + * + * When a plugin is added, we create its directory in the global zone. By doing + * that, we ensure that something isn't already there and that nothing else can + * come along and try and create something without our knowledge. We only have + * to create it in the GZ and not for all other instances of sdev because an + * instance of sdev that isn't at /dev does not have dynamic directories, and + * second, any instance of sdev present in a non-global zone cannot create + * anything, therefore we know that by it not being in the global zone's + * instance of sdev that we're good to go. + * + * Lock Ordering + * ------------- + * + * The global sdev_plugin_lock must be held before any of the individual + * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held, + * it is not legal to take any holds on any sdev_node_t or to grab the + * sdev_node_t`contents_lock in any way. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/fs/sdev_impl.h> +#include <sys/fs/sdev_plugin.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/list.h> +#include <sys/ctype.h> + +kmutex_t sdev_plugin_lock; +list_t sdev_plugin_list; +kmem_cache_t *sdev_plugin_cache; +struct vnodeops *sdev_plugin_vnops; + +#define SDEV_PLUGIN_NAMELEN 64 + +typedef struct sdev_plugin { + list_node_t sp_link; + char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */ + int sp_nflags; /* E */ + struct vnodeops *sp_vnops; /* E */ + sdev_plugin_ops_t *sp_pops; /* E */ + boolean_t sp_islegacy; /* E */ + int (*sp_lvtor)(sdev_node_t *); /* E */ + kmutex_t sp_lock; /* Protects everything below */ + kcondvar_t sp_nodecv; + size_t sp_nnodes; +} sdev_plugin_t; + +/* ARGSUSED */ +static int +sdev_plugin_cache_constructor(void *buf, void *arg, int tags) +{ + sdev_plugin_t *spp = buf; + mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0); + cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +sdev_plugin_cache_destructor(void *buf, void *arg) +{ + sdev_plugin_t *spp = buf; + cv_destroy(&spp->sp_nodecv); + mutex_destroy(&spp->sp_lock); +} + +enum vtype +sdev_ctx_vtype(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_vnode->v_type); +} + +const char * +sdev_ctx_path(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_path); +} + +const char * +sdev_ctx_name(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_name); +} + +/* + * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL. + */ +sdev_ctx_flags_t +sdev_ctx_flags(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_flags & SDEV_GLOBAL); +} + +/* + * Return some amount of private data specific to the vtype. In the case of a + * character or block device this is the device number. + */ +const void * +sdev_ctx_vtype_data(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + void *ret; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + switch (sdp->sdev_vnode->v_type) { + case VCHR: + case VBLK: + ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev); + break; + default: + ret = NULL; + break; + } + + return (ret); +} + +/* + * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'. + */ +static int +sdev_plugin_name_isvalid(const char *c, int buflen) +{ + int i; + + for (i = 0; i < buflen; i++, c++) { + if (*c == '\0') + return (1); + + if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.') + return (0); + } + /* Never found a null terminator */ + return (0); +} + +static int +sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name, + vattr_t *vap) +{ + int ret; + sdev_node_t *svp; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + ASSERT(spp != NULL); + svp = sdev_cache_lookup(sdvp, name); + if (svp != NULL) { + SDEV_SIMPLE_RELE(svp); + return (EEXIST); + } + + ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred, + SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(svp); + + return (0); +} + +/* + * Plugin node creation callbacks + */ +int +sdev_plugin_mkdir(sdev_ctx_t ctx, char *name) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(sdvp->sdev_private != NULL); + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); +} + +int +sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + if (mode != S_IFCHR && mode != S_IFBLK) + return (EINVAL); + + ASSERT(sdvp->sdev_private != NULL); + + vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + vap.va_rdev = dev; + vap.va_mode = mode | 0666; + + /* Despite the similar name, this is in fact a different function */ + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); + +} + +static int +sdev_plugin_validate(sdev_node_t *sdp) +{ + int ret; + sdev_plugin_t *spp; + + ASSERT(sdp->sdev_private != NULL); + spp = sdp->sdev_private; + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + rw_enter(&sdp->sdev_contents, RW_READER); + ret = spp->sp_pops->spo_validate((uintptr_t)sdp); + rw_exit(&sdp->sdev_contents); + return (ret); +} + +static void +sdev_plugin_validate_dir(sdev_node_t *sdvp) +{ + int ret; + sdev_node_t *svp, *next; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) { + + next = SDEV_NEXT_ENTRY(sdvp, svp); + ASSERT(svp->sdev_state != SDEV_ZOMBIE); + /* skip nodes that aren't ready */ + if (svp->sdev_state == SDEV_INIT) + continue; + + switch (sdev_plugin_validate(svp)) { + case SDEV_VTOR_VALID: + case SDEV_VTOR_SKIP: + continue; + case SDEV_VTOR_INVALID: + case SDEV_VTOR_STALE: + break; + } + + SDEV_HOLD(svp); + + /* + * Clean out everything underneath this node before we + * remove it. + */ + if (svp->sdev_vnode->v_type == VDIR) { + ret = sdev_cleandir(svp, NULL, 0); + ASSERT(ret == 0); + } + /* remove the cache node */ + (void) sdev_cache_update(sdvp, &svp, svp->sdev_name, + SDEV_CACHE_DELETE); + SDEV_RELE(svp); + } +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, + int *eofp, caller_context_t *ct_unused, int flags_unused) +{ + int ret; + sdev_node_t *sdvp = VTOSDEV(dvp); + sdev_plugin_t *spp; + + ASSERT(RW_READ_HELD(&sdvp->sdev_contents)); + + /* Sanity check we're not a zombie before we do anyting else */ + if (sdvp->sdev_state == SDEV_ZOMBIE) + return (ENOENT); + + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + if (uiop->uio_offset == 0) { + /* + * We upgrade to a write lock and grab the plugin's lock along + * the way. We're almost certainly going to get creation + * callbacks, so this is the only safe way to go. + */ + if (rw_tryupgrade(&sdvp->sdev_contents) == 0) { + rw_exit(&sdvp->sdev_contents); + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_downgrade(&sdvp->sdev_contents); + return (ENOENT); + } + } + + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_downgrade(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + } + + return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); +} + +/* + * If we don't have a callback function that returns a failure, then sdev will + * try to create a node for us which violates all of our basic assertions. To + * work around that we create our own callback for devname_lookup_func which + * always returns ENOENT as at this point either it was created with the filldir + * callback or it was not. + */ +/*ARGSUSED*/ +static int +sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred, + void *unused, char *unused2) +{ + return (ENOENT); +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + int ret; + sdev_node_t *sdvp; + sdev_plugin_t *spp; + + /* execute access is required to search the directory */ + if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) + return (ret); + + sdvp = VTOSDEV(dvp); + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + /* + * Go straight for the write lock. + */ + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_exit(&sdvp->sdev_contents); + return (ENOENT); + } + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_exit(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + + return (devname_lookup_func(sdvp, nm, vpp, cred, + sdev_plugin_vop_lookup_cb, SDEV_VATTR)); +} + +/* + * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes + * to zero, but isn't necessairily a zombie yet. As such, to make things easier + * for users, we only fire the inactive callback when the node becomes a zombie + * and thus will be torn down here. + */ +static void +sdev_plugin_vop_inactive_cb(struct vnode *dvp) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + + rw_enter(&sdp->sdev_contents, RW_READER); + if (sdp->sdev_state != SDEV_ZOMBIE) { + rw_exit(&sdp->sdev_contents); + return; + } + spp->sp_pops->spo_inactive((uintptr_t)sdp); + mutex_enter(&spp->sp_lock); + VERIFY(spp->sp_nnodes > 0); + spp->sp_nnodes--; + cv_signal(&spp->sp_nodecv); + mutex_exit(&spp->sp_lock); + rw_exit(&sdp->sdev_contents); +} + +/*ARGSUSED*/ +static void +sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred, + caller_context_t *ct) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + ASSERT(sdp->sdev_private != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb); +} + +const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = { + VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir }, + VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup }, + VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive }, + VOPNAME_CREATE, { .error = fs_nosys }, + VOPNAME_REMOVE, { .error = fs_nosys }, + VOPNAME_MKDIR, { .error = fs_nosys }, + VOPNAME_RMDIR, { .error = fs_nosys }, + VOPNAME_SYMLINK, { .error = fs_nosys }, + VOPNAME_SETSECATTR, { .error = fs_nosys }, + NULL, NULL +}; + +/* + * construct a new template with overrides from vtab + */ +static fs_operation_def_t * +sdev_merge_vtab(const fs_operation_def_t tab[]) +{ + fs_operation_def_t *new; + const fs_operation_def_t *tab_entry; + + /* make a copy of standard vnode ops table */ + new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); + bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); + + /* replace the overrides from tab */ + for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { + fs_operation_def_t *std_entry = new; + while (std_entry->name) { + if (strcmp(tab_entry->name, std_entry->name) == 0) { + std_entry->func = tab_entry->func; + break; + } + std_entry++; + } + } + + return (new); +} + +/* free memory allocated by sdev_merge_vtab */ +static void +sdev_free_vtab(fs_operation_def_t *new) +{ + kmem_free(new, sdev_vnodeops_tbl_size); +} + +/* + * Register a new plugin. + */ +sdev_plugin_hdl_t +sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp) +{ + int ret, err; + sdev_plugin_t *spp, *iter; + vnode_t *vp, *nvp; + sdev_node_t *sdp, *slp; + timestruc_t now; + struct vattr vap; + + /* + * Some consumers don't care about why they failed. To keep the code + * simple, we'll just pretend they gave us something. + */ + if (errp == NULL) + errp = &err; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_version != 1) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_validate == NULL || ops->spo_filldir == NULL || + ops->spo_inactive == NULL) { + *errp = EINVAL; + return (NULL); + } + + if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) { + *errp = EINVAL; + return (NULL); + } + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN); + + spp->sp_pops = ops; + spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR; + if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE) + spp->sp_nflags |= SDEV_NO_NCACHE; + if (ops->spo_flags & SDEV_PLUGIN_SUBDIR) + spp->sp_nflags |= SDEV_SUBDIR; + spp->sp_vnops = sdev_plugin_vnops; + spp->sp_islegacy = B_FALSE; + spp->sp_lvtor = NULL; + spp->sp_nnodes = 0; + + /* + * Make sure it's unique, nothing exists with this name already, and add + * it to the list. We also need to go through and grab the sdev + * root node as we cannot grab any sdev node locks once we've grabbed + * the sdev_plugin_lock. We effectively assert that if a directory is + * not present in the GZ's /dev, then it doesn't exist in any of the + * local zones. + */ + ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1); + if (ret != 0) { + *errp = ret; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + /* Make sure we have the real vnode */ + if (VOP_REALVP(vp, &nvp, NULL) == 0) { + VN_HOLD(nvp); + VN_RELE(vp); + vp = nvp; + nvp = NULL; + } + VERIFY(vp->v_op == sdev_vnodeops); + sdp = VTOSDEV(vp); + rw_enter(&sdp->sdev_contents, RW_WRITER); + slp = sdev_cache_lookup(sdp, spp->sp_name); + if (slp != NULL) { + SDEV_RELE(slp); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + + mutex_enter(&sdev_plugin_lock); + for (iter = list_head(&sdev_plugin_list); iter != NULL; + iter = list_next(&sdev_plugin_list, iter)) { + if (strcmp(spp->sp_name, iter->sp_name) == 0) { + mutex_exit(&sdev_plugin_lock); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + } + + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + /* + * Now go ahead and create the top level directory for the global zone. + */ + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap); + + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + + return ((sdev_plugin_hdl_t)spp); +} + +static void +sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg) +{ + sdev_plugin_t *spp = arg; + sdev_node_t *sdp; + + rw_enter(&rdp->sdev_contents, RW_WRITER); + sdp = sdev_cache_lookup(rdp, spp->sp_name); + /* If it doesn't exist, we're done here */ + if (sdp == NULL) { + rw_exit(&rdp->sdev_contents); + return; + } + + /* + * We first delete the directory before recursively marking everything + * else stale. This ordering should ensure that we don't accidentally + * miss anything. + */ + sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE); + sdev_stale(sdp); + SDEV_RELE(sdp); + rw_exit(&rdp->sdev_contents); +} + +/* + * Remove a plugin. This will block until everything has become a zombie, thus + * guaranteeing the caller that nothing will call into them again once this call + * returns. While the call is ongoing, it could be called into. Note that while + * this is ongoing, it will block other mounts. + */ +int +sdev_plugin_unregister(sdev_plugin_hdl_t hdl) +{ + sdev_plugin_t *spp = (sdev_plugin_t *)hdl; + if (spp->sp_islegacy) + return (EINVAL); + + mutex_enter(&sdev_plugin_lock); + list_remove(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + sdev_mnt_walk(sdev_plugin_unregister_cb, spp); + mutex_enter(&spp->sp_lock); + while (spp->sp_nnodes > 0) + cv_wait(&spp->sp_nodecv, &spp->sp_lock); + mutex_exit(&spp->sp_lock); + kmem_cache_free(sdev_plugin_cache, spp); + return (0); +} + +/* + * Register an old sdev style plugin to deal with what used to be in the vtab. + */ +static int +sdev_plugin_register_legacy(struct sdev_vop_table *vtp) +{ + sdev_plugin_t *spp; + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN); + spp->sp_islegacy = B_TRUE; + spp->sp_pops = NULL; + spp->sp_nflags = vtp->vt_flags; + spp->sp_lvtor = vtp->vt_vtor; + spp->sp_nnodes = 0; + + if (vtp->vt_service != NULL) { + fs_operation_def_t *templ; + templ = sdev_merge_vtab(vtp->vt_service); + if (vn_make_ops(vtp->vt_name, + (const fs_operation_def_t *)templ, + &spp->sp_vnops) != 0) { + cmn_err(CE_WARN, "%s: malformed vnode ops\n", + vtp->vt_name); + sdev_free_vtab(templ); + kmem_cache_free(sdev_plugin_cache, spp); + return (1); + } + + if (vtp->vt_global_vops) { + *(vtp->vt_global_vops) = spp->sp_vnops; + } + + sdev_free_vtab(templ); + } else { + spp->sp_vnops = sdev_vnodeops; + } + + /* + * No need to check for EEXIST here. These are loaded as a part of the + * sdev's initialization function. Further, we don't have to create them + * as that's taken care of in sdev's mount for the GZ. + */ + mutex_enter(&sdev_plugin_lock); + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + return (0); +} + +/* + * We need to match off of the sdev_path, not the sdev_name. We are only allowed + * to exist directly under /dev. + */ +static sdev_plugin_t * +sdev_match(sdev_node_t *dv) +{ + int vlen; + const char *path; + sdev_plugin_t *spp; + + if (strlen(dv->sdev_path) <= 5) + return (NULL); + + if (strncmp(dv->sdev_path, "/dev/", 5) != 0) + return (NULL); + path = dv->sdev_path + 5; + + mutex_enter(&sdev_plugin_lock); + + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + if (spp->sp_nflags & SDEV_SUBDIR) { + vlen = strlen(spp->sp_name); + if ((strncmp(spp->sp_name, path, + vlen - 1) == 0) && path[vlen] == '/') { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + } + } + + mutex_exit(&sdev_plugin_lock); + return (NULL); +} + +void +sdev_set_no_negcache(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + ASSERT(dv->sdev_path); + path = dv->sdev_path + strlen("/dev/"); + + mutex_enter(&sdev_plugin_lock); + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + if (spp->sp_nflags & SDEV_NO_NCACHE) + dv->sdev_flags |= SDEV_NO_NCACHE; + break; + } + } + mutex_exit(&sdev_plugin_lock); +} + +struct vnodeops * +sdev_get_vop(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + path = dv->sdev_path; + ASSERT(path); + + /* gets the relative path to /dev/ */ + path += 5; + + if ((spp = sdev_match(dv)) != NULL) { + dv->sdev_flags |= spp->sp_nflags; + if (SDEV_IS_PERSIST(dv->sdev_dotdot) && + (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) + dv->sdev_flags |= SDEV_PERSIST; + return (spp->sp_vnops); + } + + /* child inherits the persistence of the parent */ + if (SDEV_IS_PERSIST(dv->sdev_dotdot)) + dv->sdev_flags |= SDEV_PERSIST; + return (sdev_vnodeops); +} + +void * +sdev_get_vtor(sdev_node_t *dv) +{ + sdev_plugin_t *spp; + + if (dv->sdev_private == NULL) { + spp = sdev_match(dv); + if (spp == NULL) + return (NULL); + } else { + spp = dv->sdev_private; + } + + if (spp->sp_islegacy) + return ((void *)spp->sp_lvtor); + else + return ((void *)sdev_plugin_validate); +} + +void +sdev_plugin_nodeready(sdev_node_t *sdp) +{ + sdev_plugin_t *spp; + + ASSERT(RW_WRITE_HELD(&sdp->sdev_contents)); + ASSERT(sdp->sdev_private == NULL); + + spp = sdev_match(sdp); + if (spp == NULL) + return; + if (spp->sp_islegacy) + return; + sdp->sdev_private = spp; + mutex_enter(&spp->sp_lock); + spp->sp_nnodes++; + mutex_exit(&spp->sp_lock); +} + +int +sdev_plugin_init(void) +{ + sdev_vop_table_t *vtp; + fs_operation_def_t *templ; + + sdev_plugin_cache = kmem_cache_create("sdev_plugin", + sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor, + sdev_plugin_cache_destructor, NULL, NULL, NULL, 0); + if (sdev_plugin_cache == NULL) + return (1); + mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&sdev_plugin_list, sizeof (sdev_plugin_t), + offsetof(sdev_plugin_t, sp_link)); + + /* + * Register all of the legacy vnops + */ + for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++) + if (sdev_plugin_register_legacy(vtp) != 0) + return (1); + + templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl); + if (vn_make_ops("sdev_plugin", + (const fs_operation_def_t *)templ, + &sdev_plugin_vnops) != 0) { + sdev_free_vtab(templ); + return (1); + } + + sdev_free_vtab(templ); + return (0); +} diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index 9234cc4a0c..511432453f 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = { kmem_cache_t *sdev_node_cache; /* sdev_node cache */ int devtype; /* fstype */ -/* static */ -static struct vnodeops *sdev_get_vop(struct sdev_node *); -static void sdev_set_no_negcache(struct sdev_node *); -static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []); -static void sdev_free_vtab(fs_operation_def_t *); - static void sdev_prof_free(struct sdev_node *dv) { @@ -313,6 +307,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv, (void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm); /* overwritten for VLNK nodes */ dv->sdev_symlink = NULL; + list_link_init(&dv->sdev_plist); vp = SDEVTOV(dv); vn_reinit(vp); @@ -401,6 +396,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp, } else { dv->sdev_nlink = 1; } + sdev_plugin_nodeready(dv); if (!(SDEV_IS_GLOBAL(dv))) { dv->sdev_origin = (struct sdev_node *)args; @@ -497,37 +493,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp, return (dv); } -/* directory dependent vop table */ -struct sdev_vop_table { - char *vt_name; /* subdirectory name */ - const fs_operation_def_t *vt_service; /* vnodeops table */ - struct vnodeops *vt_vops; /* constructed vop */ - struct vnodeops **vt_global_vops; /* global container for vop */ - int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ - int vt_flags; -}; - -/* - * A nice improvement would be to provide a plug-in mechanism - * for this table instead of a const table. - */ -static struct sdev_vop_table vtab[] = -{ - { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate, +struct sdev_vop_table vtab[] = { + { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate, + { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops, + { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops, devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE }, + { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE }, - { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate, - SDEV_DYNAMIC | SDEV_VTOR }, + { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate, + SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops, + { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops, devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE }, /* @@ -542,132 +523,14 @@ static struct sdev_vop_table vtab[] = * preventing a mkdir. */ - { "lofi", NULL, NULL, NULL, NULL, + { "lofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { "rlofi", NULL, NULL, NULL, NULL, + { "rlofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { NULL, NULL, NULL, NULL, NULL, 0} + { NULL, NULL, NULL, NULL, 0} }; -/* - * We need to match off of the sdev_path, not the sdev_name. We are only allowed - * to exist directly under /dev. - */ -struct sdev_vop_table * -sdev_match(struct sdev_node *dv) -{ - int vlen; - int i; - const char *path; - - if (strlen(dv->sdev_path) <= 5) - return (NULL); - - if (strncmp(dv->sdev_path, "/dev/", 5) != 0) - return (NULL); - path = dv->sdev_path + 5; - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) - return (&vtab[i]); - if (vtab[i].vt_flags & SDEV_SUBDIR) { - vlen = strlen(vtab[i].vt_name); - if ((strncmp(vtab[i].vt_name, path, - vlen - 1) == 0) && path[vlen] == '/') - return (&vtab[i]); - } - - } - return (NULL); -} - -/* - * sets a directory's vnodeops if the directory is in the vtab; - */ -static struct vnodeops * -sdev_get_vop(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - char *path; - - path = dv->sdev_path; - ASSERT(path); - - /* gets the relative path to /dev/ */ - path += 5; - - /* gets the vtab entry it matches */ - if ((vtp = sdev_match(dv)) != NULL) { - dv->sdev_flags |= vtp->vt_flags; - if (SDEV_IS_PERSIST(dv->sdev_dotdot) && - (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) - dv->sdev_flags |= SDEV_PERSIST; - - if (vtp->vt_vops) { - if (vtp->vt_global_vops) - *(vtp->vt_global_vops) = vtp->vt_vops; - - return (vtp->vt_vops); - } - - if (vtp->vt_service) { - fs_operation_def_t *templ; - templ = sdev_merge_vtab(vtp->vt_service); - if (vn_make_ops(vtp->vt_name, - (const fs_operation_def_t *)templ, - &vtp->vt_vops) != 0) { - cmn_err(CE_PANIC, "%s: malformed vnode ops\n", - vtp->vt_name); - /*NOTREACHED*/ - } - if (vtp->vt_global_vops) { - *(vtp->vt_global_vops) = vtp->vt_vops; - } - sdev_free_vtab(templ); - - return (vtp->vt_vops); - } - - return (sdev_vnodeops); - } - - /* child inherits the persistence of the parent */ - if (SDEV_IS_PERSIST(dv->sdev_dotdot)) - dv->sdev_flags |= SDEV_PERSIST; - - return (sdev_vnodeops); -} - -static void -sdev_set_no_negcache(struct sdev_node *dv) -{ - int i; - char *path; - - ASSERT(dv->sdev_path); - path = dv->sdev_path + strlen("/dev/"); - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) { - if (vtab[i].vt_flags & SDEV_NO_NCACHE) - dv->sdev_flags |= SDEV_NO_NCACHE; - break; - } - } -} - -void * -sdev_get_vtor(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - - vtp = sdev_match(dv); - if (vtp) - return ((void *)vtp->vt_vtor); - else - return (NULL); -} /* * Build the base root inode @@ -947,8 +810,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) dv->sdev_path = NULL; } - if (!SDEV_IS_GLOBAL(dv)) + if (!SDEV_IS_GLOBAL(dv)) { sdev_prof_free(dv); + if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL) + SDEV_RELE(dv->sdev_origin); + } if (SDEVTOV(dv)->v_type == VDIR) { ASSERT(SDEV_FIRST_ENTRY(dv) == NULL); @@ -962,6 +828,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) (void) memset((void *)&dv->sdev_instance_data, 0, sizeof (dv->sdev_instance_data)); vn_invalid(SDEVTOV(dv)); + dv->sdev_private = NULL; kmem_cache_free(sdev_node_cache, dv); } @@ -2944,46 +2811,6 @@ sdev_modctl_devexists(const char *path) return (error); } -extern int sdev_vnodeops_tbl_size; - -/* - * construct a new template with overrides from vtab - */ -static fs_operation_def_t * -sdev_merge_vtab(const fs_operation_def_t tab[]) -{ - fs_operation_def_t *new; - const fs_operation_def_t *tab_entry; - - /* make a copy of standard vnode ops table */ - new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); - bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); - - /* replace the overrides from tab */ - for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { - fs_operation_def_t *std_entry = new; - while (std_entry->name) { - if (strcmp(tab_entry->name, std_entry->name) == 0) { - std_entry->func = tab_entry->func; - break; - } - std_entry++; - } - if (std_entry->name == NULL) - cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.", - tab_entry->name); - } - - return (new); -} - -/* free memory allocated by sdev_merge_vtab */ -static void -sdev_free_vtab(fs_operation_def_t *new) -{ - kmem_free(new, sdev_vnodeops_tbl_size); -} - /* * a generic setattr() function * diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c index 00e981ce9c..8de16926cd 100644 --- a/usr/src/uts/common/fs/dev/sdev_vfsops.c +++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c @@ -172,7 +172,13 @@ devinit(int fstype, char *name) if ((devmajor = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name); - return (1); + return (ENXIO); + } + + if (sdev_plugin_init() != 0) { + cmn_err(CE_WARN, "%s: failed to set init plugin subsystem", + sdev_vfssw.name); + return (EIO); } /* initialize negative cache */ @@ -349,6 +355,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, ASSERT(sdev_origins); dv->sdev_flags &= ~SDEV_GLOBAL; dv->sdev_origin = sdev_origins->sdev_root; + SDEV_HOLD(dv->sdev_origin); } else { sdev_ncache_setup(); rw_enter(&dv->sdev_contents, RW_WRITER); @@ -521,3 +528,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo) SDEVTOV(mntinfo->sdev_root)->v_count--; mutex_exit(&sdev_lock); } + +void +sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg) +{ + struct sdev_data *mntinfo; + + mutex_enter(&sdev_lock); + mntinfo = sdev_mntinfo; + while (mntinfo != NULL) { + func(mntinfo->sdev_root, arg); + mntinfo = mntinfo->sdev_next; + } + mutex_exit(&sdev_lock); +} diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c index 59a3c9f17a..6ce4b0b174 100644 --- a/usr/src/uts/common/fs/dev/sdev_vnops.c +++ b/usr/src/uts/common/fs/dev/sdev_vnops.c @@ -22,7 +22,7 @@ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* @@ -864,6 +864,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred, } } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } @@ -1188,6 +1191,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1260,6 +1264,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1375,6 +1380,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index 11384e33d3..407ad1d55b 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -459,8 +459,10 @@ devzvol_create_pool_dirs(struct vnode *dvp) ASSERT(dvp->v_count > 0); rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, NULL, kcred, NULL, 0, NULL); - /* should either work, or not be visible from a zone */ - ASSERT(rc == 0 || rc == ENOENT); + /* + * should either work or we should get an error if this should + * not be visible from the zone, or disallowed in the zone + */ if (rc == 0) VN_RELE(vp); pools++; diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index 25327d2852..c949117da6 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop) } /* - * Perform a reverse lookup in the DNLC. This will find the first occurrence of - * the vnode. If successful, it will return the vnode of the parent, and the - * name of the entry in the given buffer. If it cannot be found, or the buffer - * is too small, then it will return NULL. Note that this is a highly - * inefficient function, since the DNLC is constructed solely for forward - * lookups. - */ -vnode_t * -dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) -{ - nc_hash_t *nch; - ncache_t *ncp; - vnode_t *pvp; - - if (!doingcache) - return (NULL); - - for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { - mutex_enter(&nch->hash_lock); - ncp = nch->hash_next; - while (ncp != (ncache_t *)nch) { - /* - * We ignore '..' entries since it can create - * confusion and infinite loops. - */ - if (ncp->vp == vp && !(ncp->namlen == 2 && - 0 == bcmp(ncp->name, "..", 2)) && - ncp->namlen < buflen) { - bcopy(ncp->name, buf, ncp->namlen); - buf[ncp->namlen] = '\0'; - pvp = ncp->dp; - /* VN_HOLD 2 of 2 in this file */ - VN_HOLD_CALLER(pvp); - mutex_exit(&nch->hash_lock); - return (pvp); - } - ncp = ncp->hash_next; - } - mutex_exit(&nch->hash_lock); - } - - return (NULL); -} -/* * Utility routine to search for a cache entry. Return the * ncache entry if found, NULL otherwise. */ diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index b4e28cc860..5f524def30 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> @@ -33,11 +37,12 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> - #include <sys/fem.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */ /* @@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1) } #endif +/* + * File event monitoring handoffs + * + * File event monitoring relies on being able to inject stack frames between + * vnode consumers and the underlying file systems. This becomes problematic + * when there exist many monitors, as kernel stack depth is finite. The model + * very much encodes this injected frame: the flow of control deliberately + * lies with the monitor, not with the monitoring system. While we could + * conceivably address this by allowing each subsystem to install at most + * one monitor per vnode (and impose on subsystems that they handle any + * of their own consumer multiplexing internally), this in fact exports a + * substantial amount of run-time complexity to deal with an uncommon case + * (and, it must be said, assumes a small number of consuming subsystems). + * To allow our abstraction to remain clean, we instead check our remaining + * stack in every vnext_*() call; if the amount of stack remaining is lower + * than a threshold (fem_stack_needed), we call thread_splitstack() to carry + * on the execution of the monitors and the underlying vnode operation on a + * split stack. Because we can only pass a single argument to our split stack + * function, we must marshal our arguments, the mechanics of which are somewhat + * ornate in terms of the code: to marshal in a type-safe manner, we define a + * baton that is a union of payload structures for each kind of operation, + * loading the per-operation payload explicitly and calling into common handoff + * code that itself calls thread_splitstack(). The function passed to + * thread_splitstack() is a per-entry point function that continues monitor + * processing given the specified (marshalled) arguments. While this method + * is a little verbose to implement, it has the advantage of being relatively + * robust (that is, broadly type-safe) while imposing minimal burden on each + * vnext_*() entry point. + * + * In terms of the implementation: + * + * - The FEM_BATON_n macros define the per-entry point baton structures + * - The fem_baton_payload_t contains the union of these structures + * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point + * - The FEM_VNEXTn macros constitute the per-handoff entry point + * + * Note that we don't use variadic macros -- we define a variant of these + * macros for each of our relevant argument counts. This may seem overly + * explicit, but it is deliberate: the object here is to minimize the + * future maintenance burden by minimizing the likelihood of introduced + * error -- not to minimize the number of characters in this source file. + */ + +#ifndef STACK_GROWTH_DOWN +#error Downward stack growth assumed. +#endif + +int fem_stack_toodeep; +uintptr_t fem_stack_needed = 8 * 1024; +size_t fem_handoff_stacksize = 128 * 1024; + +#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ + (uintptr_t)curthread->t_stkbase < fem_stack_needed) + +#define FEM_BATON_1(what, t0, l0) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + } fb_##what + +#define FEM_BATON_2(what, t0, l0, t1, l1) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + } fb_##what + +#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + } fb_##what + +#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + } fb_##what + +#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + } fb_##what + +#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + } fb_##what + +#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + } fb_##what + +#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7, t8, l8) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + t8 fb_##what##_##l8; \ + } fb_##what + +typedef union { + FEM_BATON_2(open, int, mode, cred_t *, cr); + FEM_BATON_4(close, int, flag, int, count, + offset_t, offset, cred_t *, cr); + FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_5(ioctl, int, cmd, intptr_t, arg, + int, flag, cred_t *, cr, int *, rvalp); + FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr); + FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr); + FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp, + pathname_t *, pnp, int, flags, vnode_t *, rdir, + cred_t *, cr, int *, direntflags, pathname_t *, realpnp); + FEM_BATON_8(create, char *, name, vattr_t *, vap, + vcexcl_t, excl, int, mode, vnode_t **, vpp, + cred_t *, cr, int, flag, vsecattr_t *, vsecp); + FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags); + FEM_BATON_4(link, vnode_t *, svp, char *, tnm, + cred_t *, cr, int, flags); + FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp, + char *, tnm, cred_t *, cr, int, flags); + FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap, + vnode_t **, vpp, cred_t *, cr, int, flags, + vsecattr_t *, vsecp); + FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir, + cred_t *, cr, int, flags); + FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr, + int *, eofp, int, flags); + FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap, + char *, target, cred_t *, cr, int, flags); + FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr); + FEM_BATON_2(fsync, int, syncflag, cred_t *, cr); + FEM_BATON_1(inactive, cred_t *, cr); + FEM_BATON_1(fid, fid_t *, fidp); + FEM_BATON_1(rwlock, int, write_lock); + FEM_BATON_1(rwunlock, int, write_lock); + FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp); + FEM_BATON_1(cmp, vnode_t *, vp2); + FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, struct flk_callback *, flk_cbp, + cred_t *, cr); + FEM_BATON_5(space, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, cred_t *, cr); + FEM_BATON_1(realvp, vnode_t **, vpp); + FEM_BATON_9(getpage, offset_t, off, size_t, len, + uint_t *, protp, struct page **, plarr, size_t, plsz, + struct seg *, seg, caddr_t, addr, enum seg_rw, rw, + cred_t *, cr); + FEM_BATON_4(putpage, offset_t, off, size_t, len, + int, flags, cred_t *, cr); + FEM_BATON_8(map, offset_t, off, struct as *, as, + caddr_t *, addrp, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(addmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(delmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uint_t, prot, + uint_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_4(poll, short, events, int, anyyet, + short *, reventsp, struct pollhead **, phpp); + FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks); + FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr); + FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off, + size_t, io_len, int, flags, cred_t *, cr); + FEM_BATON_2(dumpctl, int, action, offset_t *, blkp); + FEM_BATON_4(dispose, struct page *, pp, int, flag, + int, dn, cred_t *, cr); + FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr, + int, flag, cred_t *, cr); + FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname); + FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag, + xuio_t *, xuiop, cred_t *, cr); + FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr); +} fem_baton_payload_t; + +typedef struct { + fem_baton_payload_t fb_payload; + int (*fb_func)(); + void (*fb_handoff)(); + int fb_rval; +} fem_baton_t; + +static int +fem_handoff(fem_baton_t *bp) +{ + fem_stack_toodeep++; + thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize); + + return (bp->fb_rval); +} + +#define FEM_VNEXT3_DECL(what, a0, a1, a2) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2); \ +} + +#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3); \ +} + +#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4); \ +} + +#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5); \ +} + +#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6); \ +} + +#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7); \ +} + +#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9); \ +} + +#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9, \ + bp->fb_payload.fb_##what.fb_##what##_##a10); \ +} + +#define FEM_VNEXT3(what, func, a0, a1, a2) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2)) + +#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3)) + +#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4)) + +#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5)) + +#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6)) + +#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7)) + +#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)) + +#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)) + static fem_t * fem_alloc() { @@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[] = { * 5. Return by invoking the base operation with the base object. * * for each classification, there needs to be at least one "next" operation - * for each "head"operation. - * + * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros + * to define the function to run when the stack is split; see the discussion + * on "File event monitoring handoffs", above. */ +FEM_VNEXT4_DECL(open, arg0, mode, cr, ct) +FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct) +FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct) +FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct) +FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct) +FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp) +FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp) +FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags) +FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags) +FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags) +FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp) +FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags) +FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags) +FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags) +FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct) +FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct) +FEM_VNEXT3_DECL(fid, arg0, fidp, ct) +FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct) +FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct) +FEM_VNEXT3_DECL(cmp, arg0, vp2, ct) +FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct) +FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct) +FEM_VNEXT3_DECL(realvp, arg0, vpp, ct) +FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz, + seg, addr, rw, cr, ct) +FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct) +FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct) +FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct) +FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct) +FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct) +FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct) +FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct) +FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct) +FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct) +FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct) + int vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) { @@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_open, femop_open); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, cr, ct)); + FEM_VNEXT4(open, func, arg0, mode, cr, ct); } int @@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_close, femop_close); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, flag, count, offset, cr, ct)); + FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct); } int @@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_read, femop_read); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct); } int @@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_write, femop_write); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct); } int @@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct)); + FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct); } int @@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, oflags, nflags, cr, ct)); + FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct); } int @@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct); } int @@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct); } int @@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_access, femop_access); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, flags, cr, ct)); + FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct); } int @@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp, vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct, - direntflags, realpnp)); + FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct, + direntflags, realpnp); } int @@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, vsop_find(vf, &func, int, &arg0, vop_create, femop_create); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)); + FEM_VNEXT10(create, func, arg0, name, vap, excl, + mode, vpp, cr, flag, ct, vsecp); } int @@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cr, ct, flags)); + FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags); } int @@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_link, femop_link); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, svp, tnm, cr, ct, flags)); + FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags); } int @@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags)); + FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags); } int @@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp, vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp)); + FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp); } int @@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cdir, cr, ct, flags)); + FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags); } int @@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, eofp, ct, flags)); + FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags); } int @@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target, vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, linkname, vap, target, cr, ct, flags)); + FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags); } int @@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, ct)); + FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct); } int @@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, syncflag, cr, ct)); + FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct); } void @@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, fidp, ct)); + FEM_VNEXT3(fid, func, arg0, fidp, ct); } int @@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, write_lock, ct)); + FEM_VNEXT3(rwlock, func, arg0, write_lock, ct); } void @@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ooff, noffp, ct)); + FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct); } int @@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vp2, ct)); + FEM_VNEXT3(cmp, func, arg0, vp2, ct); } int @@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)); + FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct); } int @@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_space, femop_space); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct)); + FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct); } int @@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vpp, ct)); + FEM_VNEXT3(realvp, func, arg0, vpp, ct); } int @@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp, vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw, - cr, ct)); + FEM_VNEXT11(getpage, func, arg0, off, len, protp, + plarr, plsz, seg, addr, rw, cr, ct); } int @@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags, vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, flags, cr, ct)); + FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct); } int @@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp, vsop_find(vf, &func, int, &arg0, vop_map, femop_map); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags, + cr, ct); } int @@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp, vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, events, anyyet, reventsp, phpp, ct)); + FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct); } int @@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks, vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, addr, lbdn, dblks, ct)); + FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct); } int @@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, valp, cr, ct)); + FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct); } int @@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off, vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct)); + FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct); } int @@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, action, blkp, ct)); + FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct); } void @@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, shr, flag, cr, ct)); + FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct); } int @@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vnevent, dvp, cname, ct)); + FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct); } int @@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ioflag, xuiop, cr, ct)); + FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct); } int @@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, xuiop, cr, ct)); + FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct); } int diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 6e56000ffe..56204c6741 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld) /* * The other end of the pipe is almost closed so * reject any other open on this end of the pipe - * This only happens with a pipe mounted under namefs + * This normally only happens with a pipe mounted under namefs, but + * we can also see an open via proc/fd, which should still succeed. + * To indicate the proc/fd case the FKLYR flag is passed. */ - if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) { + if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) && + (flag & FKLYR) == 0) { fifo_cleanup(oldvp, flag); cv_broadcast(&fnp->fn_wait_cv); if (!lockheld) diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..05ee2c6e09 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + kmem_free(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + kmem_free(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..1d857309f3 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..c582a8cac2 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,614 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), + KM_NORMALPRI | KM_NOSLEEP)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + kmem_free(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..a2064dfa1f --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1441 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (EISDIR); + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + int len, cnt, error; + int i; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + vattr_t tmp_va; + + if (tp->hln_looped == 1) { + int error; + + if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr, + ct)) != 0) + return (error); + } + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + if (tp->hln_looped == 1) { + vap->va_nblocks = tmp_va.va_nblocks; + } else { + vap->va_nblocks = + (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + } + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, int *pcnt, int n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + int cnt; + int len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == vn_vpath_empty) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + int limit, cnt, error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + kmem_free(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 55ffb94805..59ec5d1829 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -21,6 +21,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -57,6 +58,7 @@ #include <sys/zone.h> #include <sys/dnlc.h> #include <sys/fs/snode.h> +#include <sys/brand.h> /* Controls whether paths are stored with vnodes. */ int vfs_vnode_path = 1; @@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr) } /* + * Clean a stale v_path from a vnode. This is only performed if the v_path has + * not been altered since it was found to be stale + */ +static void +vnode_clear_vpath(vnode_t *vp, char *vpath_old) +{ + mutex_enter(&vp->v_lock); + if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) { + vp->v_path = vn_vpath_empty; + mutex_exit(&vp->v_lock); + kmem_free(vpath_old, strlen(vpath_old) + 1); + } else { + mutex_exit(&vp->v_lock); + } +} + +/* + * Validate that a pathname refers to a given vnode. + */ +static int +vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn, + int flags, cred_t *cr) +{ + vnode_t *compvp; + /* + * If we are in a zone or a chroot environment, then we have to + * take additional steps, since the path to the root might not + * be readable with the current credentials, even though the + * process can legitmately access the file. In this case, we + * do the following: + * + * lookuppnvp() with all privileges to get the resolved path. + * call localpath() to get the local portion of the path, and + * continue as normal. + * + * If the the conversion to a local path fails, then we continue + * as normal. This is a heuristic to make process object file + * paths available from within a zone. Because lofs doesn't + * support page operations, the vnode stored in the seg_t is + * actually the underlying real vnode, not the lofs node itself. + * Most of the time, the lofs path is the same as the underlying + * vnode (for example, /usr/lib/libc.so.1). + */ + if (vrootp != rootdir) { + char *local = NULL; + + VN_HOLD(rootdir); + if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir, + rootdir, kcred) == 0) { + local = localpath(rpn->pn_path, vrootp, kcred); + VN_RELE(compvp); + } + + /* + * The original pn was changed through lookuppnvp(). + * Set it to local for next validation attempt. + */ + if (local) { + (void) pn_set(pn, local); + } else { + return (1); + } + } + + /* + * We should have a local path at this point, so start the search from + * the root of the current process. + */ + VN_HOLD(vrootp); + if (vrootp != rootdir) + VN_HOLD(vrootp); + if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp, + cr) == 0) { + /* + * Check to see if the returned vnode is the same as the one we + * expect. + */ + if (vn_compare(vp, compvp) || + vnode_match(vp, compvp, cr)) { + VN_RELE(compvp); + return (0); + } else { + VN_RELE(compvp); + } + } + + return (1); +} + +/* * Given a directory, return the full, resolved path. This looks up "..", * searches for the given vnode in the parent, appends the component, etc. It * is used to implement vnodetopath() and getcwd() when the cached path fails. @@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, char *bufloc; size_t dlen = DIRENT64_RECLEN(MAXPATHLEN); refstr_t *mntpt; + char *vpath_cached; + boolean_t vpath_stale; /* Operation only allowed on directories */ ASSERT(vp->v_type == VDIR); @@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, * Shortcut: see if this vnode has correct v_path. If so, * we have the work done. */ + vpath_cached = NULL; + vpath_stale = B_FALSE; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { - - if ((err = pn_set(&pn, vp->v_path)) == 0) { - mutex_exit(&vp->v_lock); - rpn.pn_path = rpn.pn_buf; - - /* - * Ensure the v_path pointing to correct vnode - */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - if (lookuppnvp(&pn, &rpn, flags, NULL, - &cmpvp, vrootp, vrootp, cr) == 0) { - - if (VN_CMP(vp, cmpvp)) { - VN_RELE(cmpvp); + if (vp->v_path != vn_vpath_empty && + pn_set(&pn, vp->v_path) == 0) { + vpath_cached = vp->v_path; + mutex_exit(&vp->v_lock); + rpn.pn_path = rpn.pn_buf; - complen = strlen(rpn.pn_path); - bufloc -= complen; - if (bufloc < buf) { - err = ERANGE; - goto out; - } - bcopy(rpn.pn_path, bufloc, - complen); - break; - } else { - VN_RELE(cmpvp); - } + /* Ensure the v_path pointing to correct vnode */ + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, + cr) == 0) { + complen = strlen(rpn.pn_path); + bufloc -= complen; + if (bufloc < buf) { + err = ERANGE; + goto out; } + bcopy(rpn.pn_path, bufloc, complen); + break; } else { - mutex_exit(&vp->v_lock); + vpath_stale = B_TRUE; } } else { mutex_exit(&vp->v_lock); @@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, } /* - * Try to obtain the path component from dnlc cache - * before searching through the directory. - */ - if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) { - /* - * If we got parent vnode as a result, - * then the answered path is correct. - */ - if (VN_CMP(cmpvp, pvp)) { - VN_RELE(cmpvp); - complen = strlen(dbuf); - bufloc -= complen; - if (bufloc <= buf) { - err = ENAMETOOLONG; - goto out; - } - bcopy(dbuf, bufloc, complen); - - /* Prepend a slash to the current path */ - *--bufloc = '/'; - - /* And continue with the next component */ - VN_RELE(vp); - vp = pvp; - pvp = NULL; - continue; - } else { - VN_RELE(cmpvp); - } - } - - /* * Search the parent directory for the entry corresponding to * this vnode. */ @@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, /* Prepend a slash to the current path. */ *--bufloc = '/'; + /* Clear vp->v_path if it was found to be stale. */ + if (vpath_stale == B_TRUE) { + vnode_clear_vpath(vp, vpath_cached); + } + /* And continue with the next component */ VN_RELE(vp); vp = pvp; @@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, VN_RELE(vp); } - pn_alloc(&pn); /* - * Check to see if we have a cached path in the vnode. + * Check to see if we have a valid cached path in the vnode. */ + pn_alloc(&pn); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { (void) pn_set(&pn, vp->v_path); mutex_exit(&vp->v_lock); - pn_alloc(&rpn); - /* We should only cache absolute paths */ ASSERT(pn.pn_buf[0] == '/'); - /* - * If we are in a zone or a chroot environment, then we have to - * take additional steps, since the path to the root might not - * be readable with the current credentials, even though the - * process can legitmately access the file. In this case, we - * do the following: - * - * lookuppnvp() with all privileges to get the resolved path. - * call localpath() to get the local portion of the path, and - * continue as normal. - * - * If the the conversion to a local path fails, then we continue - * as normal. This is a heuristic to make process object file - * paths available from within a zone. Because lofs doesn't - * support page operations, the vnode stored in the seg_t is - * actually the underlying real vnode, not the lofs node itself. - * Most of the time, the lofs path is the same as the underlying - * vnode (for example, /usr/lib/libc.so.1). - */ - if (vrootp != rootdir) { - char *local = NULL; - VN_HOLD(rootdir); - if (lookuppnvp(&pn, &rpn, FOLLOW, - NULL, &compvp, rootdir, rootdir, kcred) == 0) { - local = localpath(rpn.pn_path, vrootp, - kcred); - VN_RELE(compvp); - } - - /* - * The original pn was changed through lookuppnvp(). - * Set it to local for next validation attempt. - */ - if (local) { - (void) pn_set(&pn, local); - } else { - goto notcached; + pn_alloc(&rpn); + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) { + /* Return the result, if we're able. */ + if (buflen > rpn.pn_pathlen) { + bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); + pn_free(&pn); + pn_free(&rpn); + VN_RELE(vrootp); + if (doclose) { + (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, + NULL); + VN_RELE(vp); + } + return (0); } } - /* - * We should have a local path at this point, so start the - * search from the root of the current process. + * A stale v_path will be purged by the later dirtopath lookup. */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL, - &compvp, vrootp, vrootp, cr); - if (ret == 0) { - /* - * Check to see if the returned vnode is the same as - * the one we expect. If not, give up. - */ - if (!vn_compare(vp, compvp) && - !vnode_match(vp, compvp, cr)) { - VN_RELE(compvp); - goto notcached; - } - - VN_RELE(compvp); - - /* - * Return the result. - */ - if (buflen <= rpn.pn_pathlen) - goto notcached; - - bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); - pn_free(&pn); - pn_free(&rpn); - VN_RELE(vrootp); - if (doclose) { - (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL); - VN_RELE(vp); - } - return (0); - } - -notcached: pn_free(&rpn); } else { mutex_exit(&vp->v_lock); } - pn_free(&pn); if (vp->v_type != VDIR) { - /* - * If we don't have a directory, try to find it in the dnlc via - * reverse lookup. Once this is found, we can use the regular - * directory search to find the full path. - */ - if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) { - /* - * Check if we have read privilege so, that - * we can lookup the path in the directory - */ - ret = 0; - if ((flags & LOOKUP_CHECKREAD)) { - ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL); - } - if (ret == 0) { - ret = dirtopath(vrootp, pvp, buf, buflen, - flags, cr); - } - if (ret == 0) { - len = strlen(buf); - if (len + strlen(path) + 1 >= buflen) { - ret = ENAMETOOLONG; - } else { - if (buf[len - 1] != '/') - buf[len++] = '/'; - bcopy(path, buf + len, - strlen(path) + 1); - } - } - - VN_RELE(pvp); - } else - ret = ENOENT; - } else + ret = ENOENT; + } else { ret = dirtopath(vrootp, vp, buf, buflen, flags, cr); + } VN_RELE(vrootp); if (doclose) { diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..3c1405d4af --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..9c996891f3 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3099 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support the LX brand with a Linux /proc entirely compatible with the Linux + * world view; the other -- this one -- is to support native (but Linux-borne) + * programs that wish to view the native system via the Linux /proc model. So + * the aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more -- and as such, it should not be unified with the much + * more complete Linux /proc implementation found in the LX brand. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + -1, /* 41: illumos SIGINFO */ + LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..eadb2ccd27 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,278 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LXPROC_H +#define _LXPROC_H + +#ifdef _LXPROC_BRANDED_H +#error Attempted to include native lxproc.h after branded lx_proc.h +#endif + +#define _LXPROC_NATIVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index 207a708771..2176dcb9de 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index b7354c168a..d3b12817ba 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -29,7 +29,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -5523,8 +5522,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs3setattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c index bc19d5a116..7b97b090af 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/systm.h> @@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head, kex = &exi->exi_export; kex->ex_flags = EX_PSEUDO; - vpathlen = vp->v_path ? strlen(vp->v_path) : 0; + vpathlen = strlen(vp->v_path); kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX); kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP); if (vpathlen) - (void) strcpy(kex->ex_path, vp->v_path); + (void) strncpy(kex->ex_path, vp->v_path, vpathlen); (void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX); /* Transfer the secinfo data from exdata to this new pseudo node */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index 151cb62403..55f6c95289 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index d6bf384a8b..107fe97b95 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -34,7 +34,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -3737,8 +3737,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, */ error = nfs4setattr(vp, vap, flags, cr, NULL); - if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (vap->va_mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -8061,8 +8066,9 @@ link_call: * vnode if it already existed. */ if (error == 0) { - vnode_t *tvp; + vnode_t *tvp, *tovp; rnode4_t *trp; + /* * Notify the vnode. Each links is represented by * a different vnode, in nfsv4. @@ -8075,23 +8081,20 @@ link_call: vnevent_rename_dest(tvp, ndvp, nnm, ct); } - /* - * if the source and destination directory are not the - * same notify the destination directory. - */ - if (VTOR4(odvp) != VTOR4(ndvp)) { - trp = VTOR4(ndvp); - tvp = ndvp; - if (IS_SHADOW(ndvp, trp)) - tvp = RTOV4(trp); - vnevent_rename_dest_dir(tvp, ct); - } - trp = VTOR4(ovp); - tvp = ovp; + tovp = ovp; if (IS_SHADOW(ovp, trp)) + tovp = RTOV4(trp); + + vnevent_rename_src(tovp, odvp, onm, ct); + + trp = VTOR4(ndvp); + tvp = ndvp; + + if (IS_SHADOW(ndvp, trp)) tvp = RTOV4(trp); - vnevent_rename_src(tvp, odvp, onm, ct); + + vnevent_rename_dest_dir(tvp, tovp, nnm, ct); } if (nvp) { @@ -11000,8 +11003,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs4setattr(vp, &va, 0, cr, NULL); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c index 268badd6c0..da60a0ccd0 100644 --- a/usr/src/uts/common/fs/nfs/nfs_auth.c +++ b/usr/src/uts/common/fs/nfs/nfs_auth.c @@ -22,6 +22,7 @@ /* * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -558,11 +559,16 @@ retry: *access = res.ares.auth_perm; *srv_uid = res.ares.auth_srv_uid; *srv_gid = res.ares.auth_srv_gid; - *srv_gids_cnt = res.ares.auth_srv_gids.len; - *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t), - KM_SLEEP); - bcopy(res.ares.auth_srv_gids.val, *srv_gids, - *srv_gids_cnt * sizeof (gid_t)); + + if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) { + *srv_gids = kmem_alloc(*srv_gids_cnt * + sizeof (gid_t), KM_SLEEP); + bcopy(res.ares.auth_srv_gids.val, *srv_gids, + *srv_gids_cnt * sizeof (gid_t)); + } else { + *srv_gids = NULL; + } + break; case NFSAUTH_DR_EFAIL: @@ -1051,9 +1057,13 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, if (gid != NULL) *gid = p->auth_srv_gid; if (ngids != NULL && gids != NULL) { - *ngids = p->auth_srv_ngids; - *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP); - bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t)); + if ((*ngids = p->auth_srv_ngids) != 0) { + size_t sz = *ngids * sizeof (gid_t); + *gids = kmem_alloc(sz, KM_SLEEP); + bcopy(p->auth_srv_gids, *gids, sz); + } else { + *gids = NULL; + } } access = p->auth_access; diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index be28ac9071..5d2efc71b2 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -24,6 +24,7 @@ * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -2649,6 +2650,9 @@ nfs_srvinit(void) { int error; + if (getzoneid() != GLOBAL_ZONEID) + return (EACCES); + error = nfs_exportinit(); if (error != 0) return (error); @@ -3287,7 +3291,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi) char *path; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { zone = zone_find_by_any_path(vp->v_path, B_FALSE); mutex_exit(&vp->v_lock); } else { diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c index 57b21778b4..ffd5380a86 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 1a1082bcb8..ee3bac484f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -26,7 +26,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, error = nfssetattr(vp, vap, flags, cr); - if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); - ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfssetattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c index 976715e346..275330a0ae 100644 --- a/usr/src/uts/common/fs/pcfs/pc_dir.c +++ b/usr/src/uts/common/fs/pcfs/pc_dir.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -826,8 +826,7 @@ top: if (error == 0) { vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); - if (dp != tdp) - vnevent_rename_dest_dir(PCTOV(tdp), ctp); + vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp); } done: diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c index a8743b245a..ae72cada7a 100644 --- a/usr/src/uts/common/fs/pcfs/pc_vnops.c +++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c @@ -781,8 +781,11 @@ pcfs_setattr( if (error) goto out; - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file modified times. diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c index 14be8cbbae..11b7386269 100644 --- a/usr/src/uts/common/fs/portfs/port.c +++ b/usr/src/uts/common/fs/portfs/port.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/systm.h> @@ -1381,12 +1383,18 @@ portnowait: if (model == DATAMODEL_NATIVE) { eventsz = sizeof (port_event_t); - kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp = NULL; + } else { + kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { @@ -1423,12 +1431,18 @@ portnowait: port_event32_t *kevp32; eventsz = sizeof (port_event32_t); - kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp32 == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp32 = NULL; + } else { + kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp32 == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp32; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c new file mode 100644 index 0000000000..b09a9c8afc --- /dev/null +++ b/usr/src/uts/common/fs/proc/prargv.c @@ -0,0 +1,441 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/sysmacros.h> +#include <vm/as.h> + +/* + * Safely read a contiguous region of memory from 'addr' in the address space + * of a particular process into the supplied kernel buffer (*buf, sz). + * Partially mapped regions will result in a partial read terminating at the + * first hole in the address space. The number of bytes actually read is + * returned to the caller via 'rdsz'. + */ +int +prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz) +{ + int error = 0; + size_t rem = sz; + off_t pos = 0; + + if (rdsz != NULL) + *rdsz = 0; + + while (rem != 0) { + uintptr_t addr = ustart + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if ((error = uread(p, buf + pos, len, addr)) != 0) { + if (error == ENXIO) { + /* + * ENXIO from uread() indicates that the page + * does not exist. This will simply be a + * partial read. + */ + error = 0; + } + break; + } + + rem -= len; + pos += len; + } + + if (rdsz != NULL) + *rdsz = pos; + + return (error); +} + +/* + * Attempt to read the argument vector (argv) from this process. The caller + * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via + * prlock or lx_prlock). + * + * The caller must provide a buffer (buf, buflen). We will concatenate each + * argument string (including the NUL terminator) into this buffer. The number + * of characters written to this buffer (including the final NUL terminator) + * will be stored in 'slen'. + */ +int +prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *argv = NULL; + size_t argvsz = 0; + int i; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) { + /* + * Return the regular psargs string to the caller. + */ + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + + return (0); + } + + /* + * Allocate space to store argv array. + */ + argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + argv = kmem_alloc(argvsz, KM_SLEEP); + + /* + * Extract the argv array from the target process. Drop p_lock + * while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz, + NULL)) != 0) { + kmem_free(argv, argvsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each argument string from the pointers in the argv array. + */ + pos = 0; + for (i = 0; i < up->u_argc; i++) { + size_t rdsz, trysz; + uintptr_t arg; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + arg = (uintptr_t)((caddr32_t *)argv)[i]; + } else { + arg = (uintptr_t)argv[i]; + } +#else + arg = (uintptr_t)argv[i]; +#endif + + /* + * Stop trying to read arguments if we reach a NULL + * pointer in the vector. + */ + if (arg == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual argument strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this argument. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(argv, argvsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} + +/* + * Similar to prreadargv except reads the env vector. This is slightly more + * complex because there is no count for the env vector that corresponds to + * u_argc. + */ +int +prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *envp = NULL; + uintptr_t tmpp = NULL; + size_t envpsz = 0, rdsz = 0; + int i; + int cnt, bound; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) { + /* + * Return empty string. + */ + buf[0] = '\0'; + *slen = 1; + + return (0); + } + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + + /* + * We first have to count how many env entries we have. This is + * somewhat painful. We extract the env entries from the target process + * one entry at a time. Stop trying to read env entries if we reach a + * NULL pointer in the vector or hit our upper bound (which we take + * as the bufsz/4) to ensure we don't run off. + */ + rdsz = (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + bound = (int)(bufsz / 4); + for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) { + caddr_t tmp = NULL; + + if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz, + NULL)) != 0) { + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + if (tmp == NULL) + break; + } + if (cnt == 0) { + /* Return empty string. */ + buf[0] = '\0'; + *slen = 1; + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (0); + } + + /* + * Allocate space to store env array. + */ + envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + envp = kmem_alloc(envpsz, KM_SLEEP); + + /* + * Extract the env array from the target process. + */ + if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz, + NULL)) != 0) { + kmem_free(envp, envpsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each env string from the pointers in the env array. + */ + pos = 0; + for (i = 0; i < cnt; i++) { + size_t rdsz, trysz; + uintptr_t ev; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + ev = (uintptr_t)((caddr32_t *)envp)[i]; + } else { + ev = (uintptr_t)envp[i]; + } +#else + ev = (uintptr_t)envp[i]; +#endif + + /* + * Stop trying to read env entries if we reach a NULL + * pointer in the vector. + */ + if (ev == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual env strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this env var. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(envp, envpsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index 6b151a6369..07dcb1e7db 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ p->p_stopsig = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; t->t_dtrace_stop = 0; setrun_locked(t); } @@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr) return (EPERM); if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id) return (EINVAL); - if ((zptr = zone_find_by_id(zoneid)) == NULL) - return (EINVAL); + /* + * We cannot hold p_lock when we call zone_find_by_id since that can + * lead to a deadlock. zone_find_by_id() takes zonehash_lock. + * zone_enter() can hold the zonehash_lock and needs p_lock when it + * calls task_join. + */ mutex_exit(&p->p_lock); + if ((zptr = zone_find_by_id(zoneid)) == NULL) { + mutex_enter(&p->p_lock); + return (EINVAL); + } mutex_enter(&p->p_crlock); oldcred = p->p_cred; crhold(oldcred); diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index 8ea516bf82..72f26b3c05 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_PROC_PRDATA_H @@ -123,6 +123,7 @@ typedef enum prnodetype { #if defined(__i386) || defined(__amd64) PR_LDT, /* /proc/<pid>/ldt */ #endif + PR_ARGV, /* /proc/<pid>/argv */ PR_USAGE, /* /proc/<pid>/usage */ PR_LUSAGE, /* /proc/<pid>/lusage */ PR_PAGEDATA, /* /proc/<pid>/pagedata */ @@ -347,6 +348,8 @@ extern int pr_unset(proc_t *, long); extern void pr_sethold(prnode_t *, sigset_t *); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); extern int prwritectl(vnode_t *, struct uio *, cred_t *); extern int prlock(prnode_t *, int); extern void prunmark(proc_t *); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 28950bf972..21c25a01e3 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -201,6 +201,7 @@ prchoose(proc_t *p) case PR_SYSEXIT: case PR_SIGNALLED: case PR_FAULTED: + case PR_BRAND: /* * Make an lwp calling exit() be the * last lwp seen in the process. @@ -534,6 +535,12 @@ prexecend(void) pcp->prc_tslot = tslot; } } + + /* + * There may be threads waiting for the flag change blocked behind the + * pr_pid_cv as well. + */ + cv_signal(&pr_pid_cv[p->p_slot]); } /* diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 39f2abbc32..245133abf4 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -96,6 +96,11 @@ struct prdirect { #define PRSDSIZE (sizeof (struct prdirect)) /* + * Maximum length of the /proc/$$/argv file: + */ +int prmaxargvlen = 4096; + +/* * Directory characteristics. */ typedef struct prdirent { @@ -166,6 +171,8 @@ static prdirent_t piddir[] = { { PR_LDT, 27 * sizeof (prdirent_t), sizeof (prdirent_t), "ldt" }, #endif + { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t), + "argv" }, }; #define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2) @@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(), #if defined(__x86) pr_read_ldt(), #endif + pr_read_argv(), pr_read_usage(), pr_read_lusage(), pr_read_pagedata(), pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(), pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(), @@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage, /* /proc/<pid>/usage */ pr_read_lusage, /* /proc/<pid>/lusage */ pr_read_pagedata, /* /proc/<pid>/pagedata */ @@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop) } static int +pr_read_argv(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = prmaxargvlen, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_ARGV); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int pr_read_as(prnode_t *pnp, uio_t *uiop) { int error; @@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage_32, /* /proc/<pid>/usage */ pr_read_lusage_32, /* /proc/<pid>/lusage */ pr_read_pagedata_32, /* /proc/<pid>/pagedata */ @@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) #endif } +/* + * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile + * time that PRFNSZ has the same definition as MAXCOMLEN. + */ +#if PRFNSZ != MAXCOMLEN +#error PRFNSZ/MAXCOMLEN mismatch +#endif + +static int +pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop) +{ + char fname[PRFNSZ]; + int offset = offsetof(psinfo_t, pr_fname), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_fname); +#endif + + /* + * If this isn't a write to pr_fname (or if the size doesn't match + * PRFNSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ) + return (0); + + if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0) + return (error); + + fname[PRFNSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ); + + prunlock(pnp); + + return (0); +} + +/* + * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile + * time that PRARGSZ has the same definition as PSARGSZ. + */ +#if PRARGSZ != PSARGSZ +#error PRARGSZ/PSARGSZ mismatch +#endif + +static int +pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop) +{ + char psargs[PRARGSZ]; + int offset = offsetof(psinfo_t, pr_psargs), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_psargs); +#endif + + /* + * If this isn't a write to pr_psargs (or if the size doesn't match + * PRARGSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ) + return (0); + + if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0) + return (error); + + psargs[PRARGSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ); + + prunlock(pnp); + + return (0); +} + +int +pr_write_psinfo(prnode_t *pnp, uio_t *uiop) +{ + int error; + + if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0) + return (error); + + if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0) + return (error); + + return (0); +} + + /* ARGSUSED */ static int prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) @@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) uiop->uio_resid = resid; return (error); + case PR_PSINFO: + return (pr_write_psinfo(pnp, uiop)); + default: return ((vp->v_type == VDIR)? EISDIR : EBADF); } @@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, case PR_AUXV: vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t); break; + case PR_ARGV: + if ((p->p_flag & SSYS) || p->p_as == &kas) { + vap->va_size = PSARGSZ; + } else { + vap->va_size = prmaxargvlen; + } + break; #if defined(__x86) case PR_LDT: mutex_exit(&p->p_lock); @@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) @@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = { #if defined(__x86) pr_lookup_notdir, /* /proc/<pid>/ldt */ #endif + pr_lookup_notdir, /* /proc/<pid>/argv */ pr_lookup_notdir, /* /proc/<pid>/usage */ pr_lookup_notdir, /* /proc/<pid>/lusage */ pr_lookup_notdir, /* /proc/<pid>/pagedata */ @@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type) break; case PR_PSINFO: + pnp->pr_mode = 0644; /* readable by all + owner can write */ + break; + case PR_LPSINFO: case PR_LWPSINFO: case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: pnp->pr_mode = 0444; /* read-only by all */ break; @@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = { #if defined(__x86) pr_readdir_notdir, /* /proc/<pid>/ldt */ #endif + pr_readdir_notdir, /* /proc/<pid>/argv */ pr_readdir_notdir, /* /proc/<pid>/usage */ pr_readdir_notdir, /* /proc/<pid>/lusage */ pr_readdir_notdir, /* /proc/<pid>/pagedata */ @@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp) case PR_PROCDIR: case PR_PSINFO: case PR_USAGE: + case PR_ARGV: break; default: continue; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index 703e26ea61..682f1d867b 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + return (0); } @@ -654,6 +658,10 @@ sonode_fini(struct sonode *so) if (so->so_filter_top != NULL) sof_sonode_cleanup(so); + /* Clean up any remnants of krecv callbacks */ + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + ASSERT(list_is_empty(&so->so_acceptq_list)); ASSERT(list_is_empty(&so->so_acceptq_defer)); ASSERT(!list_link_active(&so->so_acceptq_node)); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index e5bc6dc845..9b8186a8a0 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr)); ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); @@ -305,7 +305,7 @@ so_connect(struct sonode *so, struct sockaddr *name, * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -404,7 +404,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -513,7 +513,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag, error = EPIPE; break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); - if ((so->so_mode & SM_SENDFILESUPP) == 0) { - SO_UNBLOCK_FALLBACK(so); - return (EOPNOTSUPP); - } - error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top, B_FALSE); @@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, { int error; - SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); if (so->so_filter_active == 0 || (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) @@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_getsockopt(so, option_name, optval, optlenp, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); if ((so->so_filter_active == 0 || @@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_setsockopt(so, option_name, optval, optlen, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); /* X/Open requires this check */ @@ -876,7 +871,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * If there is a pending error, return error * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -1329,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp, } } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + boolean_t cont; + so_krecv_f func = so->so_krecv_cb; + void *arg = so->so_krecv_arg; + + mutex_exit(&so->so_lock); + cont = func(so, mp, msg_size, flags & MSG_OOB, arg); + mutex_enter(&so->so_lock); + if (cont == B_TRUE) { + space_left = so->so_rcvbuf; + } else { + so->so_rcv_queued = so->so_rcvlowat; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; + } + mutex_exit(&so->so_lock); + if (flags & MSG_OOB) { so_queue_oob(so, mp, msg_size); mutex_enter(&so->so_lock); @@ -1607,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, return (ENOTCONN); } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + mutex_exit(&so->so_lock); + return (EOPNOTSUPP); + } + mutex_exit(&so->so_lock); + if (msg->msg_flags & MSG_PEEK) msg->msg_flags &= ~MSG_WAITALL; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index 957c8f93b4..7bdd64393b 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -24,6 +24,7 @@ */ /* * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -670,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop, int more = 0; int error; ssize_t oobmark; + ssize_t copied = 0; sodirect_t *sodp = so->so_direct; + xuio_t *xuio = NULL; partial_read = B_FALSE; *mctlp = NULL; + if ((uiop->uio_extflg & UIO_XUIO) != 0) { + xuio = (xuio_t *)uiop; + } again: mutex_enter(&so->so_lock); again1: @@ -784,8 +790,6 @@ again1: * enabled socket, uio_resid can be 0. */ if (uiop->uio_resid >= 0) { - ssize_t copied = 0; - if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) { mutex_enter(&so->so_lock); ASSERT(uiop == (uio_t *)&sodp->sod_uioa); @@ -843,6 +847,18 @@ again1: } if (mp != NULL) { /* more data blocks in msg */ more |= MOREDATA; + + /* + * If requested, tally up remaining data along with the + * amount already copied. + */ + if (xuio != NULL && + xuio->xu_type == UIOTYPE_PEEKSIZE) { + xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE; + xuio->xu_ext.xu_ps.xu_ps_size = + copied + msgdsize(mp); + } + if ((flags & (MSG_PEEK|MSG_TRUNC))) { if (flags & MSG_PEEK) { freemsg(mp); @@ -2276,9 +2292,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) fbfunc = sp->sp_smod_info->smod_proto_fallback_func; /* - * Cannot fallback if the socket has active filters + * Cannot fallback if the socket has active filters or a krecv callback. */ - if (so->so_filter_active > 0) + if (so->so_filter_active > 0 || so->so_krecv_cb != NULL) return (EINVAL); switch (so->so_family) { @@ -2456,3 +2472,50 @@ out: return (error); } + +int +so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg) +{ + int ret; + + if (cb == NULL && arg != NULL) + return (EINVAL); + + SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg)); + + mutex_enter(&so->so_lock); + if (so->so_state & SS_FALLBACK_COMP) { + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + return (ENOTSUP); + } + + ret = so_lock_read(so, 0); + VERIFY(ret == 0); + /* + * Other consumers may actually care about getting extant data delivered + * to them, when they come along, they should figure out the best API + * for that. + */ + so_rcv_flush(so); + + so->so_krecv_cb = cb; + so->so_krecv_arg = arg; + + so_unlock_read(so); + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + + return (0); +} + +void +so_krecv_unblock(sonode_t *so) +{ + mutex_enter(&so->so_lock); + VERIFY(so->so_krecv_cb != NULL); + + so->so_rcv_queued = 0; + (void) so_check_flow_control(so); + mutex_exit(&so->so_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index 971523945e..7dca6ae6fc 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/systm.h> @@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name, /* Module loaded OK, so there must be an ops vector */ ASSERT(ent->sofe_mod != NULL); + + /* + * Check again to confirm ATTACH is ok. See if the the module + * is not SOF_ATT_SAFE after an unsafe operation has taken + * place. + */ + if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 && + so->so_state & SS_FILOP_UNSF) { + sof_instance_destroy(inst); + return (EINVAL); + } + inst->sofi_ops = &ent->sofe_mod->sofm_ops; SOF_STAT_ADD(inst, tot_active_attach, 1); @@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * sof_register(version, name, ops, flags) * * Register a socket filter identified by name `name' and which should use - * the ops vector `ops' for event notification. `flags' should be set to 0. + * the ops vector `ops' for event notification. `flags' should be set to 0 + * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An + * unsafe filter is one that cannot be attached after any socket operation has + * occured. This is the legacy default. A "safe" filter can be attached even + * after some basic initial socket operations have taken place. This set is + * currently bind, getsockname, getsockopt and setsockopt. The order in which + * a "safe" filter can be attached is more relaxed, and thus more flexible. * On success 0 is returned, otherwise an errno is returned. */ int @@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags) { sof_module_t *mod; - _NOTE(ARGUNUSED(flags)); - if (version != SOF_VERSION) return (EINVAL); mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(mod->sofm_name, name); + mod->sofm_flags = flags; mod->sofm_ops = *ops; mutex_enter(&sof_module_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index 7f7aece1f1..cf2ad8b20d 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKFILTER_H @@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t; struct sof_module { char *sofm_name; + int sofm_flags; sof_ops_t sofm_ops; uint_t sofm_refcnt; list_node_t sofm_node; diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 30027200b6..eea86672b8 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -436,10 +437,12 @@ sogetoff(mblk_t *mp, t_uscalar_t offset, * * The underlying filesystem VSOCK vnode has a v_stream pointer that * references the actual stream head (hence indirectly the actual sonode). + * + * This function is non-static so it can be used by brand emulation. */ -static int +int so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, - vnode_t **vpp) + vnode_t **vpp) { vnode_t *vp; /* Underlying filesystem vnode */ vnode_t *rvp; /* real vnode */ @@ -1879,7 +1882,7 @@ ssize_t soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) { struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec aiov[1]; register vnode_t *vp; int ioflag, rwflag; ssize_t cnt; diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index 21f3744895..854dd040b5 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -21,6 +21,8 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -54,6 +56,7 @@ #include <sys/cmn_err.h> #include <sys/vmsystm.h> #include <sys/policy.h> +#include <sys/limits.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -86,12 +89,6 @@ extern void nl7c_init(void); extern int sockfs_defer_nl7c_init; /* - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? - */ -#define MSG_MAXIOVLEN 16 - -/* * Kernel component of socket creation. * * The socket library determines which version number to use. @@ -1026,9 +1023,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) STRUCT_HANDLE(nmsghdr, umsgptr); struct nmsghdr lmsg; struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; int *flagsp; model_t model; @@ -1071,22 +1069,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1094,15 +1107,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1110,6 +1136,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1124,12 +1153,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) (do_useracc == 0 || useracc(lmsg.msg_control, lmsg.msg_controllen, B_WRITE) != 0)) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } - return (recvit(sock, &lmsg, &auio, flags, + rval = recvit(sock, &lmsg, &auio, flags, STRUCT_FADDR(umsgptr, msg_namelen), - STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); + STRUCT_FADDR(umsgptr, msg_controllen), flagsp); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } /* @@ -1267,9 +1304,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) struct nmsghdr lmsg; STRUCT_DECL(nmsghdr, u_lmsg); struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; model_t model; @@ -1312,7 +1350,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { /* * Unless this is XPG 4.2 we allow iovcnt == 0 to * be compatible with SunOS 4.X and 4.4BSD. @@ -1321,19 +1359,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + if (iovcnt != 0 && - copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1341,17 +1394,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (iovcnt != 0 && copyin(lmsg.msg_iov, aiov, (unsigned)iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1359,6 +1425,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1369,7 +1438,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) auio.uio_segflg = UIO_USERSPACE; auio.uio_limit = 0; - return (sendit(sock, &lmsg, &auio, flags)); + rval = sendit(sock, &lmsg, &auio, flags); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } ssize_t diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h index 6a515be122..24acb81a0a 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h +++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKTPI_IMPL_H @@ -56,6 +57,8 @@ extern int sogetrderr(vnode_t *, int, int *); extern int sogetwrerr(vnode_t *, int, int *); extern int so_addr_verify(struct sonode *, const struct sockaddr *, socklen_t); +extern int so_ux_lookup(struct sonode *, struct sockaddr_un *, int, + vnode_t **); extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *, socklen_t, int, void **, socklen_t *); extern void so_unix_close(struct sonode *); diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index f6621c8097..387cc6ae54 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -516,7 +516,7 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + kmem_free(tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -549,8 +549,8 @@ tdirinit( ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP); /* * Initialize the entries @@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + kmem_free(tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -925,7 +925,7 @@ tdiraddentry( */ namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); if (tdp == NULL) return (ENOSPC); @@ -1025,7 +1025,7 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 2e59d28d80..e6e2b392fe 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -40,9 +41,19 @@ #include <sys/policy.h> #include <sys/fs/tmp.h> #include <sys/fs/tmpnode.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#define KILOBYTE 1024 +#define MEGABYTE (1024 * KILOBYTE) +#define GIGABYTE (1024 * MEGABYTE) #define MODESHIFT 3 +#define VALIDMODEBITS 07777 + +extern pgcnt_t swapfs_minfree; + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred) * a plain file and you have write access to that file. * Function returns 0 if remove access is granted. */ - int tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, struct cred *cr) @@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* * Convert a string containing a number (number of bytes) to a pgcnt_t, * containing the corresponding number of pages. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * - * If the number is followed by a "k" or "K", the value is converted from - * kilobytes to bytes. If it is followed by an "m" or "M" it is converted - * from megabytes to bytes. If it is not followed by a character it is - * assumed to be in bytes. Multiple letter options are allowed, so for instance - * '2mk' is interpreted as 2gb. + * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes; + * "m" or "M" for megabytes; "g" or "G" for gigabytes. This interface allows + * for an arguably esoteric interpretation of multiple suffix characters: + * namely, they cascade. For example, the caller may specify "2mk", which is + * interpreted as 2 gigabytes. It would seem, at this late stage, that the + * horse has left not only the barn but indeed the country, and possibly the + * entire planetary system. Alternatively, the number may be followed by a + * single '%' sign, indicating the size is a percentage of either the zone's + * swap limit or the system's overall swap size. * * Parse and overflow errors are detected and a non-zero number returned on * error. */ - int tmp_convnum(char *str, pgcnt_t *maxpg) { - uint64_t num = 0, oldnum; + u_longlong_t num = 0; #ifdef _LP64 - uint64_t max_bytes = ULONG_MAX; + u_longlong_t max_bytes = ULONG_MAX; #else - uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; + u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif char *c; - - if (str == NULL) + const struct convchar { + char *cc_char; + uint64_t cc_factor; + } convchars[] = { + { "kK", KILOBYTE }, + { "mM", MEGABYTE }, + { "gG", GIGABYTE }, + { NULL, 0 } + }; + + if (str == NULL) { return (EINVAL); + } c = str; /* - * Convert str to number + * Convert the initial numeric portion of the input string. */ - while ((*c >= '0') && (*c <= '9')) { - oldnum = num; - num = num * 10 + (*c++ - '0'); - if (oldnum > num) /* overflow */ + if (ddi_strtoull(str, &c, 10, &num) != 0) { + return (EINVAL); + } + + /* + * Handle a size in percent. Anything other than a single percent + * modifier is invalid. We use either the zone's swap limit or the + * system's total available swap size as the initial value. Perform the + * intermediate calculation in pages to avoid overflow. + */ + if (*c == '\%') { + u_longlong_t cap; + + if (*(c + 1) != '\0') + return (EINVAL); + + if (num > 100) return (EINVAL); + + cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl; + if (cap == UINT64_MAX) { + /* + * Use the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + cap = TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + } else { + cap = btop(cap); + } + + num = ptob(cap * num / 100); + goto done; } /* - * Terminate on null + * Apply the (potentially cascading) magnitude suffixes until an + * invalid character is found, or the string comes to an end. */ - while (*c != '\0') { - switch (*c++) { + for (; *c != '\0'; c++) { + int i; + + for (i = 0; convchars[i].cc_char != NULL; i++) { + /* + * Check if this character matches this multiplier + * class: + */ + if (strchr(convchars[i].cc_char, *c) != NULL) { + /* + * Check for overflow: + */ + if (num > max_bytes / convchars[i].cc_factor) { + return (EINVAL); + } + + num *= convchars[i].cc_factor; + goto valid_char; + } + } /* - * convert from kilobytes + * This was not a valid multiplier suffix character. */ - case 'k': - case 'K': - if (num > max_bytes / 1024) /* will overflow */ - return (EINVAL); - num *= 1024; - break; + return (EINVAL); - /* - * convert from megabytes - */ - case 'm': - case 'M': - if (num > max_bytes / (1024 * 1024)) /* will overflow */ - return (EINVAL); - num *= 1024 * 1024; - break; - - default: - return (EINVAL); - } +valid_char: + continue; } +done: /* * Since btopr() rounds up to page granularity, this round-up can * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) @@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg) return (EINVAL); return (0); } + +/* + * Parse an octal mode string for use as the permissions set for the root + * of the tmpfs mount. + */ +int +tmp_convmode(char *str, mode_t *mode) +{ + ulong_t num; + char *c; + + if (str == NULL) { + return (EINVAL); + } + + if (ddi_strtoul(str, &c, 8, &num) != 0) { + return (EINVAL); + } + + if ((num & ~VALIDMODEBITS) != 0) { + return (EINVAL); + } + + *mode = VALIDMODEBITS & num; + return (0); +} diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index f8a36a528f..3c088c442c 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -76,7 +87,7 @@ static vfsdef_t vfw = { VFSDEF_VERSION, "tmpfs", tmpfsinit, - VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, &tmpfs_proto_opttbl }; @@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = { /* Option name Cancel Opt Arg Flags Data */ { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, - { "size", NULL, "0", MO_HASVALUE, NULL} + { "size", NULL, "0", MO_HASVALUE, NULL}, + { "mode", NULL, NULL, MO_HASVALUE, NULL} }; @@ -121,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -139,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -155,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } static int -tmp_mount( - struct vfs *vfsp, - struct vnode *mvp, - struct mounta *uap, - struct cred *cr) +tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) { struct tmount *tm = NULL; struct tmpnode *tp; @@ -239,8 +240,9 @@ tmp_mount( pgcnt_t anonmax; struct vattr rattr; int got_attrs; - - char *sizestr; + boolean_t mode_arg = B_FALSE; + mode_t root_mode = 0777; + char *argstr; if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) return (error); @@ -249,7 +251,7 @@ tmp_mount( return (ENOTDIR); mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_OVERLAY) == 0 && + if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (EBUSY); @@ -275,18 +277,45 @@ tmp_mount( * tm_anonmax is set according to the mount arguments * if any. Otherwise, it is set to a maximum value. */ - if (vfs_optionisset(vfsp, "size", &sizestr)) { - if ((error = tmp_convnum(sizestr, &anonmax)) != 0) + if (vfs_optionisset(vfsp, "size", &argstr)) { + if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { anonmax = ULONG_MAX; } + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; + } + if (error = pn_get(uap->dir, (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) goto out; - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if (uap->flags & MS_REMOUNT) { + tm = (struct tmount *)VFSTOTM(vfsp); + + /* + * If we change the size so its less than what is currently + * being used, we allow that. The file system will simply be + * full until enough files have been removed to get below the + * new max. + */ + mutex_enter(&tm->tm_contents); + tm->tm_anonmax = anonmax; + mutex_exit(&tm->tm_contents); + goto out; + } + + if ((tm = kmem_zalloc(sizeof (struct tmount), + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -318,17 +347,17 @@ tmp_mount( vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); - rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ + rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, &rattr, cr); /* @@ -345,7 +374,14 @@ tmp_mount( * the previously set hardwired defaults to prevail. */ if (got_attrs == 0) { - tp->tn_mode = rattr.va_mode; + if (!mode_arg) { + /* + * Only use the underlying mount point for the + * mode if the "mode" mount argument was not + * provided. + */ + tp->tn_mode = rattr.va_mode; + } tp->tn_uid = rattr.va_uid; tp->tn_gid = rattr.va_gid; } @@ -366,6 +402,7 @@ tmp_mount( pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index 3c251df0cc..18e037ee22 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -786,8 +794,13 @@ tmp_setattr( rw_exit(&tp->tn_contents); rw_exit(&tp->tn_rwlock); - if (error == 0 && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } goto out1; } @@ -833,6 +846,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -871,8 +887,7 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); + xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tm = VTOTM(dvp); tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* @@ -1302,10 +1317,8 @@ tmp_rename( vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); /* * vnevent_rename_dest is called in tdirenter(). - * Notify the target dir if not same as source dir. */ - if (ndvp != odvp) - vnevent_rename_dest_dir(ndvp, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); } done: @@ -1474,6 +1487,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1612,7 +1629,7 @@ tmp_symlink( return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1677,10 +1694,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - vp->v_count--; + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + vp->v_count--; + } else { + vp->v_count--; + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1705,7 +1739,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + kmem_free(tp->tn_symlink, tp->tn_size + 1); } /* @@ -1739,7 +1773,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + kmem_free(tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1861,6 +1899,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2082,6 +2124,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ @@ -2342,8 +2388,13 @@ tmp_space( return (EFBIG); error = tmp_freesp(vp, bfp, flag); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); } diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c index c1e2c74a87..def046a0bf 100644 --- a/usr/src/uts/common/fs/udfs/udf_dir.c +++ b/usr/src/uts/common/fs/udfs/udf_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -562,9 +563,8 @@ out: namep, ctp); } - if (sdp != tdp) { - vnevent_rename_dest_dir(ITOV(tdp), ctp); - } + vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip), + namep, ctp); } /* diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c index 93cc4d49e8..7e17f16ce2 100644 --- a/usr/src/uts/common/fs/udfs/udf_vnops.c +++ b/usr/src/uts/common/fs/udfs/udf_vnops.c @@ -569,8 +569,11 @@ udf_setattr( goto update_inode; } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file access or modified times. @@ -1649,8 +1652,13 @@ udf_space( } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { error = ud_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c index cf45b48e3c..d689a8173b 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vnops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -2193,8 +2193,13 @@ again: goto update_inode; } - if (error == 0 && vap->va_size) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } if (ulp) { @@ -3726,12 +3731,7 @@ retry_firstlock: if (error == 0) { vnevent_rename_src(ITOV(sip), sdvp, snm, ct); - /* - * Notify the target directory of the rename event - * if source and target directories are not the same. - */ - if (sdvp != tdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); } errout: @@ -4478,8 +4478,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, return (error); error = ufs_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else if (cmd == F_ALLOCSP) { error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FALLOCATE_MASK); diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index e179d934ed..35e65f15e6 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -236,7 +236,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp) * Make sure this root has a path. With lofs, it is possible to have * a NULL mountpoint. */ - if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { + if (ret == 0 && vfsp->vfs_mntpt != NULL && + (*vpp)->v_path == vn_vpath_empty) { mntpt = vfs_getmntpoint(vfsp); vn_setpath_str(*vpp, refstr_value(mntpt), strlen(refstr_value(mntpt))); @@ -3901,6 +3902,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index e6b6adf56b..77b30da871 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -66,6 +66,7 @@ #include <fs/fs_subr.h> #include <sys/taskq.h> #include <fs/fs_reparse.h> +#include <sys/time.h> /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ @@ -102,6 +103,9 @@ kmutex_t vskstat_tree_lock; /* Global variable which enables/disables the vopstats collection */ int vopstats_enabled = 1; +/* Global used for empty/invalid v_path */ +char *vn_vpath_empty = ""; + /* * forward declarations for internal vnode specific data (vsd) */ @@ -200,6 +204,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; vp->v_mpssdata = NULL; vp->v_vsd = NULL; vp->v_fopdata = NULL; @@ -2331,6 +2340,7 @@ void vn_recycle(vnode_t *vp) { ASSERT(vp->v_pages == NULL); + VERIFY(vp->v_path != NULL); /* * XXX - This really belongs in vn_reinit(), but we have some issues @@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp) kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } - if (vp->v_path) { + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } if (vp->v_fopdata != NULL) { @@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); - if (vp->v_path != NULL) { + VERIFY(vp->v_path != NULL); + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } /* If FEM was in use, make sure everything gets cleaned up */ @@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2622,6 +2635,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct) (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); } +void +vnevent_resize(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct); +} + /* * Vnode accessors. */ @@ -2981,7 +3003,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, * the potential for deadlock. */ mutex_enter(&base->v_lock); - if (base->v_path == NULL) { + if (base->v_path == vn_vpath_empty) { mutex_exit(&base->v_lock); return; } @@ -3008,7 +3030,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath = kmem_alloc(rpathalloc, KM_SLEEP); mutex_enter(&base->v_lock); - if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { + if (base->v_path == vn_vpath_empty || + strlen(base->v_path) != rpathlen) { mutex_exit(&base->v_lock); kmem_free(rpath, rpathalloc); return; @@ -3022,7 +3045,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath[rpathlen + plen] = '\0'; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(rpath, rpathalloc); } else { @@ -3042,7 +3065,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len) char *buf = kmem_alloc(len + 1, KM_SLEEP); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(buf, len + 1); return; @@ -3066,10 +3089,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) mutex_enter(&vp->v_lock); tmp = vp->v_path; - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; mutex_exit(&vp->v_lock); vn_setpath(rootdir, dvp, vp, nm, len); - if (tmp != NULL) + if (tmp != vn_vpath_empty) kmem_free(tmp, strlen(tmp) + 1); } @@ -3084,7 +3107,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) int alloc; mutex_enter(&src->v_lock); - if (src->v_path == NULL) { + if (src->v_path == vn_vpath_empty) { mutex_exit(&src->v_lock); return; } @@ -3094,7 +3117,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); buf = kmem_alloc(alloc, KM_SLEEP); mutex_enter(&src->v_lock); - if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { + if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) { mutex_exit(&src->v_lock); kmem_free(buf, alloc); return; @@ -3103,7 +3126,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); mutex_enter(&dst->v_lock); - if (dst->v_path != NULL) { + if (dst->v_path != vn_vpath_empty) { mutex_exit(&dst->v_lock); kmem_free(buf, alloc); return; @@ -3261,14 +3284,57 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3280,14 +3346,62 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3451,7 +3565,7 @@ fop_lookup( } if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, lookup); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); } } @@ -3493,7 +3607,7 @@ fop_create( (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, create); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); } } @@ -3615,7 +3729,7 @@ fop_mkdir( (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, mkdir); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, dirname, strlen(dirname)); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 3c3cbdf4c1..5c06a1bb29 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -129,6 +129,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/multilist.h> #ifdef _KERNEL #include <sys/vmsystm.h> @@ -4343,6 +4344,14 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 4f469fc750..64a4cb74d0 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -678,8 +678,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + + if (bonuslen) { + /* + * Absent byzantine on-disk corruption, we fully expect + * our bonuslen to be no more than DN_MAX_BONUSLEN -- + * but we nonetheless explicitly clamp it on the bcopy() + * to prevent any on-disk corruption from becoming + * rampant in-kernel corruption. + */ + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + MIN(bonuslen, DN_MAX_BONUSLEN)); + } + DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 35015825b4..8ce9178ad2 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1818,7 +1818,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } - /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 50b8aba876..f54d67202b 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] @@ -2492,8 +2492,12 @@ receive_read_record(struct receive_arg *ra) { struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); + void *buf = NULL; dmu_object_info_t doi; + + if (size > 0) + buf = kmem_zalloc(size, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 15b9459ce2..9d3db4212e 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -39,11 +39,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -224,6 +224,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + min_bs = SPA_MINBLOCKSHIFT; max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; min_ibs = DN_MIN_INDBLKSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 7d86f72ad1..ca7d8b9bee 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -42,6 +42,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include <sys/zfeature.h> #include <sys/policy.h> #include <sys/zfs_znode.h> @@ -1262,7 +1263,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); + zfs_zone_txg_delay(), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index ee2d8ee9eb..242db1c9f1 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -43,6 +43,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 9030b855a1..296f0006fc 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -64,6 +64,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ int zfs_condense_pct = 200; /* + * Never condense any space map. This is for debugging/recovery only. + */ +int zfs_condense_never = 0; + +/* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the @@ -1657,6 +1662,9 @@ metaslab_should_condense(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + if (zfs_condense_never != 0) + return (B_FALSE); + /* * Use the ms_size_tree range tree, which is ordered by size, to * obtain the largest segment in the free tree. We always condense diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 7ddf806ec5..3168b47304 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -24,6 +24,7 @@ * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -407,15 +408,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; - int i; + int i, size; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; - tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + + if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) { + tb->lot_attrs = kmem_alloc(size, KM_SLEEP); + bcopy(attrs, tb->lot_attrs, size); + } + tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 3f1b7d8a54..0b99d08e72 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -116,6 +116,7 @@ struct vdev_queue { avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; + zoneid_t vq_last_zone_id; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..f1431b3f55 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern hrtime_t zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t, + avl_tree_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 6d8f7601f3..6d02c95b22 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -421,6 +421,7 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; hrtime_t io_target_timestamp; + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ avl_node_t io_queue_node; avl_node_t io_offset_node; @@ -449,6 +450,7 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 191259e75b..915c9bb4b2 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -31,6 +31,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * ZFS Transaction Groups @@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index a6af0101e7..ab305ed694 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -26,6 +26,7 @@ */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -44,6 +45,11 @@ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; @@ -127,6 +133,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; /* * Ignore events other than offline. @@ -586,6 +594,7 @@ static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; if (vd->vdev_reopening || dvd == NULL) return; @@ -815,6 +824,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } @@ -824,6 +835,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 4917cc9284..cc415e2ca0 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* @@ -34,6 +35,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> /* * ZFS I/O Scheduler @@ -142,7 +144,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_min_active = 3; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; @@ -238,6 +240,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_last_zone_id = 0; + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -275,6 +279,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_enqueue(zio); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -290,6 +295,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -649,7 +655,11 @@ again: search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); +#ifdef _KERNEL + zio = zfs_zone_schedule(vq, p, idx, tree); +#else zio = avl_nearest(tree, idx, AVL_AFTER); +#endif if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index 2b18ecb01c..132e84b111 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ @@ -853,9 +854,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on %s is %u, " "should be at least %u", - zp->z_vnode->v_path ? zp->z_vnode->v_path : - "<unknown>", (int)zp->z_links, - zp_is_dir + 1); + zp->z_vnode->v_path != vn_vpath_empty ? + zp->z_vnode->v_path : "<unknown>", + (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index a7feada44f..6d28956707 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -615,9 +615,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); @@ -947,6 +948,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; + if (secpolicy_fs_import(cr) != 0) + return (set_errno(EPERM)); + if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); @@ -2037,7 +2041,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -2055,7 +2060,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -2082,11 +2088,24 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); } @@ -2281,8 +2300,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); @@ -2311,8 +2343,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -3022,6 +3056,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -3065,8 +3100,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3075,13 +3111,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 0d02fd5bec..ace4bf8173 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -1903,6 +1904,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 9cba49b402..829d57b760 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -23,7 +23,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -664,6 +664,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; + int prev_error; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; @@ -685,6 +686,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -737,17 +749,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); - - /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { @@ -968,7 +969,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); - ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force @@ -978,18 +978,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; + /* + * Keep track of a possible pre-existing error from a partial + * write via dmu_write_uio_dbuf above. + */ + prev_error = error; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); - if (error != 0) + if (prev_error != 0 || error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; - - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); @@ -2832,8 +2834,11 @@ top: return (err); } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } } if (mask & (AT_ATIME|AT_MTIME) || @@ -3761,9 +3766,7 @@ top: if (error == 0) { vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - /* notify the target dir if it is not the same as source dir */ - if (tdvp != sdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); } out: if (zl != NULL) @@ -4255,6 +4258,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -4790,10 +4795,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } @@ -4859,8 +4860,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - if (error == 0 && off == 0 && len == 0) - vnevent_truncate(ZTOV(zp), ct); + if (error == 0 && len == 0) { + if (off == 0) { + vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } + } ZFS_EXIT(zfsvfs); return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..4861c64f8e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1336 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + * 1) Ensure consistent and predictable I/O latency across all zones. + * 2) Sequential and random workloads have very different characteristics, + * so it is a non-starter to track IOPS or throughput. + * 3) A zone should be able to use the full disk bandwidth if no other zone + * is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + * (# of read syscalls) x (Average read latency) + + * (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +hrtime_t +zfs_zone_txg_delay() +{ + return (MSEC2NSEC(10)); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */ + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. + */ +uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */ +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + * ((now - zfs_zone_last_checked) * 1000); + */ +kmutex_t zfs_disk_lock; /* protects the following: */ +uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ +hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ + +hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; + +/* + * Data used to keep track of how often txg sync is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_sync_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on the zfs_vdev_sync_read_max_active value for the + * number of I/Os that can be pending on a device. If there are more than the + * max_active ops already queued up, beyond those already issued to the vdev, + * then use zone-based scheduling to get the next synchronous zio. + */ +uint32_t zfs_zone_schedule_thresh = 10; + +/* + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define SCHED_WEIGHT_MAX 20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale). + */ +int zfs_zone_txg_throttle_scale = 2; +hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); + +typedef struct { + int zq_qdepth; + zio_priority_t zq_queue; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static boolean_t +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (B_FALSE); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (B_TRUE); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + DTRACE_PROBE3(zfs__zone__calc__wt__avg, + uintptr_t, cp->sys_avg_lat, + uintptr_t, cp->cycle_lat, + uintptr_t, cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id, + uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, + uintptr_t, *wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, + uint_t, rops, uint_t, wops, uint_t, lwops, + uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && sp->zi_overutil) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, + uintptr_t, delay, uintptr_t, zonep->zone_io_delay, + uintptr_t, fairutil, uintptr_t, zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) +{ + zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, + uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, + uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, + uintptr_t, stats.zi_diskutil); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + uint_t cnt; + zone_q_bump_t *qbp = arg; + zio_priority_t p = qbp->zq_queue; + + cnt = zonep->zone_zfs_queued[p]; + if (cnt == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / cnt) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * it becomes the new leading contender. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. This is only + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller. + * + * For single-threaded synchronous processes a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple processes + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel processes (and ops in the queue) vs. other zones which + * are doing simple single-threaded processes, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, + avl_tree_t *tree) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + qbump.zq_queue = p; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(tree); zp != NULL; + zp = avl_walk(tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, + uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out TX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track and throttle IO operations per zone. Called from: + * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes + * go through this path) + * - arc_read for read ops that miss the ARC (both dataset and zvol) + * For each operation, increment that zone's counter based on the type of + * operation, then delay the operation, if necessary. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. We only see logical writes in this + * function, we see the physcial writes in the zfs_zone_zio_start and + * zfs_zone_zio_done functions. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice; first because this function + * is always called by a zone thread for logical writes, but then we also will + * count the physical writes that are performed at a low level via + * zfs_zone_zio_start. Without this, it can look like a non-global zone never + * writes (case 1). Depending on when the TXG is synced, the counts may be in + * the same sample bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through arc_read and we only come into this + * function when we have an arc miss. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow, last_checked; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counter for logical writes here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * If the zone's I/O priority is set to zero, don't throttle that zone's + * operations at all. + */ + if (zonep->zone_zfs_io_pri == 0) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + zfs_zone_wait_adjust(unow, last_checked); + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TXG + * syncing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_sync_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id, + uintptr_t, type, uintptr_t, wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the sync rate is going to be 1. When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second). In this case the rate + * will be > 1. The sync rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_sync_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_sync_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +hrtime_t +zfs_zone_txg_delay() +{ + if (curzone->zone_io_util_above_avg) + return (zfs_zone_txg_delay_nsec); + + return (MSEC2NSEC(10)); +} + +/* + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_disk_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if (zp->io_dispatched == 0) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_timestamp; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, + uintptr_t, zp->io_type, uintptr_t, udelta); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued[p] > 0); + if (zonep->zone_zfs_queued[p] == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued[p]--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued[p]++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. That function is where zio's are listed + * in FIFO order on one of the sync queues, then pulled off (by + * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling + * here to find a zone's zio deeper in the sync queue and issue that instead + * of simply doing FIFO. + * + * We only do zone-based zio scheduling for the two synchronous I/O queues + * (read & write). These queues are normally serviced in FIFO order but we + * may decide to move a zone's zio to the head of the line. A typical I/O + * load will be mostly synchronous reads and some asynchronous writes (which + * are scheduled differently due to transaction groups). There will also be + * some synchronous writes for those apps which want to ensure their data is on + * disk. We want to make sure that a zone with a single-threaded app (e.g. the + * shell) that is doing synchronous I/O (typically reads) isn't penalized by + * other zones which are doing lots of synchronous I/O because they have many + * running threads. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, + avl_tree_t *tree) +{ + vdev_queue_class_t *vqc = &vq->vq_class[p]; + uint_t cnt; + zoneid_t last_zone; + zio_t *zio; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* Don't change the order on the LBA ordered queues. */ + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return (avl_nearest(tree, idx, AVL_AFTER)); + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + cnt = avl_numnodes(tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few zios in the queue then just issue the head. + * If there are more than a few zios already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zio = avl_nearest(tree, idx, AVL_AFTER); + else + zio = get_next_zio(vqc, cnt, p, tree); + + vq->vq_last_zone_id = zio->io_zoneid; + + /* + * Probe with 4 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, the zone that was associated + * with the next IO that is scheduled, and which queue (priority). + */ + DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, + uint_t, zio->io_zoneid, uint_t, p); + + return (zio); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index e3889b3a30..abbb31a199 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -1802,9 +1803,18 @@ zil_close(zilog_t *zilog) if (lwb != NULL) txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); - if (txg) + + if (zilog_is_dirty(zilog)) { + /* + * If we're dirty, always wait for the current transaction -- + * our lwb_max_txg may be in the past. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (txg) { txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(!zilog_is_dirty(zilog)); + } + + VERIFY(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 1acc8b2e6a..bfbcdfb511 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -40,6 +41,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/zfs_zone.h> /* * ========================================================================== @@ -561,11 +563,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 95bb26c211..535bc057b9 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -25,7 +25,7 @@ * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -84,6 +84,7 @@ #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/zfeature.h> @@ -138,6 +139,11 @@ typedef struct zvol_state { #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * zvol maximum transfer in one DMU tx. */ @@ -1379,6 +1385,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; rl_t *rl; int error = 0; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1395,6 +1404,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1404,6 +1421,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; + tot_bytes += bytes; error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ @@ -1413,6 +1431,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } } zfs_range_unlock(rl); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += tot_bytes; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + return (error); } @@ -1426,6 +1477,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) rl_t *rl; int error = 0; boolean_t sync; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1442,6 +1496,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for zvol write operations. There's no + * actual wait queue for zvol operations. + */ + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1455,6 +1522,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; + tot_bytes += bytes; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1472,6 +1540,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += tot_bytes; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + return (error); } diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h new file mode 100644 index 0000000000..a790a797d1 --- /dev/null +++ b/usr/src/uts/common/inet/inet_hash.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _INET_INET_HASH_H +#define _INET_INET_HASH_H + +/* + * Common packet hashing routines shared across MAC, UDP, and others. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define INET_PKT_HASH_L2 0x01 +#define INET_PKT_HASH_L3 0x02 +#define INET_PKT_HASH_L4 0x04 + +extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_INET_HASH_H */ diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index bcbc1c4949..b4bff4d7b4 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } switch (name) { case IP_TTL: diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index f006e83a1f..73081b9c1c 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -12577,6 +12577,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12697,6 +12698,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12704,9 +12712,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c index 85ee142dfc..c350d67c2d 100644 --- a/usr/src/uts/common/inet/ip/ip_attr.c +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) */ if (ixa->ixa_free_flags & IXA_FREE_CRED) crhold(ixa->ixa_cred); + + /* + * There is no cleanup in progress on this new copy. + */ + ixa->ixa_tcpcleanup = IXATC_IDLE; } /* diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 33a2fa5935..dedb4dadcc 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(ip_squeue_worker_wait, pri); + sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index bc2173ff24..3a12e58c3a 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index c325e8dc26..2ca770ebe9 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid) { espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat", "net", KSTAT_TYPE_NAMED, - sizeof (esp_kstats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_PERSISTENT, stackid); + sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid); if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL) return (B_FALSE); diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index f6466434f6..c3139d9288 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _INET_IPCLASSIFIER_H @@ -293,7 +294,8 @@ struct conn_s { conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_pad_to_bit_31 : 12; + conn_reuseport : 1, /* SO_REUSEPORT state */ + conn_pad_to_bit_31 : 11; boolean_t conn_blocked; /* conn is flow-controlled */ diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index f958ca2261..227d2075f8 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); @@ -152,6 +160,16 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; char *hook6_loop_out = "ipfilter_hook6_loop_out"; char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -248,6 +266,31 @@ ipf_stack_t *ifs; ifs->ifs_ipf_ipv4 = NULL; } + /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + #undef UNDO_HOOK #ifdef IPFDEBUG @@ -445,6 +488,48 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + /* * Reacquire ipf_global, now it is safe. */ WRITE_ENTER(&ifs->ifs_ipf_global); @@ -1011,7 +1096,6 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; - /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2045,6 +2129,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hook4_loop_in */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index a239f1c1ca..9aa2478c6a 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -125,6 +125,10 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_in; hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; /* flags to indicate whether hooks are registered. */ boolean_t ifs_hook4_physical_in; @@ -137,10 +141,16 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; /* ip_auth.c */ int ifs_fr_authsize; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5d56debc31 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 2e08dc359b..1009f0700f 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -23,7 +23,7 @@ */ /* - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. */ /* @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -244,7 +248,7 @@ squeue_init(void) /* ARGSUSED */ squeue_t * -squeue_create(clock_t wait, pri_t pri) +squeue_create(clock_t wait, pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); @@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -779,7 +818,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -804,20 +843,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1487,6 +1546,7 @@ void squeue_synch_exit(conn_t *connp) { squeue_t *sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_run == curthread) { diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index b2b9973291..6ec2e6b2d7 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. */ @@ -134,6 +134,7 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; +struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -404,6 +405,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; + /* + * Group of tcp_t entries bound to the same adress and port via + * SO_REUSEPORT. The pointer itself is protected by tf_lock in the + * containing tcps_bind_fanout slot. + */ + struct tcp_rg_s *tcp_rg_bind; + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index fba7125690..cf046c968e 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013,2014 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. @@ -1423,6 +1423,21 @@ tcp_free(tcp_t *tcp) tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); /* + * Destroy any association with SO_REUSEPORT group. + */ + if (tcp->tcp_rg_bind != NULL) { + /* + * This is only necessary for connections which enabled + * SO_REUSEPORT but were never bound. Such connections should + * be the one and only member of the tcp_rg_tp to which they + * have been associated. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = NULL; + } + + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is @@ -2054,8 +2069,7 @@ tcp_reinit(tcp_t *tcp) * structure! */ static void -tcp_reinit_values(tcp) - tcp_t *tcp; +tcp_reinit_values(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index c6df39b91e..adc201eebb 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +56,7 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); /* * Hash list insertion routine for tcp_t structures. Each hash bucket @@ -172,6 +174,16 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); + + /* destroy any association with SO_REUSEPORT group */ + if (tcp->tcp_rg_bind != NULL) { + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Last one out turns off the lights */ + tcp_rg_destroy(tcp->tcp_rg_bind); + } + tcp->tcp_rg_bind = NULL; + } + if (tcp->tcp_ptpbhn) { tcpnext = tcp->tcp_bind_hash_port; if (tcpnext != NULL) { @@ -636,13 +648,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 + * If the "bind_to_req_port_only" parameter is set and the requested port + * number is available, return it (else return 0). * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. + * If "bind_to_req_port_only" parameter is not set and the requested port + * number is available, return it. If not, return the first anonymous port we + * happen across. If no anonymous ports are available, return 0. * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. @@ -662,6 +673,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t reuseport = connp->conn_reuseport; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -698,6 +710,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tf_t *tbf; tcp_t *ltcp; conn_t *lconnp; + boolean_t attempt_reuse = B_FALSE; lport = htons(port); @@ -724,6 +737,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; + boolean_t addrmatch; lconnp = ltcp->tcp_connp; @@ -829,22 +843,34 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, &lconnp->conn_faddr_v6))) continue; + addrmatch = IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6); + + if (addrmatch && reuseport && bind_to_req_port_only && + (ltcp->tcp_state == TCPS_BOUND || + ltcp->tcp_state == TCPS_LISTEN)) { + /* + * This entry is bound to the exact same + * address and port. If SO_REUSEPORT is set on + * the calling socket, attempt to reuse this + * binding if it too appears to be willing. + */ + attempt_reuse = B_TRUE; + break; + } + if (!reuseaddr) { /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. + * No socket option SO_REUSEADDR. If an + * existing port is bound to a non-wildcard IP + * address and the requesting stream is bound + * to a distinct different IP address + * (non-wildcard, also), keep going. */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) + !addrmatch) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -859,27 +885,47 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * socket option SO_REUSEADDR is set on the * binding tcp_t. * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. + * If two streams are bound to the same IP + * address or both addr and bound source are + * wildcards (INADDR_ANY), we want to stop + * searching. We have found a match of IP + * source address and source port, which is + * refused regardless of the SO_REUSEADDR + * setting, so we break. */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && + if (addrmatch && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; } } - if (ltcp != NULL) { + if (ltcp != NULL && !attempt_reuse) { /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { + if (attempt_reuse) { + int err; + + ASSERT(ltcp != NULL); + ASSERT(ltcp->tcp_rg_bind != NULL); + ASSERT(tcp->tcp_rg_bind != NULL); + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); + + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + if (err != 0) { + mutex_exit(&tbf->tf_lock); + return (0); + } + /* + * Now that the newly-binding socket has joined + * the existing reuseport group on ltcp, it + * should clean up its own (empty) group. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = ltcp->tcp_rg_bind; + } + /* * This port is ours. Insert in fanout and mark as * bound to prevent others from getting the port @@ -944,3 +990,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } + +/* Max number of members in TCP SO_REUSEPORT group */ +#define TCP_RG_SIZE_MAX 64 +/* Step size when expanding members array */ +#define TCP_RG_SIZE_STEP 2 + + +tcp_rg_t * +tcp_rg_init(tcp_t *tcp) +{ + tcp_rg_t *rg; + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI); + if (rg == NULL) + return (NULL); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (rg->tcprg_members == NULL) { + kmem_free(rg, sizeof (tcp_rg_t)); + return (NULL); + } + + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->tcprg_size = 2; + rg->tcprg_count = 1; + rg->tcprg_active = 1; + rg->tcprg_members[0] = tcp; + return (rg); +} + +void +tcp_rg_destroy(tcp_rg_t *rg) +{ + mutex_enter(&rg->tcprg_lock); + ASSERT(rg->tcprg_count == 0); + ASSERT(rg->tcprg_active == 0); + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); + mutex_destroy(&rg->tcprg_lock); + kmem_free(rg, sizeof (struct tcp_rg_s)); +} + +static int +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) +{ + mutex_enter(&rg->tcprg_lock); + + VERIFY(rg->tcprg_size > 0); + VERIFY(rg->tcprg_count <= rg->tcprg_size); + if (rg->tcprg_count != 0) { + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; + cred_t *newcred = tcp->tcp_connp->conn_cred; + + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->tcprg_lock); + return (EPERM); + } + } + + if (rg->tcprg_count == rg->tcprg_size) { + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; + tcp_t **newmembers; + + if (newsize > TCP_RG_SIZE_MAX) { + mutex_exit(&rg->tcprg_lock); + return (EINVAL); + } + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (newmembers == NULL) { + mutex_exit(&rg->tcprg_lock); + return (ENOMEM); + } + bcopy(rg->tcprg_members, newmembers, oldalloc); + kmem_free(rg->tcprg_members, oldalloc); + rg->tcprg_members = newmembers; + rg->tcprg_size = newsize; + } + + rg->tcprg_members[rg->tcprg_count] = tcp; + rg->tcprg_count++; + rg->tcprg_active++; + + mutex_exit(&rg->tcprg_lock); + return (0); +} + +boolean_t +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) +{ + int i; + boolean_t is_empty; + + mutex_enter(&rg->tcprg_lock); + for (i = 0; i < rg->tcprg_count; i++) { + if (rg->tcprg_members[i] == tcp) + break; + } + /* The item should be present */ + ASSERT(i < rg->tcprg_count); + /* Move the last member into this position */ + rg->tcprg_count--; + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; + rg->tcprg_members[rg->tcprg_count] = NULL; + if (tcp->tcp_connp->conn_reuseport != 0) + rg->tcprg_active--; + is_empty = (rg->tcprg_count == 0); + mutex_exit(&rg->tcprg_lock); + return (is_empty); +} + +void +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) +{ + mutex_enter(&rg->tcprg_lock); + if (is_active) { + rg->tcprg_active++; + } else { + rg->tcprg_active--; + } + mutex_exit(&rg->tcprg_lock); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index cf8e0c6bd4..7cfdb9a4a2 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2014 by Delphix. All rights reserved. */ @@ -99,7 +99,7 @@ * tcps_time_wait_interval since the period before upper layer closes the * connection is not accounted for when tcp_time_wait_append() is called. * - * If uppser layer has closed the connection, call tcp_time_wait_append() + * If upper layer has closed the connection, call tcp_time_wait_append() * directly. * */ diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 1a5363bedc..835acd1b12 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -62,7 +63,8 @@ opdes_t tcp_opt_arr[] = { { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -483,6 +485,42 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) return (retval); } +static int +tcp_set_reuseport(conn_t *connp, boolean_t do_enable) +{ + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (do_enable && !IPCL_IS_NONSTR(connp)) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have fallen + * back to the STREAMS API. + */ + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + if (tcp->tcp_state >= TCPS_BOUND || + tcp->tcp_state <= TCPS_CLOSED) + return (EINVAL); + if ((rg = tcp_rg_init(tcp)) == NULL) + return (ENOMEM); + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); +} + /* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. @@ -653,6 +691,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); + } + return (0); } break; case IPPROTO_TCP: @@ -769,14 +812,37 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (*i1 == 0) { return (EINVAL); } else if (tcp->tcp_ka_rinterval == 0) { - if ((tcp->tcp_ka_abort_thres / *i1) < - tcp->tcp_rto_min || - (tcp->tcp_ka_abort_thres / *i1) > - tcp->tcp_rto_max) - return (EINVAL); + /* + * When TCP_KEEPCNT is specified without first + * specifying a TCP_KEEPINTVL, we infer an + * interval based on a tunable specific to our + * stack: the tcp_keepalive_abort_interval. + * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in + * the unlikely event that that has been set.) + * Given the abort interval's default value of + * 480 seconds, low TCP_KEEPCNT values can + * result in intervals that exceed the default + * maximum RTO of 60 seconds. Rather than + * fail in these cases, we (implicitly) clamp + * the interval at the maximum RTO; if the + * TCP_KEEPCNT is shortly followed by a + * TCP_KEEPINTVL (as we expect), the abort + * threshold will be recalculated correctly -- + * and if a TCP_KEEPINTVL is not forthcoming, + * keep-alive will at least operate reasonably + * given the underconfigured state. + */ + uint32_t interval; - tcp->tcp_ka_rinterval = - tcp->tcp_ka_abort_thres / *i1; + interval = tcp->tcp_ka_abort_thres / *i1; + + if (interval < tcp->tcp_rto_min) + interval = tcp->tcp_rto_min; + + if (interval > tcp->tcp_rto_max) + interval = tcp->tcp_rto_max; + + tcp->tcp_ka_rinterval = interval; } else { if ((*i1 * tcp->tcp_ka_rinterval) < tcps->tcps_keepalive_abort_interval_low || @@ -953,10 +1019,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index a431bf63d1..8f535a5dd1 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* This file contains all TCP kernel socket related functions. */ @@ -1022,6 +1023,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, } /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (tcp->tcp_rg_bind != NULL) { + freeb(stropt_mp); + freeb(ordrel_mp); + squeue_synch_exit(connp); + return (EINVAL); + } + + /* * Both endpoints must be of the same type (either STREAMS or * non-STREAMS) for fusion to be enabled. So if we are fused, * we have to unfuse. diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c index b470934da0..6600296b18 100644 --- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -41,13 +41,13 @@ #include <inet/tcp_impl.h> #include <inet/tcp_cluster.h> -static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *); + +#define TW_BUCKET(t) \ + (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS) + +#define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS) -/* - * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. - * Running it every 5 seconds seems to give the best results. - */ -#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) /* * Remove a connection from the list of detached TIME_WAIT connections. @@ -56,17 +56,17 @@ static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. */ boolean_t -tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) +tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp) { boolean_t locked = B_FALSE; - if (tcp_time_wait == NULL) { - tcp_time_wait = *((tcp_squeue_priv_t **) + if (tsp == NULL) { + tsp = *((tcp_squeue_priv_t **) squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + mutex_enter(&tsp->tcp_time_wait_lock); locked = B_TRUE; } else { - ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); } /* 0 means that the tcp_t has not been added to the time wait list. */ @@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ASSERT(tcp->tcp_time_wait_next == NULL); ASSERT(tcp->tcp_time_wait_prev == NULL); if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_FALSE); } ASSERT(TCP_IS_DETACHED(tcp)); ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); + ASSERT(tsp->tcp_time_wait_cnt > 0); - if (tcp == tcp_time_wait->tcp_time_wait_head) { - ASSERT(tcp->tcp_time_wait_prev == NULL); - tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; - if (tcp_time_wait->tcp_time_wait_head != NULL) { - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - NULL; - } else { - tcp_time_wait->tcp_time_wait_tail = NULL; - } - } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { - ASSERT(tcp->tcp_time_wait_next == NULL); - tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; - } else { - ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); - ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); - tcp->tcp_time_wait_prev->tcp_time_wait_next = - tcp->tcp_time_wait_next; + if (tcp->tcp_time_wait_next != NULL) { tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp->tcp_time_wait_prev; } + if (tcp->tcp_time_wait_prev != NULL) { + tcp->tcp_time_wait_prev->tcp_time_wait_next = + tcp->tcp_time_wait_next; + } else { + unsigned int bucket; + + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp); + tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next; + } tcp->tcp_time_wait_next = NULL; tcp->tcp_time_wait_prev = NULL; tcp->tcp_time_wait_expire = 0; + tsp->tcp_time_wait_cnt--; if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_TRUE); } @@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) + /* * Add a connection to the list of detached TIME_WAIT connections * and set its time to expire. @@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; squeue_t *sqp = tcp->tcp_connp->conn_sqp; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - hrtime_t firetime = 0; + int64_t now, schedule; + unsigned int bucket; tcp_timers_stop(tcp); @@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_ack_tid == 0); /* must have happened at the time of detaching the tcp */ + ASSERT(TCP_IS_DETACHED(tcp)); + ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); ASSERT(tcp->tcp_ptpahn == NULL); ASSERT(tcp->tcp_flow_stopped == 0); ASSERT(tcp->tcp_time_wait_next == NULL); @@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_time_wait_expire == 0); ASSERT(tcp->tcp_listener == NULL); - tcp->tcp_time_wait_expire = ddi_get_lbolt64(); - if (IS_LOCAL_HOST(tcp)) { - /* - * This is the fastpath for handling localhost connections. - * Since we don't have to worry about packets on the localhost - * showing up after a long network delay, we want to expire - * these quickly so the port range on the localhost doesn't - * get starved by short-running, local apps. - * - * Leave tcp_time_wait_expire at the current time. This - * essentially means the connection is expired now and it will - * clean up the next time tcp_time_wait_collector runs. We set - * firetime to use a short delay so that if we have to start a - * tcp_time_wait_collector thread below, it runs soon instead - * of after a delay of time_wait_interval. firetime being set - * to a non-0 value is also our indicator that we should add - * this connection to the head of the time wait list (since we - * are already expired) so that its sure to get cleaned up on - * the next run of tcp_time_wait_collector (which expects the - * entries to appear in time-order and stops when it hits the - * first non-expired entry). - */ - firetime = TCP_TIME_WAIT_DELAY; - } else { - /* - * Since tcp_time_wait_expire is lbolt64, it should not wrap - * around in practice. Hence it cannot be 0. Note that zero - * means that the tcp_t is not in the TIME_WAIT list. - */ - tcp->tcp_time_wait_expire += MSEC_TO_TICK( - tcps->tcps_time_wait_interval); + TCP_DBGSTAT(tcps, tcp_time_wait); + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Immediately expire loopback connections. Since there is no worry + * about packets on the local host showing up after a long network + * delay, this is safe and allows much higher rates of connection churn + * for applications operating locally. + * + * This typically bypasses the tcp_free_list fast path due to squeue + * re-entry for the loopback close operation. + */ + if (tcp->tcp_loopback) { + tcp_time_wait_purge(tcp, tsp); + mutex_exit(&tsp->tcp_time_wait_lock); + return; } - ASSERT(TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - TCP_DBGSTAT(tcps, tcp_time_wait); + /* + * In order to reap TIME_WAITs reliably, we should use a source of time + * that is not adjustable by the user. While it would be more accurate + * to grab this timestamp before (potentially) sleeping on the + * tcp_time_wait_lock, doing so complicates bucket addressing later. + */ + now = ddi_get_lbolt64(); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - if (tcp_time_wait->tcp_time_wait_head == NULL) { - ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); - tcp_time_wait->tcp_time_wait_head = tcp; + /* + * Each squeue uses an arbitrary time offset when scheduling + * expiration timers. This prevents the bucketing from forcing + * tcp_time_wait_collector to run in locksetup across squeues. + * + * This offset is (re)initialized when a new TIME_WAIT connection is + * added to an squeue which has no connections waiting to expire. + */ + if (tsp->tcp_time_wait_tid == 0) { + ASSERT(tsp->tcp_time_wait_cnt == 0); + tsp->tcp_time_wait_offset = + now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + } + now -= tsp->tcp_time_wait_offset; + + /* + * Use the netstack-defined timeout, rounded up to the minimum + * time_wait_collector interval. + */ + schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval); + tcp->tcp_time_wait_expire = schedule; + + /* + * Append the connection into the appropriate bucket. + */ + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket]; + tsp->tcp_time_wait_bucket[bucket] = tcp; + if (tcp->tcp_time_wait_next != NULL) { + ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL); + tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp; + } + tsp->tcp_time_wait_cnt++; + + /* + * Round delay up to the nearest bucket boundary. + */ + schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + + /* + * The newly inserted entry may require a tighter schedule for the + * expiration timer. + */ + if (schedule < tsp->tcp_time_wait_schedule) { + callout_id_t old_tid = tsp->tcp_time_wait_tid; + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); /* - * Even if the list was empty before, there may be a timer - * running since a tcp_t can be removed from the list - * in other places, such as tcp_clean_death(). So check if - * a timer is needed. - */ - if (tcp_time_wait->tcp_time_wait_tid == 0) { - if (firetime == 0) - firetime = (hrtime_t) - (tcps->tcps_time_wait_interval + 1) * - MICROSEC; - - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, - tcp_time_wait_collector, sqp, firetime, - CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); - } - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* - * The list is not empty, so a timer must be running. If not, - * tcp_time_wait_collector() must be running on this - * tcp_time_wait list at the same time. + * It is possible for the timer to fire before the untimeout + * action is able to complete. In that case, the exclusion + * offered by the tcp_time_wait_collector_active flag will + * prevent multiple collector threads from processing records + * simultaneously from the same squeue. */ - ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 || - tcp_time_wait->tcp_time_wait_running); - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == - TCPS_TIME_WAIT); - - if (firetime == 0) { - /* add at end */ - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = - tcp; - tcp->tcp_time_wait_prev = - tcp_time_wait->tcp_time_wait_tail; - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* add at head */ - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_time_wait_head; - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - tcp; - tcp_time_wait->tcp_time_wait_head = tcp; - } + mutex_exit(&tsp->tcp_time_wait_lock); + (void) untimeout_default(old_tid, 0); + return; + } + + /* + * Start a fresh timer if none exists. + */ + if (tsp->tcp_time_wait_schedule == 0) { + ASSERT(tsp->tcp_time_wait_tid == 0); + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); } - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); } /* @@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) tcp_close_detached(tcp); } + +static void +tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp) +{ + mblk_t *mp; + conn_t *connp = tcp->tcp_connp; + kmutex_t *lock; + + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); + ASSERT(connp->conn_fanout != NULL); + + lock = &connp->conn_fanout->connf_lock; + + /* + * This is essentially a TIME_WAIT reclaim fast path optimization for + * performance where the connection is checked under the fanout lock + * (so that no one else can get access to the conn_t) that the refcnt + * is 2 (one each for TCP and the classifier hash list). That is the + * case and clustering callbacks are not enabled, the conn can be + * removed under the fanout lock and avoid clean-up under the squeue. + * + * This optimization is forgone when clustering is enabled since the + * clustering callback must be made before setting the CONDEMNED flag + * and after dropping all locks + * + * See the comments in tcp_closei_local for additional information + * regarding the refcnt logic. + */ + if (mutex_tryenter(lock)) { + mutex_enter(&connp->conn_lock); + if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) { + ipcl_hash_remove_locked(connp, connp->conn_fanout); + /* + * Set the CONDEMNED flag now itself so that the refcnt + * cannot increase due to any walker. + */ + connp->conn_state_flags |= CONN_CONDEMNED; + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) { + /* + * Add to head of tcp_free_list + */ + tcp_cleanup(tcp); + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_policy == NULL); + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + + tcp->tcp_time_wait_next = tsp->tcp_free_list; + tcp->tcp_in_free_list = B_TRUE; + tsp->tcp_free_list = tcp; + tsp->tcp_free_list_cnt++; + } else { + /* + * Do not add to tcp_free_list + */ + tcp_bind_hash_remove(tcp); + ixa_cleanup(tcp->tcp_connp->conn_ixa); + tcp_ipsec_cleanup(tcp); + CONN_DEC_REF(tcp->tcp_connp); + } + + /* + * With the fast-path complete, we can bail. + */ + return; + } else { + /* + * Fall back to slow path. + */ + CONN_INC_REF_LOCKED(connp); + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + } + } else { + CONN_INC_REF(connp); + } + + /* + * We can reuse the closemp here since conn has detached (otherwise we + * wouldn't even be in time_wait list). It is safe to change + * tcp_closemp_used without taking a lock as no other thread can + * concurrently access it at this point in the connection lifecycle. + */ + if (tcp->tcp_closemp.b_prev == NULL) { + tcp->tcp_closemp_used = B_TRUE; + } else { + cmn_err(CE_PANIC, + "tcp_timewait_collector: concurrent use of tcp_closemp: " + "connp %p tcp %p\n", (void *)connp, (void *)tcp); + } + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + mutex_exit(&tsp->tcp_time_wait_lock); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL, + SQ_FILL, SQTAG_TCP_TIMEWAIT); + mutex_enter(&tsp->tcp_time_wait_lock); +} + /* - * Blows away all tcps whose TIME_WAIT has expired. List traversal - * is done forwards from the head. - * This walks all stack instances since - * tcp_time_wait remains global across all stacks. + * Purge any tcp_t instances associated with this squeue which have expired + * from the TIME_WAIT state. */ -/* ARGSUSED */ void tcp_time_wait_collector(void *arg) { tcp_t *tcp; - int64_t now; - mblk_t *mp; - conn_t *connp; - kmutex_t *lock; - boolean_t removed; - extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, - uint8_t *, in_port_t, uint8_t *, in_port_t, void *); + int64_t now, active_schedule, new_schedule; + unsigned int idx; squeue_t *sqp = (squeue_t *)arg; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - tcp_time_wait->tcp_time_wait_tid = 0; -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_TRUE; -#endif + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Because of timer scheduling complexity and the fact that the + * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is + * possible for multiple tcp_time_wait_collector threads to run against + * the same squeue. This flag is used to exclude other collectors from + * the squeue during execution. + */ + if (tsp->tcp_time_wait_collector_active) { + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + tsp->tcp_time_wait_collector_active = B_TRUE; - if (tcp_time_wait->tcp_free_list != NULL && - tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { + /* + * Purge the free list if necessary + */ + if (tsp->tcp_free_list != NULL) { TCP_G_STAT(tcp_freelist_cleanup); - while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { - tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; + while ((tcp = tsp->tcp_free_list) != NULL) { + tsp->tcp_free_list = tcp->tcp_time_wait_next; tcp->tcp_time_wait_next = NULL; - tcp_time_wait->tcp_free_list_cnt--; + tsp->tcp_free_list_cnt--; ASSERT(tcp->tcp_tcps == NULL); CONN_DEC_REF(tcp->tcp_connp); } - ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); + ASSERT(tsp->tcp_free_list_cnt == 0); } /* - * In order to reap time waits reliably, we should use a - * source of time that is not adjustable by the user -- hence - * the call to ddi_get_lbolt64(). + * If there are no connections pending, clear timer-related state to be + * reinitialized by the next caller. */ - now = ddi_get_lbolt64(); - while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { + if (tsp->tcp_time_wait_cnt == 0) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + + /* + * Grab the bucket which we were scheduled to cleanse. + */ + active_schedule = tsp->tcp_time_wait_schedule; + idx = TW_BUCKET(active_schedule - 1); + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; +retry: + tcp = tsp->tcp_time_wait_bucket[idx]; + + while (tcp != NULL) { /* - * lbolt64 should not wrap around in practice... So we can - * do a direct comparison. + * Since the bucket count is sized to prevent wrap-around + * during typical operation and timers are schedule to process + * buckets with only expired connections, there is only one + * reason to encounter a connection expiring in the future: + * The tcp_time_wait_collector thread has been so delayed in + * its processing that connections have wrapped around the + * timing wheel into this bucket. + * + * In that case, the remaining entires in the bucket can be + * ignored since, being appended sequentially, they should all + * expire in the future. */ - if (now < tcp->tcp_time_wait_expire) + if (now < tcp->tcp_time_wait_expire) { break; + } - removed = tcp_time_wait_remove(tcp, tcp_time_wait); - ASSERT(removed); + /* + * Pull the connection out of the bucket. + */ + VERIFY(tcp_time_wait_remove(tcp, tsp)); - connp = tcp->tcp_connp; - ASSERT(connp->conn_fanout != NULL); - lock = &connp->conn_fanout->connf_lock; /* - * This is essentially a TW reclaim fast path optimization for - * performance where the timewait collector checks under the - * fanout lock (so that no one else can get access to the - * conn_t) that the refcnt is 2 i.e. one for TCP and one for - * the classifier hash list. If ref count is indeed 2, we can - * just remove the conn under the fanout lock and avoid - * cleaning up the conn under the squeue, provided that - * clustering callbacks are not enabled. If clustering is - * enabled, we need to make the clustering callback before - * setting the CONDEMNED flag and after dropping all locks and - * so we forego this optimization and fall back to the slow - * path. Also please see the comments in tcp_closei_local - * regarding the refcnt logic. + * Purge the connection. * - * Since we are holding the tcp_time_wait_lock, its better - * not to block on the fanout_lock because other connections - * can't add themselves to time_wait list. So we do a - * tryenter instead of mutex_enter. + * While tcp_time_wait_lock will be temporarily dropped as part + * of the process, there is no risk of the timer being + * (re)scheduled while the collector is running since a value + * corresponding to the past is left in tcp_time_wait_schedule. */ - if (mutex_tryenter(lock)) { - mutex_enter(&connp->conn_lock); - if ((connp->conn_ref == 2) && - (cl_inet_disconnect == NULL)) { - ipcl_hash_remove_locked(connp, - connp->conn_fanout); - /* - * Set the CONDEMNED flag now itself so that - * the refcnt cannot increase due to any - * walker. - */ - connp->conn_state_flags |= CONN_CONDEMNED; - mutex_exit(lock); - mutex_exit(&connp->conn_lock); - if (tcp_time_wait->tcp_free_list_cnt < - tcp_free_list_max_cnt) { - /* Add to head of tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_cleanup(tcp); - ASSERT(connp->conn_latch == NULL); - ASSERT(connp->conn_policy == NULL); - ASSERT(tcp->tcp_tcps == NULL); - ASSERT(connp->conn_netstack == NULL); - - mutex_enter( - &tcp_time_wait->tcp_time_wait_lock); - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_free_list; - tcp_time_wait->tcp_free_list = tcp; - tcp_time_wait->tcp_free_list_cnt++; - continue; - } else { - /* Do not add to tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_bind_hash_remove(tcp); - ixa_cleanup(tcp->tcp_connp->conn_ixa); - tcp_ipsec_cleanup(tcp); - CONN_DEC_REF(tcp->tcp_connp); - } - } else { - CONN_INC_REF_LOCKED(connp); - mutex_exit(lock); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + tcp_time_wait_purge(tcp, tsp); - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, - "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); - } - } else { - mutex_enter(&connp->conn_lock); - CONN_INC_REF_LOCKED(connp); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + /* + * Because tcp_time_wait_remove clears the tcp_time_wait_next + * field, the next item must be grabbed directly from the + * bucket itself. + */ + tcp = tsp->tcp_time_wait_bucket[idx]; + } + + if (tsp->tcp_time_wait_cnt == 0) { + /* + * There is not a need for the collector to schedule a new + * timer if no pending items remain. The timer state can be + * cleared only if it was untouched while the collector dropped + * its locks during tcp_time_wait_purge. + */ + if (tsp->tcp_time_wait_schedule == active_schedule) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + } + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } else { + unsigned int nidx; - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); + /* + * Locate the next bucket containing entries. + */ + new_schedule = active_schedule + + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + nidx = TW_BUCKET_NEXT(idx); + while (tsp->tcp_time_wait_bucket[nidx] == NULL) { + if (nidx == idx) { + break; + } + nidx = TW_BUCKET_NEXT(nidx); + new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); } - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL); } - if (tcp_time_wait->tcp_free_list != NULL) - tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; + /* + * It is possible that the system is under such dire load that between + * the timer scheduling and TIME_WAIT processing delay, execution + * overran the interval allocated to this bucket. + */ + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; + if (new_schedule <= now) { + /* + * Attempt to right the situation by immediately performing a + * purge on the next bucket. This loop will continue as needed + * until the schedule can be pushed out ahead of the clock. + */ + idx = TW_BUCKET(new_schedule - 1); + goto retry; + } /* - * If the time wait list is not empty and there is no timer running, - * restart it. + * Another thread may have snuck in to reschedule the timer while locks + * were dropped during tcp_time_wait_purge. Defer to the running timer + * if that is the case. */ - if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL && - tcp_time_wait->tcp_time_wait_tid == 0) { - hrtime_t firetime; - - /* shouldn't be necessary, but just in case */ - if (tcp->tcp_time_wait_expire < now) - tcp->tcp_time_wait_expire = now; - - firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); - /* This ensures that we won't wake up too often. */ - firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, - sqp, firetime, CALLOUT_TCP_RESOLUTION, - CALLOUT_FLAG_ROUNDUP); + if (tsp->tcp_time_wait_schedule != active_schedule) { + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; } -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_FALSE; -#endif - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + + /* + * Schedule the next timer. + */ + tsp->tcp_time_wait_schedule = new_schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(new_schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c index be75f1f663..f4d6c71914 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tunables.c +++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ @@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = { /* tunable - 0 */ { "_time_wait_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} }, + {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} }, { "_conn_req_max_q", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, @@ -307,7 +307,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = { { "_keepalive_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} }, + {1*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} }, { "_maxpsz_multiplier", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 0f0f915a2b..cb83b91fad 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for @@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls; */ #define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) -/* TCP timers related data strucutres. Refer to tcp_timers.c. */ +/* TCP timers related data structures. Refer to tcp_timers.c. */ typedef struct tcp_timer_s { conn_t *connp; void (*tcpt_proc)(void *); @@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache; (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl)); \ } + +/* + * Maximum TIME_WAIT timeout. It is defined here (instead of tcp_tunables.c) + * so that other parameters can be derived from it. + */ +#define TCP_TIME_WAIT_MAX (10 * MINUTES) + +/* + * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. + * Running it every 5 seconds seems to yield a reasonable balance between + * cleanup liveliness and system load. + */ +#define TCP_TIME_WAIT_DELAY (5 * SECONDS) + +#define TCP_TIME_WAIT_BUCKETS ((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1) + /* * For scalability, we must not run a timer for every TCP connection * in TIME_WAIT state. To see why, consider (for time wait interval of * 1 minutes): * 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's * - * This list is ordered by time, so you need only delete from the head - * until you get to entries which aren't old enough to delete yet. - * The list consists of only the detached TIME_WAIT connections. + * Since TIME_WAIT expiration occurs on a per-squeue basis, handling + * connections from all netstacks on the system, a simple queue is inadequate + * for pending entries. This is because tcp_time_wait_interval may differ + * between connections, causing tail insertion to violate expiration order. + * + * Instead of performing expensive sorting or unnecessary list traversal to + * counteract interval variance between netstacks, a timing wheel structure is + * used. The duration covered by each bucket in the wheel is determined by the + * TCP_TIME_WAIT_DELAY (5 seconds). The number of buckets in the wheel is + * determined by dividing the maximum TIME_WAIT interval (10 minutes) by + * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection. + * (Yielding 121 buckets with the current parameters) When items are inserted + * into the set of buckets, they are indexed by using their expiration time + * divided by the bucket size, modulo the number of buckets. This means that + * when each bucket is processed, all items within should have expired within + * the last TCP_TIME_WAIT_DELAY interval. + * + * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY + * interval to ensure all connections in the pending bucket will be expired, a + * per-squeue offset is used when doing TIME_WAIT scheduling. This offset is + * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling + * all of the tcp_time_wait_collector threads to run in lock-step. The offset + * is fixed while there are any connections present in the buckets. * * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is * tcps_time_wait_interval). When the tcp_t is detached (upper layer closes - * the end point), it is moved to the time wait list and another timer is - * started (expiry time is set at tcp_time_wait_expire, which is - * also calculated using tcps_time_wait_interval). This means that the - * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't - * become detached for a long time. + * the end point), it is scheduled to be cleaned up by the squeue-driving + * tcp_time_wait_collector (also using tcps_time_wait_interval). This means + * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t + * doesn't become detached for a long time. * * The list manipulations (including tcp_time_wait_next/prev) * are protected by the tcp_time_wait_lock. The content of the * detached TIME_WAIT connections is protected by the normal perimeters. * - * This list is per squeue and squeues are shared across the tcp_stack_t's. - * Things on tcp_time_wait_head remain associated with the tcp_stack_t - * and conn_netstack. - * The tcp_t's that are added to tcp_free_list are disassociated and - * have NULL tcp_tcps and conn_netstack pointers. + * These connection lists are per squeue and squeues are shared across the + * tcp_stack_t instances. Things in a tcp_time_wait_bucket remain associated + * with the tcp_stack_t and conn_netstack. Any tcp_t connections stored in the + * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack + * pointers. */ typedef struct tcp_squeue_priv_s { kmutex_t tcp_time_wait_lock; + boolean_t tcp_time_wait_collector_active; callout_id_t tcp_time_wait_tid; - tcp_t *tcp_time_wait_head; - tcp_t *tcp_time_wait_tail; + uint64_t tcp_time_wait_cnt; + int64_t tcp_time_wait_schedule; + int64_t tcp_time_wait_offset; + tcp_t *tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS]; tcp_t *tcp_free_list; uint_t tcp_free_list_cnt; -#ifdef DEBUG - /* - * For debugging purpose, true when tcp_time_wait_collector() is - * running. - */ - boolean_t tcp_time_wait_running; -#endif } tcp_squeue_priv_t; /* @@ -375,6 +406,22 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; +/* + * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - tcprg_lock: Protects the other fields + * - tcprg_size: Allocated size (in entries) of tcprg_members array + * - tcprg_count: Count of occupied tcprg_members slots + * - tcprg_active: Count of members which still have SO_REUSEPORT set + * - tcprg_members: Connections associated with address/port group + */ +typedef struct tcp_rg_s { + kmutex_t tcprg_lock; + unsigned int tcprg_size; + unsigned int tcprg_count; + unsigned int tcprg_active; + tcp_t **tcprg_members; +} tcp_rg_t; + #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -618,6 +665,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, int, boolean_t, boolean_t, boolean_t); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); +extern tcp_rg_t *tcp_rg_init(tcp_t *); +extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); +extern void tcp_rg_destroy(tcp_rg_t *); +extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); /* * Fusion related functions in tcp_fusion.c. diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5a15aea4de..a88bac932c 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -76,7 +77,8 @@ #include <inet/ipclassifier.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> -#include <sys/ethernet.h> +#include <sys/vxlan.h> +#include <inet/inet_hash.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> @@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol, typedef union T_primitives *t_primp_t; /* + * Various protocols that encapsulate UDP have no real use for the source port. + * Instead, they want to vary the source port to provide better equal-cost + * multipathing and other systems that use fanout. Consider something like + * VXLAN. If you're actually sending multiple different streams to a single + * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP, + * SRC Port, DST Port) will always be the same. + * + * Here, we return a port to hash this to, if we know how to hash it. If for + * some reason we can't perform an L4 hash, then we just return the default + * value, usually the default port. After we determine the hash we transform it + * so that it's in the range of [ min, max ]. + * + * We'd like to avoid a pull up for the sake of performing the hash. If the + * first mblk_t doesn't have the full protocol header, then we just send it to + * the default. If for some reason we have an encapsulated packet that has its + * protocol header in different parts of an mblk_t, then we'll go with the + * default port. This means that that if a driver isn't consistent about how it + * generates the frames for a given flow, it will not always be consistently + * hashed. That should be an uncommon event. + */ +uint16_t +udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, + uint16_t def) +{ + size_t szused = 0; + struct ether_header *ether; + struct ether_vlan_header *vether; + ip6_t *ip6h; + ipha_t *ipha; + uint16_t sap; + uint64_t hash; + uint32_t mod; + + ASSERT(min <= max); + + if (type != UDP_HASH_VXLAN) + return (def); + + if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) + return (def); + + /* + * The following logic is VXLAN specific to get at the header, if we + * have formats, eg. GENEVE, then we should ignore this. + * + * The kernel overlay device often puts a first mblk_t for the data + * which is just the encap. If so, then we're going to use that and try + * to avoid a pull up. + */ + if (MBLKL(mp) == VXLAN_HDR_LEN) { + if (mp->b_cont == NULL) + return (def); + mp = mp->b_cont; + ether = (struct ether_header *)mp->b_rptr; + } else if (MBLKL(mp) < VXLAN_HDR_LEN) { + return (def); + } else { + szused = VXLAN_HDR_LEN; + ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused); + } + + /* Can we hold a MAC header? */ + if (MBLKL(mp) + szused < sizeof (struct ether_header)) + return (def); + + /* + * We need to lie about the starting offset into the message block for + * convenience. Undo it at the end. We know that inet_pkt_hash() won't + * modify the mblk_t. + */ + mp->b_rptr += szused; + hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 | + INET_PKT_HASH_L3 | INET_PKT_HASH_L4); + mp->b_rptr -= szused; + + if (hash == 0) + return (def); + + mod = max - min + 1; + return ((hash % mod) + min); +} + +/* * Return the next anonymous port in the privileged port range for * bind checking. * @@ -1583,6 +1668,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_rcvhdr ? 1 : 0; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SRCPORT_HASH: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_vxlanhash; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_snd_to_conn ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1718,6 +1813,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, udp->udp_rcvhdr = onoff; mutex_exit(&connp->conn_lock); return (0); + case UDP_SRCPORT_HASH: + /* + * This should have already been verified, but double + * check. + */ + if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { + return (error); + } + + /* First see if the val is something we understand */ + if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN) + return (EINVAL); + + if (!checkonly) { + mutex_enter(&connp->conn_lock); + udp->udp_vxlanhash = *i1; + mutex_exit(&connp->conn_lock); + } + /* Fully handled this option. */ + return (0); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + udp->udp_snd_to_conn = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; } @@ -2001,13 +2121,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, uint32_t cksum; udp_t *udp = connp->conn_udp; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t ulp_hdr_len; + uint16_t srcport; data_len = msgdsize(data_mp); ulp_hdr_len = UDPH_SIZE; if (insert_spi) ulp_hdr_len += sizeof (uint32_t); + /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); if (mp == NULL) { @@ -2019,7 +2151,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); - udpha->uha_src_port = connp->conn_lport; + if (hash_srcport == B_TRUE) { + udpha->uha_src_port = htons(srcport); + } else { + udpha->uha_src_port = connp->conn_lport; + } udpha->uha_dst_port = dstport; udpha->uha_checksum = 0; udpha->uha_length = htons(data_len); @@ -3194,6 +3330,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t pktlen; uint_t alloclen; uint_t copylen; @@ -3202,10 +3339,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udpha_t *udpha; uint32_t cksum; ip_pkt_t *ipp; + uint16_t srcport; ASSERT(MUTEX_HELD(&connp->conn_lock)); /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + + /* * Copy the header template and leave space for an SPI */ copylen = connp->conn_ht_iphc_len; @@ -3303,6 +3451,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, *((uint32_t *)(udpha + 1)) = 0; udpha->uha_dst_port = dstport; + if (hash_srcport == B_TRUE) + udpha->uha_src_port = htons(srcport); + return (mp); } @@ -5947,10 +6098,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, else return (error); } - if (udp->udp_state == TS_DATA_XFER) { + + /* + * Check if we're allowed to send to a connection on which we've + * already called 'connect'. The posix spec. allows both behaviors but + * historically we've returned an error if already connected. The + * client can allow this via a sockopt. + */ + if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) { UDPS_BUMP_MIB(us, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, (struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) { diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c279bb4a21..847e2cdde6 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -292,6 +293,9 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, +{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 6a31ce5c22..ebba10c0f7 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _UDP_IMPL_H @@ -178,8 +179,12 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ + udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ + /* Because there's only VXLAN, cheat */ + /* and only use a single bit */ + udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */ - udp_pad_to_bit_31 : 29; + udp_pad_to_bit_31 : 27; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 00545d2c03..a39110255a 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -528,8 +529,13 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) if (on) { mac_rx_clear(port->lp_mch); + /* We use the promisc callback because without hardware + * rings, we deliver through flows that will cause duplicate + * delivery of packets when we've flipped into this mode + * to compensate for the lack of hardware MAC matching + */ rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_cb, port, &port->lp_mphp, + aggr_recv_promisc_cb, port, &port->lp_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP); if (rc != 0) { mac_rx_set(port->lp_mch, aggr_recv_cb, port); diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 2bdb7872e3..0dfe234b70 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -68,16 +69,27 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* * Callback function invoked by MAC service module when packets are - * made available by a MAC port. + * made available by a MAC port, both in promisc_on mode and not. */ /* ARGSUSED */ -void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) +static void +aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback, boolean_t promisc_path) { aggr_port_t *port = (aggr_port_t *)arg; aggr_grp_t *grp = port->lp_grp; + /* In the case where lp_promisc_on has been turned on to + * compensate for insufficient hardware MAC matching and + * hardware rings are not in use we will fall back to + * using flows for delivery which can result in duplicates + * pushed up the stack. Only respect the chosen path. + */ + if (port->lp_promisc_on != promisc_path) { + freemsgchain(mp); + return; + } + if (grp->lg_lacp_mode == AGGR_LACP_OFF) { aggr_mac_rx(grp->lg_mh, mrh, mp); } else { @@ -161,3 +173,19 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } } + +/* ARGSUSED */ +void +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE); +} + +/* ARGSUSED */ +void +aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE); +} diff --git a/usr/src/uts/common/io/axf/ax88172reg.h b/usr/src/uts/common/io/axf/ax88172reg.h new file mode 100644 index 0000000000..8ca6ebc187 --- /dev/null +++ b/usr/src/uts/common/io/axf/ax88172reg.h @@ -0,0 +1,163 @@ +/* + * @(#)ax88172reg.h 1.1 09/06/15 + * Macro definitions for ASIX AX88172 USB to fast ethernet controler + * based on ASIX AX88172/88772 data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +#ifndef __AX88172_H__ +#define __AX88172_H__ + +/* + * Vendor command definitions + */ +#define VCMD_READ_SRAM 0x02 +#define VCMD_WRITE_RXSRAM 0x03 +#define VCMD_WRITE_TXSRAM 0x04 +#define VCMD_SOFTWARE_MII_OP 0x06 +#define VCMD_READ_MII_REG 0x07 +#define VCMD_WRITE_MII_REG 0x08 +#define VCMD_READ_MII_OPMODE 0x09 +#define VCMD_HARDWARE_MII_OP 0x0a +#define VCMD_READ_SROM 0x0b +#define VCMD_WRITE_SROM 0x0c +#define VCMD_WRITE_SROM_ENABLE 0x0d +#define VCMD_WRITE_SROM_DISABLE 0x0e +#define VCMD_READ_RXCTRL 0x0f +#define VCMD_WRITE_RXCTRL 0x10 +#define VCMD_READ_IPGS 0x11 +#define VCMD_WRITE_IPG 0x12 +#define VCMD_WRITE_IPG1 0x13 +#define VCMD_WRITE_IPG2 0x14 +#define VCMD_READ_MCAST_FILTER 0x15 +#define VCMD_WRITE_MCAST_FILTER 0x16 +#define VCMD_READ_NODE_ID 0x17 +#define VCMD_READ_PHY_IDS 0x19 +#define VCMD_READ_MEDIUM_STATUS 0x1a +#define VCMD_WRITE_MEDIUM_STATUS 0x1b +#define VCMD_SET_MONITOR_MODE 0x1c +#define VCMD_GET_MONITOR_MODE 0x1d +#define VCMD_READ_GPIO 0x1e +#define VCMD_WRITE_GPIO 0x1f + +/* ax88772 only, currently not supported */ +#define VCMD_WRITE_IPGS_88772 0x12 +#define VCMD_READ_NODE_ID_88772 0x13 +#define VCMD_WRITE_NODE_ID_88772 0x14 +#define VCMD_WRITE_TEST_REG_88772 0x17 +#define VCMD_SOFTWARE_RESET_88772 0x20 +#define VCMD_READ_PHY_SELECT_88772 0x21 +#define VCMD_WRITE_PHY_SELECT_88772 0x22 + + +/* + * Register definitions + */ + +/* Rx control register */ +#define RCR_SO 0x80 /* Start Operation */ +#define RCR_AP_88772 0x20 /* accept physical address from mcast filter */ +#define RCR_AM 0x10 /* accept multicast address */ +#define RCR_AB 0x08 /* accept broadcast address */ +#define RCR_SEP 0x04 /* save error packet */ +#define RCR_AMALL 0x02 /* accept all multicast address */ +#define RCR_PRO 0x01 /* promiscious, all frames received */ + +#define RCR_MFB 0x0300 +#define RCR_MFB_SHIFT 8 +#define RCR_MFB_2K (0U << RCR_MFB_SHIFT) +#define RCR_MFB_4K (1U << RCR_MFB_SHIFT) +#define RCR_MFB_8K (2U << RCR_MFB_SHIFT) +#define RCR_MFB_16K (3U << RCR_MFB_SHIFT) + +#define RCR_BITS \ + "\020" \ + "\010SO" \ + "\006AP" \ + "\005AM" \ + "\004AB" \ + "\003SEP" \ + "\002AMALL" \ + "\001PRO" + +/* Medium status register */ +#define MSR_SM 0x1000 /* super mac support */ +#define MSR_SBP 0x0800 /* stop backpressure */ +#define MSR_PS 0x0200 /* port speed in mii mode */ +#define MSR_RE 0x0100 /* rx enable */ +#define MSR_PF 0x0080 /* check only length/type for pause frame */ +#define MSR_JFE 0x0040 /* jumbo frame enable */ +#define MSR_TFC 0x0020 /* tx flow control enable */ +#define MSR_RFC 0x0010 /* rx flow control enable (178) */ +#define MSR_FCEN 0x0010 /* flow control enable (172/772) */ +#define MSR_ENCK 0x0008 /* Enable GTX_CLK and TXC clock output (178) */ +#define MSR_TXABT 0x0004 /* Tx abort allow, always set */ +#define MSR_FDPX 0x0002 /* full duplex */ +#define MSR_GM 0x0001 /* Gigabit mode (178) */ + +#define MSR_BITS \ + "\020" \ + "\015SM" \ + "\014SBP" \ + "\012PS" \ + "\011RE" \ + "\005FCEN" \ + "\004ENCK" \ + "\003TXABT" \ + "\002FDPX" \ + "\001GM" + +/* monitor mode register */ +#define MMR_RWMP 0x04 /* remote wakeup by magic pkt */ +#define MMR_RWLU 0x02 /* remote wakeup by linkup */ +#define MMR_MOM 0x01 /* monitor mode 1:en, 0:dis */ + +#define MMR_BITS \ + "\020" \ + "\003RWMP" \ + "\002RWLU" \ + "\001MOM" + +/* GPIO register */ +#define GPIO_RSE 0x80 /* reload serial eeprom (88772)*/ +#define GPIO_DATA2 0x20 +#define GPIO_EN2 0x10 +#define GPIO_DATA1 0x08 +#define GPIO_EN1 0x04 +#define GPIO_DATA0 0x02 +#define GPIO_EN0 0x01 + +#define GPIO_BITS \ + "\020" \ + "\010RSE" \ + "\006DATA2" \ + "\005EN2" \ + "\004DATA1" \ + "\003EN1" \ + "\002DATA0" \ + "\001EN0" + +/* Software reset register */ +#define SWRST_IPPD 0x40 /* internal phy power down control */ +#define SWRST_IPRL 0x20 /* internal phy reset control */ +#define SWRST_BZ 0x10 /* force Bulk In to return zero-length pkt */ +#define SWRST_PRL 0x08 /* external phy reset pin level */ +#define SWRST_PRTE 0x04 /* external phy tri-state enable */ +#define SWRST_RT 0x02 /* clear frame length error for Bulk-Out */ +#define SWRST_RR 0x01 /* clear frame length error for Bulk-In */ + +#define SWRST_BITS \ + "\020" \ + "\007IPPD" \ + "\006IPRL" \ + "\005BZ" \ + "\004PRL" \ + "\003PRTE" \ + "\002RT" \ + "\001RR" + +/* Software PHY Select Status register */ +#define SPSS_ASEL 0x02 /* 1:auto select 0:manual select */ +#define SPSS_PSEL 0x01 /* 1:intenal phy, 0:external (when ASEL=0) */ + +#endif /* __AX88172_H__ */ diff --git a/usr/src/uts/common/io/axf/axf_usbgem.c b/usr/src/uts/common/io/axf/axf_usbgem.c new file mode 100644 index 0000000000..28963f6849 --- /dev/null +++ b/usr/src/uts/common/io/axf/axf_usbgem.c @@ -0,0 +1,1539 @@ +/* + * axf_usbgem.c : ASIX AX88172/772 USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2004-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "@(#)axf_usbgem.c 1.3 12/02/09" + +/* + * Changelog: + */ + +/* + * TODO + * handle RXMODE_ENABLE in set_rx_filter() + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "ax88172reg.h" + +char ident[] = "ax88x72 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label +#define LE16P(p) ((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0]) + +#define AX88172(dp) \ + (((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88172) + +#define AX88772(dp) \ + (((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88772) + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int axf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (axf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for ax88172 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * RX/TX buffer size + */ + +/* + * Local device definitions + */ +struct chip_info { + uint16_t vid; /* usb vendor id */ + uint16_t pid; /* usb product id */ + int type; + uint8_t gpio_reset[2]; + uint8_t gpio_speed[2]; + uint8_t gpio_duplex[2]; + char *name; +#define CHIP_TYPE_AX88172 0 +#define CHIP_TYPE_AX88772 1 +#define CHIP_TYPE_AX88178 2 +}; + +#define GPIO_DEFAULT {0x00, 0x15}, {0, 0}, {0, 0} +struct chip_info chiptbl_88x7x[] = { +/* AX88172 */ +{ + /* Planex UE2-100TX, Hawking UF200, TrendNet TU2-ET100 */ + 0x07b8, 0x420a, CHIP_TYPE_AX88172, + + /* + * the default setting covers below: + * gpio bit2 has to be 0 and gpio bit0 has to be 1 + */ + {0, 0}, + {GPIO_EN1, GPIO_DATA1 | GPIO_EN1}, + {0, 0}, + "Planex UE2-100TX", /* tested */ +}, +{ + 0x2001, 0x1a00, CHIP_TYPE_AX88172, + {0x9f, 0x9e}, {0, 0}, {0, 0}, + "D-Link dube100", /* XXX */ +}, +{ + 0x077b, 0x2226, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Linksys USB200M", +}, +{ + 0x0846, 0x1040, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Netgear FA120", +}, +{ + 0x0b95, 0x1720, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Intellinet, ST Lab USB Ethernet", +}, +{ + 0x08dd, 0x90ff, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Billionton Systems, USB2AR", +}, +{ + 0x0557, 0x2009, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "ATEN UC210T", +}, +{ + 0x0411, 0x003d, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Buffalo LUA-U2-KTX", +}, +{ + 0x6189, 0x182d, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Sitecom LN-029 USB 2.0 10/100 Ethernet adapter", +}, +{ + 0x07aa, 0x0017, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "corega FEther USB2-TX", +}, +{ + 0x1189, 0x0893, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Surecom EP-1427X-2", +}, +{ + 0x1631, 0x6200, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "goodway corp usb gwusb2e", +}, +/* AX88772 and AX88178 */ +{ + 0x13b1, 0x0018, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "Linksys USB200M rev.2", +}, +{ + 0x1557, 0x7720, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "0Q0 cable ethernet", +}, +{ + 0x07d1, 0x3c05, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "DLink DUB E100 ver B1", +}, +{ + 0x2001, 0x3c05, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "DLink DUB E100 ver B1(2)", +}, +{ + 0x05ac, 0x1402, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "Apple Ethernet USB Adapter", +}, +{ + 0x1737, 0x0039, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Linksys USB1000", +}, +{ + 0x0411, 0x006e, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Buffalo LUA-U2-KGT/LUA-U2-GT", +}, +{ + 0x04bb, 0x0930, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "I/O DATA ETG-US2", +}, +{ + 0x050d, 0x5055, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Belkin F5D5055", +}, +{ + /* generic ax88772 must be the last entry */ + /* planex UE-200TX-G */ + 0x0b95, 0x7720, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "ASIX AX88772/AX88178", /* tested */ +}, +}; + +#define CHIPTABLESIZE (sizeof (chiptbl_88x7x) / sizeof (struct chip_info)) + +struct axf_dev { + /* + * Misc HW information + */ + struct chip_info *chip; + uint8_t ipg[3]; + uint8_t gpio; + uint16_t rcr; + uint16_t msr; + uint8_t last_link_state; + boolean_t phy_has_reset; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t axf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void axf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int axf_reset_chip(struct usbgem_dev *); +static int axf_init_chip(struct usbgem_dev *); +static int axf_start_chip(struct usbgem_dev *); +static int axf_stop_chip(struct usbgem_dev *); +static int axf_set_media(struct usbgem_dev *); +static int axf_set_rx_filter(struct usbgem_dev *); +static int axf_get_stats(struct usbgem_dev *); +static void axf_interrupt(struct usbgem_dev *, mblk_t *); + +/* packet operations */ +static mblk_t *axf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *axf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUT(dp, req, val, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ (req), \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define IN(dp, req, val, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ (req), \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +axf_reset_phy(struct usbgem_dev *dp) +{ + uint8_t phys[2]; + uint8_t val8; + int err; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (AX88172(dp)) { + delay(drv_usectohz(5000)); + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: %s: gpio 0x%b", + dp->name, __func__, val8, GPIO_BITS)); + + /* reset MII PHY */ + val8 = lp->chip->gpio_reset[1] + | lp->chip->gpio_speed[dp->speed] + | lp->chip->gpio_duplex[dp->full_duplex]; + + OUT(dp, VCMD_WRITE_GPIO, + val8, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(5000)); + + val8 = lp->chip->gpio_reset[0] + | lp->chip->gpio_speed[dp->speed] + | lp->chip->gpio_duplex[dp->full_duplex]; + + OUT(dp, VCMD_WRITE_GPIO, + val8, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(5000)); + } else { + lp->gpio = GPIO_RSE | GPIO_DATA2 | GPIO_EN2; + OUT(dp, VCMD_WRITE_GPIO, lp->gpio, 0, + 0, NULL, &err, usberr); + drv_usecwait(1000); + + OUT(dp, VCMD_WRITE_PHY_SELECT_88772, + dp->mii_phy_addr == 16 ? 1 : 0, 0, 0, NULL, &err, usberr); + + OUT(dp, VCMD_SOFTWARE_RESET_88772, + SWRST_IPPD | SWRST_PRL, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(150*1000)); + OUT(dp, VCMD_SOFTWARE_RESET_88772, + 0, 0, 0, NULL, &err, usberr); + + OUT(dp, VCMD_SOFTWARE_RESET_88772, + dp->mii_phy_addr == 16 ? SWRST_IPRL : SWRST_PRTE, + 0, 0, NULL, &err, usberr); + delay(drv_usectohz(150*1000)); + } + + + return (USB_SUCCESS); + +usberr: + return (USB_FAILURE); +} + +static int +axf_reset_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + if (AX88172(dp)) { + /* there are no ways to reset nic */ + return (USB_SUCCESS); + } +#ifdef NEVER + OUT(dp, VCMD_SOFTWARE_RESET_88772, + SWRST_RR | SWRST_RT, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_SOFTWARE_RESET_88772, + 0, 0, 0, NULL, &err, usberr); +usberr: +#endif + return (err); +} + +/* + * Setup ax88172 + */ +static int +axf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err = USB_SUCCESS; + uint16_t reg; + uint8_t buf[2]; + uint16_t tmp16; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* rx conrol register: read default value */ + if (!AX88172(dp)) { + /* clear rx control */ + OUT(dp, VCMD_WRITE_RXCTRL, 0, 0, 0, NULL, &err, usberr); + } + + IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr); + lp->rcr = LE16P(buf); + DPRINTF(0, (CE_CONT, "!%s: %s: rcr(default):%b", + dp->name, __func__, lp->rcr, RCR_BITS)); + + lp->rcr &= ~RCR_SO; + + /* Media status register */ + if (AX88172(dp)) { +#ifdef notdef + lp->msr = MSR_TXABT; +#else + lp->msr = 0; +#endif + } else { + lp->msr = MSR_RE | MSR_TXABT; + } + DPRINTF(0, (CE_CONT, "!%s: %s: msr:%b", + dp->name, __func__, lp->msr, MSR_BITS)); + err = axf_set_media(dp); + CHECK_AND_JUMP(err, usberr); + + /* write IPG0-2 registers */ + if (AX88172(dp)) { + OUT(dp, VCMD_WRITE_IPG, lp->ipg[0], 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_IPG1, lp->ipg[1], 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_IPG2, lp->ipg[2], 0, 0, NULL, &err, usberr); + } else { + /* EMPTY */ + } +#ifdef ENABLE_RX_IN_INIT_CHIP + /* enable Rx */ + lp->rcr |= RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +axf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; +#ifndef ENABLE_RX_IN_INIT_CHIP + /* enable Rx */ + lp->rcr |= RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); +#endif + return (err); +} + +static int +axf_stop_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; + + /* Disable Rx */ + lp->rcr &= ~RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); + + /* + * Restore factory mac address + * if we have changed current mac address + */ + if (!AX88172(dp) && + bcmp(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, + ETHERADDRL) != 0) { + OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0, + ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr); + } +usberr: + return (axf_reset_chip(dp)); +} + +static int +axf_get_stats(struct usbgem_dev *dp) +{ + /* EMPTY */ + return (USB_SUCCESS); +} + +static uint_t +axf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_be(addr) >> (32 - 6)); +} + +static int +axf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint8_t mode; + uint8_t mhash[8]; + uint8_t buf[2]; + uint_t h; + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & RCR_SO) { + /* set promiscuous mode before changing it. */ + OUT(dp, VCMD_WRITE_RXCTRL, + lp->rcr | RCR_PRO, 0, 0, NULL, &err, usberr); + } + + lp->rcr &= ~(RCR_AP_88772 | RCR_AM | RCR_SEP | RCR_AMALL | RCR_PRO); + mode = RCR_AB; /* accept broadcast packets */ + + bzero(mhash, sizeof (mhash)); + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_PRO; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) { + /* accept all multicast packets */ + mode |= RCR_AMALL; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + mode |= RCR_AM; + for (i = 0; i < dp->mc_count; i++) { + h = dp->mc_list[i].hash; + mhash[h / 8] |= 1 << (h % 8); + } + } + if (AX88172(dp)) { + if (bcmp(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, ETHERADDRL) != 0) { + /* + * we use promiscious mode instead of changing the + * mac address in ax88172 + */ + mode |= RCR_PRO; + } + } else { + OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0, + ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr); + } + lp->rcr |= mode; + + /* set multicast hash table */ + if (mode & RCR_AM) { + /* need to set up multicast hash table */ + OUT(dp, VCMD_WRITE_MCAST_FILTER, 0, 0, + sizeof (mhash), mhash, &err, usberr); + } + + /* update rcr */ + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, + 0, NULL, &err, usberr); + +#if DEBUG_LEVEL > 1 + /* verify rxctrl reg */ + IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, LE16P(buf), RCR_BITS); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +axf_set_media(struct usbgem_dev *dp) +{ + uint8_t val8; + uint8_t gpio; + uint8_t gpio_old; + int err = USB_SUCCESS; + uint16_t msr; + struct axf_dev *lp = dp->private; + + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &gpio, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: %s: called, gpio:%b", + dp->name, __func__, gpio, GPIO_BITS)); + + msr = lp->msr; + gpio_old = gpio; + gpio = lp->chip->gpio_reset[0]; + + /* setup speed */ + if (AX88172(dp)) { + /* EMPTY */ + } else { + msr &= ~(MSR_PS | MSR_GM | MSR_ENCK); + + switch (dp->speed) { + case USBGEM_SPD_1000: + msr |= MSR_GM | MSR_ENCK; + break; + + case USBGEM_SPD_100: + msr |= MSR_PS; + break; + + case USBGEM_SPD_10: + break; + } + } + gpio |= lp->chip->gpio_speed[dp->speed == USBGEM_SPD_100 ? 1 : 0]; + + /* select duplex */ + msr &= ~MSR_FDPX; + if (dp->full_duplex) { + msr |= MSR_FDPX; + + /* select flow control */ + if (AX88172(dp)) { + msr &= ~MSR_FCEN; + switch (dp->flow_control) { + case FLOW_CONTROL_TX_PAUSE: + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_RX_PAUSE: + msr |= MSR_FCEN; + break; + } + } else { + msr &= ~(MSR_RFC | MSR_TFC); + switch (dp->flow_control) { + case FLOW_CONTROL_TX_PAUSE: + msr |= MSR_TFC; + break; + + case FLOW_CONTROL_SYMMETRIC: + msr |= MSR_TFC | MSR_RFC; + break; + + case FLOW_CONTROL_RX_PAUSE: + msr |= MSR_RFC; + break; + } + } + } + gpio |= lp->chip->gpio_duplex[dp->full_duplex ? 1 : 0]; + + /* update medium status register */ + lp->msr = msr; + OUT(dp, VCMD_WRITE_MEDIUM_STATUS, lp->msr, 0, + 0, NULL, &err, usberr); + + if (gpio != gpio_old) { + /* LED control required for some products */ + OUT(dp, VCMD_WRITE_GPIO, + gpio, 0, 0, NULL, &err, usberr); + } + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +#define FILL_PKT_HEADER(bp, len) { \ + (bp)[0] = (uint8_t)(len); \ + (bp)[1] = (uint8_t)((len) >> 8); \ + (bp)[2] = (uint8_t)(~(len)); \ + (bp)[3] = (uint8_t)((~(len)) >> 8); \ +} + +#define PKT_HEADER_SIZE 4 + +/* + * send/receive packet check + */ +static mblk_t * +axf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int n; + size_t len; + size_t pkt_size; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + uint_t align_mask; + size_t header_size; + int pad_size; + + len = msgdsize(mp); + + if (AX88172(dp)) { +#ifdef notdef + align_mask = 63; +#else + align_mask = 511; +#endif + header_size = 0; + + if (len >= ETHERMIN && mp->b_cont == NULL && + (len & align_mask) != 0) { + /* use the mp "as is" */ + return (mp); + } + } else { + align_mask = 511; + header_size = PKT_HEADER_SIZE; + } + + /* + * re-allocate the mp + */ + /* minimum ethernet packet size of ETHERMIN */ + pkt_size = max(len, ETHERMIN); + + if (((pkt_size + header_size) & align_mask) == 0) { + /* padding is required in usb communication */ + pad_size = PKT_HEADER_SIZE; + } else { + pad_size = 0; + } + + if ((new = allocb(header_size + pkt_size + pad_size, 0)) == NULL) { + return (NULL); + } + + bp = new->b_rptr; + if (header_size) { + uint16_t tmp; + + /* add a header */ + tmp = (uint16_t)pkt_size; + FILL_PKT_HEADER(bp, tmp); + bp += header_size; + } + + /* copy contents of the buffer */ + for (tp = mp; tp; tp = tp->b_cont) { + n = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, n); + bp += n; + } + + /* add pads for ethernet packets */ + last_pos = new->b_rptr + header_size + pkt_size; + while (bp < last_pos) { + *bp++ = 0; + } + + /* add a zero-length pad segment for usb communications */ + if (pad_size) { + /* add a dummy header for zero-length packet */ + FILL_PKT_HEADER(bp, 0); + bp += pad_size; + } + + /* close the payload of the packet */ + new->b_wptr = bp; + + return (new); +} + +static void +axf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +axf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + mblk_t *tp; + int rest; + + if (AX88172(dp)) { + return (mp); + } + + tp = mp; + rest = tp->b_wptr - tp->b_rptr; + + if (rest <= PKT_HEADER_SIZE) { + /* + * the usb bulk-in frame doesn't include any valid + * ethernet packets. + */ + return (NULL); + } + + for (; ; ) { + uint16_t len; + uint16_t cksum; + + /* analyse the header of the received usb frame */ + len = LE16P(tp->b_rptr + 0); + cksum = LE16P(tp->b_rptr + 2); + + /* test if the header is valid */ + if (len + cksum != 0xffff) { + /* discard whole the packet */ + cmn_err(CE_WARN, + "!%s: %s: corrupted header:%04x %04x", + dp->name, __func__, len, cksum); + return (NULL); + } +#if DEBUG_LEVEL > 0 + if (len < ETHERMIN || len > ETHERMAX) { + cmn_err(CE_NOTE, + "!%s: %s: incorrect pktsize:%d", + dp->name, __func__, len); + } +#endif + /* extract a ethernet packet from the bulk-in frame */ + tp->b_rptr += PKT_HEADER_SIZE; + tp->b_wptr = tp->b_rptr + len; + + if (len & 1) { + /* + * skip a tailing pad byte if the packet + * length is odd + */ + len++; + } + rest -= len + PKT_HEADER_SIZE; + + if (rest <= PKT_HEADER_SIZE) { + /* no more vaild ethernet packets */ + break; + } + +#if DEBUG_LEVEL > 10 + axf_dump_packet(dp, tp->b_wptr, 18); +#endif + /* allocate a mblk_t header for the next ethernet packet */ + tp->b_next = dupb(mp); + tp->b_next->b_rptr = tp->b_rptr + len; + tp = tp->b_next; + } + + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +axf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint8_t buf[2]; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + + /* switch to software MII operation mode */ + OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + /* Read MII register */ + IN(dp, VCMD_READ_MII_REG, dp->mii_phy_addr, index, + 2, buf, errp, usberr); + + /* switch to hardware MII operation mode */ + OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + return (LE16P(buf)); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +axf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + uint8_t buf[2]; + + DPRINTF(4, (CE_CONT, "!%s: %s called, reg:%x val:%x", + dp->name, __func__, index, val)); + + /* switch software MII operation mode */ + OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + /* Write to the specified MII register */ + buf[0] = (uint8_t)val; + buf[1] = (uint8_t)(val >> 8); + OUT(dp, VCMD_WRITE_MII_REG, dp->mii_phy_addr, index, + 2, buf, errp, usberr); + + /* switch to hardware MII operation mode */ + OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + +usberr: + ; +} + +static void +axf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *bp; + struct axf_dev *lp = dp->private; + + bp = mp->b_rptr; + + DPRINTF(2, (CE_CONT, + "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7])); + + if (lp->last_link_state ^ bp[2]) { + usbgem_mii_update_link(dp); + } + + lp->last_link_state = bp[2]; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +#ifdef DEBUG_LEVEL +static void +axf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint8_t w0[2], w1[2], w2[2], w3[2]; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + + err = USB_SUCCESS; + + for (i = 0; i < size; i += 4) { + IN(dp, VCMD_READ_SROM, i + 0, 0, 2, w0, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 1, 0, 2, w1, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 2, 0, 2, w2, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 3, 0, 2, w3, &err, usberr); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i, + (w0[1] << 8) | w0[0], + (w1[1] << 8) | w1[0], + (w2[1] << 8) | w2[0], + (w3[1] << 8) | w3[0]); + } +usberr: + ; +} +#endif + +static int +axf_attach_chip(struct usbgem_dev *dp) +{ + uint8_t phys[2]; + int err; + uint_t vcmd; + int ret; +#ifdef CONFIG_FULLSIZE_VLAN + uint8_t maxpktsize[2]; + uint16_t vlan_pktsize; +#endif +#ifdef DEBUG_LEVEL + uint8_t val8; +#endif + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__)); + + ret = USB_SUCCESS; + /* + * mac address in EEPROM has loaded to ID registers. + */ + vcmd = AX88172(dp) ? VCMD_READ_NODE_ID : VCMD_READ_NODE_ID_88772; + IN(dp, vcmd, 0, 0, + ETHERADDRL, dp->dev_addr.ether_addr_octet, &err, usberr); + + /* + * setup IPG values + */ + lp->ipg[0] = 0x15; + lp->ipg[1] = 0x0c; + lp->ipg[2] = 0x12; + + /* + * We cannot scan phy because the nic returns undefined + * value, i.e. remained garbage, when MII phy is not at the + * specified index. + */ +#ifdef DEBUG_LEVELx + if (lp->chip->vid == 0x07b8 && lp->chip->pid == 0x420a) { + /* + * restore the original phy address of brain + * damaged Planex UE2-100TX + */ + OUT(dp, VCMD_WRITE_SROM_ENABLE, 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, 0x11, 0xe004, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, 0, 0, 0, NULL, &err, usberr); + } +#endif + if (AX88172(dp)) { + IN(dp, VCMD_READ_PHY_IDS, 0, 0, 2, &phys, &err, usberr); + dp->mii_phy_addr = phys[1]; + DPRINTF(0, (CE_CONT, "!%s: %s: phys_addr:%d %d", + dp->name, __func__, phys[0], phys[1])); + } else { + /* use built-in phy */ + dp->mii_phy_addr = 0x10; + } + + dp->misc_flag |= USBGEM_VLAN; +#ifdef CONFIG_FULLSIZE_VLAN + if (AX88172(dp) || AX88772(dp)) { + /* check max packet size in srom */ + IN(dp, VCMD_READ_SROM, 0x10, 0, 2, maxpktsize, &err, usberr); + vlan_pktsize = ETHERMAX + ETHERFCSL + 4 /* VTAG_SIZE */; + + if (LE16P(maxpktsize) < vlan_pktsize) { + cmn_err(CE_CONT, + "!%s: %s: max packet size in srom is too small, " + "changing %d -> %d, do power cycle for the device", + dp->name, __func__, + LE16P(maxpktsize), vlan_pktsize); + OUT(dp, VCMD_WRITE_SROM_ENABLE, + 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, 0x10, + vlan_pktsize, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, + 0, 0, 0, NULL, &err, usberr); + + /* need to power off the device */ + ret = USB_FAILURE; + } + } +#endif +#ifdef DEBUG_LEVEL + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr); + cmn_err(CE_CONT, + "!%s: %s: ipg 0x%02x 0x%02x 0x%02x, gpio 0x%b", + dp->name, __func__, lp->ipg[0], lp->ipg[1], lp->ipg[2], + val8, GPIO_BITS); +#endif + /* fix rx buffer size */ + if (!AX88172(dp)) { + dp->rx_buf_len = 2048; + } + +#if DEBUG_LEVEL > 0 + axf_eeprom_dump(dp, 0x20); +#endif + return (ret); + +usberr: + cmn_err(CE_WARN, "%s: %s: usb error detected (%d)", + dp->name, __func__, err); + return (USB_FAILURE); +} + +static boolean_t +axf_scan_phy(struct usbgem_dev *dp) +{ + int i; + int err; + uint16_t val; + int phy_addr_saved; + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + phy_addr_saved = dp->mii_phy_addr; + + /* special probe routine for unreliable MII addr */ +#define PROBE_PAT \ + (MII_ABILITY_100BASE_TX_FD | \ + MII_ABILITY_100BASE_TX | \ + MII_ABILITY_10BASE_T_FD | \ + MII_ABILITY_10BASE_T) + + for (i = 0; i < 32; i++) { + dp->mii_phy_addr = i; + axf_mii_write(dp, MII_AN_ADVERT, 0, &err); + if (err != USBGEM_SUCCESS) { + break; + } + val = axf_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + if (val != 0) { + DPRINTF(0, (CE_CONT, "!%s: %s: index:%d, val %b != 0", + dp->name, __func__, i, val, MII_ABILITY_BITS)); + continue; + } + + axf_mii_write(dp, MII_AN_ADVERT, PROBE_PAT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + val = axf_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + if ((val & MII_ABILITY_TECH) != PROBE_PAT) { + DPRINTF(0, (CE_CONT, "!%s: %s: " + "index:%d, pat:%x != val:%b", + dp->name, __func__, i, + PROBE_PAT, val, MII_ABILITY_BITS)); + continue; + } + + /* found */ + dp->mii_phy_addr = phy_addr_saved; + return (i); + } +#undef PROBE_PAT + if (i == 32) { + cmn_err(CE_CONT, "!%s: %s: no mii phy found", + dp->name, __func__); + } else { + cmn_err(CE_CONT, "!%s: %s: i/o error while scanning phy", + dp->name, __func__); + } + dp->mii_phy_addr = phy_addr_saved; + return (-1); +} + +static int +axf_mii_probe(struct usbgem_dev *dp) +{ + int my_guess; + int err; + uint8_t old_11th[2]; + uint8_t new_11th[2]; + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + (void) axf_reset_phy(dp); + lp->phy_has_reset = B_TRUE; + + if (AX88172(dp)) { + my_guess = axf_scan_phy(dp); + if (my_guess >= 0 && my_guess < 32 && + my_guess != dp->mii_phy_addr) { + /* + * phy addr in srom is wrong, need to fix it + */ + IN(dp, VCMD_READ_SROM, + 0x11, 0, 2, old_11th, &err, usberr); + + new_11th[0] = my_guess; + new_11th[1] = old_11th[1]; + + OUT(dp, VCMD_WRITE_SROM_ENABLE, + 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, + 0x11, LE16P(new_11th), 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, + 0, 0, 0, NULL, &err, usberr); +#if 1 + /* XXX - read back, but it doesn't work, why? */ + delay(drv_usectohz(1000*1000)); + IN(dp, VCMD_READ_SROM, + 0x11, 0, 2, new_11th, &err, usberr); +#endif + cmn_err(CE_NOTE, "!%s: %s: phy addr in srom fixed: " + "%04x -> %04x", + dp->name, __func__, + LE16P(old_11th), LE16P(new_11th)); + return (USBGEM_FAILURE); +usberr: + cmn_err(CE_NOTE, + "!%s: %s: failed to patch phy addr, " + "current: %04x", + dp->name, __func__, LE16P(old_11th)); + return (USBGEM_FAILURE); + } + } + return (usbgem_mii_probe_default(dp)); +} + +static int +axf_mii_init(struct usbgem_dev *dp) +{ + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (!lp->phy_has_reset) { + (void) axf_reset_phy(dp); + } + + /* prepare to reset phy on the next reconnect or resume */ + lp->phy_has_reset = B_FALSE; + + return (USB_SUCCESS); +} + +static int +axfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int revid; + int unit; + int vid; + int pid; + struct chip_info *p; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct axf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * Check if the chip is supported. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-vendor-id", -1); + pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-product-id", -1); + revid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-revision-id", -1); + + for (i = 0, p = chiptbl_88x7x; i < CHIPTABLESIZE; i++, p++) { + if (p->vid == vid && p->pid == pid) { + /* found */ + cmn_err(CE_CONT, "!%s%d: %s " + "(vid: 0x%04x, did: 0x%04x, revid: 0x%02x)", + drv_name, unit, p->name, vid, pid, revid); + goto chip_found; + } + } + + /* Not found */ + cmn_err(CE_WARN, "!%s: %s: wrong usb venid/prodid (0x%x, 0x%x)", + drv_name, __func__, vid, pid); + + /* assume 88772 */ + p = &chiptbl_88x7x[CHIPTABLESIZE - 1]; +chip_found: + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + /* + * softmac requires that ppa is the instance number + * of the device, otherwise it hangs in seaching the device. + */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 64; + + ugcp->usbgc_rx_header_len = 0; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; + + /* flow control */ + /* + * XXX - flow control caused link down frequently under + * heavy traffic + */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = 0; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + ugcp->usbgc_mii_hw_link_detection = B_TRUE; + ugcp->usbgc_mii_stop_mac_on_linkdown = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &axf_attach_chip; + ugcp->usbgc_reset_chip = &axf_reset_chip; + ugcp->usbgc_init_chip = &axf_init_chip; + ugcp->usbgc_start_chip = &axf_start_chip; + ugcp->usbgc_stop_chip = &axf_stop_chip; + ugcp->usbgc_multicast_hash = &axf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &axf_set_rx_filter; + ugcp->usbgc_set_media = &axf_set_media; + ugcp->usbgc_get_stats = &axf_get_stats; + ugcp->usbgc_interrupt = &axf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &axf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &axf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &axf_mii_probe; + ugcp->usbgc_mii_init = &axf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &axf_mii_read; + ugcp->usbgc_mii_write = &axf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct axf_dev), KM_SLEEP); + lp->chip = p; + lp->last_link_state = 0; + lp->phy_has_reset = B_FALSE; + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct axf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct axf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + + return (DDI_FAILURE); +} + +static int +axfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(axf_ops, axfattach, axfdetach); +#else +static struct module_info axfminfo = { + 0, /* mi_idnum */ + "axf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit axfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &axfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit axfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &axfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab axf_info = { + &axfrinit, /* st_rdinit */ + &axfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_axf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &axf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops axf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + axfattach, /* devo_attach */ + axfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_axf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &axf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!axf: _init: called")); + + status = usbgem_mod_init(&axf_ops, "axf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&axf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!axf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&axf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c index 6ef1b0b9f7..495ae93cf9 100644 --- a/usr/src/uts/common/io/cons.c +++ b/usr/src/uts/common/io/cons.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* @@ -53,6 +54,7 @@ #include <sys/vnode.h> #include <sys/uio.h> #include <sys/stat.h> +#include <sys/limits.h> #include <sys/console.h> #include <sys/consdev.h> @@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred) */ if (vsconsvp != NULL && vsconsvp->v_stream != NULL) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; + + if (uio->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uio->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } /* * strwrite modifies uio so need to make copy. */ - (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt); (void) strwrite(vsconsvp, &uiod.d_uio, cred); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); } if (rconsvp->v_stream != NULL) diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index e00ac1d1e9..f8d9f1cff8 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -670,15 +670,26 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) uiosize = uiop->uio_resid; pollfdnum = uiosize / size; - mutex_enter(&curproc->p_lock); - if (pollfdnum > (uint_t)rctl_enforced_value( - rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { - (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], - curproc->p_rctls, curproc, RCA_SAFE); + + /* + * We want to make sure that pollfdnum isn't large enough to DoS us, + * but we also don't want to grab p_lock unnecessarily -- so we + * perform the full check against our resource limits if and only if + * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX. + */ + if (pollfdnum > UINT8_MAX) { + mutex_enter(&curproc->p_lock); + if (pollfdnum > + (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], + curproc->p_rctls, curproc)) { + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + curproc->p_rctls, curproc, RCA_SAFE); + mutex_exit(&curproc->p_lock); + return (EINVAL); + } mutex_exit(&curproc->p_lock); - return (EINVAL); } - mutex_exit(&curproc->p_lock); + /* * Copy in the pollfd array. Walk through the array and add * each polled fd to the cached set. @@ -1112,14 +1123,18 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) void *setp = STRUCT_FGETP(dvpoll, dp_setp); if (setp != NULL) { - if (copyin(setp, &set, sizeof (set))) { - DP_REFRELE(dpep); - return (EFAULT); + if ((mode & FKIOCTL) != 0) { + /* Use the signal set directly */ + ksetp = (k_sigset_t *)setp; + } else { + if (copyin(setp, &set, sizeof (set))) { + DP_REFRELE(dpep); + return (EFAULT); + } + sigutok(&set, &kset); + ksetp = &kset; } - sigutok(&set, &kset); - ksetp = &kset; - mutex_enter(&p->p_lock); schedctl_finish_sigblock(t); lwp->lwp_sigoldmask = t->t_hold; @@ -1268,6 +1283,10 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) DP_SIGMASK_RESTORE(ksetp); if (error == 0 && fdcnt > 0) { + /* + * It should be noted that FKIOCTL does not influence + * the copyout (vs bcopy) of dp_fds at this time. + */ if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { DP_REFRELE(dpep); diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index 40cbe86170..62bc4a8ecf 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent Inc. */ /* @@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) @@ -1376,7 +1378,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = { {SIMNET_IOC, "simnet", 0, NULL, 0}, {BRIDGE_IOC, "bridge", 0, NULL, 0}, {IPTUN_IOC, "iptun", 0, NULL, 0}, - {IBPART_IOC, "ibp", -1, NULL, 0} + {IBPART_IOC, "ibp", -1, NULL, 0}, + {OVERLAY_IOC, "overlay", 0, NULL, 0} }; #define DLDIOC_CNT \ (sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t)) diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index a438e43d91..661d8b2f4f 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -41,7 +41,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, - proto_notify_req, proto_passive_req; + proto_notify_req, proto_passive_req, proto_exclusive_req; static void proto_capability_advertise(dld_str_t *, mblk_t *); static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); @@ -121,6 +121,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp) case DL_PASSIVE_REQ: proto_passive_req(dsp, mp); break; + case DL_EXCLUSIVE_REQ: + proto_exclusive_req(dsp, mp); + break; default: proto_req(dsp, mp); break; @@ -605,6 +608,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) new_flags |= DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + new_flags |= DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + new_flags |= DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -692,6 +703,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) new_flags &= ~DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) { + dl_err = DL_NOTENAB; + goto failed; + } + new_flags &= ~DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) { + dl_err = DL_NOTENAB; + goto failed; + } + new_flags &= ~DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; mac_perim_exit(mph); @@ -1295,7 +1322,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp) * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ - if (dsp->ds_passivestate == DLD_ACTIVE) { + if (dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE) { dl_err = DL_OUTSTATE; goto failed; } @@ -1354,7 +1382,12 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + if (direct->di_flags & DI_DIRECT_RAW) { + direct->di_tx_df = + (uintptr_t)str_mdata_raw_fastpath_put; + } else { + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + } direct->di_tx_dh = dsp; direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; @@ -1516,8 +1549,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) * completes. So we limit the check to DLD_ENABLE case. */ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && - (dsp->ds_sap != ETHERTYPE_IP || - !check_mod_above(dsp->ds_rq, "ip"))) { + ((dsp->ds_sap != ETHERTYPE_IP || + !check_mod_above(dsp->ds_rq, "ip")) && + !check_mod_above(dsp->ds_rq, "vnd"))) { return (ENOTSUP); } @@ -1599,9 +1633,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Direct capability negotiation interface between IP and DLD + * Direct capability negotiation interface between IP/VND and DLD. Note + * that for vnd we only allow the case where the media type is the + * native media type so we know that there are no transformations that + * would have to happen to the mac header that it receives. */ - if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) { + if ((dsp->ds_sap == ETHERTYPE_IP && + check_mod_above(dsp->ds_rq, "ip")) || + (check_mod_above(dsp->ds_rq, "vnd") && + dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) { dld_capable = B_TRUE; subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1720,3 +1760,36 @@ dld_capabilities_disable(dld_str_t *dsp) if (dsp->ds_polling) (void) dld_capab_poll_disable(dsp, NULL); } + +static void +proto_exclusive_req(dld_str_t *dsp, mblk_t *mp) +{ + int ret = 0; + t_uscalar_t dl_err; + mac_perim_handle_t mph; + + if (dsp->ds_passivestate != DLD_UNINITIALIZED) { + dl_err = DL_OUTSTATE; + goto failed; + } + + if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) { + dl_err = DL_BADPRIM; + goto failed; + } + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + ret = dls_exclusive_set(dsp, B_TRUE); + mac_perim_exit(mph); + + if (ret != 0) { + dl_err = DL_SYSERR; + goto failed; + } + + dsp->ds_passivestate = DLD_EXCLUSIVE; + dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ); + return; +failed: + dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret); +} diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 6f0d0b9a6c..f5308e70ff 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -854,6 +854,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid, return (mp); } +static boolean_t +i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp) +{ + mblk_t *mp = *mpp; + mblk_t *newmp; + uint_t pri, vid, dvid; + + dvid = mac_client_vid(dsp->ds_mch); + + /* + * Discard the packet if this is a VLAN stream but the VID in + * the packet is not correct. + */ + vid = VLAN_ID(mhip->mhi_tci); + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + return (B_FALSE); + + /* + * Discard the packet if this packet is a tagged packet + * but both pri and VID are 0. + */ + pri = VLAN_PRI(mhip->mhi_tci); + if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 && + vid == VLAN_ID_NONE) + return (B_FALSE); + + /* + * Update the priority bits to the per-stream priority if + * priority is not set in the packet. Update the VID for + * packets on a VLAN stream. + */ + pri = (pri == 0) ? dsp->ds_pri : 0; + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { + if ((newmp = i_dld_ether_header_update_tag(mp, pri, + dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { + return (B_FALSE); + } + *mpp = newmp; + } + + return (B_TRUE); +} + +mac_tx_cookie_t +str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) +{ + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); + mac_header_info_t mhi; + mac_tx_cookie_t cookie; + + if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0) + goto discard; + + if (is_ethernet) { + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) + goto discard; + } + + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); +discard: + /* TODO: bump kstat? */ + freemsg(mp); + return (NULL); +} + + + /* * M_DATA put (IP fast-path mode) */ @@ -902,7 +973,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -948,38 +1018,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { - dvid = mac_client_vid(dsp->ds_mch); - - /* - * Discard the packet if this is a VLAN stream but the VID in - * the packet is not correct. - */ - vid = VLAN_ID(mhi.mhi_tci); - if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) - goto discard; - - /* - * Discard the packet if this packet is a tagged packet - * but both pri and VID are 0. - */ - pri = VLAN_PRI(mhi.mhi_tci); - if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 && - vid == VLAN_ID_NONE) + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) goto discard; - - /* - * Update the priority bits to the per-stream priority if - * priority is not set in the packet. Update the VID for - * packets on a VLAN stream. - */ - pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dvid != VLAN_ID_NONE)) { - if ((newmp = i_dld_ether_header_update_tag(mp, pri, - dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { - goto discard; - } - mp = newmp; - } } if (DLD_TX(dsp, mp, 0, 0) != NULL) { diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 92993ada58..0f8dbcb57a 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -248,19 +248,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) { int err = 0; uint32_t old_flags = dsp->ds_promisc; + uint32_t new_type = new_flags & + ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS); mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL; + uint16_t mac_flags = 0; + boolean_t doremove = B_FALSE; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | - DLS_PROMISC_PHYS))); + DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS))); + + /* + * If we only have the non-data receive flags set or are only changing + * them, then there's nothing to do other than update the flags here. + * Basically when we only have something in the set of + * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's + * nothing else for us to do other than toggle it, as there's no need to + * talk to MAC and we don't have to do anything else. + */ + if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) { + dsp->ds_promisc = new_flags; + return (0); + } /* * If the user has only requested DLS_PROMISC_MULTI then we need to make * sure that they don't see all packets. */ - if (new_flags == DLS_PROMISC_MULTI) + if (new_type == DLS_PROMISC_MULTI) mptype = MAC_CLIENT_PROMISC_MULTI; + /* + * Look at new flags and figure out the correct mac promisc flags. + * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS, + * don't turn on physical promisc mode. + */ + if (new_flags & DLS_PROMISC_RX_ONLY) + mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP; + if (new_flags & DLS_PROMISC_FIXUPS) + mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS; + if (new_type == DLS_PROMISC_SAP) + mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS; + + /* + * If we're coming in and we're being asked to transition to a state + * where the only DLS flags would be enabled are flags that change what + * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and + * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we + * need to remove the MAC layer promiscuous handler. + */ + if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 && + new_flags != 0) { + doremove = B_TRUE; + } + + /* + * There are three cases we care about here with respect to MAC. Going + * from nothing to something, something to nothing, something to + * something where we need to change how we're getting stuff from mac. + * In the last case, as long as they're not equal, we need to assume + * something has changed and do something about it. + */ if (dsp->ds_promisc == 0 && new_flags != 0) { /* * If only DLS_PROMISC_SAP, we don't turn on the @@ -268,9 +318,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, - (new_flags != DLS_PROMISC_SAP) ? 0 : - MAC_PROMISC_FLAGS_NO_PHYS); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) { dsp->ds_promisc = old_flags; return (err); @@ -281,7 +329,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) mac_promisc_remove(dsp->ds_vlan_mph); dsp->ds_vlan_mph = NULL; } - } else if (dsp->ds_promisc != 0 && new_flags == 0) { + } else if (dsp->ds_promisc != 0 && + (new_flags == 0 || doremove == B_TRUE)) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); @@ -296,19 +345,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); } - } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 && - new_flags != dsp->ds_promisc) { - /* - * If the old flag is PROMISC_SAP, but the current flag has - * changed to some new non-zero value, we need to turn the - * physical promiscuous mode. - */ + } else if (new_flags != 0 && new_flags != old_flags) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); /* Honors both after-remove and before-add semantics! */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, 0); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) dsp->ds_promisc = old_flags; } else { @@ -629,6 +672,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } @@ -659,7 +718,10 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + } else if (dlp->dl_exclusive == B_TRUE) { + return (EBUSY); } + dlp->dl_nactive++; return (0); } @@ -685,7 +747,11 @@ dls_active_set(dld_str_t *dsp) if (dsp->ds_passivestate == DLD_PASSIVE) return (0); - /* If we're already active, then there's nothing more to do. */ + if (dsp->ds_dlp->dl_exclusive == B_TRUE && + dsp->ds_passivestate != DLD_EXCLUSIVE) + return (EBUSY); + + /* If we're already active, we need to check the link's exclusivity */ if ((dsp->ds_nactive == 0) && ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) { /* except for ENXIO all other errors are mapped to EBUSY */ @@ -694,7 +760,8 @@ dls_active_set(dld_str_t *dsp) return (err); } - dsp->ds_passivestate = DLD_ACTIVE; + dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ? + DLD_EXCLUSIVE : DLD_ACTIVE; dsp->ds_nactive++; return (0); } @@ -725,7 +792,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all) if (dsp->ds_nactive != 0) return; - ASSERT(dsp->ds_passivestate == DLD_ACTIVE); + ASSERT(dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE); dls_mac_active_clear(dsp->ds_dlp); + /* + * We verify below to ensure that no other part of DLS has mucked with + * our exclusive state. + */ + if (dsp->ds_passivestate == DLD_EXCLUSIVE) + VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0); dsp->ds_passivestate = DLD_UNINITIALIZED; } + +int +dls_exclusive_set(dld_str_t *dsp, boolean_t enable) +{ + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + if (enable == B_FALSE) { + dsp->ds_dlp->dl_exclusive = B_FALSE; + return (0); + } + + if (dsp->ds_dlp->dl_nactive != 0) + return (EBUSY); + + dsp->ds_dlp->dl_exclusive = B_TRUE; + + return (0); +} diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 6b92a81e77..4a735d870e 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -34,6 +35,9 @@ #include <sys/dld_impl.h> #include <sys/sdt.h> #include <sys/atomic.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/datalink.h> static kmem_cache_t *i_dls_link_cachep; mod_hash_t *i_dls_link_hash; @@ -579,6 +583,67 @@ drop: freemsg(mp); } +/* + * We'd like to notify via sysevents that a link state change has occurred. + * There are a couple of challenges associated with this. The first is that if + * the link is flapping a lot, we may not see an accurate state when we launch + * the notification, we're told it changed, not what it changed to. + * + * The next problem is that all of the information that a user has associated + * with this device is the exact opposite of what we have on the dls_link_t. We + * have the name of the mac device, which has no bearing on what users see. + * Likewise, we don't have the datalink id either. So we're going to have to get + * this from dls. + * + * This is all further complicated by the fact that this could be going on in + * another thread at the same time as someone is tearing down the dls_link_t + * that we're associated with. We need to be careful not to grab the mac + * perimeter, otherwise we stand a good chance of deadlock. + */ +static void +dls_link_notify(void *arg, mac_notify_type_t type) +{ + dls_link_t *dlp = arg; + dls_dl_handle_t dhp; + nvlist_t *nvp; + sysevent_t *event; + sysevent_id_t eid; + + if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK) + return; + + /* + * If we can't find a devnet handle for this link, then there is no user + * knowable device for this at the moment and there's nothing we can + * really share with them that will make sense. + */ + if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0) + return; + + /* + * Because we're attaching this nvlist_t to the sysevent, it'll get + * cleaned up when we call sysevent_free. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID, + dls_devnet_linkid(dhp)) == 0); + VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME, + dls_devnet_link(dhp)) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID, + dls_devnet_getzid(dhp)) == 0); + + dls_devnet_rele_tmp(dhp); + + event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE, + ILLUMOS_KERN_PUB"dls", SE_SLEEP); + VERIFY(event != NULL); + (void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp); + + (void) log_sysevent(event, SE_SLEEP, &eid); + sysevent_free(event); + +} + static void i_dls_link_destroy(dls_link_t *dlp) { @@ -589,6 +654,9 @@ i_dls_link_destroy(dls_link_t *dlp) /* * Free the structure back to the cache. */ + if (dlp->dl_mnh != NULL) + mac_notify_remove(dlp->dl_mnh, B_TRUE); + if (dlp->dl_mch != NULL) mac_client_close(dlp->dl_mch, 0); @@ -600,8 +668,10 @@ i_dls_link_destroy(dls_link_t *dlp) dlp->dl_mh = NULL; dlp->dl_mch = NULL; dlp->dl_mip = NULL; + dlp->dl_mnh = NULL; dlp->dl_unknowns = 0; dlp->dl_nonip_cnt = 0; + dlp->dl_exclusive = B_FALSE; kmem_cache_free(i_dls_link_cachep, dlp); } @@ -640,6 +710,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp) if (err != 0) goto bail; + dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp); + DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *, dlp->dl_mch); diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 049c4bd757..105c55c7ce 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -105,12 +106,13 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); /*ARGSUSED*/ @@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -526,6 +533,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -534,6 +542,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) return (err); } +int +dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid, + zoneid_t zid) +{ + dlmgmt_door_getlinkid_t getlinkid; + dlmgmt_getlinkid_retval_t retval; + int err; + + ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid()); + getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; + (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = zid; + + if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, + sizeof (retval))) == 0) { + *linkid = retval.lr_linkid; + } + return (err); +} + + datalink_id_t dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class, datalink_media_t dmedia, uint32_t flags) @@ -740,12 +769,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, - dls_devnet_stat_update, ddp, &ksp) == 0) { + if (dls_stat_create("link", 0, nm, zoneid, + dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -765,12 +805,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -781,15 +821,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* @@ -878,7 +928,8 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) (void) dls_devnet_unset(macname, &linkid, B_TRUE); /* * The kstat subsystem holds its own locks (rather perimeter) @@ -887,7 +938,7 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; } @@ -924,17 +975,78 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); + if (ddp->dd_zid != GLOBAL_ZONEID) { + /* + * We need to release the dd_mutex before we try and destroy the + * stat. When we destroy it, we'll need to grab the lock for the + * kstat but if there's a concurrent reader of the kstat, we'll + * be blocked on it. This will lead to deadlock because these + * kstats employ a ks_update function (dls_devnet_stat_update) + * which needs the dd_mutex that we currently hold. + * + * Because we've already flagged the dls_devnet_t as + * DD_CONDEMNED and we still have a write lock on + * i_dls_devnet_lock, we should be able to release the dd_mutex. + */ + mutex_exit(&ddp->dd_mutex); + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + mutex_enter(&ddp->dd_mutex); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } /* * Remove this dls_devnet_t from the hash table. @@ -960,8 +1072,15 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); } - if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { + /* + * See the earlier call in this function for an explanation. + */ + mutex_exit(&ddp->dd_mutex); dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid); + mutex_enter(&ddp->dd_mutex); + } + ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; @@ -972,6 +1091,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) return (0); } +/* + * This is a private hold routine used when we already have the dls_link_t, thus + * we know that it cannot go away. + */ +int +dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp) +{ + int err; + dls_devnet_t *ddp = NULL; + + rw_enter(&i_dls_devnet_lock, RW_WRITER); + if ((err = mod_hash_find(i_dls_devnet_hash, + (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) { + ASSERT(err == MH_ERR_NOTFOUND); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + + mutex_enter(&ddp->dd_mutex); + ASSERT(ddp->dd_ref > 0); + if (ddp->dd_flags & DD_CONDEMNED) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + ddp->dd_tref++; + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + + *ddhp = ddp; + return (0); +} + static int dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, boolean_t tmp_hold) @@ -1111,7 +1263,7 @@ dls_devnet_rele(dls_devnet_t *ddp) } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) { char drv[MAXLINKNAMELEN]; uint_t ppa; @@ -1121,7 +1273,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) dls_dev_handle_t ddh; int err; - if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0) + if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0) return (dls_devnet_hold(linkid, ddpp)); /* @@ -1261,9 +1413,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zonename parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1313,13 +1471,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) * is currently accessing the link kstats, or if the link is on-loan * to a non-global zone. Then set the DD_KSTAT_CHANGING flag to * prevent any access to the kstats while we delete and recreate - * kstats below. + * kstats below. However, we skip this check if we're renaming the + * vnic as part of bringing it up for a zone. */ mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } ddp->dd_flags |= DD_KSTAT_CHANGING; @@ -1333,7 +1494,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1406,7 +1575,7 @@ done: */ rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (clear_dd_flag) { mutex_enter(&ddp->dd_mutex); @@ -1421,7 +1590,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1454,6 +1624,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1468,7 +1639,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1490,7 +1661,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1507,7 +1678,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } @@ -1545,15 +1716,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid) * Access a vanity naming node. */ int -dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp, + zoneid_t zid) { dls_devnet_t *ddp; dls_link_t *dlp; - zoneid_t zid = getzoneid(); + zoneid_t czid = getzoneid(); int err; mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + if (czid != GLOBAL_ZONEID && czid != zid) + return (ENOENT); + + if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) return (err); dls_devnet_prop_task_wait(ddp); @@ -1586,6 +1761,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (0); } +int +dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +{ + return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid())); +} + /* * Close access to a vanity naming node. */ @@ -1765,6 +1946,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid) } const char * +dls_devnet_link(dls_dl_handle_t ddh) +{ + return (ddh->dd_linkname); +} + +const char * dls_devnet_mac(dls_dl_handle_t ddh) { return (ddh->dd_mac); diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE new file mode 100644 index 0000000000..00aefb6f51 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE @@ -0,0 +1,32 @@ +/* + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..ac6d2d1b15 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +DR_SAS DRIVER diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c new file mode 100644 index 0000000000..5b1dc82938 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.c @@ -0,0 +1,5506 @@ +/* + * dr_sas.c: source for dr_sas driver + * + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Arun Chandrashekhar + * Manju R + * Rajesh Prabhakaran + * Seokmann Ju + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/pci.h> +#include <sys/scsi/scsi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/signal.h> +#include <sys/fs/dv_node.h> /* devfs_clean */ + +#include "dr_sas.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + +/* + * Local static data + */ +static void *drsas_state = NULL; +static int debug_level_g = CL_NONE; + +#pragma weak scsi_hba_open +#pragma weak scsi_hba_close +#pragma weak scsi_hba_ioctl + +static ddi_dma_attr_t drsas_generic_dma_attr = { + DMA_ATTR_V0, /* dma_attr_version */ + 0, /* low DMA address range */ + 0xFFFFFFFFU, /* high DMA address range */ + 0xFFFFFFFFU, /* DMA counter register */ + 8, /* DMA address alignment */ + 0x07, /* DMA burstsizes */ + 1, /* min DMA size */ + 0xFFFFFFFFU, /* max DMA size */ + 0xFFFFFFFFU, /* segment boundary */ + DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */ + 512, /* granularity of device */ + 0 /* bus specific DMA flags */ +}; + +int32_t drsas_max_cap_maxxfer = 0x1000000; + +/* + * cb_ops contains base level routines + */ +static struct cb_ops drsas_cb_ops = { + drsas_open, /* open */ + drsas_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + drsas_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + nodev, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * dev_ops contains configuration routines + */ +static struct dev_ops drsas_ops = { + DEVO_REV, /* rev, */ + 0, /* refcnt */ + drsas_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + drsas_attach, /* attach */ + drsas_detach, /* detach */ + drsas_reset, /* reset */ + &drsas_cb_ops, /* char/block ops */ + NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_supported, /* quiesce */ +}; + +char _depends_on[] = "misc/scsi"; + +static struct modldrv modldrv = { + &mod_driverops, /* module type - driver */ + DRSAS_VERSION, + &drsas_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* ml_rev - must be MODREV_1 */ + &modldrv, /* ml_linkage */ + NULL /* end of driver linkage */ +}; + +static struct ddi_device_acc_attr endian_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + + +/* + * ************************************************************************** * + * * + * common entry points - for loadable kernel modules * + * * + * ************************************************************************** * + */ + +int +_init(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ret = ddi_soft_state_init(&drsas_state, + sizeof (struct drsas_instance), 0); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state")); + return (ret); + } + + if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba")); + ddi_soft_state_fini(&drsas_state); + return (ret); + } + + ret = mod_install(&modlinkage); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed")); + scsi_hba_fini(&modlinkage); + ddi_soft_state_fini(&drsas_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + return (ret); + + scsi_hba_fini(&modlinkage); + + ddi_soft_state_fini(&drsas_state); + + return (ret); +} + + +/* + * ************************************************************************** * + * * + * common entry points - for autoconfiguration * + * * + * ************************************************************************** * + */ + +static int +drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance_no; + int nregs; + uint8_t added_isr_f = 0; + uint8_t added_soft_isr_f = 0; + uint8_t create_devctl_node_f = 0; + uint8_t create_scsi_node_f = 0; + uint8_t create_ioc_node_f = 0; + uint8_t tran_alloc_f = 0; + uint8_t irq; + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + uint16_t command; + off_t reglength = 0; + int intr_types = 0; + char *data; + int msi_enable = 0; + + scsi_hba_tran_t *tran; + ddi_dma_attr_t tran_dma_attr; + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + /* + * check to see whether this device is in a DMA-capable slot. + */ + if (ddi_slaveonly(dip) == DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Device in slave-only slot, unused", + instance_no)); + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_ATTACH: + con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH")); + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(drsas_state, instance_no) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to allocate soft state", + instance_no)); + + return (DDI_FAILURE); + } + + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + if (instance == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Bad soft state", instance_no)); + + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + bzero((caddr_t)instance, + sizeof (struct drsas_instance)); + + instance->func_ptr = kmem_zalloc( + sizeof (struct drsas_func_ptr), KM_SLEEP); + ASSERT(instance->func_ptr); + + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: pci config setup failed ", + instance_no)); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to get registers.")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); + + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); + + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); + + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, DRSAS_VERSION)); + + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); + + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; + + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + + con_log(CL_ANN, (CE_CONT, "dr_sas%d: " + "enable bus-mastering", instance_no)); + } else { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "bus-mastering already set", instance_no)); + } + + /* initialize function pointers */ + if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || + (device_id == PCI_DEVICE_ID_LSI_2108V)) { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "2108V/DE detected", instance_no)); + instance->func_ptr->read_fw_status_reg = + read_fw_status_reg_ppc; + instance->func_ptr->issue_cmd = issue_cmd_ppc; + instance->func_ptr->issue_cmd_in_sync_mode = + issue_cmd_in_sync_mode_ppc; + instance->func_ptr->issue_cmd_in_poll_mode = + issue_cmd_in_poll_mode_ppc; + instance->func_ptr->enable_intr = + enable_intr_ppc; + instance->func_ptr->disable_intr = + disable_intr_ppc; + instance->func_ptr->intr_ack = intr_ack_ppc; + } else { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Invalid device detected")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + drsas_fm_init(instance); + + /* Initialize Interrupts */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + return (DDI_FAILURE); + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "dr_sas: register length to map is " + "0x%lx bytes", reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: couldn't map control registers")); + goto fail_attach; + } + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + msi_enable = 0; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "drsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "yes", 3) == 0) { + msi_enable = 1; + con_log(CL_ANN, (CE_WARN, + "msi_enable = %d ENABLED", + msi_enable)); + } + ddi_prop_free(data); + } + + con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", + msi_enable)); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "ddi_intr_get_supported_types() failed")); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", + intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSIX interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & + DDI_INTR_TYPE_MSI)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSI interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "FIXED interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + con_log(CL_ANN, (CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts")); + goto fail_attach; + } + + added_isr_f = 1; + + /* setup the mfi based low level driver */ + if (init_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "could not initialize the low level driver")); + + goto fail_attach; + } + + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + &instance->soft_intr_id, NULL, NULL, + drsas_softintr, (caddr_t)instance) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + " Software ISR did not register")); + + goto fail_attach; + } + + added_soft_isr_f = 1; + } + + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + + if (tran == NULL) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_tran_alloc failed")); + goto fail_attach; + } + + tran_alloc_f = 1; + + instance->tran = tran; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = drsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = drsas_tran_tgt_free; + tran->tran_init_pkt = drsas_tran_init_pkt; + tran->tran_start = drsas_tran_start; + tran->tran_abort = drsas_tran_abort; + tran->tran_reset = drsas_tran_reset; + tran->tran_getcap = drsas_tran_getcap; + tran->tran_setcap = drsas_tran_setcap; + tran->tran_destroy_pkt = drsas_tran_destroy_pkt; + tran->tran_dmafree = drsas_tran_dmafree; + tran->tran_sync_pkt = drsas_tran_sync_pkt; + tran->tran_bus_config = drsas_tran_bus_config; + + tran_dma_attr = drsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_attach failed")); + + goto fail_attach; + } + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create devctl node.")); + + goto fail_attach; + } + + create_devctl_node_f = 1; + + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), + DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create scsi node.")); + + goto fail_attach; + } + + create_scsi_node_f = 1; + + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); + + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), + DDI_PSEUDO, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create ioctl node.")); + + goto fail_attach; + } + + create_ioc_node_f = 1; + + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "drsas_dr_taskq", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create taskq ")); + instance->taskq = NULL; + goto fail_attach; + } + + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); + + /* initiate AEN */ + if (start_mfi_aen(instance)) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to initiate AEN.")); + goto fail_initiate_aen; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "AEN started for instance %d.", instance_no)); + + /* Finally! We are on the air. */ + ddi_report_dev(dip); + + if (drsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (drsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + instance->dr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld), + KM_SLEEP); + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, (CE_WARN, + "dr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + +fail_initiate_aen: +fail_attach: + if (create_devctl_node_f) { + ddi_remove_minor_node(dip, "devctl"); + } + + if (create_scsi_node_f) { + ddi_remove_minor_node(dip, "scsi"); + } + + if (create_ioc_node_f) { + ddi_remove_minor_node(dip, instance->iocnode); + } + + if (tran_alloc_f) { + scsi_hba_tran_free(tran); + } + + + if (added_soft_isr_f) { + ddi_remove_softintr(instance->soft_intr_id); + } + + if (added_isr_f) { + drsas_rem_intrs(instance); + } + + if (instance && instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + ddi_soft_state_free(drsas_state, instance_no); + + con_log(CL_ANN, (CE_NOTE, + "dr_sas: return failure from drsas_attach")); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + int rval; + int drsas_minor = getminor((dev_t)arg); + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + instance = (struct drsas_instance *) + ddi_get_soft_state(drsas_state, + MINOR2INST(drsas_minor)); + + if (instance == NULL) { + *resultp = NULL; + rval = DDI_FAILURE; + } else { + *resultp = instance->dip; + rval = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)instance; + rval = DDI_SUCCESS; + break; + default: + *resultp = NULL; + rval = DDI_FAILURE; + } + + return (rval); +} + +static int +drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state, + instance_no); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d could not get instance in detach", + instance_no)); + + return (DDI_FAILURE); + } + + con_log(CL_ANN, (CE_NOTE, + "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x", + instance_no, instance->vendor_id, instance->device_id, + instance->subsysvid, instance->subsysid)); + + switch (cmd) { + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_DETACH")); + + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d failed to detach", + instance_no)); + + return (DDI_FAILURE); + } + + scsi_hba_tran_free(instance->tran); + + flush_cache(instance); + + if (abort_aen_cmd(instance, instance->aen_cmd)) { + con_log(CL_ANN, (CE_WARN, "drsas_detach: " + "failed to abort prevous AEN command")); + + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + if (instance->isr_level == HIGH_LEVEL_INTR) { + ddi_remove_softintr(instance->soft_intr_id); + } + + drsas_rem_intrs(instance); + + if (instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + kmem_free(instance->dr_ld_list, MRDRV_MAX_LD + * sizeof (struct drsas_ld)); + free_space_for_mfi(instance); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + + ddi_soft_state_free(drsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * ************************************************************************** * + * * + * common entry points - for character driver types * + * * + * ************************************************************************** * + */ +static int +drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* Check root permissions */ + if (drv_priv(credp) != 0) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Non-root ioctl access denied!")); + return (EPERM); + } + + /* Verify we are being opened as a character device */ + if (otyp != OTYP_CHR) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: ioctl node must be a char node")); + return (EINVAL); + } + + if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev))) + == NULL) { + return (ENXIO); + } + + if (scsi_hba_open) { + rval = scsi_hba_open(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* no need for locks! */ + + if (scsi_hba_close) { + rval = scsi_hba_close(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int rval = 0; + + struct drsas_instance *instance; + struct drsas_ioctl *ioctl; + struct drsas_aen aen; + int i; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev))); + + if (instance == NULL) { + /* invalid minor number */ + con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found.")); + return (ENXIO); + } + + ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl), + KM_SLEEP); + ASSERT(ioctl); + + switch ((uint_t)cmd) { + case DRSAS_IOCTL_FIRMWARE: + for (i = 0; i < sizeof (struct drsas_ioctl); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)ioctl+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "drsas_ioctl " + "ERROR IOCTL copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) { + rval = handle_drv_ioctl(instance, ioctl, mode); + } else { + rval = handle_mfi_ioctl(instance, ioctl, mode); + } + for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) { + if (ddi_copyout((uint8_t *)ioctl+i, + (uint8_t *)arg+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: ddi_copyout " + "failed")); + rval = 1; + break; + } + } + + break; + case DRSAS_IOCTL_AEN: + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)&aen+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ERROR AEN copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + + rval = handle_mfi_aen(instance, &aen); + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyout((uint8_t *)&aen + i, + (uint8_t *)arg + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ddi_copyout failed")); + rval = 1; + break; + } + } + + break; + default: + rval = scsi_hba_ioctl(dev, cmd, arg, + mode, credp, rvalp); + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: " + "scsi_hba_ioctl called, ret = %x.", rval)); + } + + kmem_free(ioctl, sizeof (struct drsas_ioctl)); + return (rval); +} + +/* + * ************************************************************************** * + * * + * common entry points - for block driver types * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + instance_no = ddi_get_instance(dip); + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter " + "in reset", instance_no)); + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + instance_no)); + + flush_cache(instance); + + return (DDI_SUCCESS); +} + + +/* + * ************************************************************************** * + * * + * entry points (SCSI HBA) * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + uint16_t tgt = sd->sd_address.a_target; + uint8_t lun = sd->sd_address.a_lun; + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d", + tgt, lun)); + + instance = ADDR2MR(&sd->sd_address); + + if (ndi_dev_is_persistent_node(tgt_dip) == 0) { + (void) ndi_merge_node(tgt_dip, drsas_name_node); + ddi_set_name_addr(tgt_dip, NULL); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in " + "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", + tgt, lun)); + return (DDI_FAILURE); + } + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p", + (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == NULL && + strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + instance->dr_ld_list[tgt].dip = tgt_dip; + instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN; + } + } + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void +drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + + instance = ADDR2MR(&sd->sd_address); + + con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == tgt_dip) { + instance->dr_ld_list[tgt].dip = NULL; + } + } +} + +static dev_info_t * +drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun) +{ + dev_info_t *child = NULL; + char addr[SCSI_MAXNAMELEN]; + char tmp[MAXNAMELEN]; + + (void) sprintf(addr, "%x,%x", tgt, lun); + for (child = ddi_get_child(instance->dip); child; + child = ddi_get_next_sibling(child)) { + + if (drsas_name_node(child, tmp, MAXNAMELEN) != + DDI_SUCCESS) { + continue; + } + + if (strcmp(addr, tmp) == 0) { + break; + } + } + con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p", + (void *)child)); + return (child); +} + +static int +drsas_name_node(dev_info_t *dip, char *name, int len) +{ + int tgt, lun; + + tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "target", -1); + con_log(CL_ANN1, (CE_NOTE, + "drsas_name_node: dip %p tgt %d", (void *)dip, tgt)); + if (tgt == -1) { + return (DDI_FAILURE); + } + lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "lun", -1); + con_log(CL_ANN1, + (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun)); + if (lun == -1) { + return (DDI_FAILURE); + } + (void) snprintf(name, len, "%x,%x", tgt, lun); + return (DDI_SUCCESS); +} + +static struct scsi_pkt * +drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct drsas_instance *instance; + struct scsi_pkt *new_pkt; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (drsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + + return (pkt); +} + +static int +drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) +{ + uchar_t cmd_done = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x", + __func__, __LINE__, pkt->pkt_cdbp[0])); + + pkt->pkt_reason = CMD_CMPLT; + *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */ + + cmd = build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the drsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy")); + return_mfi_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + instance->func_ptr->issue_cmd(cmd, instance); + + } else { + struct drsas_header *hdr = &cmd->frame->hdr; + + cmd->sync_cmd = DRSAS_TRUE; + + instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, + &hdr->cmd_status)) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + + case MFI_STAT_SCSI_DONE_WITH_ERROR: + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + break; + + case MFI_STAT_DEVICE_NOT_FOUND: + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + default: + ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1; + } + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + if (pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + } + + return (TRAN_ACCEPT); +} + +/*ARGSUSED*/ +static int +drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* abort command not supported by H/W */ + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_tran_reset(struct scsi_address *ap, int level) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* reset command not supported by H/W */ + + return (DDI_FAILURE); + +} + +/*ARGSUSED*/ +static int +drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) +{ + int rval = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* we do allow inquiring about capabilities for other targets */ + if (cap == NULL) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + /* Limit to 16MB max transfer */ + rval = drsas_max_cap_maxxfer; + break; + case SCSI_CAP_MSG_OUT: + rval = 1; + break; + case SCSI_CAP_DISCONNECT: + rval = 0; + break; + case SCSI_CAP_SYNCHRONOUS: + rval = 0; + break; + case SCSI_CAP_WIDE_XFER: + rval = 1; + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_UNTAGGED_QING: + rval = 1; + break; + case SCSI_CAP_PARITY: + rval = 1; + break; + case SCSI_CAP_INITIATOR_ID: + rval = instance->init_id; + break; + case SCSI_CAP_ARQ: + rval = 1; + break; + case SCSI_CAP_LINKED_CMDS: + rval = 0; + break; + case SCSI_CAP_RESET_NOTIFICATION: + rval = 1; + break; + case SCSI_CAP_GEOMETRY: + rval = -1; + + break; + default: + con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x", + scsi_hba_lookup_capstr(cap))); + rval = -1; + break; + } + + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) +{ + int rval = 1; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* We don't allow setting capabilities for other targets */ + if (cap == NULL || whom == 0) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_MSG_OUT: + case SCSI_CAP_PARITY: + case SCSI_CAP_LINKED_CMDS: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_ARQ: + /* + * None of these are settable via + * the capability interface. + */ + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_SECTOR_SIZE: + rval = 1; + break; + + case SCSI_CAP_TOTAL_SECTORS: + rval = 1; + break; + default: + rval = -1; + break; + } + + return (rval); +} + +static void +drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } + + /* free the pkt */ + scsi_hba_pkt_free(ap, pkt); +} + +/*ARGSUSED*/ +static void +drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } +} + +/*ARGSUSED*/ +static void +drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset, + acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ? + DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU); + } +} + +/* + * drsas_isr(caddr_t) + * + * The Interrupt Service Routine + * + * Collect status for all completed commands and do callback + * + */ +static uint_t +drsas_isr(struct drsas_instance *instance) +{ + int need_softintr; + uint32_t producer; + uint32_t consumer; + uint32_t context; + + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ASSERT(instance); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } + + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->producer); + consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer); + + con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ", + producer, consumer)); + if (producer == consumer) { + con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + return (DDI_INTR_UNCLAIMED); + } + mutex_enter(&instance->completed_pool_mtx); + + while (consumer != producer) { + context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + &instance->reply_queue[consumer]); + cmd = instance->cmd_list[context]; + mlist_add_tail(&cmd->list, &instance->completed_pool_list); + + consumer++; + if (consumer == (instance->max_fw_cmds + 1)) { + consumer = 0; + } + } + + mutex_exit(&instance->completed_pool_mtx); + + ddi_put32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer, consumer); + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + if (instance->softint_running) { + need_softintr = 0; + } else { + need_softintr = 1; + } + + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (need_softintr) { + ddi_trigger_softintr(instance->soft_intr_id); + } + } else { + /* + * Not a high-level interrupt, therefore call the soft level + * interrupt explicitly + */ + (void) drsas_softintr(instance); + } + + return (DDI_INTR_CLAIMED); +} + + +/* + * ************************************************************************** * + * * + * libraries * + * * + * ************************************************************************** * + */ +/* + * get_mfi_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ +static struct drsas_cmd * +get_mfi_pkt(struct drsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct drsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct drsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) + cmd->pkt = NULL; + mutex_exit(&instance->cmd_pool_mtx); + + return (cmd); +} + +/* + * return_mfi_pkt : Return a cmd to free command pool + */ +static void +return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + mlist_add(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +/* + * destroy_mfi_frame_pool + */ +static void +destroy_mfi_frame_pool(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + struct drsas_cmd *cmd; + + /* return all frames to pool */ + for (i = 0; i < max_cmd+1; i++) { + + cmd = instance->cmd_list[i]; + + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) + (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj); + + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } + +} + +/* + * create_mfi_frame_pool + */ +static int +create_mfi_frame_pool(struct drsas_instance *instance) +{ + int i = 0; + int cookie_cnt; + uint16_t max_cmd; + uint16_t sge_sz; + uint32_t sgl_sz; + uint32_t tot_frame_size; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sge_sz = sizeof (struct drsas_sge64); + + /* calculated the number of 64byte frames required for SGL */ + sgl_sz = sge_sz * instance->max_num_sge; + tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " + "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); + + while (i < max_cmd+1) { + cmd = instance->cmd_list[i]; + + cmd->frame_dma_obj.size = tot_frame_size; + cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr; + cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; + cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; + + + cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC); + + if (cookie_cnt == -1 || cookie_cnt > 1) { + con_log(CL_ANN, (CE_WARN, + "create_mfi_frame_pool: could not alloc.")); + return (DDI_FAILURE); + } + + bzero(cmd->frame_dma_obj.buffer, tot_frame_size); + + cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED; + cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer; + cmd->frame_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address; + + cmd->sense = (uint8_t *)(((unsigned long) + cmd->frame_dma_obj.buffer) + + tot_frame_size - SENSE_LENGTH); + cmd->sense_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address + + tot_frame_size - SENSE_LENGTH; + + if (!cmd->frame || !cmd->sense) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: pci_pool_alloc failed")); + + return (ENOMEM); + } + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.context, cmd->index); + i++; + + con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x", + cmd->index, cmd->frame_phys_addr)); + } + + return (DDI_SUCCESS); +} + +/* + * free_additional_dma_buffer + */ +static void +free_additional_dma_buffer(struct drsas_instance *instance) +{ + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } +} + +/* + * alloc_additional_dma_buffer + */ +static int +alloc_additional_dma_buffer(struct drsas_instance *instance) +{ + uint32_t reply_q_sz; + uint32_t internal_buf_size = PAGESIZE*2; + + /* max cmds plus 1 + producer & consumer */ + reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2); + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: could not alloc reply queue")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->producer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer); + instance->consumer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 4); + instance->reply_queue = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 8); + instance->internal_buf = (caddr_t)(((unsigned long) + instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + + (reply_q_sz + 8); + instance->internal_buf_size = internal_buf_size - + (reply_q_sz + 8); + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct drsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + return (DDI_SUCCESS); +} + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + free_additional_dma_buffer(instance); + + /* first free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* free all the commands in the cmd_list */ + for (i = 0; i < instance->max_fw_cmds+1; i++) { + kmem_free(instance->cmd_list[i], + sizeof (struct drsas_cmd)); + + instance->cmd_list[i] = NULL; + } + + /* free the cmd_list buffer itself */ + kmem_free(instance->cmd_list, + sizeof (struct drsas_cmd *) * (max_cmd+1)); + + instance->cmd_list = NULL; + + INIT_LIST_HEAD(&instance->cmd_pool_list); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd; + size_t sz; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + /* reserve 1 more slot for flush_cache */ + sz = sizeof (struct drsas_cmd *) * (max_cmd+1); + + /* + * instance->cmd_list is an array of struct drsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + ASSERT(instance->cmd_list); + + for (i = 0; i < max_cmd+1; i++) { + instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd), + KM_SLEEP); + ASSERT(instance->cmd_list[i]); + } + + INIT_LIST_HEAD(&instance->cmd_pool_list); + + /* add all the commands to command pool (instance->cmd_pool) */ + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + /* single slot for flush_cache won't be added in command pool */ + cmd = instance->cmd_list[max_cmd]; + cmd->index = i; + + /* create a frame pool and assign one frame to each cmd */ + if (create_mfi_frame_pool(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + if (alloc_additional_dma_buffer(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * get_ctrl_info + */ +static int +get_ctrl_info(struct drsas_instance *instance, + struct drsas_ctrl_info *ctrl_info) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + struct drsas_ctrl_info *ci; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + ci = (struct drsas_ctrl_info *)instance->internal_buf; + + if (!ci) { + con_log(CL_ANN, (CE_WARN, + "Failed to alloc mem for ctrl info")); + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + + (void) memset(ci, 0, sizeof (struct drsas_ctrl_info)); + + /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_ctrl_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->internal_buf_dmac_add); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_ctrl_info)); + + cmd->frame_count = 1; + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)ctrl_info, (uint8_t *)ci, + sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR); + } else { + con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + ret = -1; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = -1; + } + + return (ret); +} + +/* + * abort_aen_cmd + */ +static int +abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_abort_frame *abort_fr; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + instance->aen_cmd->abort_aen = 1; + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "abort_aen_cmd: issue_cmd_in_sync_mode failed")); + ret = -1; + } else { + ret = 0; + } + + instance->aen_cmd->abort_aen = 1; + instance->aen_cmd = 0; + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + return (ret); +} + +/* + * init_mfi + */ +static int +init_mfi(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd; + struct drsas_ctrl_info ctrl_info; + struct drsas_init_frame *init_frame; + struct drsas_init_queue_info *initq_info; + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready")); + goto fail_ready_state; + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + instance->max_num_sge = + (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ? + DRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* create a pool of commands */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) + goto fail_alloc_fw_space; + + /* + * Prepare a init frame. Note the init frame points to queue info + * structure. Each frame has SGL allocated after first 64 bytes. For + * this frame - since we don't need any SGL - we use SGL's space as + * queue info structure + */ + cmd = get_mfi_pkt(instance); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + init_frame = (struct drsas_init_frame *)cmd->frame; + initq_info = (struct drsas_init_queue_info *) + ((unsigned long)init_frame + 64); + + (void) memset(init_frame, 0, MRMFI_FRAME_SIZE); + (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info)); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_entries, instance->max_fw_cmds + 1); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8); + + ddi_put8(cmd->frame_dma_obj.acc_handle, + &init_frame->cmd, MFI_CMD_OP_INIT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_lo, + cmd->frame_phys_addr + 64); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_hi, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, + sizeof (struct drsas_init_queue_info)); + + cmd->frame_count = 1; + + /* issue the init frame in polled mode */ + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "failed to init firmware")); + goto fail_fw_init; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + goto fail_fw_init; + } + + /* gather misc FW related information */ + if (!get_ctrl_info(instance, &ctrl_info)) { + instance->max_sectors_per_req = ctrl_info.max_request_size; + con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d", + ctrl_info.product_name, ctrl_info.ld_present_count)); + } else { + instance->max_sectors_per_req = instance->max_num_sge * + PAGESIZE / 512; + } + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + goto fail_fw_init; + } + + return (DDI_SUCCESS); + +fail_fw_init: +fail_alloc_fw_space: + + free_space_for_mfi(instance); + +fail_ready_state: + ddi_regs_map_free(&instance->regmap_handle); + +fail_mfi_reg_setup: + return (DDI_FAILURE); +} + +/* + * mfi_state_transition_to_ready : Move the FW to READY state + * + * @reg_set : MFI register set + */ +static int +mfi_state_transition_to_ready(struct drsas_instance *instance) +{ + int i; + uint8_t max_wait; + uint32_t fw_ctrl; + uint32_t fw_state; + uint32_t cur_state; + + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK; + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); + + while (fw_state != MFI_STATE_READY) { + con_log(CL_ANN, (CE_NOTE, + "mfi_state_transition_to_ready:FW state%x", fw_state)); + + switch (fw_state) { + case MFI_STATE_FAULT: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW in FAULT state!!")); + + return (ENODEV); + case MFI_STATE_WAIT_HANDSHAKE: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW waiting for HANDSHAKE")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + + max_wait = 2; + cur_state = MFI_STATE_WAIT_HANDSHAKE; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW state boot message pending")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + + max_wait = 10; + cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; + break; + case MFI_STATE_OPERATIONAL: + /* bring it to READY state; assuming max wait 2 secs */ + instance->func_ptr->disable_intr(instance); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: FW in OPERATIONAL state")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT) + * to be set + */ + /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + + max_wait = 10; + cur_state = MFI_STATE_OPERATIONAL; + break; + case MFI_STATE_UNDEFINED: + /* this state should not last for more than 2 seconds */ + con_log(CL_ANN, (CE_NOTE, "FW state undefined")); + + max_wait = 2; + cur_state = MFI_STATE_UNDEFINED; + break; + case MFI_STATE_BB_INIT: + max_wait = 2; + cur_state = MFI_STATE_BB_INIT; + break; + case MFI_STATE_FW_INIT: + max_wait = 2; + cur_state = MFI_STATE_FW_INIT; + break; + case MFI_STATE_DEVICE_SCAN: + max_wait = 10; + cur_state = MFI_STATE_DEVICE_SCAN; + break; + default: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: Unknown state 0x%x", fw_state)); + return (ENODEV); + } + + /* the cur_state should not last for more than max_wait secs */ + for (i = 0; i < (max_wait * MILLISEC); i++) { + /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */ + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & + MFI_STATE_MASK; + + if (fw_state == cur_state) { + delay(1 * drv_usectohz(MILLISEC)); + } else { + break; + } + } + + /* return error if fw_state hasn't changed after max_wait */ + if (fw_state == cur_state) { + con_log(CL_ANN, (CE_NOTE, + "FW state hasn't changed in %d secs", max_wait)); + return (ENODEV); + } + }; + + fw_ctrl = RD_IB_DOORBELL(instance); + + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + + /* + * Write 0xF to the doorbell register to do the following. + * - Abort all outstanding commands (bit 0). + * - Transition from OPERATIONAL to READY state (bit 1). + * - Discard (possible) low MFA posted in 64-bit mode (bit-2). + * - Set to release FW to continue running (i.e. BIOS handshake + * (bit 3). + */ + WR_IB_DOORBELL(0xF, instance); + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + return (ENODEV); + } + return (DDI_SUCCESS); +} + +/* + * get_seq_num + */ +static int +get_seq_num(struct drsas_instance *instance, + struct drsas_evt_log_info *eli) +{ + int ret = DDI_SUCCESS; + + dma_obj_t dcmd_dma_obj; + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, "dr_sas: failed to get a cmd"); + return (ENOMEM); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info); + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "get_seq_num: could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct drsas_evt_log_info)); + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + cmn_err(CE_WARN, "get_seq_num: " + "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO"); + ret = DDI_FAILURE; + } else { + /* copy the data back into callers buffer */ + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli, + (uint8_t *)dcmd_dma_obj.buffer, + sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR); + ret = DDI_SUCCESS; + } + + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + ret = DDI_FAILURE; + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = DDI_FAILURE; + } + return (ret); +} + +/* + * start_mfi_aen + */ +static int +start_mfi_aen(struct drsas_instance *instance) +{ + int ret = 0; + + struct drsas_evt_log_info eli; + union drsas_evt_class_locale class_locale; + + /* get the latest sequence number from FW */ + (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info)); + + if (get_seq_num(instance, &eli)) { + cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num"); + return (-1); + } + + /* register AEN with FW for latest sequence number plus 1 */ + class_locale.members.reserved = 0; + class_locale.members.locale = DR_EVT_LOCALE_ALL; + class_locale.members.class = DR_EVT_CLASS_INFO; + ret = register_mfi_aen(instance, eli.newest_seq_num + 1, + class_locale.word); + + if (ret) { + cmn_err(CE_WARN, "start_mfi_aen: aen registration failed"); + return (-1); + } + + return (ret); +} + +/* + * flush_cache + */ +static void +flush_cache(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd = NULL; + struct drsas_dcmd_frame *dcmd; + uint32_t max_cmd = instance->max_fw_cmds; + + cmd = instance->cmd_list[max_cmd]; + + if (cmd == NULL) + return; + + dcmd = &cmd->frame->dcmd; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_NONE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_CACHE_FLUSH); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0], + DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE); + + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); + } + con_log(CL_DLEVEL1, (CE_NOTE, "done")); +} + +/* + * service_mfi_aen- Completes an AEN command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + */ +static void +service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + uint32_t seq_num; + struct drsas_evt_detail *evt_detail = + (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; + int rval = 0; + int tgt = 0; + ddi_acc_handle_t acc_handle; + + acc_handle = cmd->frame_dma_obj.acc_handle; + + cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + /* + * log the MFI AEN event to the sysevent queue so that + * application will get noticed + */ + if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS", + NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) { + int instance_no = ddi_get_instance(instance->dip); + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to log AEN event", instance_no)); + } + /* + * Check for any ld devices that has changed state. i.e. online + * or offline. + */ + con_log(CL_ANN1, (CE_NOTE, + "AEN: code = %x class = %x locale = %x args = %x", + ddi_get32(acc_handle, &evt_detail->code), + evt_detail->cl.members.class, + ddi_get16(acc_handle, &evt_detail->cl.members.locale), + ddi_get8(acc_handle, &evt_detail->arg_type))); + + switch (ddi_get32(acc_handle, &evt_detail->code)) { + case DR_EVT_CFG_CLEARED: { + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + if (instance->dr_ld_list[tgt].dip != NULL) { + rval = drsas_service_evt(instance, tgt, 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "dr_sas: CFG CLEARED AEN rval = %d " + "tgt id = %d", rval, tgt)); + } + } + break; + } + + case DR_EVT_LD_DELETED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_DELETED */ + + case DR_EVT_LD_CREATED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_CREATED */ + } /* End of Main Switch */ + + /* get copy of seq_num and class/locale for re-registration */ + seq_num = ddi_get32(acc_handle, &evt_detail->seq_num); + seq_num++; + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0); + ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num); + + instance->aen_seq_num = seq_num; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + instance->func_ptr->issue_cmd(cmd, instance); +} + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +static void +complete_cmd_in_sync_mode(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = DRSAS_FALSE; + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + cv_broadcast(&instance->int_cmd_cv); +} + +/* + * drsas_softintr - The Software ISR + * @param arg : HBA soft state + * + * called from high-level interrupt if hi-level interrupt are not there, + * otherwise triggered as a soft interrupt + */ +static uint_t +drsas_softintr(struct drsas_instance *instance) +{ + struct scsi_pkt *pkt; + struct scsa_cmd *acmd; + struct drsas_cmd *cmd; + struct mlist_head *pos, *next; + mlist_t process_list; + struct drsas_header *hdr; + struct scsi_arq_status *arqstat; + + con_log(CL_ANN1, (CE_CONT, "drsas_softintr called")); + + ASSERT(instance); + mutex_enter(&instance->completed_pool_mtx); + + if (mlist_empty(&instance->completed_pool_list)) { + mutex_exit(&instance->completed_pool_mtx); + return (DDI_INTR_UNCLAIMED); + } + + instance->softint_running = 1; + + INIT_LIST_HEAD(&process_list); + mlist_splice(&instance->completed_pool_list, &process_list); + INIT_LIST_HEAD(&instance->completed_pool_list); + + mutex_exit(&instance->completed_pool_mtx); + + /* perform all callbacks first, before releasing the SCBs */ + mlist_for_each_safe(pos, next, &process_list) { + cmd = mlist_entry(pos, struct drsas_cmd, list); + + /* syncronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + hdr = &cmd->frame->hdr; + + /* remove the internal command from the process list */ + mlist_del_init(&cmd->list); + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) { + case MFI_CMD_OP_PD_SCSI: + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_LD_READ: + case MFI_CMD_OP_LD_WRITE: + /* + * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI + * could have been issued either through an + * IO path or an IOCTL path. If it was via IOCTL, + * we will send it to internal completion. + */ + if (cmd->sync_cmd == DRSAS_TRUE) { + complete_cmd_in_sync_mode(instance, cmd); + break; + } + + /* regular commands */ + acmd = cmd->cmd; + pkt = CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, + acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN1, (CE_CONT, + "CDB[0] = %x completed for %s: size %lx context %x", + pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, hdr->context)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (hdr->cmd_status == MFI_STAT_OK)) { + display_scsi_inquiry( + (caddr_t)inq); + } else if ((hdr->cmd_status == + MFI_STAT_OK) && inq->inq_dtype == + DTYPE_DIRECT) { + + display_scsi_inquiry( + (caddr_t)inq); + + /* for physical disk */ + hdr->cmd_status = + MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (hdr->cmd_status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + con_log(CL_ANN, + (CE_WARN, "Initialization in Progress")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + ddi_rep_get8( + cmd->frame_dma_obj.acc_handle, + (uint8_t *) + &(arqstat->sts_sensedata), + cmd->sense, + acmd->cmd_scblen - + offsetof(struct scsi_arq_status, + sts_sensedata), DDI_DEV_AUTOINCR); + } + break; + case MFI_STAT_LD_OFFLINE: + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN1, (CE_CONT, + "device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = + KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = + CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + + break; + + default: + con_log(CL_ANN, (CE_CONT, "Unknown status!")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return_mfi_pkt(instance, cmd); + + (void) drsas_common_check(instance, cmd); + + if (acmd->cmd_dmahandle) { + if (drsas_check_dma_handle( + acmd->cmd_dmahandle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + break; + case MFI_CMD_OP_SMP: + case MFI_CMD_OP_STP: + complete_cmd_in_sync_mode(instance, cmd); + break; + case MFI_CMD_OP_DCMD: + /* see if got an event notification */ + if (ddi_get32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->dcmd.opcode) == + DR_DCMD_CTRL_EVENT_WAIT) { + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, + "drsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, + (-1)); + service_mfi_aen(instance, cmd); + } + } else { + complete_cmd_in_sync_mode(instance, cmd); + } + + break; + case MFI_CMD_OP_ABORT: + con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + /* + * MFI_CMD_OP_ABORT successfully completed + * in the synchronous mode + */ + complete_cmd_in_sync_mode(instance, cmd); + break; + default: + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + if (cmd->pkt != NULL) { + pkt = cmd->pkt; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + } + con_log(CL_ANN, (CE_WARN, "Cmd type unknown !")); + break; + } + } + + instance->softint_running = 0; + + return (DDI_INTR_CLAIMED); +} + +/* + * drsas_alloc_dma_obj + * + * Allocate the memory and other resources for an dma object. + */ +static int +drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj, + uchar_t endian_flags) +{ + int i; + size_t alen = 0; + uint_t cookie_cnt; + struct ddi_device_acc_attr tmp_endian_attr; + + tmp_endian_attr = endian_attr; + tmp_endian_attr.devacc_attr_endian_flags = endian_flags; + + i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr, + DDI_DMA_SLEEP, NULL, &obj->dma_handle); + if (i != DDI_SUCCESS) { + + switch (i) { + case DDI_DMA_BADATTR : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- Bad attribute")); + break; + case DDI_DMA_NORESOURCES : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- No Resources")); + break; + default : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle: " + "unknown status %d", i)); + break; + } + + return (-1); + } + + if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, + &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) || + alen < obj->size) { + + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + + return (-1); + } + + if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, + obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, + NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) { + + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + + return (-1); + } + + if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + return (cookie_cnt); +} + +/* + * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t) + * + * De-allocate the memory and other resources for an dma object, which must + * have been alloated by a previous call to drsas_alloc_dma_obj() + */ +static int +drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj) +{ + + if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + (void) ddi_dma_unbind_handle(obj.dma_handle); + ddi_dma_mem_free(&obj.acc_handle); + ddi_dma_free_handle(&obj.dma_handle); + + return (DDI_SUCCESS); +} + +/* + * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *, + * int, int (*)()) + * + * Allocate dma resources for a new scsi command + */ +static int +drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp, int flags, int (*callback)()) +{ + int dma_flags; + int (*cb)(caddr_t); + int i; + + ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr; + struct scsa_cmd *acmd = PKT2CMD(pkt); + + acmd->cmd_buf = bp; + + if (bp->b_flags & B_READ) { + acmd->cmd_flags &= ~CFLAG_DMASEND; + dma_flags = DDI_DMA_READ; + } else { + acmd->cmd_flags |= CFLAG_DMASEND; + dma_flags = DDI_DMA_WRITE; + } + + if (flags & PKT_CONSISTENT) { + acmd->cmd_flags |= CFLAG_CONSISTENT; + dma_flags |= DDI_DMA_CONSISTENT; + } + + if (flags & PKT_DMA_PARTIAL) { + dma_flags |= DDI_DMA_PARTIAL; + } + + dma_flags |= DDI_DMA_REDZONE; + + cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; + + tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; + tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + + if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, + cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { + switch (i) { + case DDI_DMA_BADATTR: + bioerror(bp, EFAULT); + return (DDI_FAILURE); + + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + return (DDI_FAILURE); + + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: " + "impossible result (0x%x)", i)); + bioerror(bp, EFAULT); + return (DDI_FAILURE); + } + } + + i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags, + cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies); + + switch (i) { + case DDI_DMA_PARTIAL_MAP: + if ((dma_flags & DDI_DMA_PARTIAL) == 0) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "DDI_DMA_PARTIAL_MAP impossible")); + goto no_dma_cookies; + } + + if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed")); + goto no_dma_cookies; + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + + con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed")); + goto no_dma_cookies; + } + + goto get_dma_cookies; + case DDI_DMA_MAPPED: + acmd->cmd_nwin = 1; + acmd->cmd_dma_len = 0; + acmd->cmd_dma_offset = 0; + +get_dma_cookies: + i = 0; + acmd->cmd_dmacount = 0; + for (;;) { + acmd->cmd_dmacount += + acmd->cmd_dmacookies[i++].dmac_size; + + if (i == instance->max_num_sge || + i == acmd->cmd_ncookies) + break; + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookie = i; + acmd->cmd_cookiecnt = i; + + acmd->cmd_flags |= CFLAG_DMAVALID; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + break; + case DDI_DMA_NOMAPPING: + bioerror(bp, EFAULT); + break; + case DDI_DMA_TOOBIG: + bioerror(bp, EINVAL); + break; + case DDI_DMA_INUSE: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:" + " DDI_DMA_INUSE impossible")); + break; + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "impossible result (0x%x)", i)); + break; + } + +no_dma_cookies: + ddi_dma_free_handle(&acmd->cmd_dmahandle); + acmd->cmd_dmahandle = NULL; + acmd->cmd_flags &= ~CFLAG_DMAVALID; + return (DDI_FAILURE); +} + +/* + * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *) + * + * move dma resources to next dma window + * + */ +static int +drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp) +{ + int i = 0; + + struct scsa_cmd *acmd = PKT2CMD(pkt); + + /* + * If there are no more cookies remaining in this window, + * must move to the next window first. + */ + if (acmd->cmd_cookie == acmd->cmd_ncookies) { + if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) { + return (DDI_SUCCESS); + } + + /* at last window, cannot move */ + if (++acmd->cmd_curwin >= acmd->cmd_nwin) { + return (DDI_FAILURE); + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + return (DDI_FAILURE); + } + + acmd->cmd_cookie = 0; + } else { + /* still more cookies in this window - get the next one */ + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[0]); + } + + /* get remaining cookies in this window, up to our maximum */ + for (;;) { + acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size; + acmd->cmd_cookie++; + + if (i == instance->max_num_sge || + acmd->cmd_cookie == acmd->cmd_ncookies) { + break; + } + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookiecnt = i; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); +} + +/* + * build_cmd + */ +static struct drsas_cmd * +build_cmd(struct drsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint16_t flags = 0; + uint32_t i; + uint32_t context; + uint32_t sge_bytes; + ddi_acc_handle_t acc_handle; + struct drsas_cmd *cmd; + struct drsas_sge64 *mfi_sgl; + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct drsas_pthru_frame *pthru; + struct drsas_io_frame *ldio; + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_mfi_pkt(instance))) { + return (NULL); + } + + acc_handle = cmd->frame_dma_obj.acc_handle; + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index); + + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_WRITE; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_READ; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + flags = MFI_FRAME_DIR_NONE; + } + + flags |= MFI_FRAME_SGL64; + + switch (pkt->pkt_cdbp[0]) { + + /* + * case SCMD_SYNCHRONIZE_CACHE: + * flush_cache(instance); + * return_mfi_pkt(instance, cmd); + * *cmd_done = 1; + * + * return (NULL); + */ + + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + if (acmd->islogical) { + ldio = (struct drsas_io_frame *)cmd->frame; + + /* + * preare the Logical IO frame: + * 2nd bit is zero for all read cmds + */ + ddi_put8(acc_handle, &ldio->cmd, + (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE + : MFI_CMD_OP_LD_READ); + ddi_put8(acc_handle, &ldio->cmd_status, 0x0); + ddi_put8(acc_handle, &ldio->scsi_status, 0x0); + ddi_put8(acc_handle, &ldio->target_id, acmd->device_id); + ddi_put16(acc_handle, &ldio->timeout, 0); + ddi_put8(acc_handle, &ldio->reserved_0, 0); + ddi_put16(acc_handle, &ldio->pad_0, 0); + ddi_put16(acc_handle, &ldio->flags, flags); + + /* Initialize sense Information */ + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + ddi_put32(acc_handle, &ldio->start_lba_hi, 0); + ddi_put8(acc_handle, &ldio->access_byte, + (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0); + ddi_put8(acc_handle, &ldio->sge_count, + acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&ldio->sgl; + + context = ddi_get32(acc_handle, &ldio->context); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + ddi_put32(acc_handle, &ldio->lba_count, ( + (uint16_t)(pkt->pkt_cdbp[4]))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[3])) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16))); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP2) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[9])) | + ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP3) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[13])) | + ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } + + break; + } + /* fall through For all non-rd/wr cmds */ + default: + + switch (pkt->pkt_cdbp[0]) { + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) drsas_mode_sense_build(pkt); + return_mfi_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + default: + break; + } + + pthru = (struct drsas_pthru_frame *)cmd->frame; + + /* prepare the DCDB frame */ + ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ? + MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI); + ddi_put8(acc_handle, &pthru->cmd_status, 0x0); + ddi_put8(acc_handle, &pthru->scsi_status, 0x0); + ddi_put8(acc_handle, &pthru->target_id, acmd->device_id); + ddi_put8(acc_handle, &pthru->lun, 0); + ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); + ddi_put16(acc_handle, &pthru->timeout, 0); + ddi_put16(acc_handle, &pthru->flags, flags); + ddi_put32(acc_handle, &pthru->data_xfer_len, + acmd->cmd_dmacount); + ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&pthru->sgl; + + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + + context = ddi_get32(acc_handle, &pthru->context); + ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp, + (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + break; + } +#ifdef lint + context = context; +#endif + /* prepare the scatter-gather list for the firmware */ + for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) { + ddi_put64(acc_handle, &mfi_sgl->phys_addr, + acmd->cmd_dmacookies[i].dmac_laddress); + ddi_put32(acc_handle, &mfi_sgl->length, + acmd->cmd_dmacookies[i].dmac_size); + } + + sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt; + + cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) + + ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1; + + if (cmd->frame_count >= 8) { + cmd->frame_count = 8; + } + + return (cmd); +} + +/* + * issue_mfi_pthru + */ +static int +issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint_t model; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + dma_obj_t pthru_dma_obj; + struct drsas_pthru_frame *kpthru; + struct drsas_pthru_frame *pthru; + int i; + pthru = &cmd->frame->pthru; + kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + + xferlen = kpthru->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + xferlen = kpthru->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + xferlen = kpthru->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; +#endif + } + + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + pthru_dma_obj.size = xferlen; + pthru_dma_obj.dma_attr = drsas_generic_dma_attr; + pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_sgllen = 1; + pthru_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &pthru_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kpthru->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf+i, + (uint8_t *)pthru_dma_obj.buffer+i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); + ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len); + ddi_put8(acc_handle, &pthru->cmd_status, 0); + ddi_put8(acc_handle, &pthru->scsi_status, 0); + ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); + ddi_put8(acc_handle, &pthru->lun, kpthru->lun); + ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len); + ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count); + ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout); + ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); + + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + + ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, + pthru->cdb_len, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru: fw_ioctl failed")); + } else { + if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)pthru_dma_obj.buffer+i, + (uint8_t *)ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); + kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); + + con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_dcmd + */ +static int +issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint32_t model; + dma_obj_t dcmd_dma_obj; + struct drsas_dcmd_frame *kdcmd; + struct drsas_dcmd_frame *dcmd; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + dcmd = &cmd->frame->dcmd; + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = xferlen; + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kdcmd->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf + i, + (uint8_t *)dcmd_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd); + ddi_put8(acc_handle, &dcmd->cmd_status, 0); + ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count); + ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout); + ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len); + ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode); + + ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b, + (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); + } else { + if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)dcmd_dma_obj.buffer + i, + (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_smp + */ +static int +issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *request_ubuf; + void *response_ubuf; + uint32_t request_xferlen = 0; + uint32_t response_xferlen = 0; + uint_t model; + dma_obj_t request_dma_obj; + dma_obj_t response_dma_obj; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + struct drsas_smp_frame *ksmp; + struct drsas_smp_frame *smp; + struct drsas_sge32 *sge32; +#ifndef _ILP32 + struct drsas_sge64 *sge64; +#endif + int i; + uint64_t tmp_sas_addr; + + smp = &cmd->frame->smp; + ksmp = (struct drsas_smp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + + sge64 = &ksmp->sgl[0].sge64[0]; + response_xferlen = sge64[0].length; + request_xferlen = sge64[1].length; + + response_ubuf = (void *)(ulong_t)sge64[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge64[1].phys_addr; +#endif + } + if (request_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + request_dma_obj.size = request_xferlen; + request_dma_obj.dma_attr = drsas_generic_dma_attr; + request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_sgllen = 1; + request_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &request_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyin((uint8_t *)request_ubuf + i, + (uint8_t *)request_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + response_dma_obj.size = response_xferlen; + response_dma_obj.dma_attr = drsas_generic_dma_attr; + response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_sgllen = 1; + response_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &response_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyin((uint8_t *)response_ubuf + i, + (uint8_t *)response_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &smp->cmd, ksmp->cmd); + ddi_put8(acc_handle, &smp->cmd_status, 0); + ddi_put8(acc_handle, &smp->connection_status, 0); + ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count); + /* smp->context = ksmp->context; */ + ddi_put16(acc_handle, &smp->timeout, ksmp->timeout); + ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len); + + bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr, + sizeof (uint64_t)); + ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr); + + ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64); + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#else + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: DDI_MODEL_LP64")); + sge64 = &smp->sgl[0].sge64[0]; + ddi_put32(acc_handle, &sge64[0].length, response_xferlen); + ddi_put64(acc_handle, &sge64[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge64[1].length, request_xferlen); + ddi_put64(acc_handle, &sge64[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#endif + } + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + "smp->response_xferlen = %d, smp->request_xferlen = %d " + "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), + ddi_get32(acc_handle, &sge32[1].length), + ddi_get32(acc_handle, &smp->data_xfer_len))); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp: fw_ioctl failed")); + } else { + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: copy to user space")); + + if (request_xferlen) { + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)request_dma_obj.buffer + + i, (uint8_t *)request_ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to user space" + " failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)response_dma_obj.buffer + + i, (uint8_t *)response_ubuf + + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + + ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", + ddi_get8(acc_handle, &smp->cmd_status))); + + + if (request_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, request_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (response_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, response_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_stp + */ +static int +issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *fis_ubuf; + void *data_ubuf; + uint32_t fis_xferlen = 0; + uint32_t data_xferlen = 0; + uint_t model; + dma_obj_t fis_dma_obj; + dma_obj_t data_dma_obj; + struct drsas_stp_frame *kstp; + struct drsas_stp_frame *stp; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + + stp = &cmd->frame->stp; + kstp = (struct drsas_stp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; + } + else + { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + + fis_xferlen = kstp->sgl.sge64[0].length; + data_xferlen = kstp->sgl.sge64[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr; +#endif + } + + + if (fis_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + fis_dma_obj.size = fis_xferlen; + fis_dma_obj.dma_attr = drsas_generic_dma_attr; + fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_sgllen = 1; + fis_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &fis_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyin((uint8_t *)fis_ubuf + i, + (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (data_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + "data_xferlen = %x", data_ubuf, data_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + data_dma_obj.size = data_xferlen; + data_dma_obj.dma_attr = drsas_generic_dma_attr; + data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_sgllen = 1; + data_dma_obj.dma_attr.dma_attr_align = 1; + +/* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &data_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyin((uint8_t *)data_ubuf + i, + (uint8_t *)data_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &stp->cmd, kstp->cmd); + ddi_put8(acc_handle, &stp->cmd_status, 0); + ddi_put8(acc_handle, &stp->connection_status, 0); + ddi_put8(acc_handle, &stp->target_id, kstp->target_id); + ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count); + + ddi_put16(acc_handle, &stp->timeout, kstp->timeout); + ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len); + + ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10, + DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags); + ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr, + fis_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr, + data_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); + } else { + + if (fis_xferlen) { + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)fis_dma_obj.buffer + i, + (uint8_t *)fis_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + if (data_xferlen) { + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)data_dma_obj.buffer + i, + (uint8_t *)data_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to" + " user space failed")); + return (DDI_FAILURE); + } + } + } + + kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + + if (fis_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (data_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * fill_up_drv_ver + */ +static void +fill_up_drv_ver(struct drsas_drv_ver *dv) +{ + (void) memset(dv, 0, sizeof (struct drsas_drv_ver)); + + (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$")); + (void) memcpy(dv->os_name, "Solaris", strlen("Solaris")); + (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas")); + (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION)); + (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE, + strlen(DRSAS_RELDATE)); +} + +/* + * handle_drv_ioctl + */ +static int +handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int i; + int rval = DDI_SUCCESS; + int *props = NULL; + void *ubuf; + + uint8_t *pci_conf_buf; + uint32_t xferlen; + uint32_t num_props; + uint_t model; + struct drsas_dcmd_frame *kdcmd; + struct drsas_drv_ver dv; + struct drsas_pci_information pi; + + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "dataBuf=%p size=%d bytes", ubuf, xferlen)); + + switch (kdcmd->opcode) { + case DRSAS_DRIVER_IOCTL_DRIVER_VERSION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION")); + + fill_up_drv_ver(&dv); + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + if (i == xferlen) + kdcmd->cmd_status = 0; + break; + case DRSAS_DRIVER_IOCTL_PCI_INFORMATION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON")); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip, + 0, "reg", &props, &num_props)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : " + "ddi_prop_look_int_array failed")); + rval = DDI_FAILURE; + } else { + + pi.busNumber = (props[0] >> 16) & 0xFF; + pi.deviceNumber = (props[0] >> 11) & 0x1f; + pi.functionNumber = (props[0] >> 8) & 0x7; + ddi_prop_free((void *)props); + } + + pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo; + + for (i = 0; i < (sizeof (struct drsas_pci_information) - + offsetof(struct drsas_pci_information, pciHeaderInfo)); + i++) { + pci_conf_buf[i] = + pci_config_get8(instance->pci_handle, i); + } + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + + if (i == xferlen) + kdcmd->cmd_status = 0; + + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "invalid driver specific IOCTL opcode = 0x%x", + kdcmd->opcode)); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + + return (rval); +} + +/* + * handle_mfi_ioctl + */ +static int +handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int rval = DDI_SUCCESS; + + struct drsas_header *hdr; + struct drsas_cmd *cmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "failed to get a cmd packet")); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + hdr = (struct drsas_header *)&ioctl->frame[0]; + + switch (hdr->cmd) { + case MFI_CMD_OP_DCMD: + rval = issue_mfi_dcmd(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_SMP: + rval = issue_mfi_smp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_STP: + rval = issue_mfi_stp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_PD_SCSI: + rval = issue_mfi_pthru(instance, ioctl, cmd, mode); + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: " + "invalid mfi ioctl hdr->cmd = %d", hdr->cmd)); + rval = DDI_FAILURE; + break; + } + + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) + rval = DDI_FAILURE; + return (rval); +} + +/* + * AEN + */ +static int +handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen) +{ + int rval = 0; + + rval = register_mfi_aen(instance, instance->aen_seq_num, + aen->class_locale_word); + + aen->cmd_status = (uint8_t)rval; + + return (rval); +} + +static int +register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num, + uint32_t class_locale_word) +{ + int ret_val; + + struct drsas_cmd *cmd, *aen_cmd; + struct drsas_dcmd_frame *dcmd; + union drsas_evt_class_locale curr_aen; + union drsas_evt_class_locale prev_aen; + + /* + * If there an AEN pending already (aen_cmd), check if the + * class_locale of that pending AEN is inclusive of the new + * AEN request we currently have. If it is, then we don't have + * to do anything. In other words, whichever events the current + * AEN request is subscribing to, have already been subscribed + * to. + * + * If the old_cmd is _not_ inclusive, then we have to abort + * that command, form a class_locale that is superset of both + * old and current and re-issue to the FW + */ + + curr_aen.word = class_locale_word; + aen_cmd = instance->aen_cmd; + if (aen_cmd) { + prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle, + &aen_cmd->frame->dcmd.mbox.w[1]); + + /* + * A class whose enum value is smaller is inclusive of all + * higher values. If a PROGRESS (= -1) was previously + * registered, then a new registration requests for higher + * classes need not be sent to FW. They are automatically + * included. + * + * Locale numbers don't have such hierarchy. They are bitmap + * values + */ + if ((prev_aen.members.class <= curr_aen.members.class) && + !((prev_aen.members.locale & curr_aen.members.locale) ^ + curr_aen.members.locale)) { + /* + * Previously issued event registration includes + * current request. Nothing to do. + */ + + return (0); + } else { + curr_aen.members.locale |= prev_aen.members.locale; + + if (prev_aen.members.class < curr_aen.members.class) + curr_aen.members.class = prev_aen.members.class; + + ret_val = abort_aen_cmd(instance, aen_cmd); + + if (ret_val) { + con_log(CL_ANN, (CE_WARN, "register_mfi_aen: " + "failed to abort prevous AEN command")); + + return (ret_val); + } + } + } else { + curr_aen.word = class_locale_word; + } + + cmd = get_mfi_pkt(instance); + + if (!cmd) + return (ENOMEM); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + /* Prepare DCMD for aen registration */ + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_detail)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_WAIT); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1], + curr_aen.word); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_detail)); + + instance->aen_seq_num = seq_num; + + + /* + * Store reference to the cmd used to register for AEN. When an + * application wants us to register for AEN, we have to abort this + * cmd and re-register with a new EVENT LOCALE supplied by that app + */ + instance->aen_cmd = cmd; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + /* atomic_add_16 (&instance->fw_outstanding, 1); */ + instance->func_ptr->issue_cmd(cmd, instance); + + return (0); +} + +static void +display_scsi_inquiry(caddr_t scsi_inq) +{ +#define MAX_SCSI_DEVICE_CODE 14 + int i; + char inquiry_buf[256] = {0}; + int len; + const char *const scsi_device_types[] = { + "Direct-Access ", + "Sequential-Access", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", + }; + + len = 0; + + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + for (i = 8; i < 16; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + + for (i = 16; i < 32; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + + for (i = 32; i < 36; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + + + i = scsi_inq[0] & 0x1f; + + + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : + "Unknown "); + + + len += snprintf(inquiry_buf + len, 265 - len, + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + + if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { + len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); + } else { + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + } + + con_log(CL_ANN1, (CE_CONT, inquiry_buf)); +} + +static int +read_fw_status_reg_ppc(struct drsas_instance *instance) +{ + return ((int)RD_OB_SCRATCH_PAD_0(instance)); +} + +static void +issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance) +{ + atomic_add_16(&instance->fw_outstanding, 1); + + /* Issue the command to the FW */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); +} + +/* + * issue_cmd_in_sync_mode + */ +static int +issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called")); + + cmd->cmd_status = ENODATA; + + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + mutex_enter(&instance->int_cmd_mtx); + + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + + mutex_exit(&instance->int_cmd_mtx); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +static int +issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct drsas_header *frame_hdr; + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called")); + + frame_hdr = (struct drsas_header *)cmd->frame; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + /* issue the frame using inbound queue port */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + /* wait for cmd_status to change from 0xFF */ + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " + "cmd polling timed out")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static void +enable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called")); + + /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */ + WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance); + + /* WR_OB_INTR_MASK(~0x80000000, instance); */ + WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: " + "outbound_intr_mask = 0x%x", mask)); +} + +static void +disable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called")); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */ + WR_OB_INTR_MASK(OB_INTR_MASK, instance); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); +#ifdef lint + mask = mask; +#endif +} + +static int +intr_ack_ppc(struct drsas_instance *instance) +{ + uint32_t status; + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called")); + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status)); + + if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) { + return (DDI_INTR_UNCLAIMED); + } + + /* clear the interrupt by writing back the same value */ + WR_OB_DOORBELL_CLEAR(status, instance); + + /* dummy READ */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared")); + + return (DDI_INTR_CLAIMED); +} + +static int +drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int ret = DDI_SUCCESS; + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + + ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); + + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + /* + * as the driver can always deal with an error in any dma or + * access handle, we can just return the fme_status value. + */ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +drsas_fm_init(struct drsas_instance *instance) +{ + /* Need to change iblock to priority for new MSI intr */ + ddi_iblock_cookie_t fm_ibc; + + /* Only register with IO Fault Services if we have some capability */ + if (instance->fm_capabilities) { + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_FLAGERR_ACC; + drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR; + + /* + * Register capabilities with IO Fault Services. + * fm_capabilities will be updated to indicate + * capabilities actually supported (not requested.) + */ + + ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc); + + /* + * Initialize pci ereport capabilities if ereport + * capable (should always be.) + */ + + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_setup(instance->dip); + } + + /* + * Register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_register(instance->dip, + drsas_fm_error_cb, (void*) instance); + } + } else { + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +static void +drsas_fm_fini(struct drsas_instance *instance) +{ + /* Only unregister FMA capabilities if registered */ + if (instance->fm_capabilities) { + /* + * Un-register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_unregister(instance->dip); + } + + /* + * Release any resources allocated by pci_ereport_setup() + */ + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_teardown(instance->dip); + } + + /* Unregister from IO Fault Services */ + ddi_fm_fini(instance->dip); + + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +int +drsas_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +int +drsas_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +void +drsas_fm_ereport(struct drsas_instance *instance, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) { + ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL); + } +} + +static int +drsas_add_intrs(struct drsas_instance *instance, int intr_type) +{ + + dev_info_t *dip = instance->dip; + int avail, actual, count; + int i, flag, ret; + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x", + intr_type)); + + /* Get number of interrupts */ + ret = ddi_intr_get_nintrs(dip, intr_type, &count); + if ((ret != DDI_SUCCESS) || (count == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:" + "ret %d count %d", ret, count)); + + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count)); + + /* Get number of available interrupts */ + ret = ddi_intr_get_navail(dip, intr_type, &avail); + if ((ret != DDI_SUCCESS) || (avail == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:" + "ret %d avail %d", ret, avail)); + + return (DDI_FAILURE); + } + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail)); + + /* Only one interrupt routine. So limit the count to 1 */ + if (count > 1) { + count = 1; + } + + /* + * Allocate an array of interrupt handlers. Currently we support + * only one interrupt. The framework can be extended later. + */ + instance->intr_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); + ASSERT(instance->intr_htable); + + flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == + DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + + /* Allocate interrupt */ + ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, + count, &actual, flag); + + if ((ret != DDI_SUCCESS) || (actual == 0)) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "avail = %d", avail)); + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + if (actual < count) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "Requested = %d Received = %d", count, actual)); + } + instance->intr_cnt = actual; + + /* + * Get the priority of the interrupt allocated. + */ + if ((ret = ddi_intr_get_pri(instance->intr_htable[0], + &instance->intr_pri)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "get priority call failed")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + /* + * Test for high level mutex. we don't support them. + */ + if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "High level interrupts not supported.")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ", + instance->intr_pri)); + + /* Call ddi_intr_add_handler() */ + for (i = 0; i < actual; i++) { + ret = ddi_intr_add_handler(instance->intr_htable[i], + (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance, + (caddr_t)(uintptr_t)i); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:" + "failed %d", ret)); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + } + + con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + + if ((ret = ddi_intr_get_cap(instance->intr_htable[0], + &instance->intr_cap)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", + ret)); + + /* Free already allocated intr */ + for (i = 0; i < actual; i++) { + (void) ddi_intr_remove_handler( + instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable")); + + (void) ddi_intr_block_enable(instance->intr_htable, + instance->intr_cnt); + } else { + con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable")); + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_enable(instance->intr_htable[i]); + con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns " + "%d", i)); + } + } + + return (DDI_SUCCESS); + +} + + +static void +drsas_rem_intrs(struct drsas_instance *instance) +{ + int i; + + con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called")); + + /* Disable all interrupts first */ + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(instance->intr_htable, + instance->intr_cnt); + } else { + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_disable(instance->intr_htable[i]); + } + } + + /* Remove all the handlers */ + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + + kmem_free(instance->intr_htable, instance->intr_size); +} + +static int +drsas_tran_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + struct drsas_instance *instance; + int config; + int rval; + + char *ptr = NULL; + int tgt, lun; + + con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op)); + + if ((instance = ddi_get_soft_state(drsas_state, + ddi_get_instance(parent))) == NULL) { + return (NDI_FAILURE); + } + + /* Hold nexus during bus_config */ + ndi_devi_enter(parent, &config); + switch (op) { + case BUS_CONFIG_ONE: { + + /* parse wwid/target name out of name given */ + if ((ptr = strchr((char *)arg, '@')) == NULL) { + rval = NDI_FAILURE; + break; + } + ptr++; + + if (drsas_parse_devname(arg, &tgt, &lun) != 0) { + rval = NDI_FAILURE; + break; + } + + if (lun == 0) { + rval = drsas_config_ld(instance, tgt, lun, childp); + } else { + rval = NDI_FAILURE; + } + + break; + } + case BUS_CONFIG_DRIVER: + case BUS_CONFIG_ALL: { + + rval = drsas_config_all_devices(instance); + + rval = NDI_SUCCESS; + break; + } + } + + if (rval == NDI_SUCCESS) { + rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + } + ndi_devi_exit(parent, config); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x", + rval)); + return (rval); +} + +static int +drsas_config_all_devices(struct drsas_instance *instance) +{ + int rval, tgt; + + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + (void) drsas_config_ld(instance, tgt, 0, NULL); + + } + + rval = NDI_SUCCESS; + return (rval); +} + +static int +drsas_parse_devname(char *devnm, int *tgt, int *lun) +{ + char devbuf[SCSI_MAXNAMELEN]; + char *addr; + char *p, *tp, *lp; + long num; + + /* Parse dev name and address */ + (void) strcpy(devbuf, devnm); + addr = ""; + for (p = devbuf; *p != '\0'; p++) { + if (*p == '@') { + addr = p + 1; + *p = '\0'; + } else if (*p == ':') { + *p = '\0'; + break; + } + } + + /* Parse target and lun */ + for (p = tp = addr, lp = NULL; *p != '\0'; p++) { + if (*p == ',') { + lp = p + 1; + *p = '\0'; + break; + } + } + if (tgt && tp) { + if (ddi_strtol(tp, NULL, 0x10, &num)) { + return (DDI_FAILURE); /* Can declare this as constant */ + } + *tgt = (int)num; + } + if (lun && lp) { + if (ddi_strtol(lp, NULL, 0x10, &num)) { + return (DDI_FAILURE); + } + *lun = (int)num; + } + return (DDI_SUCCESS); /* Success case */ +} + +static int +drsas_config_ld(struct drsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval; + + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d", + tgt, lun)); + + if ((child = drsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + con_log(CL_ANN1, (CE_NOTE, + "drsas_config_ld: Child = %p found t = %d l = %d", + (void *)child, tgt, lun)); + return (NDI_SUCCESS); + } + + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) + rval = drsas_config_scsi_device(instance, sd, ldip); + else + rval = NDI_FAILURE; + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + + kmem_free(sd, sizeof (struct scsi_device)); + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d", + rval)); + return (rval); +} + +static int +drsas_config_scsi_device(struct drsas_instance *instance, + struct scsi_device *sd, dev_info_t **dipp) +{ + char *nodename = NULL; + char **compatible = NULL; + int ncompatible = 0; + char *childname; + dev_info_t *ldip = NULL; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; + int rval; + + con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun)); + scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, + NULL, &nodename, &compatible, &ncompatible); + + if (nodename == NULL) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver " + "for t%dL%d", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; + con_log(CL_ANN1, (CE_WARN, + "dr_sas: Childname = %2s nodename = %s", childname, nodename)); + + /* Create a dev node */ + rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); + con_log(CL_ANN1, (CE_WARN, + "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); + if (rval == NDI_SUCCESS) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d target", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d lun", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip, + "compatible", compatible, ncompatible) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d compatible", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH); + if (rval != NDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online " + "t%dl%d", tgt, lun)); + ndi_prop_remove_all(ldip); + (void) ndi_devi_free(ldip); + } else { + con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :" + "0 t%dl%d", tgt, lun)); + } + + } +finish: + if (dipp) { + *dipp = ldip; + } + + con_log(CL_DLEVEL1, (CE_WARN, + "dr_sas: config_scsi_device rval = %d t%dL%d", + rval, tgt, lun)); + scsi_hba_nodename_compatible_free(nodename, compatible); + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event, + uint64_t wwn) +{ + struct drsas_eventinfo *mrevt = NULL; + + con_log(CL_ANN1, (CE_NOTE, + "drsas_service_evt called for t%dl%d event = %d", + tgt, lun, event)); + + if ((instance->taskq == NULL) || (mrevt = + kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + mrevt->instance = instance; + mrevt->tgt = tgt; + mrevt->lun = lun; + mrevt->event = event; + + if ((ddi_taskq_dispatch(instance->taskq, + (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != + DDI_SUCCESS) { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: Event task failed for t%dl%d event = %d", + tgt, lun, event)); + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static void +drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt) +{ + struct drsas_instance *instance = mrevt->instance; + dev_info_t *dip, *pdip; + int circ1 = 0; + char *devname; + + con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for" + " tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + dip = instance->dr_ld_list[mrevt->tgt].dip; + } else { + return; + } + + ndi_devi_enter(instance->dip, &circ1); + switch (mrevt->event) { + case DRSAS_EVT_CONFIG_TGT: + if (dip == NULL) { + + if (mrevt->lun == 0) { + (void) drsas_config_ld(instance, mrevt->tgt, + 0, NULL); + } + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT dip != NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + case DRSAS_EVT_UNCONFIG_TGT: + if (dip) { + if (i_ddi_devi_attached(dip)) { + + pdip = ddi_get_parent(dip); + + devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP); + (void) ddi_deviname(dip, devname); + + (void) devfs_clean(pdip, devname + 1, + DV_CLEAN_FORCE); + kmem_free(devname, MAXNAMELEN + 1); + } + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT dip == NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + } + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + ndi_devi_exit(instance->dip, circ1); +} + +static int +drsas_mode_sense_build(struct scsi_pkt *pkt) +{ + union scsi_cdb *cdbp; + uint16_t page_code; + struct scsa_cmd *acmd; + struct buf *bp; + struct mode_header *modehdrp; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = cdbp->cdb_un.sg.scsi[0]; + acmd = PKT2CMD(pkt); + bp = acmd->cmd_buf; + if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) { + con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command")); + /* ADD pkt statistics as Command failed. */ + return (NULL); + } + + bp_mapin(bp); + bzero(bp->b_un.b_addr, bp->b_bcount); + + switch (page_code) { + case 0x3: { + struct mode_format *page3p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page3p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page3p->mode_page.code = 0x3; + page3p->mode_page.length = + (uchar_t)(sizeof (struct mode_format)); + page3p->data_bytes_sect = 512; + page3p->sect_track = 63; + break; + } + case 0x4: { + struct mode_geometry *page4p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page4p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page4p->mode_page.code = 0x4; + page4p->mode_page.length = + (uchar_t)(sizeof (struct mode_geometry)); + page4p->heads = 255; + page4p->rpm = 10000; + break; + } + default: + break; + } + return (NULL); +} diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf new file mode 100644 index 0000000000..3792f43ca4 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf @@ -0,0 +1,15 @@ +# +# Copyright (c) 2008-2009, LSI Logic Corporation. +# All rights reserved. +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# dr_sas.conf for sol 10 (and later) for all supported architectures +# +# global definitions + +# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI +#drsas-enable-msi="yes"; diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h new file mode 100644 index 0000000000..8f78658edf --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.h @@ -0,0 +1,1766 @@ +/* + * dr_sas.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_H_ +#define _DR_SAS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/scsi/scsi.h> +#include "dr_sas_list.h" + +/* + * MegaRAID SAS2.0 Driver meta data + */ +#define DRSAS_VERSION "LSIv2.0" +#define DRSAS_RELDATE "Jan 9, 2009" + +#define DRSAS_TRUE 1 +#define DRSAS_FALSE 0 + +/* + * MegaRAID SAS2.0 device id conversion definitions. + */ +#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) + +/* + * MegaRAID SAS2.0 supported controllers + */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* + * Register Index for 2108 Controllers. + */ +#define REGISTER_SET_IO_2108 (2) + +#define DRSAS_MAX_SGE_CNT 0x50 + +#define DRSAS_IOCTL_DRIVER 0x12341234 +#define DRSAS_IOCTL_FIRMWARE 0x12345678 +#define DRSAS_IOCTL_AEN 0x87654321 + +#define DRSAS_1_SECOND 1000000 + +/* Dynamic Enumeration Flags */ +#define DRSAS_PD_LUN 1 +#define DRSAS_LD_LUN 0 +#define DRSAS_PD_TGT_MAX 255 +#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max) +#define WWN_STRLEN 17 + +/* + * ===================================== + * MegaRAID SAS2.0 MFI firmware definitions + * ===================================== + */ +/* + * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for + * protocol between the software and firmware. Commands are issued using + * "message frames" + */ + +/* + * FW posts its state in upper 4 bits of outbound_msg_0 register + */ +#define MFI_STATE_SHIFT 28 +#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) +#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) +#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) +#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) +#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) +#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) +#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) +#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) +#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) + +#define MRMFI_FRAME_SIZE 64 + +/* + * During FW init, clear pending cmds & reset state using inbound_msg_0 + * + * ABORT : Abort all pending cmds + * READY : Move from OPERATIONAL to READY state; discard queue info + * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??) + * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver + */ +#define MFI_INIT_ABORT 0x00000001 +#define MFI_INIT_READY 0x00000002 +#define MFI_INIT_MFIMODE 0x00000004 +#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 +#define MFI_STOP_ADP 0x00000020 +#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT + +/* + * MFI frame flags + */ +#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000 +#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001 +#define MFI_FRAME_SGL32 0x0000 +#define MFI_FRAME_SGL64 0x0002 +#define MFI_FRAME_SENSE32 0x0000 +#define MFI_FRAME_SENSE64 0x0004 +#define MFI_FRAME_DIR_NONE 0x0000 +#define MFI_FRAME_DIR_WRITE 0x0008 +#define MFI_FRAME_DIR_READ 0x0010 +#define MFI_FRAME_DIR_BOTH 0x0018 + +/* + * Definition for cmd_status + */ +#define MFI_CMD_STATUS_POLL_MODE 0xFF +#define MFI_CMD_STATUS_SYNC_MODE 0xFF + +/* + * MFI command opcodes + */ +#define MFI_CMD_OP_INIT 0x00 +#define MFI_CMD_OP_LD_READ 0x01 +#define MFI_CMD_OP_LD_WRITE 0x02 +#define MFI_CMD_OP_LD_SCSI 0x03 +#define MFI_CMD_OP_PD_SCSI 0x04 +#define MFI_CMD_OP_DCMD 0x05 +#define MFI_CMD_OP_ABORT 0x06 +#define MFI_CMD_OP_SMP 0x07 +#define MFI_CMD_OP_STP 0x08 + +#define DR_DCMD_CTRL_GET_INFO 0x01010000 + +#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000 +#define DR_FLUSH_CTRL_CACHE 0x01 +#define DR_FLUSH_DISK_CACHE 0x02 + +#define DR_DCMD_CTRL_SHUTDOWN 0x01050000 +#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01 + +#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100 +#define DR_DCMD_CTRL_EVENT_GET 0x01040300 +#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500 +#define DR_DCMD_LD_GET_PROPERTIES 0x03030000 +#define DR_DCMD_PD_GET_INFO 0x02020000 + +/* + * Solaris Specific MAX values + */ +#define MAX_SGL 24 +/* + * MFI command completion codes + */ +enum MFI_STAT { + MFI_STAT_OK = 0x00, + MFI_STAT_INVALID_CMD = 0x01, + MFI_STAT_INVALID_DCMD = 0x02, + MFI_STAT_INVALID_PARAMETER = 0x03, + MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04, + MFI_STAT_ABORT_NOT_POSSIBLE = 0x05, + MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06, + MFI_STAT_APP_IN_USE = 0x07, + MFI_STAT_APP_NOT_INITIALIZED = 0x08, + MFI_STAT_ARRAY_INDEX_INVALID = 0x09, + MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a, + MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b, + MFI_STAT_DEVICE_NOT_FOUND = 0x0c, + MFI_STAT_DRIVE_TOO_SMALL = 0x0d, + MFI_STAT_FLASH_ALLOC_FAIL = 0x0e, + MFI_STAT_FLASH_BUSY = 0x0f, + MFI_STAT_FLASH_ERROR = 0x10, + MFI_STAT_FLASH_IMAGE_BAD = 0x11, + MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12, + MFI_STAT_FLASH_NOT_OPEN = 0x13, + MFI_STAT_FLASH_NOT_STARTED = 0x14, + MFI_STAT_FLUSH_FAILED = 0x15, + MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16, + MFI_STAT_LD_CC_IN_PROGRESS = 0x17, + MFI_STAT_LD_INIT_IN_PROGRESS = 0x18, + MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19, + MFI_STAT_LD_MAX_CONFIGURED = 0x1a, + MFI_STAT_LD_NOT_OPTIMAL = 0x1b, + MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c, + MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d, + MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e, + MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f, + MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20, + MFI_STAT_MFC_HW_ERROR = 0x21, + MFI_STAT_NO_HW_PRESENT = 0x22, + MFI_STAT_NOT_FOUND = 0x23, + MFI_STAT_NOT_IN_ENCL = 0x24, + MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25, + MFI_STAT_PD_TYPE_WRONG = 0x26, + MFI_STAT_PR_DISABLED = 0x27, + MFI_STAT_ROW_INDEX_INVALID = 0x28, + MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29, + MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a, + MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b, + MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c, + MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d, + MFI_STAT_SCSI_IO_FAILED = 0x2e, + MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f, + MFI_STAT_SHUTDOWN_FAILED = 0x30, + MFI_STAT_TIME_NOT_SET = 0x31, + MFI_STAT_WRONG_STATE = 0x32, + MFI_STAT_LD_OFFLINE = 0x33, + /* UNUSED: 0x34 to 0xfe */ + MFI_STAT_INVALID_STATUS = 0xFF +}; + +enum DR_EVT_CLASS { + DR_EVT_CLASS_DEBUG = -2, + DR_EVT_CLASS_PROGRESS = -1, + DR_EVT_CLASS_INFO = 0, + DR_EVT_CLASS_WARNING = 1, + DR_EVT_CLASS_CRITICAL = 2, + DR_EVT_CLASS_FATAL = 3, + DR_EVT_CLASS_DEAD = 4 +}; + +enum DR_EVT_LOCALE { + DR_EVT_LOCALE_LD = 0x0001, + DR_EVT_LOCALE_PD = 0x0002, + DR_EVT_LOCALE_ENCL = 0x0004, + DR_EVT_LOCALE_BBU = 0x0008, + DR_EVT_LOCALE_SAS = 0x0010, + DR_EVT_LOCALE_CTRL = 0x0020, + DR_EVT_LOCALE_CONFIG = 0x0040, + DR_EVT_LOCALE_CLUSTER = 0x0080, + DR_EVT_LOCALE_ALL = 0xffff +}; + +#define DR_EVT_CFG_CLEARED 0x0004 +#define DR_EVT_LD_CREATED 0x008a +#define DR_EVT_LD_DELETED 0x008b +#define DR_EVT_PD_REMOVED_EXT 0x00f8 +#define DR_EVT_PD_INSERTED_EXT 0x00f7 + +enum LD_STATE { + LD_OFFLINE = 0, + LD_PARTIALLY_DEGRADED = 1, + LD_DEGRADED = 2, + LD_OPTIMAL = 3, + LD_INVALID = 0xFF +}; + +enum DRSAS_EVT { + DRSAS_EVT_CONFIG_TGT = 0, + DRSAS_EVT_UNCONFIG_TGT = 1, + DRSAS_EVT_UNCONFIG_SMP = 2 +}; + +#define DMA_OBJ_ALLOCATED 1 +#define DMA_OBJ_REALLOCATED 2 +#define DMA_OBJ_FREED 3 + +/* + * dma_obj_t - Our DMA object + * @param buffer : kernel virtual address + * @param size : size of the data to be allocated + * @param acc_handle : access handle + * @param dma_handle : dma handle + * @param dma_cookie : scatter-gather list + * @param dma_attr : dma attributes for this buffer + * Our DMA object. The caller must initialize the size and dma attributes + * (dma_attr) fields before allocating the resources. + */ +typedef struct { + caddr_t buffer; + uint32_t size; + ddi_acc_handle_t acc_handle; + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT]; + ddi_dma_attr_t dma_attr; + uint8_t status; + uint8_t reserved[3]; +} dma_obj_t; + +struct drsas_eventinfo { + struct drsas_instance *instance; + int tgt; + int lun; + int event; +}; + +struct drsas_ld { + dev_info_t *dip; + uint8_t lun_type; + uint8_t reserved[3]; +}; + +struct drsas_pd { + dev_info_t *dip; + uint8_t lun_type; + uint8_t dev_id; + uint8_t flags; + uint8_t reserved; +}; + +struct drsas_pd_info { + uint16_t deviceId; + uint16_t seqNum; + uint8_t inquiryData[96]; + uint8_t vpdPage83[64]; + uint8_t notSupported; + uint8_t scsiDevType; + uint8_t a; + uint8_t device_speed; + uint32_t mediaerrcnt; + uint32_t other; + uint32_t pred; + uint32_t lastpred; + uint16_t fwState; + uint8_t disabled; + uint8_t linkspwwd; + uint32_t ddfType; + struct { + uint8_t count; + uint8_t isPathBroken; + uint8_t connectorIndex[2]; + uint8_t reserved[4]; + uint64_t sasAddr[2]; + uint8_t reserved2[16]; + } pathInfo; +}; + +typedef struct drsas_instance { + uint32_t *producer; + uint32_t *consumer; + + uint32_t *reply_queue; + dma_obj_t mfi_internal_dma_obj; + + uint8_t init_id; + uint8_t reserved[3]; + + uint16_t max_num_sge; + uint16_t max_fw_cmds; + uint32_t max_sectors_per_req; + + struct drsas_cmd **cmd_list; + + mlist_t cmd_pool_list; + kmutex_t cmd_pool_mtx; + + mlist_t cmd_pend_list; + kmutex_t cmd_pend_mtx; + + dma_obj_t mfi_evt_detail_obj; + struct drsas_cmd *aen_cmd; + + uint32_t aen_seq_num; + uint32_t aen_class_locale_word; + + scsi_hba_tran_t *tran; + + kcondvar_t int_cmd_cv; + kmutex_t int_cmd_mtx; + + kcondvar_t aen_cmd_cv; + kmutex_t aen_cmd_mtx; + + kcondvar_t abort_cmd_cv; + kmutex_t abort_cmd_mtx; + + dev_info_t *dip; + ddi_acc_handle_t pci_handle; + + timeout_id_t timeout_id; + uint32_t unique_id; + uint16_t fw_outstanding; + caddr_t regmap; + ddi_acc_handle_t regmap_handle; + uint8_t isr_level; + ddi_iblock_cookie_t iblock_cookie; + ddi_iblock_cookie_t soft_iblock_cookie; + ddi_softintr_t soft_intr_id; + uint8_t softint_running; + kmutex_t completed_pool_mtx; + mlist_t completed_pool_list; + + caddr_t internal_buf; + uint32_t internal_buf_dmac_add; + uint32_t internal_buf_size; + + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + int instance; + int baseaddress; + char iocnode[16]; + + int fm_capabilities; + + struct drsas_func_ptr *func_ptr; + /* MSI interrupts specific */ + ddi_intr_handle_t *intr_htable; + int intr_type; + int intr_cnt; + size_t intr_size; + uint_t intr_pri; + int intr_cap; + + ddi_taskq_t *taskq; + struct drsas_ld *dr_ld_list; +} drsas_t; + +struct drsas_func_ptr { + int (*read_fw_status_reg)(struct drsas_instance *); + void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *); + int (*issue_cmd_in_sync_mode)(struct drsas_instance *, + struct drsas_cmd *); + int (*issue_cmd_in_poll_mode)(struct drsas_instance *, + struct drsas_cmd *); + void (*enable_intr)(struct drsas_instance *); + void (*disable_intr)(struct drsas_instance *); + int (*intr_ack)(struct drsas_instance *); +}; + +/* + * ### Helper routines ### + */ + +/* + * con_log() - console log routine + * @param level : indicates the severity of the message. + * @fparam mt : format string + * + * con_log displays the error messages on the console based on the current + * debug level. Also it attaches the appropriate kernel severity level with + * the message. + * + * + * console messages debug levels + */ +#define CL_NONE 0 /* No debug information */ +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ + +#ifdef __SUNPRO_C +#define __func__ "" +#endif + +#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; } + +/* + * ### SCSA definitions ### + */ +#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target) +#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun) +#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran) +#define ADDR2TRAN(ap) ((ap)->a_hba_tran) + +#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private) +#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap)) + +#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private) +#define CMD2PKT(sp) ((sp)->cmd_pkt) +#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request)) + +#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address) +#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran) +#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd))) + +#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */ +#define CFLAG_DMASEND 0x0002 /* Transfer from the device */ +#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */ + +/* + * ### Data structures for ioctl inteface and internal commands ### + */ + +/* + * Data direction flags + */ +#define UIOC_RD 0x00001 +#define UIOC_WR 0x00002 + +#define SCP2HOST(scp) (scp)->device->host /* to host */ +#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */ +#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */ +#define SCP2TARGET(scp) (scp)->device->id /* to target */ +#define SCP2LUN(scp) (scp)->device->lun /* to LUN */ + +#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0]) +#define SCP2ADAPTER(scp) \ + (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp)) + +#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \ + (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0 +#define MRDRV_IS_LOGICAL(ap) \ + ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0 +#define MAP_DEVICE_ID(instance, ap) \ + (ap->a_target) + +#define HIGH_LEVEL_INTR 1 +#define NORMAL_LEVEL_INTR 0 + +/* + * scsa_cmd - Per-command mr private data + * @param cmd_dmahandle : dma handle + * @param cmd_dmacookies : current dma cookies + * @param cmd_pkt : scsi_pkt reference + * @param cmd_dmacount : dma count + * @param cmd_cookie : next cookie + * @param cmd_ncookies : cookies per window + * @param cmd_cookiecnt : cookies per sub-win + * @param cmd_nwin : number of dma windows + * @param cmd_curwin : current dma window + * @param cmd_dma_offset : current window offset + * @param cmd_dma_len : current window length + * @param cmd_flags : private flags + * @param cmd_cdblen : length of cdb + * @param cmd_scblen : length of scb + * @param cmd_buf : command buffer + * @param channel : channel for scsi sub-system + * @param target : target for scsi sub-system + * @param lun : LUN for scsi sub-system + * + * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E) + * - Pointed to by pkt_ha_private field in scsi_pkt + */ +struct scsa_cmd { + ddi_dma_handle_t cmd_dmahandle; + ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT]; + struct scsi_pkt *cmd_pkt; + ulong_t cmd_dmacount; + uint_t cmd_cookie; + uint_t cmd_ncookies; + uint_t cmd_cookiecnt; + uint_t cmd_nwin; + uint_t cmd_curwin; + off_t cmd_dma_offset; + ulong_t cmd_dma_len; + ulong_t cmd_flags; + uint_t cmd_cdblen; + uint_t cmd_scblen; + struct buf *cmd_buf; + ushort_t device_id; + uchar_t islogical; + uchar_t lun; + struct drsas_device *drsas_dev; +}; + + +struct drsas_cmd { + union drsas_frame *frame; + uint32_t frame_phys_addr; + uint8_t *sense; + uint32_t sense_phys_addr; + dma_obj_t frame_dma_obj; + uint8_t frame_dma_obj_status; + + uint32_t index; + uint8_t sync_cmd; + uint8_t cmd_status; + uint16_t abort_aen; + mlist_t list; + uint32_t frame_count; + struct scsa_cmd *cmd; + struct scsi_pkt *pkt; +}; + +#define MAX_MGMT_ADAPTERS 1024 +#define IOC_SIGNATURE "MR-SAS" + +#define IOC_CMD_FIRMWARE 0x0 +#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000 +#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100 +#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200 +#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300 + + +#define DRSAS_MAX_SENSE_LENGTH 32 + +struct drsas_mgmt_info { + + uint16_t count; + struct drsas_instance *instance[MAX_MGMT_ADAPTERS]; + uint16_t map[MAX_MGMT_ADAPTERS]; + int max_index; +}; + +#pragma pack(1) + +/* + * SAS controller properties + */ +struct drsas_ctrl_prop { + uint16_t seq_num; + uint16_t pred_fail_poll_interval; + uint16_t intr_throttle_count; + uint16_t intr_throttle_timeouts; + + uint8_t rebuild_rate; + uint8_t patrol_read_rate; + uint8_t bgi_rate; + uint8_t cc_rate; + uint8_t recon_rate; + + uint8_t cache_flush_interval; + + uint8_t spinup_drv_count; + uint8_t spinup_delay; + + uint8_t cluster_enable; + uint8_t coercion_mode; + uint8_t disk_write_cache_disable; + uint8_t alarm_enable; + + uint8_t reserved[44]; +}; + +/* + * SAS controller information + */ +struct drsas_ctrl_info { + /* PCI device information */ + struct { + uint16_t vendor_id; + uint16_t device_id; + uint16_t sub_vendor_id; + uint16_t sub_device_id; + uint8_t reserved[24]; + } pci; + + /* Host interface information */ + struct { + uint8_t PCIX : 1; + uint8_t PCIE : 1; + uint8_t iSCSI : 1; + uint8_t SAS_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } host_interface; + + /* Device (backend) interface information */ + struct { + uint8_t SPI : 1; + uint8_t SAS_3G : 1; + uint8_t SATA_1_5G : 1; + uint8_t SATA_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } device_interface; + + /* List of components residing in flash. All str are null terminated */ + uint32_t image_check_word; + uint32_t image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char built_time[16]; + } image_component[8]; + + /* + * List of flash components that have been flashed on the card, but + * are not in use, pending reset of the adapter. This list will be + * empty if a flash operation has not occurred. All stings are null + * terminated + */ + uint32_t pending_image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char build_time[16]; + } pending_image_component[8]; + + uint8_t max_arms; + uint8_t max_spans; + uint8_t max_arrays; + uint8_t max_lds; + + char product_name[80]; + char serial_no[32]; + + /* + * Other physical/controller/operation information. Indicates the + * presence of the hardware + */ + struct { + uint32_t bbu : 1; + uint32_t alarm : 1; + uint32_t nvram : 1; + uint32_t uart : 1; + uint32_t reserved : 28; + } hw_present; + + uint32_t current_fw_time; + + /* Maximum data transfer sizes */ + uint16_t max_concurrent_cmds; + uint16_t max_sge_count; + uint32_t max_request_size; + + /* Logical and physical device counts */ + uint16_t ld_present_count; + uint16_t ld_degraded_count; + uint16_t ld_offline_count; + + uint16_t pd_present_count; + uint16_t pd_disk_present_count; + uint16_t pd_disk_pred_failure_count; + uint16_t pd_disk_failed_count; + + /* Memory size information */ + uint16_t nvram_size; + uint16_t memory_size; + uint16_t flash_size; + + /* Error counters */ + uint16_t mem_correctable_error_count; + uint16_t mem_uncorrectable_error_count; + + /* Cluster information */ + uint8_t cluster_permitted; + uint8_t cluster_active; + uint8_t reserved_1[2]; + + /* Controller capabilities structures */ + struct { + uint32_t raid_level_0 : 1; + uint32_t raid_level_1 : 1; + uint32_t raid_level_5 : 1; + uint32_t raid_level_1E : 1; + uint32_t reserved : 28; + } raid_levels; + + struct { + uint32_t rbld_rate : 1; + uint32_t cc_rate : 1; + uint32_t bgi_rate : 1; + uint32_t recon_rate : 1; + uint32_t patrol_rate : 1; + uint32_t alarm_control : 1; + uint32_t cluster_supported : 1; + uint32_t bbu : 1; + uint32_t spanning_allowed : 1; + uint32_t dedicated_hotspares : 1; + uint32_t revertible_hotspares : 1; + uint32_t foreign_config_import : 1; + uint32_t self_diagnostic : 1; + uint32_t reserved : 19; + } adapter_operations; + + struct { + uint32_t read_policy : 1; + uint32_t write_policy : 1; + uint32_t io_policy : 1; + uint32_t access_policy : 1; + uint32_t reserved : 28; + } ld_operations; + + struct { + uint8_t min; + uint8_t max; + uint8_t reserved[2]; + } stripe_size_operations; + + struct { + uint32_t force_online : 1; + uint32_t force_offline : 1; + uint32_t force_rebuild : 1; + uint32_t reserved : 29; + } pd_operations; + + struct { + uint32_t ctrl_supports_sas : 1; + uint32_t ctrl_supports_sata : 1; + uint32_t allow_mix_in_encl : 1; + uint32_t allow_mix_in_ld : 1; + uint32_t allow_sata_in_cluster : 1; + uint32_t reserved : 27; + } pd_mix_support; + + /* Include the controller properties (changeable items) */ + uint8_t reserved_2[12]; + struct drsas_ctrl_prop properties; + + uint8_t pad[0x800 - 0x640]; +}; + +/* + * ================================== + * MegaRAID SAS2.0 driver definitions + * ================================== + */ +#define MRDRV_MAX_NUM_CMD 1024 + +#define MRDRV_MAX_PD_CHANNELS 2 +#define MRDRV_MAX_LD_CHANNELS 2 +#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \ + MRDRV_MAX_LD_CHANNELS) +#define MRDRV_MAX_DEV_PER_CHANNEL 128 +#define MRDRV_DEFAULT_INIT_ID -1 +#define MRDRV_MAX_CMD_PER_LUN 1000 +#define MRDRV_MAX_LUN 1 +#define MRDRV_MAX_LD 64 + +#define MRDRV_RESET_WAIT_TIME 300 +#define MRDRV_RESET_NOTICE_INTERVAL 5 + +#define DRSAS_IOCTL_CMD 0 + +/* + * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit + * SGLs based on the size of dma_addr_t + */ +#define IS_DMA64 (sizeof (dma_addr_t) == 8) + +#define IB_MSG_0_OFF 0x10 /* XScale */ +#define OB_MSG_0_OFF 0x18 /* XScale */ +#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ +#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */ +#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */ +#define IB_QPORT_OFF 0x40 /* XScale & ROC */ +#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */ +#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ +#define OB_INTR_MASK 0xFFFFFFFF +#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF + +/* + * All MFI register set macros accept drsas_register_set* + */ +#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) + +#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF)) + +#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v)) + +#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF)) + +#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v)) + +#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF)) + +#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v)) + +#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF)) + +#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v)) + +#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \ + (v)) + +#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) + +/* + * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data + * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs + * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled. + */ +#define MFI_OB_INTR_STATUS_MASK 0x00000002 + +/* + * This MFI_REPLY_2108_MESSAGE_INTR flag is used also + * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has + * been set in this flag along with bit 1. + */ +#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 +#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 + +#define MFI_POLL_TIMEOUT_SECS 60 + +#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1) +#define MFI_DISABLE_INTR(instance) \ +{ \ + uint32_t disable = 1; \ + uint32_t mask = ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\ + mask &= ~disable; \ + ddi_put32((instance)->regmap_handle, (uint32_t *) \ + (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \ +} + +/* By default, the firmware programs for 8 Kbytes of memory */ +#define DEFAULT_MFI_MEM_SZ 8192 +#define MINIMUM_MFI_MEM_SZ 4096 + +/* DCMD Message Frame MAILBOX0-11 */ +#define DCMD_MBOX_SZ 12 + + +struct drsas_register_set { + uint32_t reserved_0[4]; + + uint32_t inbound_msg_0; + uint32_t inbound_msg_1; + uint32_t outbound_msg_0; + uint32_t outbound_msg_1; + + uint32_t inbound_doorbell; + uint32_t inbound_intr_status; + uint32_t inbound_intr_mask; + + uint32_t outbound_doorbell; + uint32_t outbound_intr_status; + uint32_t outbound_intr_mask; + + uint32_t reserved_1[2]; + + uint32_t inbound_queue_port; + uint32_t outbound_queue_port; + + uint32_t reserved_2[22]; + + uint32_t outbound_doorbell_clear; + + uint32_t reserved_3[3]; + + uint32_t outbound_scratch_pad; + + uint32_t reserved_4[3]; + + uint32_t inbound_low_queue_port; + + uint32_t inbound_high_queue_port; + + uint32_t reserved_5; + uint32_t index_registers[820]; +}; + +struct drsas_sge32 { + uint32_t phys_addr; + uint32_t length; +}; + +struct drsas_sge64 { + uint64_t phys_addr; + uint32_t length; +}; + +union drsas_sgl { + struct drsas_sge32 sge32[1]; + struct drsas_sge64 sge64[1]; +}; + +struct drsas_header { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xferlen; +}; + +union drsas_sgl_frame { + struct drsas_sge32 sge32[8]; + struct drsas_sge64 sge64[5]; +}; + +struct drsas_init_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t data_xfer_len; + + uint32_t queue_info_new_phys_addr_lo; + uint32_t queue_info_new_phys_addr_hi; + uint32_t queue_info_old_phys_addr_lo; + uint32_t queue_info_old_phys_addr_hi; + + uint32_t reserved_4[6]; +}; + +struct drsas_init_queue_info { + uint32_t init_flags; + uint32_t reply_queue_entries; + + uint32_t reply_queue_start_phys_addr_lo; + uint32_t reply_queue_start_phys_addr_hi; + uint32_t producer_index_phys_addr_lo; + uint32_t producer_index_phys_addr_hi; + uint32_t consumer_index_phys_addr_lo; + uint32_t consumer_index_phys_addr_hi; +}; + +struct drsas_io_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t access_byte; + uint8_t reserved_0; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t lba_count; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint32_t start_lba_lo; + uint32_t start_lba_hi; + + union drsas_sgl sgl; +}; + +struct drsas_pthru_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xfer_len; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint8_t cdb[16]; + union drsas_sgl sgl; +}; + +struct drsas_dcmd_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + uint8_t reserved_1[4]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + uint32_t opcode; + + union { + uint8_t b[DCMD_MBOX_SZ]; + uint16_t s[6]; + uint32_t w[3]; + } mbox; + + union drsas_sgl sgl; +}; + +struct drsas_abort_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t reserved_4; + + uint32_t abort_context; + uint32_t pad_1; + + uint32_t abort_mfi_phys_addr_lo; + uint32_t abort_mfi_phys_addr_hi; + + uint32_t reserved_5[6]; +}; + +struct drsas_smp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t reserved_2[3]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint64_t sas_addr; + + union drsas_sgl sgl[2]; +}; + +struct drsas_stp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t target_id; + uint8_t reserved_2[2]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint16_t fis[10]; + uint32_t stp_flags; + union drsas_sgl sgl; +}; + +union drsas_frame { + struct drsas_header hdr; + struct drsas_init_frame init; + struct drsas_io_frame io; + struct drsas_pthru_frame pthru; + struct drsas_dcmd_frame dcmd; + struct drsas_abort_frame abort; + struct drsas_smp_frame smp; + struct drsas_stp_frame stp; + + uint8_t raw_bytes[64]; +}; + +typedef struct drsas_pd_address { + uint16_t device_id; + uint16_t encl_id; + + union { + struct { + uint8_t encl_index; + uint8_t slot_number; + } pd_address; + struct { + uint8_t encl_position; + uint8_t encl_connector_index; + } encl_address; + }address; + + uint8_t scsi_dev_type; + + union { + uint8_t port_bitmap; + uint8_t port_numbers; + } connected; + + uint64_t sas_addr[2]; +} drsas_pd_address_t; + +union drsas_evt_class_locale { + struct { + uint16_t locale; + uint8_t reserved; + int8_t class; + } members; + + uint32_t word; +}; + +struct drsas_evt_log_info { + uint32_t newest_seq_num; + uint32_t oldest_seq_num; + uint32_t clear_seq_num; + uint32_t shutdown_seq_num; + uint32_t boot_seq_num; +}; + +struct drsas_progress { + uint16_t progress; + uint16_t elapsed_seconds; +}; + +struct drsas_evtarg_ld { + uint16_t target_id; + uint8_t ld_index; + uint8_t reserved; +}; + +struct drsas_evtarg_pd { + uint16_t device_id; + uint8_t encl_index; + uint8_t slot_number; +}; + +struct drsas_evt_detail { + uint32_t seq_num; + uint32_t time_stamp; + uint32_t code; + union drsas_evt_class_locale cl; + uint8_t arg_type; + uint8_t reserved1[15]; + + union { + struct { + struct drsas_evtarg_pd pd; + uint8_t cdb_length; + uint8_t sense_length; + uint8_t reserved[2]; + uint8_t cdb[16]; + uint8_t sense[64]; + } cdbSense; + + struct drsas_evtarg_ld ld; + + struct { + struct drsas_evtarg_ld ld; + uint64_t count; + } ld_count; + + struct { + uint64_t lba; + struct drsas_evtarg_ld ld; + } ld_lba; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prevOwner; + uint32_t newOwner; + } ld_owner; + + struct { + uint64_t ld_lba; + uint64_t pd_lba; + struct drsas_evtarg_ld ld; + struct drsas_evtarg_pd pd; + } ld_lba_pd_lba; + + struct { + struct drsas_evtarg_ld ld; + struct drsas_progress prog; + } ld_prog; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prev_state; + uint32_t new_state; + } ld_state; + + struct { + uint64_t strip; + struct drsas_evtarg_ld ld; + } ld_strip; + + struct drsas_evtarg_pd pd; + + struct { + struct drsas_evtarg_pd pd; + uint32_t err; + } pd_err; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + } pd_lba; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + struct drsas_evtarg_ld ld; + } pd_lba_ld; + + struct { + struct drsas_evtarg_pd pd; + struct drsas_progress prog; + } pd_prog; + + struct { + struct drsas_evtarg_pd pd; + uint32_t prevState; + uint32_t newState; + } pd_state; + + struct { + uint16_t vendorId; + uint16_t deviceId; + uint16_t subVendorId; + uint16_t subDeviceId; + } pci; + + uint32_t rate; + char str[96]; + + struct { + uint32_t rtc; + uint32_t elapsedSeconds; + } time; + + struct { + uint32_t ecar; + uint32_t elog; + char str[64]; + } ecc; + + drsas_pd_address_t pd_addr; + + uint8_t b[96]; + uint16_t s[48]; + uint32_t w[24]; + uint64_t d[12]; + } args; + + char description[128]; + +}; + +/* only 63 are usable by the application */ +#define MAX_LOGICAL_DRIVES 64 +/* only 255 physical devices may be used */ +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_PD_PER_ENCLOSURE 64 +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +/* maximum number of arrays a hot spare may be dedicated to */ +#define MAX_ARRAYS_DEDICATED 16 +/* maximum number of arrays which may exist */ +#define MAX_ARRAYS 128 +/* maximum number of foreign configs that may ha managed at once */ +#define MAX_FOREIGN_CONFIGS 8 +/* maximum spares (global and dedicated combined) */ +#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES +/* maximum possible Target IDs (i.e. 0 to 63) */ +#define MAX_TARGET_ID 63 +/* maximum number of supported enclosures */ +#define MAX_ENCLOSURES 32 +/* maximum number of PHYs per controller */ +#define MAX_PHYS_PER_CONTROLLER 16 +/* maximum number of LDs per array (due to DDF limitations) */ +#define MAX_LDS_PER_ARRAY 16 + +/* + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Logical Drive commands + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + */ +#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */ + +/* + * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST + * dcmd.mbox - reserved + * dcmd.sge IN - ptr to returned DR_LD_LIST structure + * Desc: Return the logical drive list structure + * Status: No error + */ + +/* + * defines the logical drive reference structure + */ +typedef union _DR_LD_REF { /* LD reference structure */ + struct { + uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */ + uint8_t reserved; /* reserved for in line with DR_PD_REF */ + uint16_t seqNum; /* Sequence Number */ + } ld_ref; + uint32_t ref; /* shorthand reference to full 32-bits */ +} DR_LD_REF; /* 4 bytes */ + +/* + * defines the logical drive list structure + */ +typedef struct _DR_LD_LIST { + uint32_t ldCount; /* number of LDs */ + uint32_t reserved; /* pad to 8-byte boundary */ + struct { + DR_LD_REF ref; /* LD reference */ + uint8_t state; /* current LD state (DR_LD_STATE) */ + uint8_t reserved[3]; /* pad to 8-byte boundary */ + uint64_t size; /* LD size */ + } ldList[MAX_LOGICAL_DRIVES]; +} DR_LD_LIST; + +struct drsas_drv_ver { + uint8_t signature[12]; + uint8_t os_name[16]; + uint8_t os_ver[12]; + uint8_t drv_name[20]; + uint8_t drv_ver[32]; + uint8_t drv_rel_date[20]; +}; + +#define PCI_TYPE0_ADDRESSES 6 +#define PCI_TYPE1_ADDRESSES 2 +#define PCI_TYPE2_ADDRESSES 5 + +struct drsas_pci_common_header { + uint16_t vendorID; /* (ro) */ + uint16_t deviceID; /* (ro) */ + uint16_t command; /* Device control */ + uint16_t status; + uint8_t revisionID; /* (ro) */ + uint8_t progIf; /* (ro) */ + uint8_t subClass; /* (ro) */ + uint8_t baseClass; /* (ro) */ + uint8_t cacheLineSize; /* (ro+) */ + uint8_t latencyTimer; /* (ro+) */ + uint8_t headerType; /* (ro) */ + uint8_t bist; /* Built in self test */ + + union { + struct { + uint32_t baseAddresses[PCI_TYPE0_ADDRESSES]; + uint32_t cis; + uint16_t subVendorID; + uint16_t subSystemID; + uint32_t romBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t reserved2; + uint8_t interruptLine; + uint8_t interruptPin; /* (ro) */ + uint8_t minimumGrant; /* (ro) */ + uint8_t maximumLatency; /* (ro) */ + } type_0; + + struct { + uint32_t baseAddresses[PCI_TYPE1_ADDRESSES]; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + uint8_t ioBase; + uint8_t ioLimit; + uint16_t secondaryStatus; + uint16_t memoryBase; + uint16_t memoryLimit; + uint16_t prefetchBase; + uint16_t prefetchLimit; + uint32_t prefetchBaseUpper32; + uint32_t prefetchLimitUpper32; + uint16_t ioBaseUpper16; + uint16_t ioLimitUpper16; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t romBaseAddress; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_1; + + struct { + uint32_t socketRegistersBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved; + uint16_t secondaryStatus; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + struct { + uint32_t base; + uint32_t limit; + } range[PCI_TYPE2_ADDRESSES-1]; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_2; + } header; +}; + +struct drsas_pci_link_capability { + union { + struct { + uint32_t linkSpeed :4; + uint32_t linkWidth :6; + uint32_t aspmSupport :2; + uint32_t losExitLatency :3; + uint32_t l1ExitLatency :3; + uint32_t rsvdp :6; + uint32_t portNumber :8; + } bits; + + uint32_t asUlong; + } cap; + +}; + +struct drsas_pci_link_status_capability { + union { + struct { + uint16_t linkSpeed :4; + uint16_t negotiatedLinkWidth :6; + uint16_t linkTrainingError :1; + uint16_t linkTraning :1; + uint16_t slotClockConfig :1; + uint16_t rsvdZ :3; + } bits; + + uint16_t asUshort; + } stat_cap; + + uint16_t reserved; + +}; + +struct drsas_pci_capabilities { + struct drsas_pci_link_capability linkCapability; + struct drsas_pci_link_status_capability linkStatusCapability; +}; + +struct drsas_pci_information +{ + uint32_t busNumber; + uint8_t deviceNumber; + uint8_t functionNumber; + uint8_t interruptVector; + uint8_t reserved; + struct drsas_pci_common_header pciHeaderInfo; + struct drsas_pci_capabilities capability; + uint8_t reserved2[32]; +}; + +struct drsas_ioctl { + uint16_t version; + uint16_t controller_id; + uint8_t signature[8]; + uint32_t reserved_1; + uint32_t control_code; + uint32_t reserved_2[2]; + uint8_t frame[64]; + union drsas_sgl_frame sgl_frame; + uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH]; + uint8_t data[1]; +}; + +struct drsas_aen { + uint16_t host_no; + uint16_t cmd_status; + uint32_t seq_num; + uint32_t class_locale_word; +}; +#pragma pack() + +#ifndef DDI_VENDOR_LSI +#define DDI_VENDOR_LSI "LSI" +#endif /* DDI_VENDOR_LSI */ + +static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int drsas_attach(dev_info_t *, ddi_attach_cmd_t); +static int drsas_reset(dev_info_t *, ddi_reset_cmd_t); +static int drsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int drsas_open(dev_t *, int, int, cred_t *); +static int drsas_close(dev_t, int, int, cred_t *); +static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int drsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int drsas_tran_reset(struct scsi_address *, int); +static int drsas_tran_getcap(struct scsi_address *, char *, int); +static int drsas_tran_setcap(struct scsi_address *, char *, int, int); +static void drsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static uint_t drsas_isr(); +static uint_t drsas_softintr(); + +static int init_mfi(struct drsas_instance *); +static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t); +static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *, + uchar_t); +static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *); +static void return_mfi_pkt(struct drsas_instance *, + struct drsas_cmd *); + +static void free_space_for_mfi(struct drsas_instance *); +static void free_additional_dma_buffer(struct drsas_instance *); +static int alloc_additional_dma_buffer(struct drsas_instance *); +static int read_fw_status_reg_ppc(struct drsas_instance *); +static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static void enable_intr_ppc(struct drsas_instance *); +static void disable_intr_ppc(struct drsas_instance *); +static int intr_ack_ppc(struct drsas_instance *); +static int mfi_state_transition_to_ready(struct drsas_instance *); +static void destroy_mfi_frame_pool(struct drsas_instance *); +static int create_mfi_frame_pool(struct drsas_instance *); +static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *, + struct buf *, int, int (*)()); +static int drsas_dma_move(struct drsas_instance *, + struct scsi_pkt *, struct buf *); +static void flush_cache(struct drsas_instance *instance); +static void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct drsas_instance *instance); +static int handle_drv_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct drsas_instance *instance, + struct drsas_aen *aen); +static void fill_up_drv_ver(struct drsas_drv_ver *dv); +static struct drsas_cmd *build_cmd(struct drsas_instance *instance, + struct scsi_address *ap, struct scsi_pkt *pkt, + uchar_t *cmd_done); +static int register_mfi_aen(struct drsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort); + +static int drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd); +static void drsas_fm_init(struct drsas_instance *instance); +static void drsas_fm_fini(struct drsas_instance *instance); +static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, + const void *); +static void drsas_fm_ereport(struct drsas_instance *instance, + char *detail); +static int drsas_check_dma_handle(ddi_dma_handle_t handle); +static int drsas_check_acc_handle(ddi_acc_handle_t handle); + +static void drsas_rem_intrs(struct drsas_instance *instance); +static int drsas_add_intrs(struct drsas_instance *instance, int intr_type); + +static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int drsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int drsas_parse_devname(char *, int *, int *); +static int drsas_config_all_devices(struct drsas_instance *); +static int drsas_config_scsi_device(struct drsas_instance *, + struct scsi_device *, dev_info_t **); +static int drsas_config_ld(struct drsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t, + uint8_t); +static int drsas_name_node(dev_info_t *, char *, int); +static void drsas_issue_evt_taskq(struct drsas_eventinfo *); +static int drsas_service_evt(struct drsas_instance *, int, int, int, + uint64_t); +static int drsas_mode_sense_build(struct scsi_pkt *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_H_ */ diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h new file mode 100644 index 0000000000..4154a77796 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h @@ -0,0 +1,212 @@ +/* + * dr_sas_list.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_LIST_H_ +#define _DR_SAS_LIST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct mlist_head { + struct mlist_head *next, *prev; +}; + +typedef struct mlist_head mlist_t; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct mlist_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} + + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_add(struct mlist_head *new, + struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static void mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + + + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_del(struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static void mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +static int mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static void mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + + +/* + * mlist_entry - get the struct for this entry + * @ptr: the &struct mlist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define mlist_entry(ptr, type, member) \ + ((type *)((size_t)(ptr) - offsetof(type, member))) + + +/* + * mlist_for_each - iterate over a list + * @pos: the &struct mlist_head to use as a loop counter. + * @head: the head for your list. + */ +#define mlist_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + + +/* + * mlist_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct mlist_head to use as a loop counter. + * @n: another &struct mlist_head to use as temporary storage + * @head: the head for your list. + */ +#define mlist_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_LIST_H_ */ diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c index 634de6c6dd..87105e779d 100644 --- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c +++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c @@ -24,6 +24,7 @@ */ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* * Fibre channel Transport Library (fctl) @@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count) maxPorts ++; } + if (maxPorts == 0) { + mutex_exit(&fctl_port_lock); + return (0); + } + /* Now allocate a buffer to store all the pointers for comparisons */ portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP); diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c new file mode 100644 index 0000000000..b484b16142 --- /dev/null +++ b/usr/src/uts/common/io/gsqueue/gsqueue.c @@ -0,0 +1,612 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Serialization queues are a technique used in illumos to provide what's + * commonly known as a 'vertical' perimeter. The idea (described a bit in + * uts/common/inet/squeue.c) is to provide a means to make sure that message + * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd + * consume these on different policies, ip on a conn_t basis, vnd on a per + * device basis, and use this to ensure that only one packet is being processed + * at a given time. + * + * Serialization queues were originally used by ip. As part of that + * implementation, many of the details of ip were baked into it. That includes + * things like conn_t, ip receive attributes, and the notion of sets. While an + * individual serialization queue, or gsqueue_t, is a useful level of + * abstraction, it isn't the basis on which monst consumers want to manage them. + * Instead, we have the notion of a set of serialization queues. These sets are + * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a + * gsqueue_t per CPU to fanout on without managing them all itself. In the + * original implementation, this existed, but they were heavily tied into the + * infrastructure of IP, and its notion of polling on the underlying MAC + * devices. + * + * The result of that past is a new interface to serialization queues and a + * similar, but slightly different, abstraction to sets of these + * (gsqueue_set_t). When designing this there are two different approaches that + * one could consider. The first is that the system has one gsqueue_set_t that + * the entire world shares, whether IP or some other consumer. The other is that + * every consumer has their own set. + * + * The trade offs between these two failure modes are the pathological failure + * modes. There is no guarantee that any two consumers here are equivalent. In + * fact, they very likely have very different latency profiles. If they are + * being processed in the same queue, that can lead to very odd behaviors. More + * generally, if we have a series of processing functions from one consumer + * which are generally short, and another which are generally long, that'll + * cause undue latency that's harder to observe. If we instead take the approach + * that each consumer should have its own set that it fans out over then we + * won't end up with the problem that a given serialization queue will have + * multiple latency profiles, but instead we'll see cpu contention for the bound + * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker + * thread is bound and it is in fact possible for it to be processed by other + * threads on other CPUs. + * + * We've opted to go down the second path, so each consumer has its own + * independent set of serialization queues that it is bound over. + * + * Structure Hierarchies + * --------------------- + * + * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t + * encapsulates all the per-CPU gsqueue_t that exist in the form of + * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could + * accommodate more than one gsqueue_t, but today there is a one to one mapping. + * + * We maintain two different lists of gsqueue_cpu_t, the active and defunct + * sets. The active set is maintained in the array `gs_cpus`. There are NCPU + * entries available in `gs_cpus` with the total number of currently active cpus + * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When + * there is no longer a need for a given binding (see the following section for + * more explanation on when this is the case) then we move the entry to the + * `gs_defunct` list which is just a singly linked list of gsqueue_cpu_t. + * + * In addition, each gsqueue_set_t can have a series of callbacks registered + * with it. These are described in the following section. Graphically, a given + * gsqueue_set_t looks roughly like the following: + * + * +---------------+ + * | gsqueue_set_t | + * +---------------+ + * | | | + * | | * . . . gs_cpus + * | | | + * | | | +-------------------------------------------------+ + * | | +----->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |... + * | | +-------------------------------------------------+ + * | | + * | * . . . gs_defunct + * | | + * | | +---------------+ +---------------+ +---------------+ + * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |... + * | +---------------+ +---------------+ +---------------+ + * * . . . gs_cbs + * | + * | +--------------+ +--------------+ +--------------+ + * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |... + * +--------------+ +--------------+ +--------------+ + * + * CPU DR, gsqueue_t, and gsqueue_t + * -------------------------------- + * + * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker + * thread that may end up doing work. As part of supporting fanout, we have one + * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of + * this binding, we need to deal with CPU DR changes. + * + * The gsqueue driver maintains a single CPU DR callback that is used for the + * entire sub-system. We break down CPU DR events into three groups. Offline + * events, online events, and events we can ignore. When the first group occurs, + * we need to go through every gsqueue_t, find the gsqueue_cpu_t that + * corresponds to that processor id, and unbind all of its gsqueue_t's. It's + * rather important that we only unbind the gsqueue_t's and not actually destroy + * them. When this happens, they could very easily have data queued inside of + * them and it's unreasonable to just throw out everything in them at this + * point. The data remains intact and service continues uinterrupted. + * + * When we receive an online event, we do the opposite. We try to find a + * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid + * field intact) in the defunct list. If we find one, we remove it from the + * defunct list and add it to the active list as well as binding the gsqueue_t + * to the CPU in question. If we don't find one, then we create a new one. + * + * To deal with these kinds of situations, we allow a consumer to register + * callbacks for the gsqueue_t that they are interested in. These callbacks will + * fire whenever we are handling a topology change. The design of the callbacks + * is not that the user can take any administrative action during them, but + * rather set something for them to do asynchronously. It is illegal to make any + * calls into the gsqueue system while you are in a callback. + * + * Locking + * ------- + * + * The lock ordering here is fairly straightforward. Due to our use of CPU + * binding and the CPU DR callbacks, we have an additional lock to consider + * cpu_lock. Because of that, the following are the rules for locking: + * + * + * o If performing binding operations, you must grab cpu_lock. cpu_lock is + * also at the top of the order. + * + * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock + * If you need to take multiple locks, you must take the greatest + * (left-most) one first. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/cpuvar.h> +#include <sys/list.h> +#include <sys/sysmacros.h> + +#include <sys/gsqueue.h> +#include <sys/squeue_impl.h> + +typedef struct gsqueue_cb { + struct gsqueue_cb *gcb_next; + gsqueue_cb_f gcb_func; + void *gcb_arg; +} gsqueue_cb_t; + +typedef struct gsqueue_cpu { + struct gsqueue_cpu *gqc_next; + squeue_t *gqc_head; + processorid_t gqc_cpuid; +} gsqueue_cpu_t; + +struct gsqueue_set { + list_node_t gs_next; + uint_t gs_wwait; + pri_t gs_wpri; + kmutex_t gs_lock; + int gs_ncpus; + gsqueue_cpu_t **gs_cpus; + gsqueue_cpu_t *gs_defunct; + gsqueue_cb_t *gs_cbs; +}; + +static kmutex_t gsqueue_lock; +static list_t gsqueue_list; +static kmem_cache_t *gsqueue_cb_cache; +static kmem_cache_t *gsqueue_cpu_cache; +static kmem_cache_t *gsqueue_set_cache; + +static gsqueue_cpu_t * +gsqueue_cpu_create(uint_t wwait, pri_t wpri, processorid_t cpuid) +{ + gsqueue_cpu_t *scp; + + scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP); + + scp->gqc_next = NULL; + scp->gqc_cpuid = cpuid; + scp->gqc_head = squeue_create(wwait, wpri, B_FALSE); + scp->gqc_head->sq_state = SQS_DEFAULT; + squeue_bind(scp->gqc_head, cpuid); + + return (scp); +} + +static void +gsqueue_cpu_destroy(gsqueue_cpu_t *scp) +{ + squeue_destroy(scp->gqc_head); + kmem_cache_free(gsqueue_cpu_cache, scp); +} + +gsqueue_set_t * +gsqueue_set_create(uint_t wwait, pri_t wpri) +{ + int i; + gsqueue_set_t *gssp; + + gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP); + gssp->gs_wwait = wwait; + gssp->gs_wpri = wpri; + gssp->gs_ncpus = 0; + + /* + * We're grabbing CPU lock. Once we let go of it we have to ensure all + * set up of the gsqueue_set_t is complete, as it'll be in there for the + * various CPU DR bits. + */ + mutex_enter(&cpu_lock); + + for (i = 0; i < NCPU; i++) { + gsqueue_cpu_t *scp; + cpu_t *cp = cpu_get(i); + if (cp != NULL && CPU_ACTIVE(cp) && + cp->cpu_flags & CPU_EXISTS) { + scp = gsqueue_cpu_create(wwait, wpri, cp->cpu_id); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + } + } + + /* Finally we can add it to our global list and be done */ + mutex_enter(&gsqueue_lock); + list_insert_tail(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + mutex_exit(&cpu_lock); + + return (gssp); +} + +void +gsqueue_set_destroy(gsqueue_set_t *gssp) +{ + int i; + gsqueue_cpu_t *scp; + + /* + * Go through and unbind all of the squeues while cpu_lock is held and + * move them to the defunct list. Once that's done, we don't need to do + * anything else with cpu_lock. + */ + mutex_enter(&cpu_lock); + mutex_enter(&gsqueue_lock); + list_remove(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + + mutex_enter(&gssp->gs_lock); + + for (i = 0; i < gssp->gs_ncpus; i++) { + scp = gssp->gs_cpus[i]; + squeue_unbind(scp->gqc_head); + scp->gqc_next = gssp->gs_defunct; + gssp->gs_defunct = scp; + gssp->gs_cpus[i] = NULL; + } + gssp->gs_ncpus = 0; + + mutex_exit(&gssp->gs_lock); + mutex_exit(&cpu_lock); + + while (gssp->gs_defunct != NULL) { + gsqueue_cpu_t *scp; + + scp = gssp->gs_defunct; + gssp->gs_defunct = scp->gqc_next; + gsqueue_cpu_destroy(scp); + } + + while (gssp->gs_cbs != NULL) { + gsqueue_cb_t *cbp; + + cbp = gssp->gs_cbs; + gssp->gs_cbs = cbp->gcb_next; + kmem_cache_free(gsqueue_cb_cache, cbp); + } + + ASSERT(gssp->gs_ncpus == 0); + ASSERT(gssp->gs_defunct == NULL); + ASSERT(gssp->gs_cbs == NULL); + kmem_cache_free(gsqueue_set_cache, gssp); +} + +gsqueue_t * +gsqueue_set_get(gsqueue_set_t *gssp, uint_t index) +{ + squeue_t *sqp; + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_cpus[index % gssp->gs_ncpus]; + sqp = scp->gqc_head; + mutex_exit(&gssp->gs_lock); + return ((gsqueue_t *)sqp); +} + +uintptr_t +gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg) +{ + gsqueue_cb_t *cbp; + + cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP); + cbp->gcb_func = cb; + cbp->gcb_arg = arg; + + mutex_enter(&gssp->gs_lock); + cbp->gcb_next = gssp->gs_cbs; + gssp->gs_cbs = cbp; + mutex_exit(&gssp->gs_lock); + return ((uintptr_t)cbp); +} + +int +gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id) +{ + gsqueue_cb_t *cbp, *prev; + mutex_enter(&gssp->gs_lock); + cbp = gssp->gs_cbs; + prev = NULL; + while (cbp != NULL) { + if ((uintptr_t)cbp != id) { + prev = cbp; + cbp = cbp->gcb_next; + continue; + } + + if (prev == NULL) { + gssp->gs_cbs = cbp->gcb_next; + } else { + prev->gcb_next = cbp->gcb_next; + } + + mutex_exit(&gssp->gs_lock); + kmem_cache_free(gsqueue_cb_cache, cbp); + return (0); + } + mutex_exit(&gssp->gs_lock); + return (-1); +} + +void +gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg, + int flags, uint8_t tag) +{ + squeue_t *sqp = (squeue_t *)gsp; + + ASSERT(mp->b_next == NULL); + ASSERT(mp->b_prev == NULL); + mp->b_queue = (queue_t *)func; + mp->b_prev = arg; + sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag); +} + +static void +gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online) +{ + gsqueue_cb_t *cbp; + + ASSERT(MUTEX_HELD(&gssp->gs_lock)); + cbp = gssp->gs_cbs; + while (cbp != NULL) { + cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online); + cbp = cbp->gcb_next; + } + +} + +/* + * When we online a processor we need to go through and either bind a defunct + * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the + * defunct list that used to be on that processor. If no such gsqueue_cpu_t + * exists, then we'll create a new one. We'd rather avoid taking over an + * existing defunct one that used to be on another CPU, as its not unreasonable + * to believe that its CPU will come back. More CPUs are offlined and onlined by + * the administrator or by creating cpu sets than actually get offlined by FMA. + */ +static void +gsqueue_handle_online(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_defunct; + while (scp != NULL) { + if (scp->gqc_cpuid == id) + break; + scp = scp->gqc_next; + } + + if (scp == NULL) { + scp = gsqueue_cpu_create(gssp->gs_wwait, + gssp->gs_wpri, id); + } else { + squeue_bind(scp->gqc_head, id); + } + ASSERT(gssp->gs_ncpus < NCPU); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + gsqueue_notify(gssp, scp->gqc_head, B_TRUE); + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +static void +gsqueue_handle_offline(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + int i; + gsqueue_cpu_t *scp = NULL; + + mutex_enter(&gssp->gs_lock); + for (i = 0; i < gssp->gs_ncpus; i++) { + if (gssp->gs_cpus[i]->gqc_cpuid == id) { + scp = gssp->gs_cpus[i]; + break; + } + } + + if (scp != NULL) { + squeue_unbind(scp->gqc_head); + scp->gqc_next = gssp->gs_defunct; + gssp->gs_defunct = scp; + gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1]; + gssp->gs_ncpus--; + gsqueue_notify(gssp, scp->gqc_head, B_FALSE); + } + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +/* ARGSUSED */ +static int +gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu_get(id); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS) + gsqueue_handle_online(cp->cpu_id); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + gsqueue_handle_offline(cp->cpu_id); + break; + default: + break; + } + + return (0); +} + + +/* ARGSUSED */ +static int +gsqueue_set_cache_construct(void *buf, void *arg, int kmflags) +{ + gsqueue_set_t *gssp = buf; + + gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags); + if (gssp->gs_cpus == NULL) + return (-1); + + mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL); + gssp->gs_ncpus = 0; + gssp->gs_defunct = NULL; + gssp->gs_cbs = NULL; + + return (0); +} + +static void +gsqueue_set_cache_destruct(void *buf, void *arg) +{ + gsqueue_set_t *gssp = buf; + + kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU); + gssp->gs_cpus = NULL; + mutex_destroy(&gssp->gs_lock); +} + +static void +gsqueue_ddiinit(void) +{ + list_create(&gsqueue_list, sizeof (gsqueue_set_t), + offsetof(gsqueue_set_t, gs_next)); + mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL); + + gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache", + sizeof (gsqueue_cb_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache", + sizeof (gsqueue_cpu_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_set_cache = kmem_cache_create("squeue_set_cache", + sizeof (gsqueue_set_t), + 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct, + NULL, NULL, NULL, 0); + + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +static int +gsqueue_ddifini(void) +{ + mutex_enter(&gsqueue_lock); + if (list_is_empty(&gsqueue_list) == 0) { + mutex_exit(&gsqueue_lock); + return (EBUSY); + } + list_destroy(&gsqueue_list); + mutex_exit(&gsqueue_lock); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + kmem_cache_destroy(gsqueue_set_cache); + kmem_cache_destroy(gsqueue_cpu_cache); + kmem_cache_destroy(gsqueue_cb_cache); + + mutex_destroy(&gsqueue_lock); + + return (0); +} + +static struct modlmisc gsqueue_modmisc = { + &mod_miscops, + "gsqueue" +}; + +static struct modlinkage gsqueue_modlinkage = { + MODREV_1, + &gsqueue_modmisc, + NULL +}; + +int +_init(void) +{ + int ret; + + gsqueue_ddiinit(); + if ((ret = mod_install(&gsqueue_modlinkage)) != 0) { + VERIFY(gsqueue_ddifini() == 0); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&gsqueue_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = gsqueue_ddifini()) != 0) + return (ret); + + if ((ret = mod_remove(&gsqueue_modlinkage)) != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/io/i40e/core/README b/usr/src/uts/common/io/i40e/core/README new file mode 100644 index 0000000000..dc0149ce62 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/README @@ -0,0 +1,410 @@ + ixl FreeBSD* Base Driver and ixlv VF Driver for the + Intel XL710 Ethernet Controller Family + +/*$FreeBSD$*/ +================================================================ + +August 26, 2014 + + +Contents +======== + +- Overview +- Supported Adapters +- The VF Driver +- Building and Installation +- Additional Configurations +- Known Limitations + + +Overview +======== + +This file describes the IXL FreeBSD* Base driver and the IXLV VF Driver +for the XL710 Ethernet Family of Adapters. The Driver has been developed +for use with FreeBSD 10.0 or later, but should be compatible with any +supported release. + +For questions related to hardware requirements, refer to the documentation +supplied with your Intel XL710 adapter. All hardware requirements listed +apply for use with FreeBSD. + + +Supported Adapters +================== + +The drivers in this release are compatible with XL710 and X710-based +Intel Ethernet Network Connections. + + +SFP+ Devices with Pluggable Optics +---------------------------------- + +SR Modules +---------- + Intel DUAL RATE 1G/10G SFP+ SR (bailed) FTLX8571D3BCV-IT + Intel DUAL RATE 1G/10G SFP+ SR (bailed) AFBR-703SDZ-IN2 + +LR Modules +---------- + Intel DUAL RATE 1G/10G SFP+ LR (bailed) FTLX1471D3BCV-IT + Intel DUAL RATE 1G/10G SFP+ LR (bailed) AFCT-701SDZ-IN2 + +QSFP+ Modules +------------- + Intel TRIPLE RATE 1G/10G/40G QSFP+ SR (bailed) E40GQSFPSR + Intel TRIPLE RATE 1G/10G/40G QSFP+ LR (bailed) E40GQSFPLR + QSFP+ 1G speed is not supported on XL710 based devices. + +X710/XL710 Based SFP+ adapters support all passive and active limiting direct +attach cables that comply with SFF-8431 v4.1 and SFF-8472 v10.4 specifications. + +The VF Driver +================== +The VF driver is normally used in a virtualized environment where a host +driver manages SRIOV, and provides a VF device to the guest. With this +first release the only host environment tested was using Linux QEMU/KVM. +Support is planned for Xen and VMWare hosts at a later time. + +In the FreeBSD guest the IXLV driver would be loaded and will function +using the VF device assigned to it. + +The VF driver provides most of the same functionality as the CORE driver, +but is actually a slave to the Host, access to many controls are actually +accomplished by a request to the Host via what is called the "Admin queue". +These are startup and initialization events however, once in operation +the device is self-contained and should achieve near native performance. + +Some notable limitations of the VF environment: for security reasons +the driver is never permitted to be promiscuous, therefore a tcpdump +will not behave the same with the interface. Second, media info is not +available from the PF, so it will always appear as auto. + +Tarball Building and Installation +========================= + +NOTE: You must have kernel sources installed to compile the driver tarball. + +These instructions assume a standalone driver tarball, building the driver +already in the kernel source is simply a matter of adding the device entry +to the kernel config file, or building in the ixl or ixlv module directory. + +In the instructions below, x.x.x is the driver version +as indicated in the name of the driver tarball. The example is +for ixl, the same procedure applies for ixlv. + +1. Move the base driver tar file to the directory of your choice. + For example, use /home/username/ixl or /usr/local/src/ixl. + +2. Untar/unzip the archive: + tar xfz ixl-x.x.x.tar.gz + +3. To install man page: + cd ixl-x.x.x + gzip -c ixl.4 > /usr/share/man/man4/ixl.4.gz + +4. To load the driver onto a running system: + cd ixl-x.x.x/src + make load + +5. To assign an IP address to the interface, enter the following: + ifconfig ixl<interface_num> <IP_address> + +6. Verify that the interface works. Enter the following, where <IP_address> + is the IP address for another machine on the same subnet as the interface + that is being tested: + + ping <IP_address> + +7. If you want the driver to load automatically when the system is booted: + + cd ixl-x.x.x/src + make + make install + + Edit /boot/loader.conf, and add the following line: + if_ixl_load="YES" + + Edit /etc/rc.conf, and create the appropriate + ifconfig_ixl<interface_num> entry: + + ifconfig_ixl<interface_num>="<ifconfig_settings>" + + Example usage: + + ifconfig_ixl0="inet 192.168.10.1 netmask 255.255.255.0" + + NOTE: For assistance, see the ifconfig man page. + + + +Configuration and Tuning +========================= + +Both drivers supports Transmit/Receive Checksum Offload for IPv4 and IPv6, +TSO forIPv4 and IPv6, LRO, and Jumbo Frames on all 40 Gigabit adapters. + + Jumbo Frames + ------------ + To enable Jumbo Frames, use the ifconfig utility to increase + the MTU beyond 1500 bytes. + + - The Jumbo Frames setting on the switch must be set to at least + 22 byteslarger than that of the adapter. + + - The maximum MTU setting for Jumbo Frames is 9706. This value + coincides with the maximum jumbo frames size of 9728. + To modify the setting, enter the following: + + ifconfig ixl<interface_num> <hostname or IP address> mtu 9000 + + - To confirm an interface's MTU value, use the ifconfig command. + To confirm the MTU used between two specific devices, use: + + route get <destination_IP_address> + + VLANs + ----- + To create a new VLAN pseudo-interface: + + ifconfig <vlan_name> create + + To associate the VLAN pseudo-interface with a physical interface + and assign a VLAN ID, IP address, and netmask: + + ifconfig <vlan_name> <ip_address> netmask <subnet_mask> vlan + <vlan_id> vlandev <physical_interface> + + Example: + + ifconfig vlan10 10.0.0.1 netmask 255.255.255.0 vlan 10 vlandev ixl0 + + In this example, all packets will be marked on egress with + 802.1Q VLAN tags, specifying a VLAN ID of 10. + + To remove a VLAN pseudo-interface: + + ifconfig <vlan_name> destroy + + + Checksum Offload + ---------------- + + Checksum offloading supports IPv4 and IPv6 with TCP and UDP packets + and is supported for both transmit and receive. Checksum offloading + for transmit and recieve is enabled by default for both IPv4 and IPv6. + + Checksum offloading can be enabled or disabled using ifconfig. + Transmit and receive offloading for IPv4 and Ipv6 are enabled + and disabled seperately. + + NOTE: TSO requires Tx checksum, so when Tx checksum + is disabled, TSO will also be disabled. + + To enable Tx checksum offloading for ipv4: + + ifconfig ixl<interface_num> txcsum4 + + To disable Tx checksum offloading for ipv4: + + ifconfig ixl<interface_num> -txcsum4 + (NOTE: This will disable TSO4) + + To enable Rx checksum offloading for ipv6: + + ifconfig ixl<interface_num> rxcsum6 + + To disable Rx checksum offloading for ipv6: + + ifconfig ixl<interface_num> -rxcsum6 + (NOTE: This will disable TSO6) + + + To confirm the current settings: + + ifconfig ixl<interface_num> + + + TSO + --- + + TSO supports both IPv4 and IPv6 and is enabled by default. TSO can + be disabled and enabled using the ifconfig utility. + + NOTE: TSO requires Tx checksum, so when Tx checksum is + disabled, TSO will also be disabled. + + To disable TSO IPv4: + + ifconfig ixl<interface_num> -tso4 + + To enable TSO IPv4: + + ifconfig ixl<interface_num> tso4 + + To disable TSO IPv6: + + ifconfig ixl<interface_num> -tso6 + + To enable TSO IPv6: + + ifconfig ixl<interface_num> tso6 + + To disable BOTH TSO IPv4 and IPv6: + + ifconfig ixl<interface_num> -tso + + To enable BOTH TSO IPv4 and IPv6: + + ifconfig ixl<interface_num> tso + + + LRO + --- + + Large Receive Offload is enabled by default. It can be enabled + or disabled by using the ifconfig utility. + + NOTE: LRO should be disabled when forwarding packets. + + To disable LRO: + + ifconfig ixl<interface_num> -lro + + To enable LRO: + + ifconfig ixl<interface_num> lro + + +Flow Control (IXL only) +------------ +Flow control is disabled by default. To change flow control settings use sysctl. + +To enable flow control to Rx pause frames: + + sysctl dev.ixl.<interface_num>.fc=1 + +To enable flow control to Tx pause frames: + + sysctl dev.ixl.<interface_num>.fc=2 + +To enable flow control to Rx and Tx pause frames: + + sysctl dev.ixl.<interface_num>.fc=3 + +To disable flow control: + + sysctl dev.ixl.<interface_num>.fc=0 + + +NOTE: You must have a flow control capable link partner. + +NOTE: The VF driver does not have access to flow control, it must be + managed from the host side. + + + Important system configuration changes: + ======================================= + +-Change the file /etc/sysctl.conf, and add the line: + + hw.intr_storm_threshold: 0 (the default is 1000) + +-Best throughput results are seen with a large MTU; use 9706 if possible. + +-The default number of descriptors per ring is 1024, increasing this may +improve performance depending on the use case. + +-The VF driver uses a relatively large buf ring, this was found to eliminate + UDP transmit errors, it is a tuneable, and if no UDP traffic is used it can + be reduced. It is memory used per queue. + + +Known Limitations +================= + +Network Memory Buffer allocation +-------------------------------- + FreeBSD may have a low number of network memory buffers (mbufs) by default. +If your mbuf value is too low, it may cause the driver to fail to initialize +and/or cause the system to become unresponsive. You can check to see if the +system is mbuf-starved by running 'netstat -m'. Increase the number of mbufs +by editing the lines below in /etc/sysctl.conf: + + kern.ipc.nmbclusters + kern.ipc.nmbjumbop + kern.ipc.nmbjumbo9 + kern.ipc.nmbjumbo16 + kern.ipc.nmbufs + +The amount of memory that you allocate is system specific, and may +require some trial and error. + +Also, increasing the follwing in /etc/sysctl.conf could help increase +network performance: + + kern.ipc.maxsockbuf + net.inet.tcp.sendspace + net.inet.tcp.recvspace + net.inet.udp.maxdgram + net.inet.udp.recvspace + + +UDP Stress Test Dropped Packet Issue +------------------------------------ +Under small packet UDP stress test with the ixl driver, the FreeBSD system +may drop UDP packets due to the fullness of socket buffers. You may want to +change the driver's Flow Control variables to the minimum value for +controlling packet reception. + + +Disable LRO when routing/bridging +--------------------------------- +LRO must be turned off when forwarding traffic. + + +Lower than expected performance +------------------------------- +Some PCIe x8 slots are actually configured as x4 slots. These slots have +insufficient bandwidth for full line rate with dual port and quad port +devices. + +In addition, if you put a PCIe Generation 3-capable adapter into a PCIe +Generation 2 slot, you cannot get full bandwidth. The driver detects this +situation and writes the following message in the system log: + + "PCI-Express bandwidth available for this card is not sufficient for + optimal performance. For optimal performance a x8 PCI-Express slot + is required." + +If this error occurs, moving your adapter to a true PCIe Generation 3 x8 +slot will resolve the issue. + + +Support +======= + +For general information and support, go to the Intel support website at: + + http://support.intel.com + +If an issue is identified with the released source code on the supported kernel +with a supported adapter, email the specific information related to the issue +to freebsdnic@mailbox.intel.com. + + +License +======= + +This software program is released under the terms of a license agreement +between you ('Licensee') and Intel. Do not use or load this software or any +associated materials (collectively, the 'Software') until you have carefully +read the full terms and conditions of the LICENSE located in this software +package. By loadingor using the Software, you agree to the terms of this +Agreement. If you do not agree with the terms of this Agreement, do not +install or use the Software. + +* Other names and brands may be claimed as the property of others. + + diff --git a/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE new file mode 100644 index 0000000000..04c551f1b2 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE @@ -0,0 +1,29 @@ + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + diff --git a/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..7a9537b10e --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +i40e DRIVER diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq.c b/usr/src/uts/common/io/i40e/core/i40e_adminq.c new file mode 100644 index 0000000000..67b72fd9f2 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_adminq.c @@ -0,0 +1,1101 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_adminq.c 284049 2015-06-05 22:52:42Z jfv $*/ + +#include "i40e_status.h" +#include "i40e_type.h" +#include "i40e_register.h" +#include "i40e_adminq.h" +#include "i40e_prototype.h" + +/** + * i40e_is_nvm_update_op - return TRUE if this is an NVM update operation + * @desc: API request descriptor + **/ +static INLINE bool i40e_is_nvm_update_op(struct i40e_aq_desc *desc) +{ + return (desc->opcode == CPU_TO_LE16(i40e_aqc_opc_nvm_erase) || + desc->opcode == CPU_TO_LE16(i40e_aqc_opc_nvm_update)); +} + +/** + * i40e_adminq_init_regs - Initialize AdminQ registers + * @hw: pointer to the hardware structure + * + * This assumes the alloc_asq and alloc_arq functions have already been called + **/ +static void i40e_adminq_init_regs(struct i40e_hw *hw) +{ + /* set head and tail registers in our local struct */ + if (i40e_is_vf(hw)) { + hw->aq.asq.tail = I40E_VF_ATQT1; + hw->aq.asq.head = I40E_VF_ATQH1; + hw->aq.asq.len = I40E_VF_ATQLEN1; + hw->aq.asq.bal = I40E_VF_ATQBAL1; + hw->aq.asq.bah = I40E_VF_ATQBAH1; + hw->aq.arq.tail = I40E_VF_ARQT1; + hw->aq.arq.head = I40E_VF_ARQH1; + hw->aq.arq.len = I40E_VF_ARQLEN1; + hw->aq.arq.bal = I40E_VF_ARQBAL1; + hw->aq.arq.bah = I40E_VF_ARQBAH1; + } else { + hw->aq.asq.tail = I40E_PF_ATQT; + hw->aq.asq.head = I40E_PF_ATQH; + hw->aq.asq.len = I40E_PF_ATQLEN; + hw->aq.asq.bal = I40E_PF_ATQBAL; + hw->aq.asq.bah = I40E_PF_ATQBAH; + hw->aq.arq.tail = I40E_PF_ARQT; + hw->aq.arq.head = I40E_PF_ARQH; + hw->aq.arq.len = I40E_PF_ARQLEN; + hw->aq.arq.bal = I40E_PF_ARQBAL; + hw->aq.arq.bah = I40E_PF_ARQBAH; + } +} + +/** + * i40e_alloc_adminq_asq_ring - Allocate Admin Queue send rings + * @hw: pointer to the hardware structure + **/ +enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code; + + ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf, + i40e_mem_atq_ring, + (hw->aq.num_asq_entries * + sizeof(struct i40e_aq_desc)), + I40E_ADMINQ_DESC_ALIGNMENT); + if (ret_code) + return ret_code; + + ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.cmd_buf, + (hw->aq.num_asq_entries * + sizeof(struct i40e_asq_cmd_details))); + if (ret_code) { + i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf); + return ret_code; + } + + return ret_code; +} + +/** + * i40e_alloc_adminq_arq_ring - Allocate Admin Queue receive rings + * @hw: pointer to the hardware structure + **/ +enum i40e_status_code i40e_alloc_adminq_arq_ring(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code; + + ret_code = i40e_allocate_dma_mem(hw, &hw->aq.arq.desc_buf, + i40e_mem_arq_ring, + (hw->aq.num_arq_entries * + sizeof(struct i40e_aq_desc)), + I40E_ADMINQ_DESC_ALIGNMENT); + + return ret_code; +} + +/** + * i40e_free_adminq_asq - Free Admin Queue send rings + * @hw: pointer to the hardware structure + * + * This assumes the posted send buffers have already been cleaned + * and de-allocated + **/ +void i40e_free_adminq_asq(struct i40e_hw *hw) +{ + i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf); +} + +/** + * i40e_free_adminq_arq - Free Admin Queue receive rings + * @hw: pointer to the hardware structure + * + * This assumes the posted receive buffers have already been cleaned + * and de-allocated + **/ +void i40e_free_adminq_arq(struct i40e_hw *hw) +{ + i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf); +} + +/** + * i40e_alloc_arq_bufs - Allocate pre-posted buffers for the receive queue + * @hw: pointer to the hardware structure + **/ +static enum i40e_status_code i40e_alloc_arq_bufs(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code; + struct i40e_aq_desc *desc; + struct i40e_dma_mem *bi; + int i; + + /* We'll be allocating the buffer info memory first, then we can + * allocate the mapped buffers for the event processing + */ + + /* buffer_info structures do not need alignment */ + ret_code = i40e_allocate_virt_mem(hw, &hw->aq.arq.dma_head, + (hw->aq.num_arq_entries * sizeof(struct i40e_dma_mem))); + if (ret_code) + goto alloc_arq_bufs; + hw->aq.arq.r.arq_bi = (struct i40e_dma_mem *)hw->aq.arq.dma_head.va; + + /* allocate the mapped buffers */ + for (i = 0; i < hw->aq.num_arq_entries; i++) { + bi = &hw->aq.arq.r.arq_bi[i]; + ret_code = i40e_allocate_dma_mem(hw, bi, + i40e_mem_arq_buf, + hw->aq.arq_buf_size, + I40E_ADMINQ_DESC_ALIGNMENT); + if (ret_code) + goto unwind_alloc_arq_bufs; + + /* now configure the descriptors for use */ + desc = I40E_ADMINQ_DESC(hw->aq.arq, i); + + desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF); + if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF) + desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB); + desc->opcode = 0; + /* This is in accordance with Admin queue design, there is no + * register for buffer size configuration + */ + desc->datalen = CPU_TO_LE16((u16)bi->size); + desc->retval = 0; + desc->cookie_high = 0; + desc->cookie_low = 0; + desc->params.external.addr_high = + CPU_TO_LE32(I40E_HI_DWORD(bi->pa)); + desc->params.external.addr_low = + CPU_TO_LE32(I40E_LO_DWORD(bi->pa)); + desc->params.external.param0 = 0; + desc->params.external.param1 = 0; + } + +alloc_arq_bufs: + return ret_code; + +unwind_alloc_arq_bufs: + /* don't try to free the one that failed... */ + i--; + for (; i >= 0; i--) + i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]); + i40e_free_virt_mem(hw, &hw->aq.arq.dma_head); + + return ret_code; +} + +/** + * i40e_alloc_asq_bufs - Allocate empty buffer structs for the send queue + * @hw: pointer to the hardware structure + **/ +static enum i40e_status_code i40e_alloc_asq_bufs(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code; + struct i40e_dma_mem *bi; + int i; + + /* No mapped memory needed yet, just the buffer info structures */ + ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.dma_head, + (hw->aq.num_asq_entries * sizeof(struct i40e_dma_mem))); + if (ret_code) + goto alloc_asq_bufs; + hw->aq.asq.r.asq_bi = (struct i40e_dma_mem *)hw->aq.asq.dma_head.va; + + /* allocate the mapped buffers */ + for (i = 0; i < hw->aq.num_asq_entries; i++) { + bi = &hw->aq.asq.r.asq_bi[i]; + ret_code = i40e_allocate_dma_mem(hw, bi, + i40e_mem_asq_buf, + hw->aq.asq_buf_size, + I40E_ADMINQ_DESC_ALIGNMENT); + if (ret_code) + goto unwind_alloc_asq_bufs; + } +alloc_asq_bufs: + return ret_code; + +unwind_alloc_asq_bufs: + /* don't try to free the one that failed... */ + i--; + for (; i >= 0; i--) + i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]); + i40e_free_virt_mem(hw, &hw->aq.asq.dma_head); + + return ret_code; +} + +/** + * i40e_free_arq_bufs - Free receive queue buffer info elements + * @hw: pointer to the hardware structure + **/ +static void i40e_free_arq_bufs(struct i40e_hw *hw) +{ + int i; + + /* free descriptors */ + for (i = 0; i < hw->aq.num_arq_entries; i++) + i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]); + + /* free the descriptor memory */ + i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf); + + /* free the dma header */ + i40e_free_virt_mem(hw, &hw->aq.arq.dma_head); +} + +/** + * i40e_free_asq_bufs - Free send queue buffer info elements + * @hw: pointer to the hardware structure + **/ +static void i40e_free_asq_bufs(struct i40e_hw *hw) +{ + int i; + + /* only unmap if the address is non-NULL */ + for (i = 0; i < hw->aq.num_asq_entries; i++) + if (hw->aq.asq.r.asq_bi[i].pa) + i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]); + + /* free the buffer info list */ + i40e_free_virt_mem(hw, &hw->aq.asq.cmd_buf); + + /* free the descriptor memory */ + i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf); + + /* free the dma header */ + i40e_free_virt_mem(hw, &hw->aq.asq.dma_head); +} + +/** + * i40e_config_asq_regs - configure ASQ registers + * @hw: pointer to the hardware structure + * + * Configure base address and length registers for the transmit queue + **/ +static enum i40e_status_code i40e_config_asq_regs(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u32 reg = 0; + + /* Clear Head and Tail */ + wr32(hw, hw->aq.asq.head, 0); + wr32(hw, hw->aq.asq.tail, 0); + + /* set starting point */ + if (!i40e_is_vf(hw)) + wr32(hw, hw->aq.asq.len, (hw->aq.num_asq_entries | + I40E_PF_ATQLEN_ATQENABLE_MASK)); + if (i40e_is_vf(hw)) + wr32(hw, hw->aq.asq.len, (hw->aq.num_asq_entries | + I40E_VF_ATQLEN1_ATQENABLE_MASK)); + wr32(hw, hw->aq.asq.bal, I40E_LO_DWORD(hw->aq.asq.desc_buf.pa)); + wr32(hw, hw->aq.asq.bah, I40E_HI_DWORD(hw->aq.asq.desc_buf.pa)); + + /* Check one register to verify that config was applied */ + reg = rd32(hw, hw->aq.asq.bal); + if (reg != I40E_LO_DWORD(hw->aq.asq.desc_buf.pa)) + ret_code = I40E_ERR_ADMIN_QUEUE_ERROR; + + return ret_code; +} + +/** + * i40e_config_arq_regs - ARQ register configuration + * @hw: pointer to the hardware structure + * + * Configure base address and length registers for the receive (event queue) + **/ +static enum i40e_status_code i40e_config_arq_regs(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u32 reg = 0; + + /* Clear Head and Tail */ + wr32(hw, hw->aq.arq.head, 0); + wr32(hw, hw->aq.arq.tail, 0); + + /* set starting point */ + if (!i40e_is_vf(hw)) + wr32(hw, hw->aq.arq.len, (hw->aq.num_arq_entries | + I40E_PF_ARQLEN_ARQENABLE_MASK)); + if (i40e_is_vf(hw)) + wr32(hw, hw->aq.arq.len, (hw->aq.num_arq_entries | + I40E_VF_ARQLEN1_ARQENABLE_MASK)); + wr32(hw, hw->aq.arq.bal, I40E_LO_DWORD(hw->aq.arq.desc_buf.pa)); + wr32(hw, hw->aq.arq.bah, I40E_HI_DWORD(hw->aq.arq.desc_buf.pa)); + + /* Update tail in the HW to post pre-allocated buffers */ + wr32(hw, hw->aq.arq.tail, hw->aq.num_arq_entries - 1); + + /* Check one register to verify that config was applied */ + reg = rd32(hw, hw->aq.arq.bal); + if (reg != I40E_LO_DWORD(hw->aq.arq.desc_buf.pa)) + ret_code = I40E_ERR_ADMIN_QUEUE_ERROR; + + return ret_code; +} + +/** + * i40e_init_asq - main initialization routine for ASQ + * @hw: pointer to the hardware structure + * + * This is the main initialization routine for the Admin Send Queue + * Prior to calling this function, drivers *MUST* set the following fields + * in the hw->aq structure: + * - hw->aq.num_asq_entries + * - hw->aq.arq_buf_size + * + * Do *NOT* hold the lock when calling this as the memory allocation routines + * called are not going to be atomic context safe + **/ +enum i40e_status_code i40e_init_asq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (hw->aq.asq.count > 0) { + /* queue already initialized */ + ret_code = I40E_ERR_NOT_READY; + goto init_adminq_exit; + } + + /* verify input for valid configuration */ + if ((hw->aq.num_asq_entries == 0) || + (hw->aq.asq_buf_size == 0)) { + ret_code = I40E_ERR_CONFIG; + goto init_adminq_exit; + } + + hw->aq.asq.next_to_use = 0; + hw->aq.asq.next_to_clean = 0; + hw->aq.asq.count = hw->aq.num_asq_entries; + + /* allocate the ring memory */ + ret_code = i40e_alloc_adminq_asq_ring(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_exit; + + /* allocate buffers in the rings */ + ret_code = i40e_alloc_asq_bufs(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_rings; + + /* initialize base registers */ + ret_code = i40e_config_asq_regs(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_rings; + + /* success! */ + goto init_adminq_exit; + +init_adminq_free_rings: + i40e_free_adminq_asq(hw); + +init_adminq_exit: + return ret_code; +} + +/** + * i40e_init_arq - initialize ARQ + * @hw: pointer to the hardware structure + * + * The main initialization routine for the Admin Receive (Event) Queue. + * Prior to calling this function, drivers *MUST* set the following fields + * in the hw->aq structure: + * - hw->aq.num_asq_entries + * - hw->aq.arq_buf_size + * + * Do *NOT* hold the lock when calling this as the memory allocation routines + * called are not going to be atomic context safe + **/ +enum i40e_status_code i40e_init_arq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (hw->aq.arq.count > 0) { + /* queue already initialized */ + ret_code = I40E_ERR_NOT_READY; + goto init_adminq_exit; + } + + /* verify input for valid configuration */ + if ((hw->aq.num_arq_entries == 0) || + (hw->aq.arq_buf_size == 0)) { + ret_code = I40E_ERR_CONFIG; + goto init_adminq_exit; + } + + hw->aq.arq.next_to_use = 0; + hw->aq.arq.next_to_clean = 0; + hw->aq.arq.count = hw->aq.num_arq_entries; + + /* allocate the ring memory */ + ret_code = i40e_alloc_adminq_arq_ring(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_exit; + + /* allocate buffers in the rings */ + ret_code = i40e_alloc_arq_bufs(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_rings; + + /* initialize base registers */ + ret_code = i40e_config_arq_regs(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_rings; + + /* success! */ + goto init_adminq_exit; + +init_adminq_free_rings: + i40e_free_adminq_arq(hw); + +init_adminq_exit: + return ret_code; +} + +/** + * i40e_shutdown_asq - shutdown the ASQ + * @hw: pointer to the hardware structure + * + * The main shutdown routine for the Admin Send Queue + **/ +enum i40e_status_code i40e_shutdown_asq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (hw->aq.asq.count == 0) + return I40E_ERR_NOT_READY; + + /* Stop firmware AdminQ processing */ + wr32(hw, hw->aq.asq.head, 0); + wr32(hw, hw->aq.asq.tail, 0); + wr32(hw, hw->aq.asq.len, 0); + wr32(hw, hw->aq.asq.bal, 0); + wr32(hw, hw->aq.asq.bah, 0); + + /* make sure spinlock is available */ + i40e_acquire_spinlock(&hw->aq.asq_spinlock); + + hw->aq.asq.count = 0; /* to indicate uninitialized queue */ + + /* free ring buffers */ + i40e_free_asq_bufs(hw); + + i40e_release_spinlock(&hw->aq.asq_spinlock); + + return ret_code; +} + +/** + * i40e_shutdown_arq - shutdown ARQ + * @hw: pointer to the hardware structure + * + * The main shutdown routine for the Admin Receive Queue + **/ +enum i40e_status_code i40e_shutdown_arq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (hw->aq.arq.count == 0) + return I40E_ERR_NOT_READY; + + /* Stop firmware AdminQ processing */ + wr32(hw, hw->aq.arq.head, 0); + wr32(hw, hw->aq.arq.tail, 0); + wr32(hw, hw->aq.arq.len, 0); + wr32(hw, hw->aq.arq.bal, 0); + wr32(hw, hw->aq.arq.bah, 0); + + /* make sure spinlock is available */ + i40e_acquire_spinlock(&hw->aq.arq_spinlock); + + hw->aq.arq.count = 0; /* to indicate uninitialized queue */ + + /* free ring buffers */ + i40e_free_arq_bufs(hw); + + i40e_release_spinlock(&hw->aq.arq_spinlock); + + return ret_code; +} + +/** + * i40e_init_adminq - main initialization routine for Admin Queue + * @hw: pointer to the hardware structure + * + * Prior to calling this function, drivers *MUST* set the following fields + * in the hw->aq structure: + * - hw->aq.num_asq_entries + * - hw->aq.num_arq_entries + * - hw->aq.arq_buf_size + * - hw->aq.asq_buf_size + **/ +enum i40e_status_code i40e_init_adminq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code; + u16 eetrack_lo, eetrack_hi; + u16 cfg_ptr, oem_hi, oem_lo; + int retry = 0; + /* verify input for valid configuration */ + if ((hw->aq.num_arq_entries == 0) || + (hw->aq.num_asq_entries == 0) || + (hw->aq.arq_buf_size == 0) || + (hw->aq.asq_buf_size == 0)) { + ret_code = I40E_ERR_CONFIG; + goto init_adminq_exit; + } + + /* initialize spin locks */ + i40e_init_spinlock(&hw->aq.asq_spinlock); + i40e_init_spinlock(&hw->aq.arq_spinlock); + + /* Set up register offsets */ + i40e_adminq_init_regs(hw); + + /* setup ASQ command write back timeout */ + hw->aq.asq_cmd_timeout = I40E_ASQ_CMD_TIMEOUT; + + /* allocate the ASQ */ + ret_code = i40e_init_asq(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_destroy_spinlocks; + + /* allocate the ARQ */ + ret_code = i40e_init_arq(hw); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_asq; + + /* VF has no need of firmware */ + if (i40e_is_vf(hw)) + goto init_adminq_exit; + /* There are some cases where the firmware may not be quite ready + * for AdminQ operations, so we retry the AdminQ setup a few times + * if we see timeouts in this first AQ call. + */ + do { + ret_code = i40e_aq_get_firmware_version(hw, + &hw->aq.fw_maj_ver, + &hw->aq.fw_min_ver, + &hw->aq.fw_build, + &hw->aq.api_maj_ver, + &hw->aq.api_min_ver, + NULL); + if (ret_code != I40E_ERR_ADMIN_QUEUE_TIMEOUT) + break; + retry++; + i40e_msec_delay(100); + i40e_resume_aq(hw); + } while (retry < 10); + if (ret_code != I40E_SUCCESS) + goto init_adminq_free_arq; + + /* get the NVM version info */ + i40e_read_nvm_word(hw, I40E_SR_NVM_DEV_STARTER_VERSION, + &hw->nvm.version); + i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_LO, &eetrack_lo); + i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_HI, &eetrack_hi); + hw->nvm.eetrack = (eetrack_hi << 16) | eetrack_lo; + i40e_read_nvm_word(hw, I40E_SR_BOOT_CONFIG_PTR, &cfg_ptr); + i40e_read_nvm_word(hw, (cfg_ptr + I40E_NVM_OEM_VER_OFF), + &oem_hi); + i40e_read_nvm_word(hw, (cfg_ptr + (I40E_NVM_OEM_VER_OFF + 1)), + &oem_lo); + hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo; + + if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) { + ret_code = I40E_ERR_FIRMWARE_API_VERSION; + goto init_adminq_free_arq; + } + + /* pre-emptive resource lock release */ + i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL); + hw->aq.nvm_release_on_done = FALSE; + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; + + ret_code = i40e_aq_set_hmc_resource_profile(hw, + I40E_HMC_PROFILE_DEFAULT, + 0, + NULL); + ret_code = I40E_SUCCESS; + + /* success! */ + goto init_adminq_exit; + +init_adminq_free_arq: + i40e_shutdown_arq(hw); +init_adminq_free_asq: + i40e_shutdown_asq(hw); +init_adminq_destroy_spinlocks: + i40e_destroy_spinlock(&hw->aq.asq_spinlock); + i40e_destroy_spinlock(&hw->aq.arq_spinlock); + +init_adminq_exit: + return ret_code; +} + +/** + * i40e_shutdown_adminq - shutdown routine for the Admin Queue + * @hw: pointer to the hardware structure + **/ +enum i40e_status_code i40e_shutdown_adminq(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (i40e_check_asq_alive(hw)) + i40e_aq_queue_shutdown(hw, TRUE); + + i40e_shutdown_asq(hw); + i40e_shutdown_arq(hw); + + /* destroy the spinlocks */ + i40e_destroy_spinlock(&hw->aq.asq_spinlock); + i40e_destroy_spinlock(&hw->aq.arq_spinlock); + + if (hw->nvm_buff.va) + i40e_free_virt_mem(hw, &hw->nvm_buff); + + return ret_code; +} + +/** + * i40e_clean_asq - cleans Admin send queue + * @hw: pointer to the hardware structure + * + * returns the number of free desc + **/ +u16 i40e_clean_asq(struct i40e_hw *hw) +{ + struct i40e_adminq_ring *asq = &(hw->aq.asq); + struct i40e_asq_cmd_details *details; + u16 ntc = asq->next_to_clean; + struct i40e_aq_desc desc_cb; + struct i40e_aq_desc *desc; + + desc = I40E_ADMINQ_DESC(*asq, ntc); + details = I40E_ADMINQ_DETAILS(*asq, ntc); + + while (rd32(hw, hw->aq.asq.head) != ntc) { + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, + "ntc %d head %d.\n", ntc, rd32(hw, hw->aq.asq.head)); + + if (details->callback) { + I40E_ADMINQ_CALLBACK cb_func = + (I40E_ADMINQ_CALLBACK)details->callback; + i40e_memcpy(&desc_cb, desc, sizeof(struct i40e_aq_desc), + I40E_DMA_TO_DMA); + cb_func(hw, &desc_cb); + } + i40e_memset(desc, 0, sizeof(*desc), I40E_DMA_MEM); + i40e_memset(details, 0, sizeof(*details), I40E_NONDMA_MEM); + ntc++; + if (ntc == asq->count) + ntc = 0; + desc = I40E_ADMINQ_DESC(*asq, ntc); + details = I40E_ADMINQ_DETAILS(*asq, ntc); + } + + asq->next_to_clean = ntc; + + return I40E_DESC_UNUSED(asq); +} + +/** + * i40e_asq_done - check if FW has processed the Admin Send Queue + * @hw: pointer to the hw struct + * + * Returns TRUE if the firmware has processed all descriptors on the + * admin send queue. Returns FALSE if there are still requests pending. + **/ +bool i40e_asq_done(struct i40e_hw *hw) +{ + /* AQ designers suggest use of head for better + * timing reliability than DD bit + */ + return rd32(hw, hw->aq.asq.head) == hw->aq.asq.next_to_use; + +} + +/** + * i40e_asq_send_command - send command to Admin Queue + * @hw: pointer to the hw struct + * @desc: prefilled descriptor describing the command (non DMA mem) + * @buff: buffer to use for indirect commands + * @buff_size: size of buffer for indirect commands + * @cmd_details: pointer to command details structure + * + * This is the main send command driver routine for the Admin Queue send + * queue. It runs the queue, cleans the queue, etc + **/ +enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw, + struct i40e_aq_desc *desc, + void *buff, /* can be NULL */ + u16 buff_size, + struct i40e_asq_cmd_details *cmd_details) +{ + enum i40e_status_code status = I40E_SUCCESS; + struct i40e_dma_mem *dma_buff = NULL; + struct i40e_asq_cmd_details *details; + struct i40e_aq_desc *desc_on_ring; + bool cmd_completed = FALSE; + u16 retval = 0; + u32 val = 0; + + hw->aq.asq_last_status = I40E_AQ_RC_OK; + + val = rd32(hw, hw->aq.asq.head); + if (val >= hw->aq.num_asq_entries) { + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, + "AQTX: head overrun at %d\n", val); + status = I40E_ERR_QUEUE_EMPTY; + goto asq_send_command_exit; + } + + if (hw->aq.asq.count == 0) { + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, + "AQTX: Admin queue not initialized.\n"); + status = I40E_ERR_QUEUE_EMPTY; + goto asq_send_command_exit; + } + + details = I40E_ADMINQ_DETAILS(hw->aq.asq, hw->aq.asq.next_to_use); + if (cmd_details) { + i40e_memcpy(details, + cmd_details, + sizeof(struct i40e_asq_cmd_details), + I40E_NONDMA_TO_NONDMA); + + /* If the cmd_details are defined copy the cookie. The + * CPU_TO_LE32 is not needed here because the data is ignored + * by the FW, only used by the driver + */ + if (details->cookie) { + desc->cookie_high = + CPU_TO_LE32(I40E_HI_DWORD(details->cookie)); + desc->cookie_low = + CPU_TO_LE32(I40E_LO_DWORD(details->cookie)); + } + } else { + i40e_memset(details, 0, + sizeof(struct i40e_asq_cmd_details), + I40E_NONDMA_MEM); + } + + /* clear requested flags and then set additional flags if defined */ + desc->flags &= ~CPU_TO_LE16(details->flags_dis); + desc->flags |= CPU_TO_LE16(details->flags_ena); + + i40e_acquire_spinlock(&hw->aq.asq_spinlock); + + if (buff_size > hw->aq.asq_buf_size) { + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQTX: Invalid buffer size: %d.\n", + buff_size); + status = I40E_ERR_INVALID_SIZE; + goto asq_send_command_error; + } + + if (details->postpone && !details->async) { + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQTX: Async flag not set along with postpone flag"); + status = I40E_ERR_PARAM; + goto asq_send_command_error; + } + + /* call clean and check queue available function to reclaim the + * descriptors that were processed by FW, the function returns the + * number of desc available + */ + /* the clean function called here could be called in a separate thread + * in case of asynchronous completions + */ + if (i40e_clean_asq(hw) == 0) { + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQTX: Error queue is full.\n"); + status = I40E_ERR_ADMIN_QUEUE_FULL; + goto asq_send_command_error; + } + + /* initialize the temp desc pointer with the right desc */ + desc_on_ring = I40E_ADMINQ_DESC(hw->aq.asq, hw->aq.asq.next_to_use); + + /* if the desc is available copy the temp desc to the right place */ + i40e_memcpy(desc_on_ring, desc, sizeof(struct i40e_aq_desc), + I40E_NONDMA_TO_DMA); + + /* if buff is not NULL assume indirect command */ + if (buff != NULL) { + dma_buff = &(hw->aq.asq.r.asq_bi[hw->aq.asq.next_to_use]); + /* copy the user buff into the respective DMA buff */ + i40e_memcpy(dma_buff->va, buff, buff_size, + I40E_NONDMA_TO_DMA); + desc_on_ring->datalen = CPU_TO_LE16(buff_size); + + /* Update the address values in the desc with the pa value + * for respective buffer + */ + desc_on_ring->params.external.addr_high = + CPU_TO_LE32(I40E_HI_DWORD(dma_buff->pa)); + desc_on_ring->params.external.addr_low = + CPU_TO_LE32(I40E_LO_DWORD(dma_buff->pa)); + } + + /* bump the tail */ + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQTX: desc and buffer:\n"); + i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc_on_ring, + buff, buff_size); + (hw->aq.asq.next_to_use)++; + if (hw->aq.asq.next_to_use == hw->aq.asq.count) + hw->aq.asq.next_to_use = 0; + if (!details->postpone) + wr32(hw, hw->aq.asq.tail, hw->aq.asq.next_to_use); + + /* if cmd_details are not defined or async flag is not set, + * we need to wait for desc write back + */ + if (!details->async && !details->postpone) { + u32 total_delay = 0; + + do { + /* AQ designers suggest use of head for better + * timing reliability than DD bit + */ + if (i40e_asq_done(hw)) + break; + /* ugh! delay while spin_lock */ + i40e_msec_delay(1); + total_delay++; + } while (total_delay < hw->aq.asq_cmd_timeout); + } + + /* if ready, copy the desc back to temp */ + if (i40e_asq_done(hw)) { + i40e_memcpy(desc, desc_on_ring, sizeof(struct i40e_aq_desc), + I40E_DMA_TO_NONDMA); + if (buff != NULL) + i40e_memcpy(buff, dma_buff->va, buff_size, + I40E_DMA_TO_NONDMA); + retval = LE16_TO_CPU(desc->retval); + if (retval != 0) { + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQTX: Command completed with error 0x%X.\n", + retval); + + /* strip off FW internal code */ + retval &= 0xff; + } + cmd_completed = TRUE; + if ((enum i40e_admin_queue_err)retval == I40E_AQ_RC_OK) + status = I40E_SUCCESS; + else + status = I40E_ERR_ADMIN_QUEUE_ERROR; + hw->aq.asq_last_status = (enum i40e_admin_queue_err)retval; + } + + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, + "AQTX: desc and buffer writeback:\n"); + i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, buff, buff_size); + + /* save writeback aq if requested */ + if (details->wb_desc) + i40e_memcpy(details->wb_desc, desc_on_ring, + sizeof(struct i40e_aq_desc), I40E_DMA_TO_NONDMA); + + /* update the error if time out occurred */ + if ((!cmd_completed) && + (!details->async && !details->postpone)) { + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQTX: Writeback timeout.\n"); + status = I40E_ERR_ADMIN_QUEUE_TIMEOUT; + } + +asq_send_command_error: + i40e_release_spinlock(&hw->aq.asq_spinlock); +asq_send_command_exit: + return status; +} + +/** + * i40e_fill_default_direct_cmd_desc - AQ descriptor helper function + * @desc: pointer to the temp descriptor (non DMA mem) + * @opcode: the opcode can be used to decide which flags to turn off or on + * + * Fill the desc with default values + **/ +void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc, + u16 opcode) +{ + /* zero out the desc */ + i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc), + I40E_NONDMA_MEM); + desc->opcode = CPU_TO_LE16(opcode); + desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_SI); +} + +/** + * i40e_clean_arq_element + * @hw: pointer to the hw struct + * @e: event info from the receive descriptor, includes any buffers + * @pending: number of events that could be left to process + * + * This function cleans one Admin Receive Queue element and returns + * the contents through e. It can also return how many events are + * left to process through 'pending' + **/ +enum i40e_status_code i40e_clean_arq_element(struct i40e_hw *hw, + struct i40e_arq_event_info *e, + u16 *pending) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u16 ntc = hw->aq.arq.next_to_clean; + struct i40e_aq_desc *desc; + struct i40e_dma_mem *bi; + u16 desc_idx; + u16 datalen; + u16 flags; + u16 ntu; + + /* take the lock before we start messing with the ring */ + i40e_acquire_spinlock(&hw->aq.arq_spinlock); + + /* set next_to_use to head */ + if (!i40e_is_vf(hw)) + ntu = (rd32(hw, hw->aq.arq.head) & I40E_PF_ARQH_ARQH_MASK); + if (i40e_is_vf(hw)) + ntu = (rd32(hw, hw->aq.arq.head) & I40E_VF_ARQH1_ARQH_MASK); + if (ntu == ntc) { + /* nothing to do - shouldn't need to update ring's values */ + ret_code = I40E_ERR_ADMIN_QUEUE_NO_WORK; + goto clean_arq_element_out; + } + + /* now clean the next descriptor */ + desc = I40E_ADMINQ_DESC(hw->aq.arq, ntc); + desc_idx = ntc; + + flags = LE16_TO_CPU(desc->flags); + if (flags & I40E_AQ_FLAG_ERR) { + ret_code = I40E_ERR_ADMIN_QUEUE_ERROR; + hw->aq.arq_last_status = + (enum i40e_admin_queue_err)LE16_TO_CPU(desc->retval); + i40e_debug(hw, + I40E_DEBUG_AQ_MESSAGE, + "AQRX: Event received with error 0x%X.\n", + hw->aq.arq_last_status); + } + + i40e_memcpy(&e->desc, desc, sizeof(struct i40e_aq_desc), + I40E_DMA_TO_NONDMA); + datalen = LE16_TO_CPU(desc->datalen); + e->msg_len = min(datalen, e->buf_len); + if (e->msg_buf != NULL && (e->msg_len != 0)) + i40e_memcpy(e->msg_buf, + hw->aq.arq.r.arq_bi[desc_idx].va, + e->msg_len, I40E_DMA_TO_NONDMA); + + i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQRX: desc and buffer:\n"); + i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, e->msg_buf, + hw->aq.arq_buf_size); + + /* Restore the original datalen and buffer address in the desc, + * FW updates datalen to indicate the event message + * size + */ + bi = &hw->aq.arq.r.arq_bi[ntc]; + i40e_memset((void *)desc, 0, sizeof(struct i40e_aq_desc), I40E_DMA_MEM); + + desc->flags = CPU_TO_LE16(I40E_AQ_FLAG_BUF); + if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF) + desc->flags |= CPU_TO_LE16(I40E_AQ_FLAG_LB); + desc->datalen = CPU_TO_LE16((u16)bi->size); + desc->params.external.addr_high = CPU_TO_LE32(I40E_HI_DWORD(bi->pa)); + desc->params.external.addr_low = CPU_TO_LE32(I40E_LO_DWORD(bi->pa)); + + /* set tail = the last cleaned desc index. */ + wr32(hw, hw->aq.arq.tail, ntc); + /* ntc is updated to tail + 1 */ + ntc++; + if (ntc == hw->aq.num_arq_entries) + ntc = 0; + hw->aq.arq.next_to_clean = ntc; + hw->aq.arq.next_to_use = ntu; + +clean_arq_element_out: + /* Set pending if needed, unlock and return */ + if (pending != NULL) + *pending = (ntc > ntu ? hw->aq.arq.count : 0) + (ntu - ntc); + i40e_release_spinlock(&hw->aq.arq_spinlock); + + if (i40e_is_nvm_update_op(&e->desc)) { + if (hw->aq.nvm_release_on_done) { + i40e_release_nvm(hw); + hw->aq.nvm_release_on_done = FALSE; + } + + switch (hw->nvmupd_state) { + case I40E_NVMUPD_STATE_INIT_WAIT: + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; + break; + + case I40E_NVMUPD_STATE_WRITE_WAIT: + hw->nvmupd_state = I40E_NVMUPD_STATE_WRITING; + break; + + default: + break; + } + } + + return ret_code; +} + +void i40e_resume_aq(struct i40e_hw *hw) +{ + /* Registers are reset after PF reset */ + hw->aq.asq.next_to_use = 0; + hw->aq.asq.next_to_clean = 0; + + i40e_config_asq_regs(hw); + + hw->aq.arq.next_to_use = 0; + hw->aq.arq.next_to_clean = 0; + + i40e_config_arq_regs(hw); +} diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq.h b/usr/src/uts/common/io/i40e/core/i40e_adminq.h new file mode 100644 index 0000000000..e20d6893ed --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_adminq.h @@ -0,0 +1,125 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_adminq.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_ADMINQ_H_ +#define _I40E_ADMINQ_H_ + +#include "i40e_osdep.h" +#include "i40e_status.h" +#include "i40e_adminq_cmd.h" + +#define I40E_ADMINQ_DESC(R, i) \ + (&(((struct i40e_aq_desc *)((R).desc_buf.va))[i])) + +#define I40E_ADMINQ_DESC_ALIGNMENT 4096 + +struct i40e_adminq_ring { + struct i40e_virt_mem dma_head; /* space for dma structures */ + struct i40e_dma_mem desc_buf; /* descriptor ring memory */ + struct i40e_virt_mem cmd_buf; /* command buffer memory */ + + union { + struct i40e_dma_mem *asq_bi; + struct i40e_dma_mem *arq_bi; + } r; + + u16 count; /* Number of descriptors */ + u16 rx_buf_len; /* Admin Receive Queue buffer length */ + + /* used for interrupt processing */ + u16 next_to_use; + u16 next_to_clean; + + /* used for queue tracking */ + u32 head; + u32 tail; + u32 len; + u32 bah; + u32 bal; +}; + +/* ASQ transaction details */ +struct i40e_asq_cmd_details { + void *callback; /* cast from type I40E_ADMINQ_CALLBACK */ + u64 cookie; + u16 flags_ena; + u16 flags_dis; + bool async; + bool postpone; + struct i40e_aq_desc *wb_desc; +}; + +#define I40E_ADMINQ_DETAILS(R, i) \ + (&(((struct i40e_asq_cmd_details *)((R).cmd_buf.va))[i])) + +/* ARQ event information */ +struct i40e_arq_event_info { + struct i40e_aq_desc desc; + u16 msg_len; + u16 buf_len; + u8 *msg_buf; +}; + +/* Admin Queue information */ +struct i40e_adminq_info { + struct i40e_adminq_ring arq; /* receive queue */ + struct i40e_adminq_ring asq; /* send queue */ + u32 asq_cmd_timeout; /* send queue cmd write back timeout*/ + u16 num_arq_entries; /* receive queue depth */ + u16 num_asq_entries; /* send queue depth */ + u16 arq_buf_size; /* receive queue buffer size */ + u16 asq_buf_size; /* send queue buffer size */ + u16 fw_maj_ver; /* firmware major version */ + u16 fw_min_ver; /* firmware minor version */ + u32 fw_build; /* firmware build number */ + u16 api_maj_ver; /* api major version */ + u16 api_min_ver; /* api minor version */ + bool nvm_release_on_done; + + struct i40e_spinlock asq_spinlock; /* Send queue spinlock */ + struct i40e_spinlock arq_spinlock; /* Receive queue spinlock */ + + /* last status values on send and receive queues */ + enum i40e_admin_queue_err asq_last_status; + enum i40e_admin_queue_err arq_last_status; +}; + +/* general information */ +#define I40E_AQ_LARGE_BUF 512 +#define I40E_ASQ_CMD_TIMEOUT 250 /* msecs */ + +void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc, + u16 opcode); + +#endif /* _I40E_ADMINQ_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h b/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h new file mode 100644 index 0000000000..af9f107597 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_adminq_cmd.h @@ -0,0 +1,2424 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_adminq_cmd.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_ADMINQ_CMD_H_ +#define _I40E_ADMINQ_CMD_H_ + +/* This header file defines the i40e Admin Queue commands and is shared between + * i40e Firmware and Software. + * + * This file needs to comply with the Linux Kernel coding style. + */ + +#define I40E_FW_API_VERSION_MAJOR 0x0001 +#ifdef X722_SUPPORT +#define I40E_FW_API_VERSION_MINOR 0x0003 +#else +#define I40E_FW_API_VERSION_MINOR 0x0004 +#endif + +struct i40e_aq_desc { + __le16 flags; + __le16 opcode; + __le16 datalen; + __le16 retval; + __le32 cookie_high; + __le32 cookie_low; + union { + struct { + __le32 param0; + __le32 param1; + __le32 param2; + __le32 param3; + } internal; + struct { + __le32 param0; + __le32 param1; + __le32 addr_high; + __le32 addr_low; + } external; + u8 raw[16]; + } params; +}; + +/* Flags sub-structure + * |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10 |11 |12 |13 |14 |15 | + * |DD |CMP|ERR|VFE| * * RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE | + */ + +/* command flags and offsets*/ +#define I40E_AQ_FLAG_DD_SHIFT 0 +#define I40E_AQ_FLAG_CMP_SHIFT 1 +#define I40E_AQ_FLAG_ERR_SHIFT 2 +#define I40E_AQ_FLAG_VFE_SHIFT 3 +#define I40E_AQ_FLAG_LB_SHIFT 9 +#define I40E_AQ_FLAG_RD_SHIFT 10 +#define I40E_AQ_FLAG_VFC_SHIFT 11 +#define I40E_AQ_FLAG_BUF_SHIFT 12 +#define I40E_AQ_FLAG_SI_SHIFT 13 +#define I40E_AQ_FLAG_EI_SHIFT 14 +#define I40E_AQ_FLAG_FE_SHIFT 15 + +#define I40E_AQ_FLAG_DD (1 << I40E_AQ_FLAG_DD_SHIFT) /* 0x1 */ +#define I40E_AQ_FLAG_CMP (1 << I40E_AQ_FLAG_CMP_SHIFT) /* 0x2 */ +#define I40E_AQ_FLAG_ERR (1 << I40E_AQ_FLAG_ERR_SHIFT) /* 0x4 */ +#define I40E_AQ_FLAG_VFE (1 << I40E_AQ_FLAG_VFE_SHIFT) /* 0x8 */ +#define I40E_AQ_FLAG_LB (1 << I40E_AQ_FLAG_LB_SHIFT) /* 0x200 */ +#define I40E_AQ_FLAG_RD (1 << I40E_AQ_FLAG_RD_SHIFT) /* 0x400 */ +#define I40E_AQ_FLAG_VFC (1 << I40E_AQ_FLAG_VFC_SHIFT) /* 0x800 */ +#define I40E_AQ_FLAG_BUF (1 << I40E_AQ_FLAG_BUF_SHIFT) /* 0x1000 */ +#define I40E_AQ_FLAG_SI (1 << I40E_AQ_FLAG_SI_SHIFT) /* 0x2000 */ +#define I40E_AQ_FLAG_EI (1 << I40E_AQ_FLAG_EI_SHIFT) /* 0x4000 */ +#define I40E_AQ_FLAG_FE (1 << I40E_AQ_FLAG_FE_SHIFT) /* 0x8000 */ + +/* error codes */ +enum i40e_admin_queue_err { + I40E_AQ_RC_OK = 0, /* success */ + I40E_AQ_RC_EPERM = 1, /* Operation not permitted */ + I40E_AQ_RC_ENOENT = 2, /* No such element */ + I40E_AQ_RC_ESRCH = 3, /* Bad opcode */ + I40E_AQ_RC_EINTR = 4, /* operation interrupted */ + I40E_AQ_RC_EIO = 5, /* I/O error */ + I40E_AQ_RC_ENXIO = 6, /* No such resource */ + I40E_AQ_RC_E2BIG = 7, /* Arg too long */ + I40E_AQ_RC_EAGAIN = 8, /* Try again */ + I40E_AQ_RC_ENOMEM = 9, /* Out of memory */ + I40E_AQ_RC_EACCES = 10, /* Permission denied */ + I40E_AQ_RC_EFAULT = 11, /* Bad address */ + I40E_AQ_RC_EBUSY = 12, /* Device or resource busy */ + I40E_AQ_RC_EEXIST = 13, /* object already exists */ + I40E_AQ_RC_EINVAL = 14, /* Invalid argument */ + I40E_AQ_RC_ENOTTY = 15, /* Not a typewriter */ + I40E_AQ_RC_ENOSPC = 16, /* No space left or alloc failure */ + I40E_AQ_RC_ENOSYS = 17, /* Function not implemented */ + I40E_AQ_RC_ERANGE = 18, /* Parameter out of range */ + I40E_AQ_RC_EFLUSHED = 19, /* Cmd flushed due to prev cmd error */ + I40E_AQ_RC_BAD_ADDR = 20, /* Descriptor contains a bad pointer */ + I40E_AQ_RC_EMODE = 21, /* Op not allowed in current dev mode */ + I40E_AQ_RC_EFBIG = 22, /* File too large */ +}; + +/* Admin Queue command opcodes */ +enum i40e_admin_queue_opc { + /* aq commands */ + i40e_aqc_opc_get_version = 0x0001, + i40e_aqc_opc_driver_version = 0x0002, + i40e_aqc_opc_queue_shutdown = 0x0003, + i40e_aqc_opc_set_pf_context = 0x0004, + + /* resource ownership */ + i40e_aqc_opc_request_resource = 0x0008, + i40e_aqc_opc_release_resource = 0x0009, + + i40e_aqc_opc_list_func_capabilities = 0x000A, + i40e_aqc_opc_list_dev_capabilities = 0x000B, + + /* LAA */ + i40e_aqc_opc_mac_address_read = 0x0107, + i40e_aqc_opc_mac_address_write = 0x0108, + + /* PXE */ + i40e_aqc_opc_clear_pxe_mode = 0x0110, + + /* internal switch commands */ + i40e_aqc_opc_get_switch_config = 0x0200, + i40e_aqc_opc_add_statistics = 0x0201, + i40e_aqc_opc_remove_statistics = 0x0202, + i40e_aqc_opc_set_port_parameters = 0x0203, + i40e_aqc_opc_get_switch_resource_alloc = 0x0204, + + i40e_aqc_opc_add_vsi = 0x0210, + i40e_aqc_opc_update_vsi_parameters = 0x0211, + i40e_aqc_opc_get_vsi_parameters = 0x0212, + + i40e_aqc_opc_add_pv = 0x0220, + i40e_aqc_opc_update_pv_parameters = 0x0221, + i40e_aqc_opc_get_pv_parameters = 0x0222, + + i40e_aqc_opc_add_veb = 0x0230, + i40e_aqc_opc_update_veb_parameters = 0x0231, + i40e_aqc_opc_get_veb_parameters = 0x0232, + + i40e_aqc_opc_delete_element = 0x0243, + + i40e_aqc_opc_add_macvlan = 0x0250, + i40e_aqc_opc_remove_macvlan = 0x0251, + i40e_aqc_opc_add_vlan = 0x0252, + i40e_aqc_opc_remove_vlan = 0x0253, + i40e_aqc_opc_set_vsi_promiscuous_modes = 0x0254, + i40e_aqc_opc_add_tag = 0x0255, + i40e_aqc_opc_remove_tag = 0x0256, + i40e_aqc_opc_add_multicast_etag = 0x0257, + i40e_aqc_opc_remove_multicast_etag = 0x0258, + i40e_aqc_opc_update_tag = 0x0259, + i40e_aqc_opc_add_control_packet_filter = 0x025A, + i40e_aqc_opc_remove_control_packet_filter = 0x025B, + i40e_aqc_opc_add_cloud_filters = 0x025C, + i40e_aqc_opc_remove_cloud_filters = 0x025D, + + i40e_aqc_opc_add_mirror_rule = 0x0260, + i40e_aqc_opc_delete_mirror_rule = 0x0261, + + /* DCB commands */ + i40e_aqc_opc_dcb_ignore_pfc = 0x0301, + i40e_aqc_opc_dcb_updated = 0x0302, + + /* TX scheduler */ + i40e_aqc_opc_configure_vsi_bw_limit = 0x0400, + i40e_aqc_opc_configure_vsi_ets_sla_bw_limit = 0x0406, + i40e_aqc_opc_configure_vsi_tc_bw = 0x0407, + i40e_aqc_opc_query_vsi_bw_config = 0x0408, + i40e_aqc_opc_query_vsi_ets_sla_config = 0x040A, + i40e_aqc_opc_configure_switching_comp_bw_limit = 0x0410, + + i40e_aqc_opc_enable_switching_comp_ets = 0x0413, + i40e_aqc_opc_modify_switching_comp_ets = 0x0414, + i40e_aqc_opc_disable_switching_comp_ets = 0x0415, + i40e_aqc_opc_configure_switching_comp_ets_bw_limit = 0x0416, + i40e_aqc_opc_configure_switching_comp_bw_config = 0x0417, + i40e_aqc_opc_query_switching_comp_ets_config = 0x0418, + i40e_aqc_opc_query_port_ets_config = 0x0419, + i40e_aqc_opc_query_switching_comp_bw_config = 0x041A, + i40e_aqc_opc_suspend_port_tx = 0x041B, + i40e_aqc_opc_resume_port_tx = 0x041C, + i40e_aqc_opc_configure_partition_bw = 0x041D, + + /* hmc */ + i40e_aqc_opc_query_hmc_resource_profile = 0x0500, + i40e_aqc_opc_set_hmc_resource_profile = 0x0501, + + /* phy commands*/ + i40e_aqc_opc_get_phy_abilities = 0x0600, + i40e_aqc_opc_set_phy_config = 0x0601, + i40e_aqc_opc_set_mac_config = 0x0603, + i40e_aqc_opc_set_link_restart_an = 0x0605, + i40e_aqc_opc_get_link_status = 0x0607, + i40e_aqc_opc_set_phy_int_mask = 0x0613, + i40e_aqc_opc_get_local_advt_reg = 0x0614, + i40e_aqc_opc_set_local_advt_reg = 0x0615, + i40e_aqc_opc_get_partner_advt = 0x0616, + i40e_aqc_opc_set_lb_modes = 0x0618, + i40e_aqc_opc_get_phy_wol_caps = 0x0621, + i40e_aqc_opc_set_phy_debug = 0x0622, + i40e_aqc_opc_upload_ext_phy_fm = 0x0625, + + /* NVM commands */ + i40e_aqc_opc_nvm_read = 0x0701, + i40e_aqc_opc_nvm_erase = 0x0702, + i40e_aqc_opc_nvm_update = 0x0703, + i40e_aqc_opc_nvm_config_read = 0x0704, + i40e_aqc_opc_nvm_config_write = 0x0705, + i40e_aqc_opc_oem_post_update = 0x0720, + + /* virtualization commands */ + i40e_aqc_opc_send_msg_to_pf = 0x0801, + i40e_aqc_opc_send_msg_to_vf = 0x0802, + i40e_aqc_opc_send_msg_to_peer = 0x0803, + + /* alternate structure */ + i40e_aqc_opc_alternate_write = 0x0900, + i40e_aqc_opc_alternate_write_indirect = 0x0901, + i40e_aqc_opc_alternate_read = 0x0902, + i40e_aqc_opc_alternate_read_indirect = 0x0903, + i40e_aqc_opc_alternate_write_done = 0x0904, + i40e_aqc_opc_alternate_set_mode = 0x0905, + i40e_aqc_opc_alternate_clear_port = 0x0906, + + /* LLDP commands */ + i40e_aqc_opc_lldp_get_mib = 0x0A00, + i40e_aqc_opc_lldp_update_mib = 0x0A01, + i40e_aqc_opc_lldp_add_tlv = 0x0A02, + i40e_aqc_opc_lldp_update_tlv = 0x0A03, + i40e_aqc_opc_lldp_delete_tlv = 0x0A04, + i40e_aqc_opc_lldp_stop = 0x0A05, + i40e_aqc_opc_lldp_start = 0x0A06, + i40e_aqc_opc_get_cee_dcb_cfg = 0x0A07, + i40e_aqc_opc_lldp_set_local_mib = 0x0A08, + i40e_aqc_opc_lldp_stop_start_spec_agent = 0x0A09, + + /* Tunnel commands */ + i40e_aqc_opc_add_udp_tunnel = 0x0B00, + i40e_aqc_opc_del_udp_tunnel = 0x0B01, +#ifdef X722_SUPPORT + i40e_aqc_opc_set_rss_key = 0x0B02, + i40e_aqc_opc_set_rss_lut = 0x0B03, + i40e_aqc_opc_get_rss_key = 0x0B04, + i40e_aqc_opc_get_rss_lut = 0x0B05, +#endif + + /* Async Events */ + i40e_aqc_opc_event_lan_overflow = 0x1001, + + /* OEM commands */ + i40e_aqc_opc_oem_parameter_change = 0xFE00, + i40e_aqc_opc_oem_device_status_change = 0xFE01, + i40e_aqc_opc_oem_ocsd_initialize = 0xFE02, + i40e_aqc_opc_oem_ocbb_initialize = 0xFE03, + + /* debug commands */ + i40e_aqc_opc_debug_read_reg = 0xFF03, + i40e_aqc_opc_debug_write_reg = 0xFF04, + i40e_aqc_opc_debug_modify_reg = 0xFF07, + i40e_aqc_opc_debug_dump_internals = 0xFF08, +}; + +/* command structures and indirect data structures */ + +/* Structure naming conventions: + * - no suffix for direct command descriptor structures + * - _data for indirect sent data + * - _resp for indirect return data (data which is both will use _data) + * - _completion for direct return data + * - _element_ for repeated elements (may also be _data or _resp) + * + * Command structures are expected to overlay the params.raw member of the basic + * descriptor, and as such cannot exceed 16 bytes in length. + */ + +/* This macro is used to generate a compilation error if a structure + * is not exactly the correct length. It gives a divide by zero error if the + * structure is not of the correct size, otherwise it creates an enum that is + * never used. + */ +#define I40E_CHECK_STRUCT_LEN(n, X) enum i40e_static_assert_enum_##X \ + { i40e_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } + +/* This macro is used extensively to ensure that command structures are 16 + * bytes in length as they have to map to the raw array of that size. + */ +#define I40E_CHECK_CMD_LENGTH(X) I40E_CHECK_STRUCT_LEN(16, X) + +/* internal (0x00XX) commands */ + +/* Get version (direct 0x0001) */ +struct i40e_aqc_get_version { + __le32 rom_ver; + __le32 fw_build; + __le16 fw_major; + __le16 fw_minor; + __le16 api_major; + __le16 api_minor; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_version); + +/* Send driver version (indirect 0x0002) */ +struct i40e_aqc_driver_version { + u8 driver_major_ver; + u8 driver_minor_ver; + u8 driver_build_ver; + u8 driver_subbuild_ver; + u8 reserved[4]; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_driver_version); + +/* Queue Shutdown (direct 0x0003) */ +struct i40e_aqc_queue_shutdown { + __le32 driver_unloading; +#define I40E_AQ_DRIVER_UNLOADING 0x1 + u8 reserved[12]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_queue_shutdown); + +/* Set PF context (0x0004, direct) */ +struct i40e_aqc_set_pf_context { + u8 pf_id; + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_pf_context); + +/* Request resource ownership (direct 0x0008) + * Release resource ownership (direct 0x0009) + */ +#define I40E_AQ_RESOURCE_NVM 1 +#define I40E_AQ_RESOURCE_SDP 2 +#define I40E_AQ_RESOURCE_ACCESS_READ 1 +#define I40E_AQ_RESOURCE_ACCESS_WRITE 2 +#define I40E_AQ_RESOURCE_NVM_READ_TIMEOUT 3000 +#define I40E_AQ_RESOURCE_NVM_WRITE_TIMEOUT 180000 + +struct i40e_aqc_request_resource { + __le16 resource_id; + __le16 access_type; + __le32 timeout; + __le32 resource_number; + u8 reserved[4]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_request_resource); + +/* Get function capabilities (indirect 0x000A) + * Get device capabilities (indirect 0x000B) + */ +struct i40e_aqc_list_capabilites { + u8 command_flags; +#define I40E_AQ_LIST_CAP_PF_INDEX_EN 1 + u8 pf_index; + u8 reserved[2]; + __le32 count; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_list_capabilites); + +struct i40e_aqc_list_capabilities_element_resp { + __le16 id; + u8 major_rev; + u8 minor_rev; + __le32 number; + __le32 logical_id; + __le32 phys_id; + u8 reserved[16]; +}; + +/* list of caps */ + +#define I40E_AQ_CAP_ID_SWITCH_MODE 0x0001 +#define I40E_AQ_CAP_ID_MNG_MODE 0x0002 +#define I40E_AQ_CAP_ID_NPAR_ACTIVE 0x0003 +#define I40E_AQ_CAP_ID_OS2BMC_CAP 0x0004 +#define I40E_AQ_CAP_ID_FUNCTIONS_VALID 0x0005 +#define I40E_AQ_CAP_ID_ALTERNATE_RAM 0x0006 +#define I40E_AQ_CAP_ID_SRIOV 0x0012 +#define I40E_AQ_CAP_ID_VF 0x0013 +#define I40E_AQ_CAP_ID_VMDQ 0x0014 +#define I40E_AQ_CAP_ID_8021QBG 0x0015 +#define I40E_AQ_CAP_ID_8021QBR 0x0016 +#define I40E_AQ_CAP_ID_VSI 0x0017 +#define I40E_AQ_CAP_ID_DCB 0x0018 +#define I40E_AQ_CAP_ID_FCOE 0x0021 +#define I40E_AQ_CAP_ID_ISCSI 0x0022 +#define I40E_AQ_CAP_ID_RSS 0x0040 +#define I40E_AQ_CAP_ID_RXQ 0x0041 +#define I40E_AQ_CAP_ID_TXQ 0x0042 +#define I40E_AQ_CAP_ID_MSIX 0x0043 +#define I40E_AQ_CAP_ID_VF_MSIX 0x0044 +#define I40E_AQ_CAP_ID_FLOW_DIRECTOR 0x0045 +#define I40E_AQ_CAP_ID_1588 0x0046 +#define I40E_AQ_CAP_ID_IWARP 0x0051 +#define I40E_AQ_CAP_ID_LED 0x0061 +#define I40E_AQ_CAP_ID_SDP 0x0062 +#define I40E_AQ_CAP_ID_MDIO 0x0063 +#define I40E_AQ_CAP_ID_FLEX10 0x00F1 +#define I40E_AQ_CAP_ID_CEM 0x00F2 + +/* Set CPPM Configuration (direct 0x0103) */ +struct i40e_aqc_cppm_configuration { + __le16 command_flags; +#define I40E_AQ_CPPM_EN_LTRC 0x0800 +#define I40E_AQ_CPPM_EN_DMCTH 0x1000 +#define I40E_AQ_CPPM_EN_DMCTLX 0x2000 +#define I40E_AQ_CPPM_EN_HPTC 0x4000 +#define I40E_AQ_CPPM_EN_DMARC 0x8000 + __le16 ttlx; + __le32 dmacr; + __le16 dmcth; + u8 hptc; + u8 reserved; + __le32 pfltrc; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_cppm_configuration); + +/* Set ARP Proxy command / response (indirect 0x0104) */ +struct i40e_aqc_arp_proxy_data { + __le16 command_flags; +#define I40E_AQ_ARP_INIT_IPV4 0x0008 +#define I40E_AQ_ARP_UNSUP_CTL 0x0010 +#define I40E_AQ_ARP_ENA 0x0020 +#define I40E_AQ_ARP_ADD_IPV4 0x0040 +#define I40E_AQ_ARP_DEL_IPV4 0x0080 + __le16 table_id; + __le32 pfpm_proxyfc; + __le32 ip_addr; + u8 mac_addr[6]; + u8 reserved[2]; +}; + +I40E_CHECK_STRUCT_LEN(0x14, i40e_aqc_arp_proxy_data); + +/* Set NS Proxy Table Entry Command (indirect 0x0105) */ +struct i40e_aqc_ns_proxy_data { + __le16 table_idx_mac_addr_0; + __le16 table_idx_mac_addr_1; + __le16 table_idx_ipv6_0; + __le16 table_idx_ipv6_1; + __le16 control; +#define I40E_AQ_NS_PROXY_ADD_0 0x0100 +#define I40E_AQ_NS_PROXY_DEL_0 0x0200 +#define I40E_AQ_NS_PROXY_ADD_1 0x0400 +#define I40E_AQ_NS_PROXY_DEL_1 0x0800 +#define I40E_AQ_NS_PROXY_ADD_IPV6_0 0x1000 +#define I40E_AQ_NS_PROXY_DEL_IPV6_0 0x2000 +#define I40E_AQ_NS_PROXY_ADD_IPV6_1 0x4000 +#define I40E_AQ_NS_PROXY_DEL_IPV6_1 0x8000 +#define I40E_AQ_NS_PROXY_COMMAND_SEQ 0x0001 +#define I40E_AQ_NS_PROXY_INIT_IPV6_TBL 0x0002 +#define I40E_AQ_NS_PROXY_INIT_MAC_TBL 0x0004 + u8 mac_addr_0[6]; + u8 mac_addr_1[6]; + u8 local_mac_addr[6]; + u8 ipv6_addr_0[16]; /* Warning! spec specifies BE byte order */ + u8 ipv6_addr_1[16]; +}; + +I40E_CHECK_STRUCT_LEN(0x3c, i40e_aqc_ns_proxy_data); + +/* Manage LAA Command (0x0106) - obsolete */ +struct i40e_aqc_mng_laa { + __le16 command_flags; +#define I40E_AQ_LAA_FLAG_WR 0x8000 + u8 reserved[2]; + __le32 sal; + __le16 sah; + u8 reserved2[6]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_mng_laa); + +/* Manage MAC Address Read Command (indirect 0x0107) */ +struct i40e_aqc_mac_address_read { + __le16 command_flags; +#define I40E_AQC_LAN_ADDR_VALID 0x10 +#define I40E_AQC_SAN_ADDR_VALID 0x20 +#define I40E_AQC_PORT_ADDR_VALID 0x40 +#define I40E_AQC_WOL_ADDR_VALID 0x80 +#define I40E_AQC_MC_MAG_EN_VALID 0x100 +#define I40E_AQC_ADDR_VALID_MASK 0x1F0 + u8 reserved[6]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_read); + +struct i40e_aqc_mac_address_read_data { + u8 pf_lan_mac[6]; + u8 pf_san_mac[6]; + u8 port_mac[6]; + u8 pf_wol_mac[6]; +}; + +I40E_CHECK_STRUCT_LEN(24, i40e_aqc_mac_address_read_data); + +/* Manage MAC Address Write Command (0x0108) */ +struct i40e_aqc_mac_address_write { + __le16 command_flags; +#define I40E_AQC_WRITE_TYPE_LAA_ONLY 0x0000 +#define I40E_AQC_WRITE_TYPE_LAA_WOL 0x4000 +#define I40E_AQC_WRITE_TYPE_PORT 0x8000 +#define I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG 0xC000 +#define I40E_AQC_WRITE_TYPE_MASK 0xC000 + + __le16 mac_sah; + __le32 mac_sal; + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_write); + +/* PXE commands (0x011x) */ + +/* Clear PXE Command and response (direct 0x0110) */ +struct i40e_aqc_clear_pxe { + u8 rx_cnt; + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_clear_pxe); + +/* Switch configuration commands (0x02xx) */ + +/* Used by many indirect commands that only pass an seid and a buffer in the + * command + */ +struct i40e_aqc_switch_seid { + __le16 seid; + u8 reserved[6]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_switch_seid); + +/* Get Switch Configuration command (indirect 0x0200) + * uses i40e_aqc_switch_seid for the descriptor + */ +struct i40e_aqc_get_switch_config_header_resp { + __le16 num_reported; + __le16 num_total; + u8 reserved[12]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_config_header_resp); + +struct i40e_aqc_switch_config_element_resp { + u8 element_type; +#define I40E_AQ_SW_ELEM_TYPE_MAC 1 +#define I40E_AQ_SW_ELEM_TYPE_PF 2 +#define I40E_AQ_SW_ELEM_TYPE_VF 3 +#define I40E_AQ_SW_ELEM_TYPE_EMP 4 +#define I40E_AQ_SW_ELEM_TYPE_BMC 5 +#define I40E_AQ_SW_ELEM_TYPE_PV 16 +#define I40E_AQ_SW_ELEM_TYPE_VEB 17 +#define I40E_AQ_SW_ELEM_TYPE_PA 18 +#define I40E_AQ_SW_ELEM_TYPE_VSI 19 + u8 revision; +#define I40E_AQ_SW_ELEM_REV_1 1 + __le16 seid; + __le16 uplink_seid; + __le16 downlink_seid; + u8 reserved[3]; + u8 connection_type; +#define I40E_AQ_CONN_TYPE_REGULAR 0x1 +#define I40E_AQ_CONN_TYPE_DEFAULT 0x2 +#define I40E_AQ_CONN_TYPE_CASCADED 0x3 + __le16 scheduler_id; + __le16 element_info; +}; + +I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_config_element_resp); + +/* Get Switch Configuration (indirect 0x0200) + * an array of elements are returned in the response buffer + * the first in the array is the header, remainder are elements + */ +struct i40e_aqc_get_switch_config_resp { + struct i40e_aqc_get_switch_config_header_resp header; + struct i40e_aqc_switch_config_element_resp element[1]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_switch_config_resp); + +/* Add Statistics (direct 0x0201) + * Remove Statistics (direct 0x0202) + */ +struct i40e_aqc_add_remove_statistics { + __le16 seid; + __le16 vlan; + __le16 stat_index; + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_statistics); + +/* Set Port Parameters command (direct 0x0203) */ +struct i40e_aqc_set_port_parameters { + __le16 command_flags; +#define I40E_AQ_SET_P_PARAMS_SAVE_BAD_PACKETS 1 +#define I40E_AQ_SET_P_PARAMS_PAD_SHORT_PACKETS 2 /* must set! */ +#define I40E_AQ_SET_P_PARAMS_DOUBLE_VLAN_ENA 4 + __le16 bad_frame_vsi; + __le16 default_seid; /* reserved for command */ + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_port_parameters); + +/* Get Switch Resource Allocation (indirect 0x0204) */ +struct i40e_aqc_get_switch_resource_alloc { + u8 num_entries; /* reserved for command */ + u8 reserved[7]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_resource_alloc); + +/* expect an array of these structs in the response buffer */ +struct i40e_aqc_switch_resource_alloc_element_resp { + u8 resource_type; +#define I40E_AQ_RESOURCE_TYPE_VEB 0x0 +#define I40E_AQ_RESOURCE_TYPE_VSI 0x1 +#define I40E_AQ_RESOURCE_TYPE_MACADDR 0x2 +#define I40E_AQ_RESOURCE_TYPE_STAG 0x3 +#define I40E_AQ_RESOURCE_TYPE_ETAG 0x4 +#define I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH 0x5 +#define I40E_AQ_RESOURCE_TYPE_UNICAST_HASH 0x6 +#define I40E_AQ_RESOURCE_TYPE_VLAN 0x7 +#define I40E_AQ_RESOURCE_TYPE_VSI_LIST_ENTRY 0x8 +#define I40E_AQ_RESOURCE_TYPE_ETAG_LIST_ENTRY 0x9 +#define I40E_AQ_RESOURCE_TYPE_VLAN_STAT_POOL 0xA +#define I40E_AQ_RESOURCE_TYPE_MIRROR_RULE 0xB +#define I40E_AQ_RESOURCE_TYPE_QUEUE_SETS 0xC +#define I40E_AQ_RESOURCE_TYPE_VLAN_FILTERS 0xD +#define I40E_AQ_RESOURCE_TYPE_INNER_MAC_FILTERS 0xF +#define I40E_AQ_RESOURCE_TYPE_IP_FILTERS 0x10 +#define I40E_AQ_RESOURCE_TYPE_GRE_VN_KEYS 0x11 +#define I40E_AQ_RESOURCE_TYPE_VN2_KEYS 0x12 +#define I40E_AQ_RESOURCE_TYPE_TUNNEL_PORTS 0x13 + u8 reserved1; + __le16 guaranteed; + __le16 total; + __le16 used; + __le16 total_unalloced; + u8 reserved2[6]; +}; + +I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_resource_alloc_element_resp); + +/* Add VSI (indirect 0x0210) + * this indirect command uses struct i40e_aqc_vsi_properties_data + * as the indirect buffer (128 bytes) + * + * Update VSI (indirect 0x211) + * uses the same data structure as Add VSI + * + * Get VSI (indirect 0x0212) + * uses the same completion and data structure as Add VSI + */ +struct i40e_aqc_add_get_update_vsi { + __le16 uplink_seid; + u8 connection_type; +#define I40E_AQ_VSI_CONN_TYPE_NORMAL 0x1 +#define I40E_AQ_VSI_CONN_TYPE_DEFAULT 0x2 +#define I40E_AQ_VSI_CONN_TYPE_CASCADED 0x3 + u8 reserved1; + u8 vf_id; + u8 reserved2; + __le16 vsi_flags; +#define I40E_AQ_VSI_TYPE_SHIFT 0x0 +#define I40E_AQ_VSI_TYPE_MASK (0x3 << I40E_AQ_VSI_TYPE_SHIFT) +#define I40E_AQ_VSI_TYPE_VF 0x0 +#define I40E_AQ_VSI_TYPE_VMDQ2 0x1 +#define I40E_AQ_VSI_TYPE_PF 0x2 +#define I40E_AQ_VSI_TYPE_EMP_MNG 0x3 +#define I40E_AQ_VSI_FLAG_CASCADED_PV 0x4 + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi); + +struct i40e_aqc_add_get_update_vsi_completion { + __le16 seid; + __le16 vsi_number; + __le16 vsi_used; + __le16 vsi_free; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi_completion); + +struct i40e_aqc_vsi_properties_data { + /* first 96 byte are written by SW */ + __le16 valid_sections; +#define I40E_AQ_VSI_PROP_SWITCH_VALID 0x0001 +#define I40E_AQ_VSI_PROP_SECURITY_VALID 0x0002 +#define I40E_AQ_VSI_PROP_VLAN_VALID 0x0004 +#define I40E_AQ_VSI_PROP_CAS_PV_VALID 0x0008 +#define I40E_AQ_VSI_PROP_INGRESS_UP_VALID 0x0010 +#define I40E_AQ_VSI_PROP_EGRESS_UP_VALID 0x0020 +#define I40E_AQ_VSI_PROP_QUEUE_MAP_VALID 0x0040 +#define I40E_AQ_VSI_PROP_QUEUE_OPT_VALID 0x0080 +#define I40E_AQ_VSI_PROP_OUTER_UP_VALID 0x0100 +#define I40E_AQ_VSI_PROP_SCHED_VALID 0x0200 + /* switch section */ + __le16 switch_id; /* 12bit id combined with flags below */ +#define I40E_AQ_VSI_SW_ID_SHIFT 0x0000 +#define I40E_AQ_VSI_SW_ID_MASK (0xFFF << I40E_AQ_VSI_SW_ID_SHIFT) +#define I40E_AQ_VSI_SW_ID_FLAG_NOT_STAG 0x1000 +#define I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB 0x2000 +#define I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB 0x4000 + u8 sw_reserved[2]; + /* security section */ + u8 sec_flags; +#define I40E_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD 0x01 +#define I40E_AQ_VSI_SEC_FLAG_ENABLE_VLAN_CHK 0x02 +#define I40E_AQ_VSI_SEC_FLAG_ENABLE_MAC_CHK 0x04 + u8 sec_reserved; + /* VLAN section */ + __le16 pvid; /* VLANS include priority bits */ + __le16 fcoe_pvid; + u8 port_vlan_flags; +#define I40E_AQ_VSI_PVLAN_MODE_SHIFT 0x00 +#define I40E_AQ_VSI_PVLAN_MODE_MASK (0x03 << \ + I40E_AQ_VSI_PVLAN_MODE_SHIFT) +#define I40E_AQ_VSI_PVLAN_MODE_TAGGED 0x01 +#define I40E_AQ_VSI_PVLAN_MODE_UNTAGGED 0x02 +#define I40E_AQ_VSI_PVLAN_MODE_ALL 0x03 +#define I40E_AQ_VSI_PVLAN_INSERT_PVID 0x04 +#define I40E_AQ_VSI_PVLAN_EMOD_SHIFT 0x03 +#define I40E_AQ_VSI_PVLAN_EMOD_MASK (0x3 << \ + I40E_AQ_VSI_PVLAN_EMOD_SHIFT) +#define I40E_AQ_VSI_PVLAN_EMOD_STR_BOTH 0x0 +#define I40E_AQ_VSI_PVLAN_EMOD_STR_UP 0x08 +#define I40E_AQ_VSI_PVLAN_EMOD_STR 0x10 +#define I40E_AQ_VSI_PVLAN_EMOD_NOTHING 0x18 + u8 pvlan_reserved[3]; + /* ingress egress up sections */ + __le32 ingress_table; /* bitmap, 3 bits per up */ +#define I40E_AQ_VSI_UP_TABLE_UP0_SHIFT 0 +#define I40E_AQ_VSI_UP_TABLE_UP0_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP0_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP1_SHIFT 3 +#define I40E_AQ_VSI_UP_TABLE_UP1_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP1_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP2_SHIFT 6 +#define I40E_AQ_VSI_UP_TABLE_UP2_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP2_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP3_SHIFT 9 +#define I40E_AQ_VSI_UP_TABLE_UP3_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP3_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP4_SHIFT 12 +#define I40E_AQ_VSI_UP_TABLE_UP4_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP4_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP5_SHIFT 15 +#define I40E_AQ_VSI_UP_TABLE_UP5_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP5_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP6_SHIFT 18 +#define I40E_AQ_VSI_UP_TABLE_UP6_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP6_SHIFT) +#define I40E_AQ_VSI_UP_TABLE_UP7_SHIFT 21 +#define I40E_AQ_VSI_UP_TABLE_UP7_MASK (0x7 << \ + I40E_AQ_VSI_UP_TABLE_UP7_SHIFT) + __le32 egress_table; /* same defines as for ingress table */ + /* cascaded PV section */ + __le16 cas_pv_tag; + u8 cas_pv_flags; +#define I40E_AQ_VSI_CAS_PV_TAGX_SHIFT 0x00 +#define I40E_AQ_VSI_CAS_PV_TAGX_MASK (0x03 << \ + I40E_AQ_VSI_CAS_PV_TAGX_SHIFT) +#define I40E_AQ_VSI_CAS_PV_TAGX_LEAVE 0x00 +#define I40E_AQ_VSI_CAS_PV_TAGX_REMOVE 0x01 +#define I40E_AQ_VSI_CAS_PV_TAGX_COPY 0x02 +#define I40E_AQ_VSI_CAS_PV_INSERT_TAG 0x10 +#define I40E_AQ_VSI_CAS_PV_ETAG_PRUNE 0x20 +#define I40E_AQ_VSI_CAS_PV_ACCEPT_HOST_TAG 0x40 + u8 cas_pv_reserved; + /* queue mapping section */ + __le16 mapping_flags; +#define I40E_AQ_VSI_QUE_MAP_CONTIG 0x0 +#define I40E_AQ_VSI_QUE_MAP_NONCONTIG 0x1 + __le16 queue_mapping[16]; +#define I40E_AQ_VSI_QUEUE_SHIFT 0x0 +#define I40E_AQ_VSI_QUEUE_MASK (0x7FF << I40E_AQ_VSI_QUEUE_SHIFT) + __le16 tc_mapping[8]; +#define I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT 0 +#define I40E_AQ_VSI_TC_QUE_OFFSET_MASK (0x1FF << \ + I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) +#define I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT 9 +#define I40E_AQ_VSI_TC_QUE_NUMBER_MASK (0x7 << \ + I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) + /* queueing option section */ + u8 queueing_opt_flags; +#ifdef X722_SUPPORT +#define I40E_AQ_VSI_QUE_OPT_MULTICAST_UDP_ENA 0x04 +#define I40E_AQ_VSI_QUE_OPT_UNICAST_UDP_ENA 0x08 +#endif +#define I40E_AQ_VSI_QUE_OPT_TCP_ENA 0x10 +#define I40E_AQ_VSI_QUE_OPT_FCOE_ENA 0x20 +#ifdef X722_SUPPORT +#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_PF 0x00 +#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_VSI 0x40 +#endif + u8 queueing_opt_reserved[3]; + /* scheduler section */ + u8 up_enable_bits; + u8 sched_reserved; + /* outer up section */ + __le32 outer_up_table; /* same structure and defines as ingress table */ + u8 cmd_reserved[8]; + /* last 32 bytes are written by FW */ + __le16 qs_handle[8]; +#define I40E_AQ_VSI_QS_HANDLE_INVALID 0xFFFF + __le16 stat_counter_idx; + __le16 sched_id; + u8 resp_reserved[12]; +}; + +I40E_CHECK_STRUCT_LEN(128, i40e_aqc_vsi_properties_data); + +/* Add Port Virtualizer (direct 0x0220) + * also used for update PV (direct 0x0221) but only flags are used + * (IS_CTRL_PORT only works on add PV) + */ +struct i40e_aqc_add_update_pv { + __le16 command_flags; +#define I40E_AQC_PV_FLAG_PV_TYPE 0x1 +#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_STAG_EN 0x2 +#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_ETAG_EN 0x4 +#define I40E_AQC_PV_FLAG_IS_CTRL_PORT 0x8 + __le16 uplink_seid; + __le16 connected_seid; + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv); + +struct i40e_aqc_add_update_pv_completion { + /* reserved for update; for add also encodes error if rc == ENOSPC */ + __le16 pv_seid; +#define I40E_AQC_PV_ERR_FLAG_NO_PV 0x1 +#define I40E_AQC_PV_ERR_FLAG_NO_SCHED 0x2 +#define I40E_AQC_PV_ERR_FLAG_NO_COUNTER 0x4 +#define I40E_AQC_PV_ERR_FLAG_NO_ENTRY 0x8 + u8 reserved[14]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv_completion); + +/* Get PV Params (direct 0x0222) + * uses i40e_aqc_switch_seid for the descriptor + */ + +struct i40e_aqc_get_pv_params_completion { + __le16 seid; + __le16 default_stag; + __le16 pv_flags; /* same flags as add_pv */ +#define I40E_AQC_GET_PV_PV_TYPE 0x1 +#define I40E_AQC_GET_PV_FRWD_UNKNOWN_STAG 0x2 +#define I40E_AQC_GET_PV_FRWD_UNKNOWN_ETAG 0x4 + u8 reserved[8]; + __le16 default_port_seid; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_pv_params_completion); + +/* Add VEB (direct 0x0230) */ +struct i40e_aqc_add_veb { + __le16 uplink_seid; + __le16 downlink_seid; + __le16 veb_flags; +#define I40E_AQC_ADD_VEB_FLOATING 0x1 +#define I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT 1 +#define I40E_AQC_ADD_VEB_PORT_TYPE_MASK (0x3 << \ + I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT) +#define I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT 0x2 +#define I40E_AQC_ADD_VEB_PORT_TYPE_DATA 0x4 +#define I40E_AQC_ADD_VEB_ENABLE_L2_FILTER 0x8 + u8 enable_tcs; + u8 reserved[9]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb); + +struct i40e_aqc_add_veb_completion { + u8 reserved[6]; + __le16 switch_seid; + /* also encodes error if rc == ENOSPC; codes are the same as add_pv */ + __le16 veb_seid; +#define I40E_AQC_VEB_ERR_FLAG_NO_VEB 0x1 +#define I40E_AQC_VEB_ERR_FLAG_NO_SCHED 0x2 +#define I40E_AQC_VEB_ERR_FLAG_NO_COUNTER 0x4 +#define I40E_AQC_VEB_ERR_FLAG_NO_ENTRY 0x8 + __le16 statistic_index; + __le16 vebs_used; + __le16 vebs_free; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb_completion); + +/* Get VEB Parameters (direct 0x0232) + * uses i40e_aqc_switch_seid for the descriptor + */ +struct i40e_aqc_get_veb_parameters_completion { + __le16 seid; + __le16 switch_id; + __le16 veb_flags; /* only the first/last flags from 0x0230 is valid */ + __le16 statistic_index; + __le16 vebs_used; + __le16 vebs_free; + u8 reserved[4]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_veb_parameters_completion); + +/* Delete Element (direct 0x0243) + * uses the generic i40e_aqc_switch_seid + */ + +/* Add MAC-VLAN (indirect 0x0250) */ + +/* used for the command for most vlan commands */ +struct i40e_aqc_macvlan { + __le16 num_addresses; + __le16 seid[3]; +#define I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_MACVLAN_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT) +#define I40E_AQC_MACVLAN_CMD_SEID_VALID 0x8000 + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_macvlan); + +/* indirect data for command and response */ +struct i40e_aqc_add_macvlan_element_data { + u8 mac_addr[6]; + __le16 vlan_tag; + __le16 flags; +#define I40E_AQC_MACVLAN_ADD_PERFECT_MATCH 0x0001 +#define I40E_AQC_MACVLAN_ADD_HASH_MATCH 0x0002 +#define I40E_AQC_MACVLAN_ADD_IGNORE_VLAN 0x0004 +#define I40E_AQC_MACVLAN_ADD_TO_QUEUE 0x0008 + __le16 queue_number; +#define I40E_AQC_MACVLAN_CMD_QUEUE_SHIFT 0 +#define I40E_AQC_MACVLAN_CMD_QUEUE_MASK (0x7FF << \ + I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT) + /* response section */ + u8 match_method; +#define I40E_AQC_MM_PERFECT_MATCH 0x01 +#define I40E_AQC_MM_HASH_MATCH 0x02 +#define I40E_AQC_MM_ERR_NO_RES 0xFF + u8 reserved1[3]; +}; + +struct i40e_aqc_add_remove_macvlan_completion { + __le16 perfect_mac_used; + __le16 perfect_mac_free; + __le16 unicast_hash_free; + __le16 multicast_hash_free; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_macvlan_completion); + +/* Remove MAC-VLAN (indirect 0x0251) + * uses i40e_aqc_macvlan for the descriptor + * data points to an array of num_addresses of elements + */ + +struct i40e_aqc_remove_macvlan_element_data { + u8 mac_addr[6]; + __le16 vlan_tag; + u8 flags; +#define I40E_AQC_MACVLAN_DEL_PERFECT_MATCH 0x01 +#define I40E_AQC_MACVLAN_DEL_HASH_MATCH 0x02 +#define I40E_AQC_MACVLAN_DEL_IGNORE_VLAN 0x08 +#define I40E_AQC_MACVLAN_DEL_ALL_VSIS 0x10 + u8 reserved[3]; + /* reply section */ + u8 error_code; +#define I40E_AQC_REMOVE_MACVLAN_SUCCESS 0x0 +#define I40E_AQC_REMOVE_MACVLAN_FAIL 0xFF + u8 reply_reserved[3]; +}; + +/* Add VLAN (indirect 0x0252) + * Remove VLAN (indirect 0x0253) + * use the generic i40e_aqc_macvlan for the command + */ +struct i40e_aqc_add_remove_vlan_element_data { + __le16 vlan_tag; + u8 vlan_flags; +/* flags for add VLAN */ +#define I40E_AQC_ADD_VLAN_LOCAL 0x1 +#define I40E_AQC_ADD_PVLAN_TYPE_SHIFT 1 +#define I40E_AQC_ADD_PVLAN_TYPE_MASK (0x3 << I40E_AQC_ADD_PVLAN_TYPE_SHIFT) +#define I40E_AQC_ADD_PVLAN_TYPE_REGULAR 0x0 +#define I40E_AQC_ADD_PVLAN_TYPE_PRIMARY 0x2 +#define I40E_AQC_ADD_PVLAN_TYPE_SECONDARY 0x4 +#define I40E_AQC_VLAN_PTYPE_SHIFT 3 +#define I40E_AQC_VLAN_PTYPE_MASK (0x3 << I40E_AQC_VLAN_PTYPE_SHIFT) +#define I40E_AQC_VLAN_PTYPE_REGULAR_VSI 0x0 +#define I40E_AQC_VLAN_PTYPE_PROMISC_VSI 0x8 +#define I40E_AQC_VLAN_PTYPE_COMMUNITY_VSI 0x10 +#define I40E_AQC_VLAN_PTYPE_ISOLATED_VSI 0x18 +/* flags for remove VLAN */ +#define I40E_AQC_REMOVE_VLAN_ALL 0x1 + u8 reserved; + u8 result; +/* flags for add VLAN */ +#define I40E_AQC_ADD_VLAN_SUCCESS 0x0 +#define I40E_AQC_ADD_VLAN_FAIL_REQUEST 0xFE +#define I40E_AQC_ADD_VLAN_FAIL_RESOURCE 0xFF +/* flags for remove VLAN */ +#define I40E_AQC_REMOVE_VLAN_SUCCESS 0x0 +#define I40E_AQC_REMOVE_VLAN_FAIL 0xFF + u8 reserved1[3]; +}; + +struct i40e_aqc_add_remove_vlan_completion { + u8 reserved[4]; + __le16 vlans_used; + __le16 vlans_free; + __le32 addr_high; + __le32 addr_low; +}; + +/* Set VSI Promiscuous Modes (direct 0x0254) */ +struct i40e_aqc_set_vsi_promiscuous_modes { + __le16 promiscuous_flags; + __le16 valid_flags; +/* flags used for both fields above */ +#define I40E_AQC_SET_VSI_PROMISC_UNICAST 0x01 +#define I40E_AQC_SET_VSI_PROMISC_MULTICAST 0x02 +#define I40E_AQC_SET_VSI_PROMISC_BROADCAST 0x04 +#define I40E_AQC_SET_VSI_DEFAULT 0x08 +#define I40E_AQC_SET_VSI_PROMISC_VLAN 0x10 + __le16 seid; +#define I40E_AQC_VSI_PROM_CMD_SEID_MASK 0x3FF + __le16 vlan_tag; +#define I40E_AQC_SET_VSI_VLAN_MASK 0x0FFF +#define I40E_AQC_SET_VSI_VLAN_VALID 0x8000 + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_vsi_promiscuous_modes); + +/* Add S/E-tag command (direct 0x0255) + * Uses generic i40e_aqc_add_remove_tag_completion for completion + */ +struct i40e_aqc_add_tag { + __le16 flags; +#define I40E_AQC_ADD_TAG_FLAG_TO_QUEUE 0x0001 + __le16 seid; +#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT) + __le16 tag; + __le16 queue_number; + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_tag); + +struct i40e_aqc_add_remove_tag_completion { + u8 reserved[12]; + __le16 tags_used; + __le16 tags_free; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_tag_completion); + +/* Remove S/E-tag command (direct 0x0256) + * Uses generic i40e_aqc_add_remove_tag_completion for completion + */ +struct i40e_aqc_remove_tag { + __le16 seid; +#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT) + __le16 tag; + u8 reserved[12]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_tag); + +/* Add multicast E-Tag (direct 0x0257) + * del multicast E-Tag (direct 0x0258) only uses pv_seid and etag fields + * and no external data + */ +struct i40e_aqc_add_remove_mcast_etag { + __le16 pv_seid; + __le16 etag; + u8 num_unicast_etags; + u8 reserved[3]; + __le32 addr_high; /* address of array of 2-byte s-tags */ + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag); + +struct i40e_aqc_add_remove_mcast_etag_completion { + u8 reserved[4]; + __le16 mcast_etags_used; + __le16 mcast_etags_free; + __le32 addr_high; + __le32 addr_low; + +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag_completion); + +/* Update S/E-Tag (direct 0x0259) */ +struct i40e_aqc_update_tag { + __le16 seid; +#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT) + __le16 old_tag; + __le16 new_tag; + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag); + +struct i40e_aqc_update_tag_completion { + u8 reserved[12]; + __le16 tags_used; + __le16 tags_free; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag_completion); + +/* Add Control Packet filter (direct 0x025A) + * Remove Control Packet filter (direct 0x025B) + * uses the i40e_aqc_add_oveb_cloud, + * and the generic direct completion structure + */ +struct i40e_aqc_add_remove_control_packet_filter { + u8 mac[6]; + __le16 etype; + __le16 flags; +#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC 0x0001 +#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_DROP 0x0002 +#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TO_QUEUE 0x0004 +#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX 0x0008 +#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX 0x0000 + __le16 seid; +#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT) + __le16 queue; + u8 reserved[2]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter); + +struct i40e_aqc_add_remove_control_packet_filter_completion { + __le16 mac_etype_used; + __le16 etype_used; + __le16 mac_etype_free; + __le16 etype_free; + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter_completion); + +/* Add Cloud filters (indirect 0x025C) + * Remove Cloud filters (indirect 0x025D) + * uses the i40e_aqc_add_remove_cloud_filters, + * and the generic indirect completion structure + */ +struct i40e_aqc_add_remove_cloud_filters { + u8 num_filters; + u8 reserved; + __le16 seid; +#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT 0 +#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK (0x3FF << \ + I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT) + u8 reserved2[4]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters); + +struct i40e_aqc_add_remove_cloud_filters_element_data { + u8 outer_mac[6]; + u8 inner_mac[6]; + __le16 inner_vlan; + union { + struct { + u8 reserved[12]; + u8 data[4]; + } v4; + struct { + u8 data[16]; + } v6; + } ipaddr; + __le16 flags; +#define I40E_AQC_ADD_CLOUD_FILTER_SHIFT 0 +#define I40E_AQC_ADD_CLOUD_FILTER_MASK (0x3F << \ + I40E_AQC_ADD_CLOUD_FILTER_SHIFT) +/* 0x0000 reserved */ +#define I40E_AQC_ADD_CLOUD_FILTER_OIP 0x0001 +/* 0x0002 reserved */ +#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN 0x0003 +#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN_TEN_ID 0x0004 +/* 0x0005 reserved */ +#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_TEN_ID 0x0006 +/* 0x0007 reserved */ +/* 0x0008 reserved */ +#define I40E_AQC_ADD_CLOUD_FILTER_OMAC 0x0009 +#define I40E_AQC_ADD_CLOUD_FILTER_IMAC 0x000A +#define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC 0x000B +#define I40E_AQC_ADD_CLOUD_FILTER_IIP 0x000C + +#define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE 0x0080 +#define I40E_AQC_ADD_CLOUD_VNK_SHIFT 6 +#define I40E_AQC_ADD_CLOUD_VNK_MASK 0x00C0 +#define I40E_AQC_ADD_CLOUD_FLAGS_IPV4 0 +#define I40E_AQC_ADD_CLOUD_FLAGS_IPV6 0x0100 + +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT 9 +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_MASK 0x1E00 +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_XVLAN 0 +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_NVGRE_OMAC 1 +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_NGE 2 +#define I40E_AQC_ADD_CLOUD_TNL_TYPE_IP 3 + + __le32 tenant_id; + u8 reserved[4]; + __le16 queue_number; +#define I40E_AQC_ADD_CLOUD_QUEUE_SHIFT 0 +#define I40E_AQC_ADD_CLOUD_QUEUE_MASK (0x7FF << \ + I40E_AQC_ADD_CLOUD_QUEUE_SHIFT) + u8 reserved2[14]; + /* response section */ + u8 allocation_result; +#define I40E_AQC_ADD_CLOUD_FILTER_SUCCESS 0x0 +#define I40E_AQC_ADD_CLOUD_FILTER_FAIL 0xFF + u8 response_reserved[7]; +}; + +struct i40e_aqc_remove_cloud_filters_completion { + __le16 perfect_ovlan_used; + __le16 perfect_ovlan_free; + __le16 vlan_used; + __le16 vlan_free; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion); + +/* Add Mirror Rule (indirect or direct 0x0260) + * Delete Mirror Rule (indirect or direct 0x0261) + * note: some rule types (4,5) do not use an external buffer. + * take care to set the flags correctly. + */ +struct i40e_aqc_add_delete_mirror_rule { + __le16 seid; + __le16 rule_type; +#define I40E_AQC_MIRROR_RULE_TYPE_SHIFT 0 +#define I40E_AQC_MIRROR_RULE_TYPE_MASK (0x7 << \ + I40E_AQC_MIRROR_RULE_TYPE_SHIFT) +#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_INGRESS 1 +#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_EGRESS 2 +#define I40E_AQC_MIRROR_RULE_TYPE_VLAN 3 +#define I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS 4 +#define I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS 5 + __le16 num_entries; + __le16 destination; /* VSI for add, rule id for delete */ + __le32 addr_high; /* address of array of 2-byte VSI or VLAN ids */ + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule); + +struct i40e_aqc_add_delete_mirror_rule_completion { + u8 reserved[2]; + __le16 rule_id; /* only used on add */ + __le16 mirror_rules_used; + __le16 mirror_rules_free; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule_completion); + +/* DCB 0x03xx*/ + +/* PFC Ignore (direct 0x0301) + * the command and response use the same descriptor structure + */ +struct i40e_aqc_pfc_ignore { + u8 tc_bitmap; + u8 command_flags; /* unused on response */ +#define I40E_AQC_PFC_IGNORE_SET 0x80 +#define I40E_AQC_PFC_IGNORE_CLEAR 0x0 + u8 reserved[14]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_pfc_ignore); + +/* DCB Update (direct 0x0302) uses the i40e_aq_desc structure + * with no parameters + */ + +/* TX scheduler 0x04xx */ + +/* Almost all the indirect commands use + * this generic struct to pass the SEID in param0 + */ +struct i40e_aqc_tx_sched_ind { + __le16 vsi_seid; + u8 reserved[6]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_tx_sched_ind); + +/* Several commands respond with a set of queue set handles */ +struct i40e_aqc_qs_handles_resp { + __le16 qs_handles[8]; +}; + +/* Configure VSI BW limits (direct 0x0400) */ +struct i40e_aqc_configure_vsi_bw_limit { + __le16 vsi_seid; + u8 reserved[2]; + __le16 credit; + u8 reserved1[2]; + u8 max_credit; /* 0-3, limit = 2^max */ + u8 reserved2[7]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_vsi_bw_limit); + +/* Configure VSI Bandwidth Limit per Traffic Type (indirect 0x0406) + * responds with i40e_aqc_qs_handles_resp + */ +struct i40e_aqc_configure_vsi_ets_sla_bw_data { + u8 tc_valid_bits; + u8 reserved[15]; + __le16 tc_bw_credits[8]; /* FW writesback QS handles here */ + + /* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */ + __le16 tc_bw_max[2]; + u8 reserved1[28]; +}; + +I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_configure_vsi_ets_sla_bw_data); + +/* Configure VSI Bandwidth Allocation per Traffic Type (indirect 0x0407) + * responds with i40e_aqc_qs_handles_resp + */ +struct i40e_aqc_configure_vsi_tc_bw_data { + u8 tc_valid_bits; + u8 reserved[3]; + u8 tc_bw_credits[8]; + u8 reserved1[4]; + __le16 qs_handles[8]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_vsi_tc_bw_data); + +/* Query vsi bw configuration (indirect 0x0408) */ +struct i40e_aqc_query_vsi_bw_config_resp { + u8 tc_valid_bits; + u8 tc_suspended_bits; + u8 reserved[14]; + __le16 qs_handles[8]; + u8 reserved1[4]; + __le16 port_bw_limit; + u8 reserved2[2]; + u8 max_bw; /* 0-3, limit = 2^max */ + u8 reserved3[23]; +}; + +I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_vsi_bw_config_resp); + +/* Query VSI Bandwidth Allocation per Traffic Type (indirect 0x040A) */ +struct i40e_aqc_query_vsi_ets_sla_config_resp { + u8 tc_valid_bits; + u8 reserved[3]; + u8 share_credits[8]; + __le16 credits[8]; + + /* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */ + __le16 tc_bw_max[2]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_vsi_ets_sla_config_resp); + +/* Configure Switching Component Bandwidth Limit (direct 0x0410) */ +struct i40e_aqc_configure_switching_comp_bw_limit { + __le16 seid; + u8 reserved[2]; + __le16 credit; + u8 reserved1[2]; + u8 max_bw; /* 0-3, limit = 2^max */ + u8 reserved2[7]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_switching_comp_bw_limit); + +/* Enable Physical Port ETS (indirect 0x0413) + * Modify Physical Port ETS (indirect 0x0414) + * Disable Physical Port ETS (indirect 0x0415) + */ +struct i40e_aqc_configure_switching_comp_ets_data { + u8 reserved[4]; + u8 tc_valid_bits; + u8 seepage; +#define I40E_AQ_ETS_SEEPAGE_EN_MASK 0x1 + u8 tc_strict_priority_flags; + u8 reserved1[17]; + u8 tc_bw_share_credits[8]; + u8 reserved2[96]; +}; + +I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_configure_switching_comp_ets_data); + +/* Configure Switching Component Bandwidth Limits per Tc (indirect 0x0416) */ +struct i40e_aqc_configure_switching_comp_ets_bw_limit_data { + u8 tc_valid_bits; + u8 reserved[15]; + __le16 tc_bw_credit[8]; + + /* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */ + __le16 tc_bw_max[2]; + u8 reserved1[28]; +}; + +I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_configure_switching_comp_ets_bw_limit_data); + +/* Configure Switching Component Bandwidth Allocation per Tc + * (indirect 0x0417) + */ +struct i40e_aqc_configure_switching_comp_bw_config_data { + u8 tc_valid_bits; + u8 reserved[2]; + u8 absolute_credits; /* bool */ + u8 tc_bw_share_credits[8]; + u8 reserved1[20]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_switching_comp_bw_config_data); + +/* Query Switching Component Configuration (indirect 0x0418) */ +struct i40e_aqc_query_switching_comp_ets_config_resp { + u8 tc_valid_bits; + u8 reserved[35]; + __le16 port_bw_limit; + u8 reserved1[2]; + u8 tc_bw_max; /* 0-3, limit = 2^max */ + u8 reserved2[23]; +}; + +I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_switching_comp_ets_config_resp); + +/* Query PhysicalPort ETS Configuration (indirect 0x0419) */ +struct i40e_aqc_query_port_ets_config_resp { + u8 reserved[4]; + u8 tc_valid_bits; + u8 reserved1; + u8 tc_strict_priority_bits; + u8 reserved2; + u8 tc_bw_share_credits[8]; + __le16 tc_bw_limits[8]; + + /* 4 bits per tc 0-7, 4th bit reserved, limit = 2^max */ + __le16 tc_bw_max[2]; + u8 reserved3[32]; +}; + +I40E_CHECK_STRUCT_LEN(0x44, i40e_aqc_query_port_ets_config_resp); + +/* Query Switching Component Bandwidth Allocation per Traffic Type + * (indirect 0x041A) + */ +struct i40e_aqc_query_switching_comp_bw_config_resp { + u8 tc_valid_bits; + u8 reserved[2]; + u8 absolute_credits_enable; /* bool */ + u8 tc_bw_share_credits[8]; + __le16 tc_bw_limits[8]; + + /* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */ + __le16 tc_bw_max[2]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_switching_comp_bw_config_resp); + +/* Suspend/resume port TX traffic + * (direct 0x041B and 0x041C) uses the generic SEID struct + */ + +/* Configure partition BW + * (indirect 0x041D) + */ +struct i40e_aqc_configure_partition_bw_data { + __le16 pf_valid_bits; + u8 min_bw[16]; /* guaranteed bandwidth */ + u8 max_bw[16]; /* bandwidth limit */ +}; + +I40E_CHECK_STRUCT_LEN(0x22, i40e_aqc_configure_partition_bw_data); + +/* Get and set the active HMC resource profile and status. + * (direct 0x0500) and (direct 0x0501) + */ +struct i40e_aq_get_set_hmc_resource_profile { + u8 pm_profile; + u8 pe_vf_enabled; + u8 reserved[14]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aq_get_set_hmc_resource_profile); + +enum i40e_aq_hmc_profile { + /* I40E_HMC_PROFILE_NO_CHANGE = 0, reserved */ + I40E_HMC_PROFILE_DEFAULT = 1, + I40E_HMC_PROFILE_FAVOR_VF = 2, + I40E_HMC_PROFILE_EQUAL = 3, +}; + +#define I40E_AQ_GET_HMC_RESOURCE_PROFILE_PM_MASK 0xF +#define I40E_AQ_GET_HMC_RESOURCE_PROFILE_COUNT_MASK 0x3F + +/* Get PHY Abilities (indirect 0x0600) uses the generic indirect struct */ + +/* set in param0 for get phy abilities to report qualified modules */ +#define I40E_AQ_PHY_REPORT_QUALIFIED_MODULES 0x0001 +#define I40E_AQ_PHY_REPORT_INITIAL_VALUES 0x0002 + +enum i40e_aq_phy_type { + I40E_PHY_TYPE_SGMII = 0x0, + I40E_PHY_TYPE_1000BASE_KX = 0x1, + I40E_PHY_TYPE_10GBASE_KX4 = 0x2, + I40E_PHY_TYPE_10GBASE_KR = 0x3, + I40E_PHY_TYPE_40GBASE_KR4 = 0x4, + I40E_PHY_TYPE_XAUI = 0x5, + I40E_PHY_TYPE_XFI = 0x6, + I40E_PHY_TYPE_SFI = 0x7, + I40E_PHY_TYPE_XLAUI = 0x8, + I40E_PHY_TYPE_XLPPI = 0x9, + I40E_PHY_TYPE_40GBASE_CR4_CU = 0xA, + I40E_PHY_TYPE_10GBASE_CR1_CU = 0xB, + I40E_PHY_TYPE_10GBASE_AOC = 0xC, + I40E_PHY_TYPE_40GBASE_AOC = 0xD, + I40E_PHY_TYPE_100BASE_TX = 0x11, + I40E_PHY_TYPE_1000BASE_T = 0x12, + I40E_PHY_TYPE_10GBASE_T = 0x13, + I40E_PHY_TYPE_10GBASE_SR = 0x14, + I40E_PHY_TYPE_10GBASE_LR = 0x15, + I40E_PHY_TYPE_10GBASE_SFPP_CU = 0x16, + I40E_PHY_TYPE_10GBASE_CR1 = 0x17, + I40E_PHY_TYPE_40GBASE_CR4 = 0x18, + I40E_PHY_TYPE_40GBASE_SR4 = 0x19, + I40E_PHY_TYPE_40GBASE_LR4 = 0x1A, + I40E_PHY_TYPE_1000BASE_SX = 0x1B, + I40E_PHY_TYPE_1000BASE_LX = 0x1C, + I40E_PHY_TYPE_1000BASE_T_OPTICAL = 0x1D, + I40E_PHY_TYPE_20GBASE_KR2 = 0x1E, + I40E_PHY_TYPE_MAX +}; + +#define I40E_LINK_SPEED_100MB_SHIFT 0x1 +#define I40E_LINK_SPEED_1000MB_SHIFT 0x2 +#define I40E_LINK_SPEED_10GB_SHIFT 0x3 +#define I40E_LINK_SPEED_40GB_SHIFT 0x4 +#define I40E_LINK_SPEED_20GB_SHIFT 0x5 + +enum i40e_aq_link_speed { + I40E_LINK_SPEED_UNKNOWN = 0, + I40E_LINK_SPEED_100MB = (1 << I40E_LINK_SPEED_100MB_SHIFT), + I40E_LINK_SPEED_1GB = (1 << I40E_LINK_SPEED_1000MB_SHIFT), + I40E_LINK_SPEED_10GB = (1 << I40E_LINK_SPEED_10GB_SHIFT), + I40E_LINK_SPEED_40GB = (1 << I40E_LINK_SPEED_40GB_SHIFT), + I40E_LINK_SPEED_20GB = (1 << I40E_LINK_SPEED_20GB_SHIFT) +}; + +struct i40e_aqc_module_desc { + u8 oui[3]; + u8 reserved1; + u8 part_number[16]; + u8 revision[4]; + u8 reserved2[8]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_module_desc); + +struct i40e_aq_get_phy_abilities_resp { + __le32 phy_type; /* bitmap using the above enum for offsets */ + u8 link_speed; /* bitmap using the above enum bit patterns */ + u8 abilities; +#define I40E_AQ_PHY_FLAG_PAUSE_TX 0x01 +#define I40E_AQ_PHY_FLAG_PAUSE_RX 0x02 +#define I40E_AQ_PHY_FLAG_LOW_POWER 0x04 +#define I40E_AQ_PHY_LINK_ENABLED 0x08 +#define I40E_AQ_PHY_AN_ENABLED 0x10 +#define I40E_AQ_PHY_FLAG_MODULE_QUAL 0x20 + __le16 eee_capability; +#define I40E_AQ_EEE_100BASE_TX 0x0002 +#define I40E_AQ_EEE_1000BASE_T 0x0004 +#define I40E_AQ_EEE_10GBASE_T 0x0008 +#define I40E_AQ_EEE_1000BASE_KX 0x0010 +#define I40E_AQ_EEE_10GBASE_KX4 0x0020 +#define I40E_AQ_EEE_10GBASE_KR 0x0040 + __le32 eeer_val; + u8 d3_lpan; +#define I40E_AQ_SET_PHY_D3_LPAN_ENA 0x01 + u8 reserved[3]; + u8 phy_id[4]; + u8 module_type[3]; + u8 qualified_module_count; +#define I40E_AQ_PHY_MAX_QMS 16 + struct i40e_aqc_module_desc qualified_module[I40E_AQ_PHY_MAX_QMS]; +}; + +I40E_CHECK_STRUCT_LEN(0x218, i40e_aq_get_phy_abilities_resp); + +/* Set PHY Config (direct 0x0601) */ +struct i40e_aq_set_phy_config { /* same bits as above in all */ + __le32 phy_type; + u8 link_speed; + u8 abilities; +/* bits 0-2 use the values from get_phy_abilities_resp */ +#define I40E_AQ_PHY_ENABLE_LINK 0x08 +#define I40E_AQ_PHY_ENABLE_AN 0x10 +#define I40E_AQ_PHY_ENABLE_ATOMIC_LINK 0x20 + __le16 eee_capability; + __le32 eeer; + u8 low_power_ctrl; + u8 reserved[3]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config); + +/* Set MAC Config command data structure (direct 0x0603) */ +struct i40e_aq_set_mac_config { + __le16 max_frame_size; + u8 params; +#define I40E_AQ_SET_MAC_CONFIG_CRC_EN 0x04 +#define I40E_AQ_SET_MAC_CONFIG_PACING_MASK 0x78 +#define I40E_AQ_SET_MAC_CONFIG_PACING_SHIFT 3 +#define I40E_AQ_SET_MAC_CONFIG_PACING_NONE 0x0 +#define I40E_AQ_SET_MAC_CONFIG_PACING_1B_13TX 0xF +#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_9TX 0x9 +#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_4TX 0x8 +#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_7TX 0x7 +#define I40E_AQ_SET_MAC_CONFIG_PACING_2DW_3TX 0x6 +#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_1TX 0x5 +#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_2TX 0x4 +#define I40E_AQ_SET_MAC_CONFIG_PACING_7DW_3TX 0x3 +#define I40E_AQ_SET_MAC_CONFIG_PACING_4DW_1TX 0x2 +#define I40E_AQ_SET_MAC_CONFIG_PACING_9DW_1TX 0x1 + u8 tx_timer_priority; /* bitmap */ + __le16 tx_timer_value; + __le16 fc_refresh_threshold; + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aq_set_mac_config); + +/* Restart Auto-Negotiation (direct 0x605) */ +struct i40e_aqc_set_link_restart_an { + u8 command; +#define I40E_AQ_PHY_RESTART_AN 0x02 +#define I40E_AQ_PHY_LINK_ENABLE 0x04 + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_link_restart_an); + +/* Get Link Status cmd & response data structure (direct 0x0607) */ +struct i40e_aqc_get_link_status { + __le16 command_flags; /* only field set on command */ +#define I40E_AQ_LSE_MASK 0x3 +#define I40E_AQ_LSE_NOP 0x0 +#define I40E_AQ_LSE_DISABLE 0x2 +#define I40E_AQ_LSE_ENABLE 0x3 +/* only response uses this flag */ +#define I40E_AQ_LSE_IS_ENABLED 0x1 + u8 phy_type; /* i40e_aq_phy_type */ + u8 link_speed; /* i40e_aq_link_speed */ + u8 link_info; +#define I40E_AQ_LINK_UP 0x01 /* obsolete */ +#define I40E_AQ_LINK_UP_FUNCTION 0x01 +#define I40E_AQ_LINK_FAULT 0x02 +#define I40E_AQ_LINK_FAULT_TX 0x04 +#define I40E_AQ_LINK_FAULT_RX 0x08 +#define I40E_AQ_LINK_FAULT_REMOTE 0x10 +#define I40E_AQ_LINK_UP_PORT 0x20 +#define I40E_AQ_MEDIA_AVAILABLE 0x40 +#define I40E_AQ_SIGNAL_DETECT 0x80 + u8 an_info; +#define I40E_AQ_AN_COMPLETED 0x01 +#define I40E_AQ_LP_AN_ABILITY 0x02 +#define I40E_AQ_PD_FAULT 0x04 +#define I40E_AQ_FEC_EN 0x08 +#define I40E_AQ_PHY_LOW_POWER 0x10 +#define I40E_AQ_LINK_PAUSE_TX 0x20 +#define I40E_AQ_LINK_PAUSE_RX 0x40 +#define I40E_AQ_QUALIFIED_MODULE 0x80 + u8 ext_info; +#define I40E_AQ_LINK_PHY_TEMP_ALARM 0x01 +#define I40E_AQ_LINK_XCESSIVE_ERRORS 0x02 +#define I40E_AQ_LINK_TX_SHIFT 0x02 +#define I40E_AQ_LINK_TX_MASK (0x03 << I40E_AQ_LINK_TX_SHIFT) +#define I40E_AQ_LINK_TX_ACTIVE 0x00 +#define I40E_AQ_LINK_TX_DRAINED 0x01 +#define I40E_AQ_LINK_TX_FLUSHED 0x03 +#define I40E_AQ_LINK_FORCED_40G 0x10 + u8 loopback; /* use defines from i40e_aqc_set_lb_mode */ + __le16 max_frame_size; + u8 config; +#define I40E_AQ_CONFIG_CRC_ENA 0x04 +#define I40E_AQ_CONFIG_PACING_MASK 0x78 + u8 reserved[5]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status); + +/* Set event mask command (direct 0x613) */ +struct i40e_aqc_set_phy_int_mask { + u8 reserved[8]; + __le16 event_mask; +#define I40E_AQ_EVENT_LINK_UPDOWN 0x0002 +#define I40E_AQ_EVENT_MEDIA_NA 0x0004 +#define I40E_AQ_EVENT_LINK_FAULT 0x0008 +#define I40E_AQ_EVENT_PHY_TEMP_ALARM 0x0010 +#define I40E_AQ_EVENT_EXCESSIVE_ERRORS 0x0020 +#define I40E_AQ_EVENT_SIGNAL_DETECT 0x0040 +#define I40E_AQ_EVENT_AN_COMPLETED 0x0080 +#define I40E_AQ_EVENT_MODULE_QUAL_FAIL 0x0100 +#define I40E_AQ_EVENT_PORT_TX_SUSPENDED 0x0200 + u8 reserved1[6]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_int_mask); + +/* Get Local AN advt register (direct 0x0614) + * Set Local AN advt register (direct 0x0615) + * Get Link Partner AN advt register (direct 0x0616) + */ +struct i40e_aqc_an_advt_reg { + __le32 local_an_reg0; + __le16 local_an_reg1; + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_an_advt_reg); + +/* Set Loopback mode (0x0618) */ +struct i40e_aqc_set_lb_mode { + __le16 lb_mode; +#define I40E_AQ_LB_PHY_LOCAL 0x01 +#define I40E_AQ_LB_PHY_REMOTE 0x02 +#define I40E_AQ_LB_MAC_LOCAL 0x04 + u8 reserved[14]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_lb_mode); + +/* Set PHY Debug command (0x0622) */ +struct i40e_aqc_set_phy_debug { + u8 command_flags; +#define I40E_AQ_PHY_DEBUG_RESET_INTERNAL 0x02 +#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT 2 +#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_MASK (0x03 << \ + I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT) +#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_NONE 0x00 +#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_HARD 0x01 +#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SOFT 0x02 +#define I40E_AQ_PHY_DEBUG_DISABLE_LINK_FW 0x10 + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_debug); + +enum i40e_aq_phy_reg_type { + I40E_AQC_PHY_REG_INTERNAL = 0x1, + I40E_AQC_PHY_REG_EXERNAL_BASET = 0x2, + I40E_AQC_PHY_REG_EXERNAL_MODULE = 0x3 +}; + +/* NVM Read command (indirect 0x0701) + * NVM Erase commands (direct 0x0702) + * NVM Update commands (indirect 0x0703) + */ +struct i40e_aqc_nvm_update { + u8 command_flags; +#define I40E_AQ_NVM_LAST_CMD 0x01 +#define I40E_AQ_NVM_FLASH_ONLY 0x80 + u8 module_pointer; + __le16 length; + __le32 offset; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_update); + +/* NVM Config Read (indirect 0x0704) */ +struct i40e_aqc_nvm_config_read { + __le16 cmd_flags; +#define I40E_AQ_ANVM_SINGLE_OR_MULTIPLE_FEATURES_MASK 1 +#define I40E_AQ_ANVM_READ_SINGLE_FEATURE 0 +#define I40E_AQ_ANVM_READ_MULTIPLE_FEATURES 1 + __le16 element_count; + __le16 element_id; /* Feature/field ID */ + __le16 element_id_msw; /* MSWord of field ID */ + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_read); + +/* NVM Config Write (indirect 0x0705) */ +struct i40e_aqc_nvm_config_write { + __le16 cmd_flags; + __le16 element_count; + u8 reserved[4]; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_write); + +/* Used for 0x0704 as well as for 0x0705 commands */ +#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT 1 +#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_MASK (1 << I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT) +#define I40E_AQ_ANVM_FEATURE 0 +#define I40E_AQ_ANVM_IMMEDIATE_FIELD (1 << FEATURE_OR_IMMEDIATE_SHIFT) +struct i40e_aqc_nvm_config_data_feature { + __le16 feature_id; +#define I40E_AQ_ANVM_FEATURE_OPTION_OEM_ONLY 0x01 +#define I40E_AQ_ANVM_FEATURE_OPTION_DWORD_MAP 0x08 +#define I40E_AQ_ANVM_FEATURE_OPTION_POR_CSR 0x10 + __le16 feature_options; + __le16 feature_selection; +}; + +I40E_CHECK_STRUCT_LEN(0x6, i40e_aqc_nvm_config_data_feature); + +struct i40e_aqc_nvm_config_data_immediate_field { + __le32 field_id; + __le32 field_value; + __le16 field_options; + __le16 reserved; +}; + +I40E_CHECK_STRUCT_LEN(0xc, i40e_aqc_nvm_config_data_immediate_field); + +/* OEM Post Update (indirect 0x0720) + * no command data struct used + */ + struct i40e_aqc_nvm_oem_post_update { +#define I40E_AQ_NVM_OEM_POST_UPDATE_EXTERNAL_DATA 0x01 + u8 sel_data; + u8 reserved[7]; +}; + +I40E_CHECK_STRUCT_LEN(0x8, i40e_aqc_nvm_oem_post_update); + +struct i40e_aqc_nvm_oem_post_update_buffer { + u8 str_len; + u8 dev_addr; + __le16 eeprom_addr; + u8 data[36]; +}; + +I40E_CHECK_STRUCT_LEN(0x28, i40e_aqc_nvm_oem_post_update_buffer); + +/* Send to PF command (indirect 0x0801) id is only used by PF + * Send to VF command (indirect 0x0802) id is only used by PF + * Send to Peer PF command (indirect 0x0803) + */ +struct i40e_aqc_pf_vf_message { + __le32 id; + u8 reserved[4]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_pf_vf_message); + +/* Alternate structure */ + +/* Direct write (direct 0x0900) + * Direct read (direct 0x0902) + */ +struct i40e_aqc_alternate_write { + __le32 address0; + __le32 data0; + __le32 address1; + __le32 data1; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write); + +/* Indirect write (indirect 0x0901) + * Indirect read (indirect 0x0903) + */ + +struct i40e_aqc_alternate_ind_write { + __le32 address; + __le32 length; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_ind_write); + +/* Done alternate write (direct 0x0904) + * uses i40e_aq_desc + */ +struct i40e_aqc_alternate_write_done { + __le16 cmd_flags; +#define I40E_AQ_ALTERNATE_MODE_BIOS_MASK 1 +#define I40E_AQ_ALTERNATE_MODE_BIOS_LEGACY 0 +#define I40E_AQ_ALTERNATE_MODE_BIOS_UEFI 1 +#define I40E_AQ_ALTERNATE_RESET_NEEDED 2 + u8 reserved[14]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write_done); + +/* Set OEM mode (direct 0x0905) */ +struct i40e_aqc_alternate_set_mode { + __le32 mode; +#define I40E_AQ_ALTERNATE_MODE_NONE 0 +#define I40E_AQ_ALTERNATE_MODE_OEM 1 + u8 reserved[12]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_set_mode); + +/* Clear port Alternate RAM (direct 0x0906) uses i40e_aq_desc */ + +/* async events 0x10xx */ + +/* Lan Queue Overflow Event (direct, 0x1001) */ +struct i40e_aqc_lan_overflow { + __le32 prtdcb_rupto; + __le32 otx_ctl; + u8 reserved[8]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lan_overflow); + +/* Get LLDP MIB (indirect 0x0A00) */ +struct i40e_aqc_lldp_get_mib { + u8 type; + u8 reserved1; +#define I40E_AQ_LLDP_MIB_TYPE_MASK 0x3 +#define I40E_AQ_LLDP_MIB_LOCAL 0x0 +#define I40E_AQ_LLDP_MIB_REMOTE 0x1 +#define I40E_AQ_LLDP_MIB_LOCAL_AND_REMOTE 0x2 +#define I40E_AQ_LLDP_BRIDGE_TYPE_MASK 0xC +#define I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT 0x2 +#define I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE 0x0 +#define I40E_AQ_LLDP_BRIDGE_TYPE_NON_TPMR 0x1 +#define I40E_AQ_LLDP_TX_SHIFT 0x4 +#define I40E_AQ_LLDP_TX_MASK (0x03 << I40E_AQ_LLDP_TX_SHIFT) +/* TX pause flags use I40E_AQ_LINK_TX_* above */ + __le16 local_len; + __le16 remote_len; + u8 reserved2[2]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_get_mib); + +/* Configure LLDP MIB Change Event (direct 0x0A01) + * also used for the event (with type in the command field) + */ +struct i40e_aqc_lldp_update_mib { + u8 command; +#define I40E_AQ_LLDP_MIB_UPDATE_ENABLE 0x0 +#define I40E_AQ_LLDP_MIB_UPDATE_DISABLE 0x1 + u8 reserved[7]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_mib); + +/* Add LLDP TLV (indirect 0x0A02) + * Delete LLDP TLV (indirect 0x0A04) + */ +struct i40e_aqc_lldp_add_tlv { + u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */ + u8 reserved1[1]; + __le16 len; + u8 reserved2[4]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_add_tlv); + +/* Update LLDP TLV (indirect 0x0A03) */ +struct i40e_aqc_lldp_update_tlv { + u8 type; /* only nearest bridge and non-TPMR from 0x0A00 */ + u8 reserved; + __le16 old_len; + __le16 new_offset; + __le16 new_len; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_tlv); + +/* Stop LLDP (direct 0x0A05) */ +struct i40e_aqc_lldp_stop { + u8 command; +#define I40E_AQ_LLDP_AGENT_STOP 0x0 +#define I40E_AQ_LLDP_AGENT_SHUTDOWN 0x1 + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop); + +/* Start LLDP (direct 0x0A06) */ + +struct i40e_aqc_lldp_start { + u8 command; +#define I40E_AQ_LLDP_AGENT_START 0x1 + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start); + +/* Get CEE DCBX Oper Config (0x0A07) + * uses the generic descriptor struct + * returns below as indirect response + */ + +#define I40E_AQC_CEE_APP_FCOE_SHIFT 0x0 +#define I40E_AQC_CEE_APP_FCOE_MASK (0x7 << I40E_AQC_CEE_APP_FCOE_SHIFT) +#define I40E_AQC_CEE_APP_ISCSI_SHIFT 0x3 +#define I40E_AQC_CEE_APP_ISCSI_MASK (0x7 << I40E_AQC_CEE_APP_ISCSI_SHIFT) +#define I40E_AQC_CEE_APP_FIP_SHIFT 0x8 +#define I40E_AQC_CEE_APP_FIP_MASK (0x7 << I40E_AQC_CEE_APP_FIP_SHIFT) + +#define I40E_AQC_CEE_PG_STATUS_SHIFT 0x0 +#define I40E_AQC_CEE_PG_STATUS_MASK (0x7 << I40E_AQC_CEE_PG_STATUS_SHIFT) +#define I40E_AQC_CEE_PFC_STATUS_SHIFT 0x3 +#define I40E_AQC_CEE_PFC_STATUS_MASK (0x7 << I40E_AQC_CEE_PFC_STATUS_SHIFT) +#define I40E_AQC_CEE_APP_STATUS_SHIFT 0x8 +#define I40E_AQC_CEE_APP_STATUS_MASK (0x7 << I40E_AQC_CEE_APP_STATUS_SHIFT) +#define I40E_AQC_CEE_FCOE_STATUS_SHIFT 0x8 +#define I40E_AQC_CEE_FCOE_STATUS_MASK (0x7 << I40E_AQC_CEE_FCOE_STATUS_SHIFT) +#define I40E_AQC_CEE_ISCSI_STATUS_SHIFT 0xB +#define I40E_AQC_CEE_ISCSI_STATUS_MASK (0x7 << I40E_AQC_CEE_ISCSI_STATUS_SHIFT) +#define I40E_AQC_CEE_FIP_STATUS_SHIFT 0x10 +#define I40E_AQC_CEE_FIP_STATUS_MASK (0x7 << I40E_AQC_CEE_FIP_STATUS_SHIFT) + +/* struct i40e_aqc_get_cee_dcb_cfg_v1_resp was originally defined with + * word boundary layout issues, which the Linux compilers silently deal + * with by adding padding, making the actual struct larger than designed. + * However, the FW compiler for the NIC is less lenient and complains + * about the struct. Hence, the struct defined here has an extra byte in + * fields reserved3 and reserved4 to directly acknowledge that padding, + * and the new length is used in the length check macro. + */ +struct i40e_aqc_get_cee_dcb_cfg_v1_resp { + u8 reserved1; + u8 oper_num_tc; + u8 oper_prio_tc[4]; + u8 reserved2; + u8 oper_tc_bw[8]; + u8 oper_pfc_en; + u8 reserved3[2]; + __le16 oper_app_prio; + u8 reserved4[2]; + __le16 tlv_status; +}; + +I40E_CHECK_STRUCT_LEN(0x18, i40e_aqc_get_cee_dcb_cfg_v1_resp); + +struct i40e_aqc_get_cee_dcb_cfg_resp { + u8 oper_num_tc; + u8 oper_prio_tc[4]; + u8 oper_tc_bw[8]; + u8 oper_pfc_en; + __le16 oper_app_prio; + __le32 tlv_status; + u8 reserved[12]; +}; + +I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_cee_dcb_cfg_resp); + +/* Set Local LLDP MIB (indirect 0x0A08) + * Used to replace the local MIB of a given LLDP agent. e.g. DCBx + */ +struct i40e_aqc_lldp_set_local_mib { +#define SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT 0 +#define SET_LOCAL_MIB_AC_TYPE_DCBX_MASK (1 << SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT) + u8 type; + u8 reserved0; + __le16 length; + u8 reserved1[4]; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_set_local_mib); + +/* Stop/Start LLDP Agent (direct 0x0A09) + * Used for stopping/starting specific LLDP agent. e.g. DCBx + */ +struct i40e_aqc_lldp_stop_start_specific_agent { +#define I40E_AQC_START_SPECIFIC_AGENT_SHIFT 0 +#define I40E_AQC_START_SPECIFIC_AGENT_MASK (1 << I40E_AQC_START_SPECIFIC_AGENT_SHIFT) + u8 command; + u8 reserved[15]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop_start_specific_agent); + +/* Add Udp Tunnel command and completion (direct 0x0B00) */ +struct i40e_aqc_add_udp_tunnel { + __le16 udp_port; + u8 reserved0[3]; + u8 protocol_type; +#define I40E_AQC_TUNNEL_TYPE_VXLAN 0x00 +#define I40E_AQC_TUNNEL_TYPE_NGE 0x01 +#define I40E_AQC_TUNNEL_TYPE_TEREDO 0x10 + u8 reserved1[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel); + +struct i40e_aqc_add_udp_tunnel_completion { + __le16 udp_port; + u8 filter_entry_index; + u8 multiple_pfs; +#define I40E_AQC_SINGLE_PF 0x0 +#define I40E_AQC_MULTIPLE_PFS 0x1 + u8 total_filters; + u8 reserved[11]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel_completion); + +/* remove UDP Tunnel command (0x0B01) */ +struct i40e_aqc_remove_udp_tunnel { + u8 reserved[2]; + u8 index; /* 0 to 15 */ + u8 reserved2[13]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_udp_tunnel); + +struct i40e_aqc_del_udp_tunnel_completion { + __le16 udp_port; + u8 index; /* 0 to 15 */ + u8 multiple_pfs; + u8 total_filters_used; + u8 reserved1[11]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_del_udp_tunnel_completion); +#ifdef X722_SUPPORT + +struct i40e_aqc_get_set_rss_key { +#define I40E_AQC_SET_RSS_KEY_VSI_VALID (0x1 << 15) +#define I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT 0 +#define I40E_AQC_SET_RSS_KEY_VSI_ID_MASK (0x3FF << \ + I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT) + __le16 vsi_id; + u8 reserved[6]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_key); + +struct i40e_aqc_get_set_rss_key_data { + u8 standard_rss_key[0x28]; + u8 extended_hash_key[0xc]; +}; + +I40E_CHECK_STRUCT_LEN(0x34, i40e_aqc_get_set_rss_key_data); + +struct i40e_aqc_get_set_rss_lut { +#define I40E_AQC_SET_RSS_LUT_VSI_VALID (0x1 << 15) +#define I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT 0 +#define I40E_AQC_SET_RSS_LUT_VSI_ID_MASK (0x3FF << \ + I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT) + __le16 vsi_id; +#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT 0 +#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK (0x1 << \ + I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) + +#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI 0 +#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF 1 + __le16 flags; + u8 reserved[4]; + __le32 addr_high; + __le32 addr_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_lut); +#endif + +/* tunnel key structure 0x0B10 */ + +struct i40e_aqc_tunnel_key_structure { + u8 key1_off; + u8 key2_off; + u8 key1_len; /* 0 to 15 */ + u8 key2_len; /* 0 to 15 */ + u8 flags; +#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDE 0x01 +/* response flags */ +#define I40E_AQC_TUNNEL_KEY_STRUCT_SUCCESS 0x01 +#define I40E_AQC_TUNNEL_KEY_STRUCT_MODIFIED 0x02 +#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDDEN 0x03 + u8 network_key_index; +#define I40E_AQC_NETWORK_KEY_INDEX_VXLAN 0x0 +#define I40E_AQC_NETWORK_KEY_INDEX_NGE 0x1 +#define I40E_AQC_NETWORK_KEY_INDEX_FLEX_MAC_IN_UDP 0x2 +#define I40E_AQC_NETWORK_KEY_INDEX_GRE 0x3 + u8 reserved[10]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_tunnel_key_structure); + +/* OEM mode commands (direct 0xFE0x) */ +struct i40e_aqc_oem_param_change { + __le32 param_type; +#define I40E_AQ_OEM_PARAM_TYPE_PF_CTL 0 +#define I40E_AQ_OEM_PARAM_TYPE_BW_CTL 1 +#define I40E_AQ_OEM_PARAM_MAC 2 + __le32 param_value1; + __le16 param_value2; + u8 reserved[6]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_param_change); + +struct i40e_aqc_oem_state_change { + __le32 state; +#define I40E_AQ_OEM_STATE_LINK_DOWN 0x0 +#define I40E_AQ_OEM_STATE_LINK_UP 0x1 + u8 reserved[12]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_state_change); + +/* Initialize OCSD (0xFE02, direct) */ +struct i40e_aqc_opc_oem_ocsd_initialize { + u8 type_status; + u8 reserved1[3]; + __le32 ocsd_memory_block_addr_high; + __le32 ocsd_memory_block_addr_low; + __le32 requested_update_interval; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocsd_initialize); + +/* Initialize OCBB (0xFE03, direct) */ +struct i40e_aqc_opc_oem_ocbb_initialize { + u8 type_status; + u8 reserved1[3]; + __le32 ocbb_memory_block_addr_high; + __le32 ocbb_memory_block_addr_low; + u8 reserved2[4]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocbb_initialize); + +/* debug commands */ + +/* get device id (0xFF00) uses the generic structure */ + +/* set test more (0xFF01, internal) */ + +struct i40e_acq_set_test_mode { + u8 mode; +#define I40E_AQ_TEST_PARTIAL 0 +#define I40E_AQ_TEST_FULL 1 +#define I40E_AQ_TEST_NVM 2 + u8 reserved[3]; + u8 command; +#define I40E_AQ_TEST_OPEN 0 +#define I40E_AQ_TEST_CLOSE 1 +#define I40E_AQ_TEST_INC 2 + u8 reserved2[3]; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_acq_set_test_mode); + +/* Debug Read Register command (0xFF03) + * Debug Write Register command (0xFF04) + */ +struct i40e_aqc_debug_reg_read_write { + __le32 reserved; + __le32 address; + __le32 value_high; + __le32 value_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_reg_read_write); + +/* Scatter/gather Reg Read (indirect 0xFF05) + * Scatter/gather Reg Write (indirect 0xFF06) + */ + +/* i40e_aq_desc is used for the command */ +struct i40e_aqc_debug_reg_sg_element_data { + __le32 address; + __le32 value; +}; + +/* Debug Modify register (direct 0xFF07) */ +struct i40e_aqc_debug_modify_reg { + __le32 address; + __le32 value; + __le32 clear_mask; + __le32 set_mask; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_reg); + +/* dump internal data (0xFF08, indirect) */ + +#define I40E_AQ_CLUSTER_ID_AUX 0 +#define I40E_AQ_CLUSTER_ID_SWITCH_FLU 1 +#define I40E_AQ_CLUSTER_ID_TXSCHED 2 +#define I40E_AQ_CLUSTER_ID_HMC 3 +#define I40E_AQ_CLUSTER_ID_MAC0 4 +#define I40E_AQ_CLUSTER_ID_MAC1 5 +#define I40E_AQ_CLUSTER_ID_MAC2 6 +#define I40E_AQ_CLUSTER_ID_MAC3 7 +#define I40E_AQ_CLUSTER_ID_DCB 8 +#define I40E_AQ_CLUSTER_ID_EMP_MEM 9 +#define I40E_AQ_CLUSTER_ID_PKT_BUF 10 +#define I40E_AQ_CLUSTER_ID_ALTRAM 11 + +struct i40e_aqc_debug_dump_internals { + u8 cluster_id; + u8 table_id; + __le16 data_size; + __le32 idx; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_dump_internals); + +struct i40e_aqc_debug_modify_internals { + u8 cluster_id; + u8 cluster_specific_params[7]; + __le32 address_high; + __le32 address_low; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_internals); + +#endif diff --git a/usr/src/uts/common/io/i40e/core/i40e_alloc.h b/usr/src/uts/common/io/i40e/core/i40e_alloc.h new file mode 100644 index 0000000000..4428287f83 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_alloc.h @@ -0,0 +1,66 @@ +/****************************************************************************** + + Copyright (c) 2013-2014, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_alloc.h 283119 2015-05-19 18:35:18Z jhb $*/ + +#ifndef _I40E_ALLOC_H_ +#define _I40E_ALLOC_H_ + +struct i40e_hw; + +/* Memory allocation types */ +enum i40e_memory_type { + i40e_mem_arq_buf = 0, /* ARQ indirect command buffer */ + i40e_mem_asq_buf = 1, + i40e_mem_atq_buf = 2, /* ATQ indirect command buffer */ + i40e_mem_arq_ring = 3, /* ARQ descriptor ring */ + i40e_mem_atq_ring = 4, /* ATQ descriptor ring */ + i40e_mem_pd = 5, /* Page Descriptor */ + i40e_mem_bp = 6, /* Backing Page - 4KB */ + i40e_mem_bp_jumbo = 7, /* Backing Page - > 4KB */ + i40e_mem_reserved +}; + +/* prototype for functions used for dynamic memory allocation */ +enum i40e_status_code i40e_allocate_dma_mem(struct i40e_hw *hw, + struct i40e_dma_mem *mem, + enum i40e_memory_type type, + u64 size, u32 alignment); +enum i40e_status_code i40e_free_dma_mem(struct i40e_hw *hw, + struct i40e_dma_mem *mem); +enum i40e_status_code i40e_allocate_virt_mem(struct i40e_hw *hw, + struct i40e_virt_mem *mem, + u32 size); +enum i40e_status_code i40e_free_virt_mem(struct i40e_hw *hw, + struct i40e_virt_mem *mem); + +#endif /* _I40E_ALLOC_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_common.c b/usr/src/uts/common/io/i40e/core/i40e_common.c new file mode 100644 index 0000000000..c58eb9de1e --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_common.c @@ -0,0 +1,5708 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_common.c 284049 2015-06-05 22:52:42Z jfv $*/ + +#include "i40e_type.h" +#include "i40e_adminq.h" +#include "i40e_prototype.h" +#include "i40e_virtchnl.h" + + +/** + * i40e_set_mac_type - Sets MAC type + * @hw: pointer to the HW structure + * + * This function sets the mac type of the adapter based on the + * vendor ID and device ID stored in the hw structure. + **/ +enum i40e_status_code i40e_set_mac_type(struct i40e_hw *hw) +{ + enum i40e_status_code status = I40E_SUCCESS; + + DEBUGFUNC("i40e_set_mac_type\n"); + + if (hw->vendor_id == I40E_INTEL_VENDOR_ID) { + switch (hw->device_id) { + case I40E_DEV_ID_SFP_XL710: + case I40E_DEV_ID_QEMU: + case I40E_DEV_ID_KX_A: + case I40E_DEV_ID_KX_B: + case I40E_DEV_ID_KX_C: + case I40E_DEV_ID_QSFP_A: + case I40E_DEV_ID_QSFP_B: + case I40E_DEV_ID_QSFP_C: + case I40E_DEV_ID_10G_BASE_T: + case I40E_DEV_ID_10G_BASE_T4: + case I40E_DEV_ID_20G_KR2: + case I40E_DEV_ID_20G_KR2_A: + hw->mac.type = I40E_MAC_XL710; + break; +#ifdef X722_SUPPORT + case I40E_DEV_ID_SFP_X722: + case I40E_DEV_ID_1G_BASE_T_X722: + case I40E_DEV_ID_10G_BASE_T_X722: + hw->mac.type = I40E_MAC_X722; + break; +#endif +#ifdef X722_SUPPORT + case I40E_DEV_ID_X722_VF: + case I40E_DEV_ID_X722_VF_HV: + hw->mac.type = I40E_MAC_X722_VF; + break; +#endif + case I40E_DEV_ID_VF: + case I40E_DEV_ID_VF_HV: + hw->mac.type = I40E_MAC_VF; + break; + default: + hw->mac.type = I40E_MAC_GENERIC; + break; + } + } else { + status = I40E_ERR_DEVICE_NOT_SUPPORTED; + } + + DEBUGOUT2("i40e_set_mac_type found mac: %d, returns: %d\n", + hw->mac.type, status); + return status; +} + +/** + * i40e_aq_str - convert AQ err code to a string + * @hw: pointer to the HW structure + * @aq_err: the AQ error code to convert + **/ +char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err) +{ + switch (aq_err) { + case I40E_AQ_RC_OK: + return "OK"; + case I40E_AQ_RC_EPERM: + return "I40E_AQ_RC_EPERM"; + case I40E_AQ_RC_ENOENT: + return "I40E_AQ_RC_ENOENT"; + case I40E_AQ_RC_ESRCH: + return "I40E_AQ_RC_ESRCH"; + case I40E_AQ_RC_EINTR: + return "I40E_AQ_RC_EINTR"; + case I40E_AQ_RC_EIO: + return "I40E_AQ_RC_EIO"; + case I40E_AQ_RC_ENXIO: + return "I40E_AQ_RC_ENXIO"; + case I40E_AQ_RC_E2BIG: + return "I40E_AQ_RC_E2BIG"; + case I40E_AQ_RC_EAGAIN: + return "I40E_AQ_RC_EAGAIN"; + case I40E_AQ_RC_ENOMEM: + return "I40E_AQ_RC_ENOMEM"; + case I40E_AQ_RC_EACCES: + return "I40E_AQ_RC_EACCES"; + case I40E_AQ_RC_EFAULT: + return "I40E_AQ_RC_EFAULT"; + case I40E_AQ_RC_EBUSY: + return "I40E_AQ_RC_EBUSY"; + case I40E_AQ_RC_EEXIST: + return "I40E_AQ_RC_EEXIST"; + case I40E_AQ_RC_EINVAL: + return "I40E_AQ_RC_EINVAL"; + case I40E_AQ_RC_ENOTTY: + return "I40E_AQ_RC_ENOTTY"; + case I40E_AQ_RC_ENOSPC: + return "I40E_AQ_RC_ENOSPC"; + case I40E_AQ_RC_ENOSYS: + return "I40E_AQ_RC_ENOSYS"; + case I40E_AQ_RC_ERANGE: + return "I40E_AQ_RC_ERANGE"; + case I40E_AQ_RC_EFLUSHED: + return "I40E_AQ_RC_EFLUSHED"; + case I40E_AQ_RC_BAD_ADDR: + return "I40E_AQ_RC_BAD_ADDR"; + case I40E_AQ_RC_EMODE: + return "I40E_AQ_RC_EMODE"; + case I40E_AQ_RC_EFBIG: + return "I40E_AQ_RC_EFBIG"; + } + + snprintf(hw->err_str, sizeof(hw->err_str), "%d", aq_err); + return hw->err_str; +} + +/** + * i40e_stat_str - convert status err code to a string + * @hw: pointer to the HW structure + * @stat_err: the status error code to convert + **/ +char *i40e_stat_str(struct i40e_hw *hw, enum i40e_status_code stat_err) +{ + switch (stat_err) { + case I40E_SUCCESS: + return "OK"; + case I40E_ERR_NVM: + return "I40E_ERR_NVM"; + case I40E_ERR_NVM_CHECKSUM: + return "I40E_ERR_NVM_CHECKSUM"; + case I40E_ERR_PHY: + return "I40E_ERR_PHY"; + case I40E_ERR_CONFIG: + return "I40E_ERR_CONFIG"; + case I40E_ERR_PARAM: + return "I40E_ERR_PARAM"; + case I40E_ERR_MAC_TYPE: + return "I40E_ERR_MAC_TYPE"; + case I40E_ERR_UNKNOWN_PHY: + return "I40E_ERR_UNKNOWN_PHY"; + case I40E_ERR_LINK_SETUP: + return "I40E_ERR_LINK_SETUP"; + case I40E_ERR_ADAPTER_STOPPED: + return "I40E_ERR_ADAPTER_STOPPED"; + case I40E_ERR_INVALID_MAC_ADDR: + return "I40E_ERR_INVALID_MAC_ADDR"; + case I40E_ERR_DEVICE_NOT_SUPPORTED: + return "I40E_ERR_DEVICE_NOT_SUPPORTED"; + case I40E_ERR_MASTER_REQUESTS_PENDING: + return "I40E_ERR_MASTER_REQUESTS_PENDING"; + case I40E_ERR_INVALID_LINK_SETTINGS: + return "I40E_ERR_INVALID_LINK_SETTINGS"; + case I40E_ERR_AUTONEG_NOT_COMPLETE: + return "I40E_ERR_AUTONEG_NOT_COMPLETE"; + case I40E_ERR_RESET_FAILED: + return "I40E_ERR_RESET_FAILED"; + case I40E_ERR_SWFW_SYNC: + return "I40E_ERR_SWFW_SYNC"; + case I40E_ERR_NO_AVAILABLE_VSI: + return "I40E_ERR_NO_AVAILABLE_VSI"; + case I40E_ERR_NO_MEMORY: + return "I40E_ERR_NO_MEMORY"; + case I40E_ERR_BAD_PTR: + return "I40E_ERR_BAD_PTR"; + case I40E_ERR_RING_FULL: + return "I40E_ERR_RING_FULL"; + case I40E_ERR_INVALID_PD_ID: + return "I40E_ERR_INVALID_PD_ID"; + case I40E_ERR_INVALID_QP_ID: + return "I40E_ERR_INVALID_QP_ID"; + case I40E_ERR_INVALID_CQ_ID: + return "I40E_ERR_INVALID_CQ_ID"; + case I40E_ERR_INVALID_CEQ_ID: + return "I40E_ERR_INVALID_CEQ_ID"; + case I40E_ERR_INVALID_AEQ_ID: + return "I40E_ERR_INVALID_AEQ_ID"; + case I40E_ERR_INVALID_SIZE: + return "I40E_ERR_INVALID_SIZE"; + case I40E_ERR_INVALID_ARP_INDEX: + return "I40E_ERR_INVALID_ARP_INDEX"; + case I40E_ERR_INVALID_FPM_FUNC_ID: + return "I40E_ERR_INVALID_FPM_FUNC_ID"; + case I40E_ERR_QP_INVALID_MSG_SIZE: + return "I40E_ERR_QP_INVALID_MSG_SIZE"; + case I40E_ERR_QP_TOOMANY_WRS_POSTED: + return "I40E_ERR_QP_TOOMANY_WRS_POSTED"; + case I40E_ERR_INVALID_FRAG_COUNT: + return "I40E_ERR_INVALID_FRAG_COUNT"; + case I40E_ERR_QUEUE_EMPTY: + return "I40E_ERR_QUEUE_EMPTY"; + case I40E_ERR_INVALID_ALIGNMENT: + return "I40E_ERR_INVALID_ALIGNMENT"; + case I40E_ERR_FLUSHED_QUEUE: + return "I40E_ERR_FLUSHED_QUEUE"; + case I40E_ERR_INVALID_PUSH_PAGE_INDEX: + return "I40E_ERR_INVALID_PUSH_PAGE_INDEX"; + case I40E_ERR_INVALID_IMM_DATA_SIZE: + return "I40E_ERR_INVALID_IMM_DATA_SIZE"; + case I40E_ERR_TIMEOUT: + return "I40E_ERR_TIMEOUT"; + case I40E_ERR_OPCODE_MISMATCH: + return "I40E_ERR_OPCODE_MISMATCH"; + case I40E_ERR_CQP_COMPL_ERROR: + return "I40E_ERR_CQP_COMPL_ERROR"; + case I40E_ERR_INVALID_VF_ID: + return "I40E_ERR_INVALID_VF_ID"; + case I40E_ERR_INVALID_HMCFN_ID: + return "I40E_ERR_INVALID_HMCFN_ID"; + case I40E_ERR_BACKING_PAGE_ERROR: + return "I40E_ERR_BACKING_PAGE_ERROR"; + case I40E_ERR_NO_PBLCHUNKS_AVAILABLE: + return "I40E_ERR_NO_PBLCHUNKS_AVAILABLE"; + case I40E_ERR_INVALID_PBLE_INDEX: + return "I40E_ERR_INVALID_PBLE_INDEX"; + case I40E_ERR_INVALID_SD_INDEX: + return "I40E_ERR_INVALID_SD_INDEX"; + case I40E_ERR_INVALID_PAGE_DESC_INDEX: + return "I40E_ERR_INVALID_PAGE_DESC_INDEX"; + case I40E_ERR_INVALID_SD_TYPE: + return "I40E_ERR_INVALID_SD_TYPE"; + case I40E_ERR_MEMCPY_FAILED: + return "I40E_ERR_MEMCPY_FAILED"; + case I40E_ERR_INVALID_HMC_OBJ_INDEX: + return "I40E_ERR_INVALID_HMC_OBJ_INDEX"; + case I40E_ERR_INVALID_HMC_OBJ_COUNT: + return "I40E_ERR_INVALID_HMC_OBJ_COUNT"; + case I40E_ERR_INVALID_SRQ_ARM_LIMIT: + return "I40E_ERR_INVALID_SRQ_ARM_LIMIT"; + case I40E_ERR_SRQ_ENABLED: + return "I40E_ERR_SRQ_ENABLED"; + case I40E_ERR_ADMIN_QUEUE_ERROR: + return "I40E_ERR_ADMIN_QUEUE_ERROR"; + case I40E_ERR_ADMIN_QUEUE_TIMEOUT: + return "I40E_ERR_ADMIN_QUEUE_TIMEOUT"; + case I40E_ERR_BUF_TOO_SHORT: + return "I40E_ERR_BUF_TOO_SHORT"; + case I40E_ERR_ADMIN_QUEUE_FULL: + return "I40E_ERR_ADMIN_QUEUE_FULL"; + case I40E_ERR_ADMIN_QUEUE_NO_WORK: + return "I40E_ERR_ADMIN_QUEUE_NO_WORK"; + case I40E_ERR_BAD_IWARP_CQE: + return "I40E_ERR_BAD_IWARP_CQE"; + case I40E_ERR_NVM_BLANK_MODE: + return "I40E_ERR_NVM_BLANK_MODE"; + case I40E_ERR_NOT_IMPLEMENTED: + return "I40E_ERR_NOT_IMPLEMENTED"; + case I40E_ERR_PE_DOORBELL_NOT_ENABLED: + return "I40E_ERR_PE_DOORBELL_NOT_ENABLED"; + case I40E_ERR_DIAG_TEST_FAILED: + return "I40E_ERR_DIAG_TEST_FAILED"; + case I40E_ERR_NOT_READY: + return "I40E_ERR_NOT_READY"; + case I40E_NOT_SUPPORTED: + return "I40E_NOT_SUPPORTED"; + case I40E_ERR_FIRMWARE_API_VERSION: + return "I40E_ERR_FIRMWARE_API_VERSION"; + } + + snprintf(hw->err_str, sizeof(hw->err_str), "%d", stat_err); + return hw->err_str; +} + +/** + * i40e_debug_aq + * @hw: debug mask related to admin queue + * @mask: debug mask + * @desc: pointer to admin queue descriptor + * @buffer: pointer to command buffer + * @buf_len: max length of buffer + * + * Dumps debug log about adminq command with descriptor contents. + **/ +void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, void *desc, + void *buffer, u16 buf_len) +{ + struct i40e_aq_desc *aq_desc = (struct i40e_aq_desc *)desc; + u16 len = LE16_TO_CPU(aq_desc->datalen); + u8 *buf = (u8 *)buffer; + u16 i = 0; + + if ((!(mask & hw->debug_mask)) || (desc == NULL)) + return; + + i40e_debug(hw, mask, + "AQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n", + LE16_TO_CPU(aq_desc->opcode), + LE16_TO_CPU(aq_desc->flags), + LE16_TO_CPU(aq_desc->datalen), + LE16_TO_CPU(aq_desc->retval)); + i40e_debug(hw, mask, "\tcookie (h,l) 0x%08X 0x%08X\n", + LE32_TO_CPU(aq_desc->cookie_high), + LE32_TO_CPU(aq_desc->cookie_low)); + i40e_debug(hw, mask, "\tparam (0,1) 0x%08X 0x%08X\n", + LE32_TO_CPU(aq_desc->params.internal.param0), + LE32_TO_CPU(aq_desc->params.internal.param1)); + i40e_debug(hw, mask, "\taddr (h,l) 0x%08X 0x%08X\n", + LE32_TO_CPU(aq_desc->params.external.addr_high), + LE32_TO_CPU(aq_desc->params.external.addr_low)); + + if ((buffer != NULL) && (aq_desc->datalen != 0)) { + i40e_debug(hw, mask, "AQ CMD Buffer:\n"); + if (buf_len < len) + len = buf_len; + /* write the full 16-byte chunks */ + for (i = 0; i < (len - 16); i += 16) + i40e_debug(hw, mask, + "\t0x%04X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n", + i, buf[i], buf[i+1], buf[i+2], buf[i+3], + buf[i+4], buf[i+5], buf[i+6], buf[i+7], + buf[i+8], buf[i+9], buf[i+10], buf[i+11], + buf[i+12], buf[i+13], buf[i+14], buf[i+15]); + /* write whatever's left over without overrunning the buffer */ + if (i < len) { + char d_buf[80]; + int j = 0; + + memset(d_buf, 0, sizeof(d_buf)); +#ifdef I40E_ILLUMOS + /* + * Sigh. + * + * The illumos DDI (inherited from OpenSolaris) says + * sprintf() returns the pointer to its first + * argument, NOT the length of bytes printed. A better + * solution would be to have the kernel provide + * something like real_sprintf() but for now, we + * hack around it. + */ + (void) sprintf(d_buf, "\t0x%04X ", i); + j += strlen(d_buf); + /* Bounds-check at 77, because " XX" emits 4 chars. */ + while (i < len && j < 77) { + (void) sprintf(&d_buf[j], " %02X", buf[i++]); + j += strlen(&d_buf[j]); + } +#else + j += sprintf(d_buf, "\t0x%04X ", i); + while (i < len) + j += sprintf(&d_buf[j], " %02X", buf[i++]); +#endif + i40e_debug(hw, mask, "%s\n", d_buf); + } + } +} + +/** + * i40e_check_asq_alive + * @hw: pointer to the hw struct + * + * Returns TRUE if Queue is enabled else FALSE. + **/ +bool i40e_check_asq_alive(struct i40e_hw *hw) +{ + if (hw->aq.asq.len) + if (!i40e_is_vf(hw)) + return !!(rd32(hw, hw->aq.asq.len) & + I40E_PF_ATQLEN_ATQENABLE_MASK); + if (i40e_is_vf(hw)) + return !!(rd32(hw, hw->aq.asq.len) & + I40E_VF_ATQLEN1_ATQENABLE_MASK); + return FALSE; +} + +/** + * i40e_aq_queue_shutdown + * @hw: pointer to the hw struct + * @unloading: is the driver unloading itself + * + * Tell the Firmware that we're shutting down the AdminQ and whether + * or not the driver is unloading as well. + **/ +enum i40e_status_code i40e_aq_queue_shutdown(struct i40e_hw *hw, + bool unloading) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_queue_shutdown *cmd = + (struct i40e_aqc_queue_shutdown *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_queue_shutdown); + + if (unloading) + cmd->driver_unloading = CPU_TO_LE32(I40E_AQ_DRIVER_UNLOADING); + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + + return status; +} +#ifdef X722_SUPPORT + +/** + * i40e_aq_get_set_rss_lut + * @hw: pointer to the hardware structure + * @vsi_id: vsi fw index + * @pf_lut: for PF table set TRUE, for VSI table set FALSE + * @lut: pointer to the lut buffer provided by the caller + * @lut_size: size of the lut buffer + * @set: set TRUE to set the table, FALSE to get the table + * + * Internal function to get or set RSS look up table + **/ +static enum i40e_status_code i40e_aq_get_set_rss_lut(struct i40e_hw *hw, + u16 vsi_id, bool pf_lut, + u8 *lut, u16 lut_size, + bool set) +{ + enum i40e_status_code status; + struct i40e_aq_desc desc; + struct i40e_aqc_get_set_rss_lut *cmd_resp = + (struct i40e_aqc_get_set_rss_lut *)&desc.params.raw; + + if (set) + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_rss_lut); + else + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_rss_lut); + + /* Indirect command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD); + + cmd_resp->vsi_id = + CPU_TO_LE16((u16)((vsi_id << + I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT) & + I40E_AQC_SET_RSS_LUT_VSI_ID_MASK)); + cmd_resp->vsi_id |= CPU_TO_LE16((u16)I40E_AQC_SET_RSS_LUT_VSI_VALID); + + if (pf_lut) + cmd_resp->flags |= CPU_TO_LE16((u16) + ((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF << + I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) & + I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK)); + else + cmd_resp->flags |= CPU_TO_LE16((u16) + ((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI << + I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) & + I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK)); + + cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((u64)lut)); + cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((u64)lut)); + + status = i40e_asq_send_command(hw, &desc, lut, lut_size, NULL); + + return status; +} + +/** + * i40e_aq_get_rss_lut + * @hw: pointer to the hardware structure + * @vsi_id: vsi fw index + * @pf_lut: for PF table set TRUE, for VSI table set FALSE + * @lut: pointer to the lut buffer provided by the caller + * @lut_size: size of the lut buffer + * + * get the RSS lookup table, PF or VSI type + **/ +enum i40e_status_code i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 vsi_id, + bool pf_lut, u8 *lut, u16 lut_size) +{ + return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size, + FALSE); +} + +/** + * i40e_aq_set_rss_lut + * @hw: pointer to the hardware structure + * @vsi_id: vsi fw index + * @pf_lut: for PF table set TRUE, for VSI table set FALSE + * @lut: pointer to the lut buffer provided by the caller + * @lut_size: size of the lut buffer + * + * set the RSS lookup table, PF or VSI type + **/ +enum i40e_status_code i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 vsi_id, + bool pf_lut, u8 *lut, u16 lut_size) +{ + return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size, TRUE); +} + +/** + * i40e_aq_get_set_rss_key + * @hw: pointer to the hw struct + * @vsi_id: vsi fw index + * @key: pointer to key info struct + * @set: set TRUE to set the key, FALSE to get the key + * + * get the RSS key per VSI + **/ +static enum i40e_status_code i40e_aq_get_set_rss_key(struct i40e_hw *hw, + u16 vsi_id, + struct i40e_aqc_get_set_rss_key_data *key, + bool set) +{ + enum i40e_status_code status; + struct i40e_aq_desc desc; + struct i40e_aqc_get_set_rss_key *cmd_resp = + (struct i40e_aqc_get_set_rss_key *)&desc.params.raw; + u16 key_size = sizeof(struct i40e_aqc_get_set_rss_key_data); + + if (set) + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_rss_key); + else + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_rss_key); + + /* Indirect command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD); + + cmd_resp->vsi_id = + CPU_TO_LE16((u16)((vsi_id << + I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT) & + I40E_AQC_SET_RSS_KEY_VSI_ID_MASK)); + cmd_resp->vsi_id |= CPU_TO_LE16((u16)I40E_AQC_SET_RSS_KEY_VSI_VALID); + cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((u64)key)); + cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((u64)key)); + + status = i40e_asq_send_command(hw, &desc, key, key_size, NULL); + + return status; +} + +/** + * i40e_aq_get_rss_key + * @hw: pointer to the hw struct + * @vsi_id: vsi fw index + * @key: pointer to key info struct + * + **/ +enum i40e_status_code i40e_aq_get_rss_key(struct i40e_hw *hw, + u16 vsi_id, + struct i40e_aqc_get_set_rss_key_data *key) +{ + return i40e_aq_get_set_rss_key(hw, vsi_id, key, FALSE); +} + +/** + * i40e_aq_set_rss_key + * @hw: pointer to the hw struct + * @vsi_id: vsi fw index + * @key: pointer to key info struct + * + * set the RSS key per VSI + **/ +enum i40e_status_code i40e_aq_set_rss_key(struct i40e_hw *hw, + u16 vsi_id, + struct i40e_aqc_get_set_rss_key_data *key) +{ + return i40e_aq_get_set_rss_key(hw, vsi_id, key, TRUE); +} +#endif /* X722_SUPPORT */ + +/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the + * hardware to a bit-field that can be used by SW to more easily determine the + * packet type. + * + * Macros are used to shorten the table lines and make this table human + * readable. + * + * We store the PTYPE in the top byte of the bit field - this is just so that + * we can check that the table doesn't have a row missing, as the index into + * the table should be the PTYPE. + * + * Typical work flow: + * + * IF NOT i40e_ptype_lookup[ptype].known + * THEN + * Packet is unknown + * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP + * Use the rest of the fields to look at the tunnels, inner protocols, etc + * ELSE + * Use the enum i40e_rx_l2_ptype to decode the packet type + * ENDIF + */ + +/* macro to make the table lines short */ +#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\ + { PTYPE, \ + 1, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP, \ + I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \ + I40E_RX_PTYPE_##OUTER_FRAG, \ + I40E_RX_PTYPE_TUNNEL_##T, \ + I40E_RX_PTYPE_TUNNEL_END_##TE, \ + I40E_RX_PTYPE_##TEF, \ + I40E_RX_PTYPE_INNER_PROT_##I, \ + I40E_RX_PTYPE_PAYLOAD_LAYER_##PL } + +#define I40E_PTT_UNUSED_ENTRY(PTYPE) \ + { PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + +/* shorter macros makes the table fit but are terse */ +#define I40E_RX_PTYPE_NOF I40E_RX_PTYPE_NOT_FRAG +#define I40E_RX_PTYPE_FRG I40E_RX_PTYPE_FRAG +#define I40E_RX_PTYPE_INNER_PROT_TS I40E_RX_PTYPE_INNER_PROT_TIMESYNC + +/* Lookup table mapping the HW PTYPE to the bit field for decoding */ +struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = { + /* L2 Packet types */ + I40E_PTT_UNUSED_ENTRY(0), + I40E_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(2, L2, NONE, NOF, NONE, NONE, NOF, TS, PAY2), + I40E_PTT(3, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(4), + I40E_PTT_UNUSED_ENTRY(5), + I40E_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT_UNUSED_ENTRY(8), + I40E_PTT_UNUSED_ENTRY(9), + I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2), + I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE), + I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3), + + /* Non Tunneled IPv4 */ + I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(25), + I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv4 --> IPv4 */ + I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(32), + I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> IPv6 */ + I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(39), + I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT */ + I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> IPv4 */ + I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(47), + I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> IPv6 */ + I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(54), + I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC */ + I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv4 --> GRE/NAT --> MAC --> IPv4 */ + I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(62), + I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT -> MAC --> IPv6 */ + I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(69), + I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv4 --> GRE/NAT --> MAC/VLAN */ + I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(77), + I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(84), + I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* Non Tunneled IPv6 */ + I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3), + I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(91), + I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP, PAY4), + I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4), + I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4), + + /* IPv6 --> IPv4 */ + I40E_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3), + I40E_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3), + I40E_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(98), + I40E_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP, PAY4), + I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4), + I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> IPv6 */ + I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3), + I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3), + I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(105), + I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP, PAY4), + I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4), + I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT */ + I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> IPv4 */ + I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3), + I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3), + I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(113), + I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP, PAY4), + I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4), + I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> IPv6 */ + I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3), + I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3), + I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(120), + I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP, PAY4), + I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4), + I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC */ + I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC -> IPv4 */ + I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3), + I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3), + I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(128), + I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP, PAY4), + I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4), + I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC -> IPv6 */ + I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3), + I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3), + I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(135), + I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP, PAY4), + I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4), + I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN */ + I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */ + I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3), + I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3), + I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(143), + I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP, PAY4), + I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4), + I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4), + + /* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */ + I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3), + I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3), + I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP, PAY4), + I40E_PTT_UNUSED_ENTRY(150), + I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP, PAY4), + I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4), + I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4), + + /* unused entries */ + I40E_PTT_UNUSED_ENTRY(154), + I40E_PTT_UNUSED_ENTRY(155), + I40E_PTT_UNUSED_ENTRY(156), + I40E_PTT_UNUSED_ENTRY(157), + I40E_PTT_UNUSED_ENTRY(158), + I40E_PTT_UNUSED_ENTRY(159), + + I40E_PTT_UNUSED_ENTRY(160), + I40E_PTT_UNUSED_ENTRY(161), + I40E_PTT_UNUSED_ENTRY(162), + I40E_PTT_UNUSED_ENTRY(163), + I40E_PTT_UNUSED_ENTRY(164), + I40E_PTT_UNUSED_ENTRY(165), + I40E_PTT_UNUSED_ENTRY(166), + I40E_PTT_UNUSED_ENTRY(167), + I40E_PTT_UNUSED_ENTRY(168), + I40E_PTT_UNUSED_ENTRY(169), + + I40E_PTT_UNUSED_ENTRY(170), + I40E_PTT_UNUSED_ENTRY(171), + I40E_PTT_UNUSED_ENTRY(172), + I40E_PTT_UNUSED_ENTRY(173), + I40E_PTT_UNUSED_ENTRY(174), + I40E_PTT_UNUSED_ENTRY(175), + I40E_PTT_UNUSED_ENTRY(176), + I40E_PTT_UNUSED_ENTRY(177), + I40E_PTT_UNUSED_ENTRY(178), + I40E_PTT_UNUSED_ENTRY(179), + + I40E_PTT_UNUSED_ENTRY(180), + I40E_PTT_UNUSED_ENTRY(181), + I40E_PTT_UNUSED_ENTRY(182), + I40E_PTT_UNUSED_ENTRY(183), + I40E_PTT_UNUSED_ENTRY(184), + I40E_PTT_UNUSED_ENTRY(185), + I40E_PTT_UNUSED_ENTRY(186), + I40E_PTT_UNUSED_ENTRY(187), + I40E_PTT_UNUSED_ENTRY(188), + I40E_PTT_UNUSED_ENTRY(189), + + I40E_PTT_UNUSED_ENTRY(190), + I40E_PTT_UNUSED_ENTRY(191), + I40E_PTT_UNUSED_ENTRY(192), + I40E_PTT_UNUSED_ENTRY(193), + I40E_PTT_UNUSED_ENTRY(194), + I40E_PTT_UNUSED_ENTRY(195), + I40E_PTT_UNUSED_ENTRY(196), + I40E_PTT_UNUSED_ENTRY(197), + I40E_PTT_UNUSED_ENTRY(198), + I40E_PTT_UNUSED_ENTRY(199), + + I40E_PTT_UNUSED_ENTRY(200), + I40E_PTT_UNUSED_ENTRY(201), + I40E_PTT_UNUSED_ENTRY(202), + I40E_PTT_UNUSED_ENTRY(203), + I40E_PTT_UNUSED_ENTRY(204), + I40E_PTT_UNUSED_ENTRY(205), + I40E_PTT_UNUSED_ENTRY(206), + I40E_PTT_UNUSED_ENTRY(207), + I40E_PTT_UNUSED_ENTRY(208), + I40E_PTT_UNUSED_ENTRY(209), + + I40E_PTT_UNUSED_ENTRY(210), + I40E_PTT_UNUSED_ENTRY(211), + I40E_PTT_UNUSED_ENTRY(212), + I40E_PTT_UNUSED_ENTRY(213), + I40E_PTT_UNUSED_ENTRY(214), + I40E_PTT_UNUSED_ENTRY(215), + I40E_PTT_UNUSED_ENTRY(216), + I40E_PTT_UNUSED_ENTRY(217), + I40E_PTT_UNUSED_ENTRY(218), + I40E_PTT_UNUSED_ENTRY(219), + + I40E_PTT_UNUSED_ENTRY(220), + I40E_PTT_UNUSED_ENTRY(221), + I40E_PTT_UNUSED_ENTRY(222), + I40E_PTT_UNUSED_ENTRY(223), + I40E_PTT_UNUSED_ENTRY(224), + I40E_PTT_UNUSED_ENTRY(225), + I40E_PTT_UNUSED_ENTRY(226), + I40E_PTT_UNUSED_ENTRY(227), + I40E_PTT_UNUSED_ENTRY(228), + I40E_PTT_UNUSED_ENTRY(229), + + I40E_PTT_UNUSED_ENTRY(230), + I40E_PTT_UNUSED_ENTRY(231), + I40E_PTT_UNUSED_ENTRY(232), + I40E_PTT_UNUSED_ENTRY(233), + I40E_PTT_UNUSED_ENTRY(234), + I40E_PTT_UNUSED_ENTRY(235), + I40E_PTT_UNUSED_ENTRY(236), + I40E_PTT_UNUSED_ENTRY(237), + I40E_PTT_UNUSED_ENTRY(238), + I40E_PTT_UNUSED_ENTRY(239), + + I40E_PTT_UNUSED_ENTRY(240), + I40E_PTT_UNUSED_ENTRY(241), + I40E_PTT_UNUSED_ENTRY(242), + I40E_PTT_UNUSED_ENTRY(243), + I40E_PTT_UNUSED_ENTRY(244), + I40E_PTT_UNUSED_ENTRY(245), + I40E_PTT_UNUSED_ENTRY(246), + I40E_PTT_UNUSED_ENTRY(247), + I40E_PTT_UNUSED_ENTRY(248), + I40E_PTT_UNUSED_ENTRY(249), + + I40E_PTT_UNUSED_ENTRY(250), + I40E_PTT_UNUSED_ENTRY(251), + I40E_PTT_UNUSED_ENTRY(252), + I40E_PTT_UNUSED_ENTRY(253), + I40E_PTT_UNUSED_ENTRY(254), + I40E_PTT_UNUSED_ENTRY(255) +}; + + +/** + * i40e_validate_mac_addr - Validate unicast MAC address + * @mac_addr: pointer to MAC address + * + * Tests a MAC address to ensure it is a valid Individual Address + **/ +enum i40e_status_code i40e_validate_mac_addr(u8 *mac_addr) +{ + enum i40e_status_code status = I40E_SUCCESS; + + DEBUGFUNC("i40e_validate_mac_addr"); + + /* Broadcast addresses ARE multicast addresses + * Make sure it is not a multicast address + * Reject the zero address + */ + if (I40E_IS_MULTICAST(mac_addr) || + (mac_addr[0] == 0 && mac_addr[1] == 0 && mac_addr[2] == 0 && + mac_addr[3] == 0 && mac_addr[4] == 0 && mac_addr[5] == 0)) + status = I40E_ERR_INVALID_MAC_ADDR; + + return status; +} + +/** + * i40e_init_shared_code - Initialize the shared code + * @hw: pointer to hardware structure + * + * This assigns the MAC type and PHY code and inits the NVM. + * Does not touch the hardware. This function must be called prior to any + * other function in the shared code. The i40e_hw structure should be + * memset to 0 prior to calling this function. The following fields in + * hw structure should be filled in prior to calling this function: + * hw_addr, back, device_id, vendor_id, subsystem_device_id, + * subsystem_vendor_id, and revision_id + **/ +enum i40e_status_code i40e_init_shared_code(struct i40e_hw *hw) +{ + enum i40e_status_code status = I40E_SUCCESS; + u32 port, ari, func_rid; + + DEBUGFUNC("i40e_init_shared_code"); + + i40e_set_mac_type(hw); + + switch (hw->mac.type) { + case I40E_MAC_XL710: +#ifdef X722_SUPPORT + case I40E_MAC_X722: +#endif + break; + default: + return I40E_ERR_DEVICE_NOT_SUPPORTED; + } + + hw->phy.get_link_info = TRUE; + + /* Determine port number and PF number*/ + port = (rd32(hw, I40E_PFGEN_PORTNUM) & I40E_PFGEN_PORTNUM_PORT_NUM_MASK) + >> I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT; + hw->port = (u8)port; + ari = (rd32(hw, I40E_GLPCI_CAPSUP) & I40E_GLPCI_CAPSUP_ARI_EN_MASK) >> + I40E_GLPCI_CAPSUP_ARI_EN_SHIFT; + func_rid = rd32(hw, I40E_PF_FUNC_RID); + if (ari) + hw->pf_id = (u8)(func_rid & 0xff); + else + hw->pf_id = (u8)(func_rid & 0x7); + + status = i40e_init_nvm(hw); + return status; +} + +/** + * i40e_aq_mac_address_read - Retrieve the MAC addresses + * @hw: pointer to the hw struct + * @flags: a return indicator of what addresses were added to the addr store + * @addrs: the requestor's mac addr store + * @cmd_details: pointer to command details structure or NULL + **/ +static enum i40e_status_code i40e_aq_mac_address_read(struct i40e_hw *hw, + u16 *flags, + struct i40e_aqc_mac_address_read_data *addrs, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_mac_address_read *cmd_data = + (struct i40e_aqc_mac_address_read *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_mac_address_read); + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF); + + status = i40e_asq_send_command(hw, &desc, addrs, + sizeof(*addrs), cmd_details); + *flags = LE16_TO_CPU(cmd_data->command_flags); + + return status; +} + +/** + * i40e_aq_mac_address_write - Change the MAC addresses + * @hw: pointer to the hw struct + * @flags: indicates which MAC to be written + * @mac_addr: address to write + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_mac_address_write(struct i40e_hw *hw, + u16 flags, u8 *mac_addr, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_mac_address_write *cmd_data = + (struct i40e_aqc_mac_address_write *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_mac_address_write); + cmd_data->command_flags = CPU_TO_LE16(flags); + cmd_data->mac_sah = CPU_TO_LE16((u16)mac_addr[0] << 8 | mac_addr[1]); + cmd_data->mac_sal = CPU_TO_LE32(((u32)mac_addr[2] << 24) | + ((u32)mac_addr[3] << 16) | + ((u32)mac_addr[4] << 8) | + mac_addr[5]); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_get_mac_addr - get MAC address + * @hw: pointer to the HW structure + * @mac_addr: pointer to MAC address + * + * Reads the adapter's MAC address from register + **/ +enum i40e_status_code i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr) +{ + struct i40e_aqc_mac_address_read_data addrs; + enum i40e_status_code status; + u16 flags = 0; + + status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL); + + if (flags & I40E_AQC_LAN_ADDR_VALID) + memcpy(mac_addr, &addrs.pf_lan_mac, sizeof(addrs.pf_lan_mac)); + + return status; +} + +/** + * i40e_get_port_mac_addr - get Port MAC address + * @hw: pointer to the HW structure + * @mac_addr: pointer to Port MAC address + * + * Reads the adapter's Port MAC address + **/ +enum i40e_status_code i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr) +{ + struct i40e_aqc_mac_address_read_data addrs; + enum i40e_status_code status; + u16 flags = 0; + + status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL); + if (status) + return status; + + if (flags & I40E_AQC_PORT_ADDR_VALID) + memcpy(mac_addr, &addrs.port_mac, sizeof(addrs.port_mac)); + else + status = I40E_ERR_INVALID_MAC_ADDR; + + return status; +} + +/** + * i40e_pre_tx_queue_cfg - pre tx queue configure + * @hw: pointer to the HW structure + * @queue: target pf queue index + * @enable: state change request + * + * Handles hw requirement to indicate intention to enable + * or disable target queue. + **/ +void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable) +{ + u32 abs_queue_idx = hw->func_caps.base_queue + queue; + u32 reg_block = 0; + u32 reg_val; + + if (abs_queue_idx >= 128) { + reg_block = abs_queue_idx / 128; + abs_queue_idx %= 128; + } + + reg_val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block)); + reg_val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK; + reg_val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT); + + if (enable) + reg_val |= I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK; + else + reg_val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK; + + wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), reg_val); +} + +/** + * i40e_read_pba_string - Reads part number string from EEPROM + * @hw: pointer to hardware structure + * @pba_num: stores the part number string from the EEPROM + * @pba_num_size: part number string buffer length + * + * Reads the part number string from the EEPROM. + **/ +enum i40e_status_code i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num, + u32 pba_num_size) +{ + enum i40e_status_code status = I40E_SUCCESS; + u16 pba_word = 0; + u16 pba_size = 0; + u16 pba_ptr = 0; + u16 i = 0; + + status = i40e_read_nvm_word(hw, I40E_SR_PBA_FLAGS, &pba_word); + if ((status != I40E_SUCCESS) || (pba_word != 0xFAFA)) { + DEBUGOUT("Failed to read PBA flags or flag is invalid.\n"); + return status; + } + + status = i40e_read_nvm_word(hw, I40E_SR_PBA_BLOCK_PTR, &pba_ptr); + if (status != I40E_SUCCESS) { + DEBUGOUT("Failed to read PBA Block pointer.\n"); + return status; + } + + status = i40e_read_nvm_word(hw, pba_ptr, &pba_size); + if (status != I40E_SUCCESS) { + DEBUGOUT("Failed to read PBA Block size.\n"); + return status; + } + + /* Subtract one to get PBA word count (PBA Size word is included in + * total size) + */ + pba_size--; + if (pba_num_size < (((u32)pba_size * 2) + 1)) { + DEBUGOUT("Buffer to small for PBA data.\n"); + return I40E_ERR_PARAM; + } + + for (i = 0; i < pba_size; i++) { + status = i40e_read_nvm_word(hw, (pba_ptr + 1) + i, &pba_word); + if (status != I40E_SUCCESS) { + DEBUGOUT1("Failed to read PBA Block word %d.\n", i); + return status; + } + + pba_num[(i * 2)] = (pba_word >> 8) & 0xFF; + pba_num[(i * 2) + 1] = pba_word & 0xFF; + } + pba_num[(pba_size * 2)] = '\0'; + + return status; +} + +/** + * i40e_get_media_type - Gets media type + * @hw: pointer to the hardware structure + **/ +static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw) +{ + enum i40e_media_type media; + + switch (hw->phy.link_info.phy_type) { + case I40E_PHY_TYPE_10GBASE_SR: + case I40E_PHY_TYPE_10GBASE_LR: + case I40E_PHY_TYPE_1000BASE_SX: + case I40E_PHY_TYPE_1000BASE_LX: + case I40E_PHY_TYPE_40GBASE_SR4: + case I40E_PHY_TYPE_40GBASE_LR4: + media = I40E_MEDIA_TYPE_FIBER; + break; + case I40E_PHY_TYPE_100BASE_TX: + case I40E_PHY_TYPE_1000BASE_T: + case I40E_PHY_TYPE_10GBASE_T: + media = I40E_MEDIA_TYPE_BASET; + break; + case I40E_PHY_TYPE_10GBASE_CR1_CU: + case I40E_PHY_TYPE_40GBASE_CR4_CU: + case I40E_PHY_TYPE_10GBASE_CR1: + case I40E_PHY_TYPE_40GBASE_CR4: + case I40E_PHY_TYPE_10GBASE_SFPP_CU: + case I40E_PHY_TYPE_40GBASE_AOC: + case I40E_PHY_TYPE_10GBASE_AOC: + media = I40E_MEDIA_TYPE_DA; + break; + case I40E_PHY_TYPE_1000BASE_KX: + case I40E_PHY_TYPE_10GBASE_KX4: + case I40E_PHY_TYPE_10GBASE_KR: + case I40E_PHY_TYPE_40GBASE_KR4: + case I40E_PHY_TYPE_20GBASE_KR2: + media = I40E_MEDIA_TYPE_BACKPLANE; + break; + case I40E_PHY_TYPE_SGMII: + case I40E_PHY_TYPE_XAUI: + case I40E_PHY_TYPE_XFI: + case I40E_PHY_TYPE_XLAUI: + case I40E_PHY_TYPE_XLPPI: + default: + media = I40E_MEDIA_TYPE_UNKNOWN; + break; + } + + return media; +} + +#define I40E_PF_RESET_WAIT_COUNT 200 +/** + * i40e_pf_reset - Reset the PF + * @hw: pointer to the hardware structure + * + * Assuming someone else has triggered a global reset, + * assure the global reset is complete and then reset the PF + **/ +enum i40e_status_code i40e_pf_reset(struct i40e_hw *hw) +{ + u32 cnt = 0; + u32 cnt1 = 0; + u32 reg = 0; + u32 grst_del; + + /* Poll for Global Reset steady state in case of recent GRST. + * The grst delay value is in 100ms units, and we'll wait a + * couple counts longer to be sure we don't just miss the end. + */ + grst_del = (rd32(hw, I40E_GLGEN_RSTCTL) & + I40E_GLGEN_RSTCTL_GRSTDEL_MASK) >> + I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT; + for (cnt = 0; cnt < grst_del + 10; cnt++) { + reg = rd32(hw, I40E_GLGEN_RSTAT); + if (!(reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK)) + break; + i40e_msec_delay(100); + } + if (reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK) { + DEBUGOUT("Global reset polling failed to complete.\n"); + return I40E_ERR_RESET_FAILED; + } + + /* Now Wait for the FW to be ready */ + for (cnt1 = 0; cnt1 < I40E_PF_RESET_WAIT_COUNT; cnt1++) { + reg = rd32(hw, I40E_GLNVM_ULD); + reg &= (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK | + I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK); + if (reg == (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK | + I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK)) { + DEBUGOUT1("Core and Global modules ready %d\n", cnt1); + break; + } + i40e_msec_delay(10); + } + if (!(reg & (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK | + I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK))) { + DEBUGOUT("wait for FW Reset complete timedout\n"); + DEBUGOUT1("I40E_GLNVM_ULD = 0x%x\n", reg); + return I40E_ERR_RESET_FAILED; + } + + /* If there was a Global Reset in progress when we got here, + * we don't need to do the PF Reset + */ + if (!cnt) { + reg = rd32(hw, I40E_PFGEN_CTRL); + wr32(hw, I40E_PFGEN_CTRL, + (reg | I40E_PFGEN_CTRL_PFSWR_MASK)); + for (cnt = 0; cnt < I40E_PF_RESET_WAIT_COUNT; cnt++) { + reg = rd32(hw, I40E_PFGEN_CTRL); + if (!(reg & I40E_PFGEN_CTRL_PFSWR_MASK)) + break; + i40e_msec_delay(1); + } + if (reg & I40E_PFGEN_CTRL_PFSWR_MASK) { + DEBUGOUT("PF reset polling failed to complete.\n"); + return I40E_ERR_RESET_FAILED; + } + } + + i40e_clear_pxe_mode(hw); + + + return I40E_SUCCESS; +} + +/** + * i40e_clear_hw - clear out any left over hw state + * @hw: pointer to the hw struct + * + * Clear queues and interrupts, typically called at init time, + * but after the capabilities have been found so we know how many + * queues and msix vectors have been allocated. + **/ +void i40e_clear_hw(struct i40e_hw *hw) +{ + u32 num_queues, base_queue; + u32 num_pf_int; + u32 num_vf_int; + u32 num_vfs; + u32 i, j; + u32 val; + u32 eol = 0x7ff; + + /* get number of interrupts, queues, and vfs */ + val = rd32(hw, I40E_GLPCI_CNF2); + num_pf_int = (val & I40E_GLPCI_CNF2_MSI_X_PF_N_MASK) >> + I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT; + num_vf_int = (val & I40E_GLPCI_CNF2_MSI_X_VF_N_MASK) >> + I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT; + + val = rd32(hw, I40E_PFLAN_QALLOC); + base_queue = (val & I40E_PFLAN_QALLOC_FIRSTQ_MASK) >> + I40E_PFLAN_QALLOC_FIRSTQ_SHIFT; + j = (val & I40E_PFLAN_QALLOC_LASTQ_MASK) >> + I40E_PFLAN_QALLOC_LASTQ_SHIFT; + if (val & I40E_PFLAN_QALLOC_VALID_MASK) + num_queues = (j - base_queue) + 1; + else + num_queues = 0; + + val = rd32(hw, I40E_PF_VT_PFALLOC); + i = (val & I40E_PF_VT_PFALLOC_FIRSTVF_MASK) >> + I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT; + j = (val & I40E_PF_VT_PFALLOC_LASTVF_MASK) >> + I40E_PF_VT_PFALLOC_LASTVF_SHIFT; + if (val & I40E_PF_VT_PFALLOC_VALID_MASK) + num_vfs = (j - i) + 1; + else + num_vfs = 0; + + /* stop all the interrupts */ + wr32(hw, I40E_PFINT_ICR0_ENA, 0); + val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT; + for (i = 0; i < num_pf_int - 2; i++) + wr32(hw, I40E_PFINT_DYN_CTLN(i), val); + + /* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */ + val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT; + wr32(hw, I40E_PFINT_LNKLST0, val); + for (i = 0; i < num_pf_int - 2; i++) + wr32(hw, I40E_PFINT_LNKLSTN(i), val); + val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT; + for (i = 0; i < num_vfs; i++) + wr32(hw, I40E_VPINT_LNKLST0(i), val); + for (i = 0; i < num_vf_int - 2; i++) + wr32(hw, I40E_VPINT_LNKLSTN(i), val); + + /* warn the HW of the coming Tx disables */ + for (i = 0; i < num_queues; i++) { + u32 abs_queue_idx = base_queue + i; + u32 reg_block = 0; + + if (abs_queue_idx >= 128) { + reg_block = abs_queue_idx / 128; + abs_queue_idx %= 128; + } + + val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block)); + val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK; + val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT); + val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK; + + wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), val); + } + i40e_usec_delay(400); + + /* stop all the queues */ + for (i = 0; i < num_queues; i++) { + wr32(hw, I40E_QINT_TQCTL(i), 0); + wr32(hw, I40E_QTX_ENA(i), 0); + wr32(hw, I40E_QINT_RQCTL(i), 0); + wr32(hw, I40E_QRX_ENA(i), 0); + } + + /* short wait for all queue disables to settle */ + i40e_usec_delay(50); +} + +/** + * i40e_clear_pxe_mode - clear pxe operations mode + * @hw: pointer to the hw struct + * + * Make sure all PXE mode settings are cleared, including things + * like descriptor fetch/write-back mode. + **/ +void i40e_clear_pxe_mode(struct i40e_hw *hw) +{ + if (i40e_check_asq_alive(hw)) + i40e_aq_clear_pxe_mode(hw, NULL); +} + +/** + * i40e_led_is_mine - helper to find matching led + * @hw: pointer to the hw struct + * @idx: index into GPIO registers + * + * returns: 0 if no match, otherwise the value of the GPIO_CTL register + */ +static u32 i40e_led_is_mine(struct i40e_hw *hw, int idx) +{ + u32 gpio_val = 0; + u32 port; + + if (!hw->func_caps.led[idx]) + return 0; + + gpio_val = rd32(hw, I40E_GLGEN_GPIO_CTL(idx)); + port = (gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK) >> + I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT; + + /* if PRT_NUM_NA is 1 then this LED is not port specific, OR + * if it is not our port then ignore + */ + if ((gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK) || + (port != hw->port)) + return 0; + + return gpio_val; +} + +#define I40E_COMBINED_ACTIVITY 0xA +#define I40E_FILTER_ACTIVITY 0xE +#define I40E_LINK_ACTIVITY 0xC +#define I40E_MAC_ACTIVITY 0xD +#define I40E_LED0 22 + +/** + * i40e_led_get - return current on/off mode + * @hw: pointer to the hw struct + * + * The value returned is the 'mode' field as defined in the + * GPIO register definitions: 0x0 = off, 0xf = on, and other + * values are variations of possible behaviors relating to + * blink, link, and wire. + **/ +u32 i40e_led_get(struct i40e_hw *hw) +{ + u32 current_mode = 0; + u32 mode = 0; + int i; + + /* as per the documentation GPIO 22-29 are the LED + * GPIO pins named LED0..LED7 + */ + for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) { + u32 gpio_val = i40e_led_is_mine(hw, i); + + if (!gpio_val) + continue; + + /* ignore gpio LED src mode entries related to the activity + * LEDs + */ + current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK) + >> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT); + switch (current_mode) { + case I40E_COMBINED_ACTIVITY: + case I40E_FILTER_ACTIVITY: + case I40E_MAC_ACTIVITY: + continue; + default: + break; + } + + mode = (gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK) >> + I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT; + break; + } + + return mode; +} + +/** + * i40e_led_set - set new on/off mode + * @hw: pointer to the hw struct + * @mode: 0=off, 0xf=on (else see manual for mode details) + * @blink: TRUE if the LED should blink when on, FALSE if steady + * + * if this function is used to turn on the blink it should + * be used to disable the blink when restoring the original state. + **/ +void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink) +{ + u32 current_mode = 0; + int i; + + if (mode & 0xfffffff0) { + DEBUGOUT1("invalid mode passed in %X\n", mode); + } + + /* as per the documentation GPIO 22-29 are the LED + * GPIO pins named LED0..LED7 + */ + for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) { + u32 gpio_val = i40e_led_is_mine(hw, i); + + if (!gpio_val) + continue; + + /* ignore gpio LED src mode entries related to the activity + * LEDs + */ + current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK) + >> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT); + switch (current_mode) { + case I40E_COMBINED_ACTIVITY: + case I40E_FILTER_ACTIVITY: + case I40E_MAC_ACTIVITY: + continue; + default: + break; + } + + gpio_val &= ~I40E_GLGEN_GPIO_CTL_LED_MODE_MASK; + /* this & is a bit of paranoia, but serves as a range check */ + gpio_val |= ((mode << I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT) & + I40E_GLGEN_GPIO_CTL_LED_MODE_MASK); + + if (mode == I40E_LINK_ACTIVITY) + blink = FALSE; + + if (blink) + gpio_val |= BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT); + else + gpio_val &= ~BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT); + + wr32(hw, I40E_GLGEN_GPIO_CTL(i), gpio_val); + break; + } +} + +/* Admin command wrappers */ + +/** + * i40e_aq_get_phy_capabilities + * @hw: pointer to the hw struct + * @abilities: structure for PHY capabilities to be filled + * @qualified_modules: report Qualified Modules + * @report_init: report init capabilities (active are default) + * @cmd_details: pointer to command details structure or NULL + * + * Returns the various PHY abilities supported on the Port. + **/ +enum i40e_status_code i40e_aq_get_phy_capabilities(struct i40e_hw *hw, + bool qualified_modules, bool report_init, + struct i40e_aq_get_phy_abilities_resp *abilities, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp); + + if (!abilities) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_phy_abilities); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (abilities_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + if (qualified_modules) + desc.params.external.param0 |= + CPU_TO_LE32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES); + + if (report_init) + desc.params.external.param0 |= + CPU_TO_LE32(I40E_AQ_PHY_REPORT_INITIAL_VALUES); + + status = i40e_asq_send_command(hw, &desc, abilities, abilities_size, + cmd_details); + + if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) + status = I40E_ERR_UNKNOWN_PHY; + + return status; +} + +/** + * i40e_aq_set_phy_config + * @hw: pointer to the hw struct + * @config: structure with PHY configuration to be set + * @cmd_details: pointer to command details structure or NULL + * + * Set the various PHY configuration parameters + * supported on the Port.One or more of the Set PHY config parameters may be + * ignored in an MFP mode as the PF may not have the privilege to set some + * of the PHY Config parameters. This status will be indicated by the + * command response. + **/ +enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw, + struct i40e_aq_set_phy_config *config, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aq_set_phy_config *cmd = + (struct i40e_aq_set_phy_config *)&desc.params.raw; + enum i40e_status_code status; + + if (!config) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_phy_config); + + *cmd = *config; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_set_fc + * @hw: pointer to the hw struct + * + * Set the requested flow control mode using set_phy_config. + **/ +enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures, + bool atomic_restart) +{ + enum i40e_fc_mode fc_mode = hw->fc.requested_mode; + struct i40e_aq_get_phy_abilities_resp abilities; + struct i40e_aq_set_phy_config config; + enum i40e_status_code status; + u8 pause_mask = 0x0; + + *aq_failures = 0x0; + + switch (fc_mode) { + case I40E_FC_FULL: + pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX; + pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX; + break; + case I40E_FC_RX_PAUSE: + pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX; + break; + case I40E_FC_TX_PAUSE: + pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX; + break; + default: + break; + } + + /* Get the current phy config */ + status = i40e_aq_get_phy_capabilities(hw, FALSE, false, &abilities, + NULL); + if (status) { + *aq_failures |= I40E_SET_FC_AQ_FAIL_GET; + return status; + } + + memset(&config, 0, sizeof(config)); + /* clear the old pause settings */ + config.abilities = abilities.abilities & ~(I40E_AQ_PHY_FLAG_PAUSE_TX) & + ~(I40E_AQ_PHY_FLAG_PAUSE_RX); + /* set the new abilities */ + config.abilities |= pause_mask; + /* If the abilities have changed, then set the new config */ + if (config.abilities != abilities.abilities) { + /* Auto restart link so settings take effect */ + if (atomic_restart) + config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK; + /* Copy over all the old settings */ + config.phy_type = abilities.phy_type; + config.link_speed = abilities.link_speed; + config.eee_capability = abilities.eee_capability; + config.eeer = abilities.eeer_val; + config.low_power_ctrl = abilities.d3_lpan; + status = i40e_aq_set_phy_config(hw, &config, NULL); + + if (status) + *aq_failures |= I40E_SET_FC_AQ_FAIL_SET; + } + /* Update the link info */ + status = i40e_update_link_info(hw); + if (status) { + /* Wait a little bit (on 40G cards it sometimes takes a really + * long time for link to come back from the atomic reset) + * and try once more + */ + i40e_msec_delay(1000); + status = i40e_update_link_info(hw); + } + if (status) + *aq_failures |= I40E_SET_FC_AQ_FAIL_UPDATE; + + return status; +} + +/** + * i40e_aq_set_mac_config + * @hw: pointer to the hw struct + * @max_frame_size: Maximum Frame Size to be supported by the port + * @crc_en: Tell HW to append a CRC to outgoing frames + * @pacing: Pacing configurations + * @cmd_details: pointer to command details structure or NULL + * + * Configure MAC settings for frame size, jumbo frame support and the + * addition of a CRC by the hardware. + **/ +enum i40e_status_code i40e_aq_set_mac_config(struct i40e_hw *hw, + u16 max_frame_size, + bool crc_en, u16 pacing, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aq_set_mac_config *cmd = + (struct i40e_aq_set_mac_config *)&desc.params.raw; + enum i40e_status_code status; + + if (max_frame_size == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_mac_config); + + cmd->max_frame_size = CPU_TO_LE16(max_frame_size); + cmd->params = ((u8)pacing & 0x0F) << 3; + if (crc_en) + cmd->params |= I40E_AQ_SET_MAC_CONFIG_CRC_EN; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_clear_pxe_mode + * @hw: pointer to the hw struct + * @cmd_details: pointer to command details structure or NULL + * + * Tell the firmware that the driver is taking over from PXE + **/ +enum i40e_status_code i40e_aq_clear_pxe_mode(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details) +{ + enum i40e_status_code status; + struct i40e_aq_desc desc; + struct i40e_aqc_clear_pxe *cmd = + (struct i40e_aqc_clear_pxe *)&desc.params.raw; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_clear_pxe_mode); + + cmd->rx_cnt = 0x2; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + wr32(hw, I40E_GLLAN_RCTL_0, 0x1); + + return status; +} + +/** + * i40e_aq_set_link_restart_an + * @hw: pointer to the hw struct + * @enable_link: if TRUE: enable link, if FALSE: disable link + * @cmd_details: pointer to command details structure or NULL + * + * Sets up the link and restarts the Auto-Negotiation over the link. + **/ +enum i40e_status_code i40e_aq_set_link_restart_an(struct i40e_hw *hw, + bool enable_link, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_link_restart_an *cmd = + (struct i40e_aqc_set_link_restart_an *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_link_restart_an); + + cmd->command = I40E_AQ_PHY_RESTART_AN; + if (enable_link) + cmd->command |= I40E_AQ_PHY_LINK_ENABLE; + else + cmd->command &= ~I40E_AQ_PHY_LINK_ENABLE; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_link_info + * @hw: pointer to the hw struct + * @enable_lse: enable/disable LinkStatusEvent reporting + * @link: pointer to link status structure - optional + * @cmd_details: pointer to command details structure or NULL + * + * Returns the link status of the adapter. + **/ +enum i40e_status_code i40e_aq_get_link_info(struct i40e_hw *hw, + bool enable_lse, struct i40e_link_status *link, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_get_link_status *resp = + (struct i40e_aqc_get_link_status *)&desc.params.raw; + struct i40e_link_status *hw_link_info = &hw->phy.link_info; + enum i40e_status_code status; + bool tx_pause, rx_pause; + u16 command_flags; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_link_status); + + if (enable_lse) + command_flags = I40E_AQ_LSE_ENABLE; + else + command_flags = I40E_AQ_LSE_DISABLE; + resp->command_flags = CPU_TO_LE16(command_flags); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (status != I40E_SUCCESS) + goto aq_get_link_info_exit; + + /* save off old link status information */ + i40e_memcpy(&hw->phy.link_info_old, hw_link_info, + sizeof(*hw_link_info), I40E_NONDMA_TO_NONDMA); + + /* update link status */ + hw_link_info->phy_type = (enum i40e_aq_phy_type)resp->phy_type; + hw->phy.media_type = i40e_get_media_type(hw); + hw_link_info->link_speed = (enum i40e_aq_link_speed)resp->link_speed; + hw_link_info->link_info = resp->link_info; + hw_link_info->an_info = resp->an_info; + hw_link_info->ext_info = resp->ext_info; + hw_link_info->loopback = resp->loopback; + hw_link_info->max_frame_size = LE16_TO_CPU(resp->max_frame_size); + hw_link_info->pacing = resp->config & I40E_AQ_CONFIG_PACING_MASK; + + /* update fc info */ + tx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_TX); + rx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_RX); + if (tx_pause & rx_pause) + hw->fc.current_mode = I40E_FC_FULL; + else if (tx_pause) + hw->fc.current_mode = I40E_FC_TX_PAUSE; + else if (rx_pause) + hw->fc.current_mode = I40E_FC_RX_PAUSE; + else + hw->fc.current_mode = I40E_FC_NONE; + + if (resp->config & I40E_AQ_CONFIG_CRC_ENA) + hw_link_info->crc_enable = TRUE; + else + hw_link_info->crc_enable = FALSE; + + if (resp->command_flags & CPU_TO_LE16(I40E_AQ_LSE_ENABLE)) + hw_link_info->lse_enable = TRUE; + else + hw_link_info->lse_enable = FALSE; + + if ((hw->aq.fw_maj_ver < 4 || (hw->aq.fw_maj_ver == 4 && + hw->aq.fw_min_ver < 40)) && hw_link_info->phy_type == 0xE) + hw_link_info->phy_type = I40E_PHY_TYPE_10GBASE_SFPP_CU; + + /* save link status information */ + if (link) + i40e_memcpy(link, hw_link_info, sizeof(*hw_link_info), + I40E_NONDMA_TO_NONDMA); + + /* flag cleared so helper functions don't call AQ again */ + hw->phy.get_link_info = FALSE; + +aq_get_link_info_exit: + return status; +} + +/** + * i40e_aq_set_phy_int_mask + * @hw: pointer to the hw struct + * @mask: interrupt mask to be set + * @cmd_details: pointer to command details structure or NULL + * + * Set link interrupt mask. + **/ +enum i40e_status_code i40e_aq_set_phy_int_mask(struct i40e_hw *hw, + u16 mask, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_phy_int_mask *cmd = + (struct i40e_aqc_set_phy_int_mask *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_phy_int_mask); + + cmd->event_mask = CPU_TO_LE16(mask); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_local_advt_reg + * @hw: pointer to the hw struct + * @advt_reg: local AN advertisement register value + * @cmd_details: pointer to command details structure or NULL + * + * Get the Local AN advertisement register value. + **/ +enum i40e_status_code i40e_aq_get_local_advt_reg(struct i40e_hw *hw, + u64 *advt_reg, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_an_advt_reg *resp = + (struct i40e_aqc_an_advt_reg *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_local_advt_reg); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (status != I40E_SUCCESS) + goto aq_get_local_advt_reg_exit; + + *advt_reg = (u64)(LE16_TO_CPU(resp->local_an_reg1)) << 32; + *advt_reg |= LE32_TO_CPU(resp->local_an_reg0); + +aq_get_local_advt_reg_exit: + return status; +} + +/** + * i40e_aq_set_local_advt_reg + * @hw: pointer to the hw struct + * @advt_reg: local AN advertisement register value + * @cmd_details: pointer to command details structure or NULL + * + * Get the Local AN advertisement register value. + **/ +enum i40e_status_code i40e_aq_set_local_advt_reg(struct i40e_hw *hw, + u64 advt_reg, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_an_advt_reg *cmd = + (struct i40e_aqc_an_advt_reg *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_local_advt_reg); + + cmd->local_an_reg0 = CPU_TO_LE32(I40E_LO_DWORD(advt_reg)); + cmd->local_an_reg1 = CPU_TO_LE16(I40E_HI_DWORD(advt_reg)); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_partner_advt + * @hw: pointer to the hw struct + * @advt_reg: AN partner advertisement register value + * @cmd_details: pointer to command details structure or NULL + * + * Get the link partner AN advertisement register value. + **/ +enum i40e_status_code i40e_aq_get_partner_advt(struct i40e_hw *hw, + u64 *advt_reg, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_an_advt_reg *resp = + (struct i40e_aqc_an_advt_reg *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_partner_advt); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (status != I40E_SUCCESS) + goto aq_get_partner_advt_exit; + + *advt_reg = (u64)(LE16_TO_CPU(resp->local_an_reg1)) << 32; + *advt_reg |= LE32_TO_CPU(resp->local_an_reg0); + +aq_get_partner_advt_exit: + return status; +} + +/** + * i40e_aq_set_lb_modes + * @hw: pointer to the hw struct + * @lb_modes: loopback mode to be set + * @cmd_details: pointer to command details structure or NULL + * + * Sets loopback modes. + **/ +enum i40e_status_code i40e_aq_set_lb_modes(struct i40e_hw *hw, + u16 lb_modes, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_lb_mode *cmd = + (struct i40e_aqc_set_lb_mode *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_lb_modes); + + cmd->lb_mode = CPU_TO_LE16(lb_modes); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_phy_debug + * @hw: pointer to the hw struct + * @cmd_flags: debug command flags + * @cmd_details: pointer to command details structure or NULL + * + * Reset the external PHY. + **/ +enum i40e_status_code i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_phy_debug *cmd = + (struct i40e_aqc_set_phy_debug *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_phy_debug); + + cmd->command_flags = cmd_flags; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_add_vsi + * @hw: pointer to the hw struct + * @vsi_ctx: pointer to a vsi context struct + * @cmd_details: pointer to command details structure or NULL + * + * Add a VSI context to the hardware. +**/ +enum i40e_status_code i40e_aq_add_vsi(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_get_update_vsi *cmd = + (struct i40e_aqc_add_get_update_vsi *)&desc.params.raw; + struct i40e_aqc_add_get_update_vsi_completion *resp = + (struct i40e_aqc_add_get_update_vsi_completion *) + &desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_add_vsi); + + cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->uplink_seid); + cmd->connection_type = vsi_ctx->connection_type; + cmd->vf_id = vsi_ctx->vf_num; + cmd->vsi_flags = CPU_TO_LE16(vsi_ctx->flags); + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + + status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info, + sizeof(vsi_ctx->info), cmd_details); + + if (status != I40E_SUCCESS) + goto aq_add_vsi_exit; + + vsi_ctx->seid = LE16_TO_CPU(resp->seid); + vsi_ctx->vsi_number = LE16_TO_CPU(resp->vsi_number); + vsi_ctx->vsis_allocated = LE16_TO_CPU(resp->vsi_used); + vsi_ctx->vsis_unallocated = LE16_TO_CPU(resp->vsi_free); + +aq_add_vsi_exit: + return status; +} + +/** + * i40e_aq_set_default_vsi + * @hw: pointer to the hw struct + * @seid: vsi number + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_default_vsi(struct i40e_hw *hw, + u16 seid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *) + &desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + cmd->promiscuous_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_DEFAULT); + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_DEFAULT); + cmd->seid = CPU_TO_LE16(seid); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_vsi_unicast_promiscuous + * @hw: pointer to the hw struct + * @seid: vsi number + * @set: set unicast promiscuous enable/disable + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw, + u16 seid, bool set, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; + enum i40e_status_code status; + u16 flags = 0; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + if (set) + flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; + + cmd->promiscuous_flags = CPU_TO_LE16(flags); + + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_UNICAST); + + cmd->seid = CPU_TO_LE16(seid); + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_vsi_multicast_promiscuous + * @hw: pointer to the hw struct + * @seid: vsi number + * @set: set multicast promiscuous enable/disable + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw, + u16 seid, bool set, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; + enum i40e_status_code status; + u16 flags = 0; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + if (set) + flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST; + + cmd->promiscuous_flags = CPU_TO_LE16(flags); + + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_MULTICAST); + + cmd->seid = CPU_TO_LE16(seid); + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_vsi_mc_promisc_on_vlan + * @hw: pointer to the hw struct + * @seid: vsi number + * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN + * @vid: The VLAN tag filter - capture any multicast packet with this VLAN tag + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw, + u16 seid, bool enable, u16 vid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; + enum i40e_status_code status; + u16 flags = 0; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + if (enable) + flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST; + + cmd->promiscuous_flags = CPU_TO_LE16(flags); + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_MULTICAST); + cmd->seid = CPU_TO_LE16(seid); + cmd->vlan_tag = CPU_TO_LE16(vid | I40E_AQC_SET_VSI_VLAN_VALID); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_vsi_uc_promisc_on_vlan + * @hw: pointer to the hw struct + * @seid: vsi number + * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN + * @vid: The VLAN tag filter - capture any unicast packet with this VLAN tag + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, + u16 seid, bool enable, u16 vid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; + enum i40e_status_code status; + u16 flags = 0; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + if (enable) + flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST; + + cmd->promiscuous_flags = CPU_TO_LE16(flags); + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_UNICAST); + cmd->seid = CPU_TO_LE16(seid); + cmd->vlan_tag = CPU_TO_LE16(vid | I40E_AQC_SET_VSI_VLAN_VALID); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_vsi_broadcast + * @hw: pointer to the hw struct + * @seid: vsi number + * @set_filter: TRUE to set filter, FALSE to clear filter + * @cmd_details: pointer to command details structure or NULL + * + * Set or clear the broadcast promiscuous flag (filter) for a given VSI. + **/ +enum i40e_status_code i40e_aq_set_vsi_broadcast(struct i40e_hw *hw, + u16 seid, bool set_filter, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_set_vsi_promiscuous_modes *cmd = + (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_vsi_promiscuous_modes); + + if (set_filter) + cmd->promiscuous_flags + |= CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_BROADCAST); + else + cmd->promiscuous_flags + &= CPU_TO_LE16(~I40E_AQC_SET_VSI_PROMISC_BROADCAST); + + cmd->valid_flags = CPU_TO_LE16(I40E_AQC_SET_VSI_PROMISC_BROADCAST); + cmd->seid = CPU_TO_LE16(seid); + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_get_vsi_params - get VSI configuration info + * @hw: pointer to the hw struct + * @vsi_ctx: pointer to a vsi context struct + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_get_vsi_params(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_get_update_vsi *cmd = + (struct i40e_aqc_add_get_update_vsi *)&desc.params.raw; + struct i40e_aqc_add_get_update_vsi_completion *resp = + (struct i40e_aqc_add_get_update_vsi_completion *) + &desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_vsi_parameters); + + cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->seid); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + + status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info, + sizeof(vsi_ctx->info), NULL); + + if (status != I40E_SUCCESS) + goto aq_get_vsi_params_exit; + + vsi_ctx->seid = LE16_TO_CPU(resp->seid); + vsi_ctx->vsi_number = LE16_TO_CPU(resp->vsi_number); + vsi_ctx->vsis_allocated = LE16_TO_CPU(resp->vsi_used); + vsi_ctx->vsis_unallocated = LE16_TO_CPU(resp->vsi_free); + +aq_get_vsi_params_exit: + return status; +} + +/** + * i40e_aq_update_vsi_params + * @hw: pointer to the hw struct + * @vsi_ctx: pointer to a vsi context struct + * @cmd_details: pointer to command details structure or NULL + * + * Update a VSI context. + **/ +enum i40e_status_code i40e_aq_update_vsi_params(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_get_update_vsi *cmd = + (struct i40e_aqc_add_get_update_vsi *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_update_vsi_parameters); + cmd->uplink_seid = CPU_TO_LE16(vsi_ctx->seid); + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + + status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info, + sizeof(vsi_ctx->info), cmd_details); + + return status; +} + +/** + * i40e_aq_get_switch_config + * @hw: pointer to the hardware structure + * @buf: pointer to the result buffer + * @buf_size: length of input buffer + * @start_seid: seid to start for the report, 0 == beginning + * @cmd_details: pointer to command details structure or NULL + * + * Fill the buf with switch configuration returned from AdminQ command + **/ +enum i40e_status_code i40e_aq_get_switch_config(struct i40e_hw *hw, + struct i40e_aqc_get_switch_config_resp *buf, + u16 buf_size, u16 *start_seid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_switch_seid *scfg = + (struct i40e_aqc_switch_seid *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_switch_config); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + scfg->seid = CPU_TO_LE16(*start_seid); + + status = i40e_asq_send_command(hw, &desc, buf, buf_size, cmd_details); + *start_seid = LE16_TO_CPU(scfg->seid); + + return status; +} + +/** + * i40e_aq_get_firmware_version + * @hw: pointer to the hw struct + * @fw_major_version: firmware major version + * @fw_minor_version: firmware minor version + * @fw_build: firmware build number + * @api_major_version: major queue version + * @api_minor_version: minor queue version + * @cmd_details: pointer to command details structure or NULL + * + * Get the firmware version from the admin queue commands + **/ +enum i40e_status_code i40e_aq_get_firmware_version(struct i40e_hw *hw, + u16 *fw_major_version, u16 *fw_minor_version, + u32 *fw_build, + u16 *api_major_version, u16 *api_minor_version, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_get_version *resp = + (struct i40e_aqc_get_version *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (status == I40E_SUCCESS) { + if (fw_major_version != NULL) + *fw_major_version = LE16_TO_CPU(resp->fw_major); + if (fw_minor_version != NULL) + *fw_minor_version = LE16_TO_CPU(resp->fw_minor); + if (fw_build != NULL) + *fw_build = LE32_TO_CPU(resp->fw_build); + if (api_major_version != NULL) + *api_major_version = LE16_TO_CPU(resp->api_major); + if (api_minor_version != NULL) + *api_minor_version = LE16_TO_CPU(resp->api_minor); + + /* A workaround to fix the API version in SW */ + if (api_major_version && api_minor_version && + fw_major_version && fw_minor_version && + ((*api_major_version == 1) && (*api_minor_version == 1)) && + (((*fw_major_version == 4) && (*fw_minor_version >= 2)) || + (*fw_major_version > 4))) + *api_minor_version = 2; + } + + return status; +} + +/** + * i40e_aq_send_driver_version + * @hw: pointer to the hw struct + * @dv: driver's major, minor version + * @cmd_details: pointer to command details structure or NULL + * + * Send the driver version to the firmware + **/ +enum i40e_status_code i40e_aq_send_driver_version(struct i40e_hw *hw, + struct i40e_driver_version *dv, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_driver_version *cmd = + (struct i40e_aqc_driver_version *)&desc.params.raw; + enum i40e_status_code status; + u16 len; + + if (dv == NULL) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_driver_version); + + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD); + cmd->driver_major_ver = dv->major_version; + cmd->driver_minor_ver = dv->minor_version; + cmd->driver_build_ver = dv->build_version; + cmd->driver_subbuild_ver = dv->subbuild_version; + + len = 0; + while (len < sizeof(dv->driver_string) && + (dv->driver_string[len] < 0x80) && + dv->driver_string[len]) + len++; + status = i40e_asq_send_command(hw, &desc, dv->driver_string, + len, cmd_details); + + return status; +} + +/** + * i40e_get_link_status - get status of the HW network link + * @hw: pointer to the hw struct + * @link_up: pointer to bool (TRUE/FALSE = linkup/linkdown) + * + * Variable link_up TRUE if link is up, FALSE if link is down. + * The variable link_up is invalid if returned value of status != I40E_SUCCESS + * + * Side effect: LinkStatusEvent reporting becomes enabled + **/ +enum i40e_status_code i40e_get_link_status(struct i40e_hw *hw, bool *link_up) +{ + enum i40e_status_code status = I40E_SUCCESS; + + if (hw->phy.get_link_info) { + status = i40e_update_link_info(hw); + + if (status != I40E_SUCCESS) + i40e_debug(hw, I40E_DEBUG_LINK, "get link failed: status %d\n", + status); + } + + *link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP; + + return status; +} + +/** + * i40e_updatelink_status - update status of the HW network link + * @hw: pointer to the hw struct + **/ +enum i40e_status_code i40e_update_link_info(struct i40e_hw *hw) +{ + struct i40e_aq_get_phy_abilities_resp abilities; + enum i40e_status_code status = I40E_SUCCESS; + + status = i40e_aq_get_link_info(hw, TRUE, NULL, NULL); + if (status) + return status; + + status = i40e_aq_get_phy_capabilities(hw, FALSE, false, &abilities, + NULL); + if (status) + return status; + + memcpy(hw->phy.link_info.module_type, &abilities.module_type, + sizeof(hw->phy.link_info.module_type)); + + return status; +} + + +/** + * i40e_get_link_speed + * @hw: pointer to the hw struct + * + * Returns the link speed of the adapter. + **/ +enum i40e_aq_link_speed i40e_get_link_speed(struct i40e_hw *hw) +{ + enum i40e_aq_link_speed speed = I40E_LINK_SPEED_UNKNOWN; + enum i40e_status_code status = I40E_SUCCESS; + + if (hw->phy.get_link_info) { + status = i40e_aq_get_link_info(hw, TRUE, NULL, NULL); + + if (status != I40E_SUCCESS) + goto i40e_link_speed_exit; + } + + speed = hw->phy.link_info.link_speed; + +i40e_link_speed_exit: + return speed; +} + +/** + * i40e_aq_add_veb - Insert a VEB between the VSI and the MAC + * @hw: pointer to the hw struct + * @uplink_seid: the MAC or other gizmo SEID + * @downlink_seid: the VSI SEID + * @enabled_tc: bitmap of TCs to be enabled + * @default_port: TRUE for default port VSI, FALSE for control port + * @enable_l2_filtering: TRUE to add L2 filter table rules to regular forwarding rules for cloud support + * @veb_seid: pointer to where to put the resulting VEB SEID + * @cmd_details: pointer to command details structure or NULL + * + * This asks the FW to add a VEB between the uplink and downlink + * elements. If the uplink SEID is 0, this will be a floating VEB. + **/ +enum i40e_status_code i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid, + u16 downlink_seid, u8 enabled_tc, + bool default_port, bool enable_l2_filtering, + u16 *veb_seid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_veb *cmd = + (struct i40e_aqc_add_veb *)&desc.params.raw; + struct i40e_aqc_add_veb_completion *resp = + (struct i40e_aqc_add_veb_completion *)&desc.params.raw; + enum i40e_status_code status; + u16 veb_flags = 0; + + /* SEIDs need to either both be set or both be 0 for floating VEB */ + if (!!uplink_seid != !!downlink_seid) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_veb); + + cmd->uplink_seid = CPU_TO_LE16(uplink_seid); + cmd->downlink_seid = CPU_TO_LE16(downlink_seid); + cmd->enable_tcs = enabled_tc; + if (!uplink_seid) + veb_flags |= I40E_AQC_ADD_VEB_FLOATING; + if (default_port) + veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT; + else + veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DATA; + + if (enable_l2_filtering) + veb_flags |= I40E_AQC_ADD_VEB_ENABLE_L2_FILTER; + + cmd->veb_flags = CPU_TO_LE16(veb_flags); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status && veb_seid) + *veb_seid = LE16_TO_CPU(resp->veb_seid); + + return status; +} + +/** + * i40e_aq_get_veb_parameters - Retrieve VEB parameters + * @hw: pointer to the hw struct + * @veb_seid: the SEID of the VEB to query + * @switch_id: the uplink switch id + * @floating: set to TRUE if the VEB is floating + * @statistic_index: index of the stats counter block for this VEB + * @vebs_used: number of VEB's used by function + * @vebs_free: total VEB's not reserved by any function + * @cmd_details: pointer to command details structure or NULL + * + * This retrieves the parameters for a particular VEB, specified by + * uplink_seid, and returns them to the caller. + **/ +enum i40e_status_code i40e_aq_get_veb_parameters(struct i40e_hw *hw, + u16 veb_seid, u16 *switch_id, + bool *floating, u16 *statistic_index, + u16 *vebs_used, u16 *vebs_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_get_veb_parameters_completion *cmd_resp = + (struct i40e_aqc_get_veb_parameters_completion *) + &desc.params.raw; + enum i40e_status_code status; + + if (veb_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_veb_parameters); + cmd_resp->seid = CPU_TO_LE16(veb_seid); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + if (status) + goto get_veb_exit; + + if (switch_id) + *switch_id = LE16_TO_CPU(cmd_resp->switch_id); + if (statistic_index) + *statistic_index = LE16_TO_CPU(cmd_resp->statistic_index); + if (vebs_used) + *vebs_used = LE16_TO_CPU(cmd_resp->vebs_used); + if (vebs_free) + *vebs_free = LE16_TO_CPU(cmd_resp->vebs_free); + if (floating) { + u16 flags = LE16_TO_CPU(cmd_resp->veb_flags); + + if (flags & I40E_AQC_ADD_VEB_FLOATING) + *floating = TRUE; + else + *floating = FALSE; + } + +get_veb_exit: + return status; +} + +/** + * i40e_aq_add_macvlan + * @hw: pointer to the hw struct + * @seid: VSI for the mac address + * @mv_list: list of macvlans to be added + * @count: length of the list + * @cmd_details: pointer to command details structure or NULL + * + * Add MAC/VLAN addresses to the HW filtering + **/ +enum i40e_status_code i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid, + struct i40e_aqc_add_macvlan_element_data *mv_list, + u16 count, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_macvlan *cmd = + (struct i40e_aqc_macvlan *)&desc.params.raw; + enum i40e_status_code status; + u16 buf_size; + + if (count == 0 || !mv_list || !hw) + return I40E_ERR_PARAM; + + buf_size = count * sizeof(*mv_list); + + /* prep the rest of the request */ + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_macvlan); + cmd->num_addresses = CPU_TO_LE16(count); + cmd->seid[0] = CPU_TO_LE16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid); + cmd->seid[1] = 0; + cmd->seid[2] = 0; + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, mv_list, buf_size, + cmd_details); + + return status; +} + +/** + * i40e_aq_remove_macvlan + * @hw: pointer to the hw struct + * @seid: VSI for the mac address + * @mv_list: list of macvlans to be removed + * @count: length of the list + * @cmd_details: pointer to command details structure or NULL + * + * Remove MAC/VLAN addresses from the HW filtering + **/ +enum i40e_status_code i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid, + struct i40e_aqc_remove_macvlan_element_data *mv_list, + u16 count, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_macvlan *cmd = + (struct i40e_aqc_macvlan *)&desc.params.raw; + enum i40e_status_code status; + u16 buf_size; + + if (count == 0 || !mv_list || !hw) + return I40E_ERR_PARAM; + + buf_size = count * sizeof(*mv_list); + + /* prep the rest of the request */ + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan); + cmd->num_addresses = CPU_TO_LE16(count); + cmd->seid[0] = CPU_TO_LE16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid); + cmd->seid[1] = 0; + cmd->seid[2] = 0; + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, mv_list, buf_size, + cmd_details); + + return status; +} + +/** + * i40e_aq_add_vlan - Add VLAN ids to the HW filtering + * @hw: pointer to the hw struct + * @seid: VSI for the vlan filters + * @v_list: list of vlan filters to be added + * @count: length of the list + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_add_vlan(struct i40e_hw *hw, u16 seid, + struct i40e_aqc_add_remove_vlan_element_data *v_list, + u8 count, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_macvlan *cmd = + (struct i40e_aqc_macvlan *)&desc.params.raw; + enum i40e_status_code status; + u16 buf_size; + + if (count == 0 || !v_list || !hw) + return I40E_ERR_PARAM; + + buf_size = count * sizeof(*v_list); + + /* prep the rest of the request */ + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_vlan); + cmd->num_addresses = CPU_TO_LE16(count); + cmd->seid[0] = CPU_TO_LE16(seid | I40E_AQC_MACVLAN_CMD_SEID_VALID); + cmd->seid[1] = 0; + cmd->seid[2] = 0; + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, v_list, buf_size, + cmd_details); + + return status; +} + +/** + * i40e_aq_remove_vlan - Remove VLANs from the HW filtering + * @hw: pointer to the hw struct + * @seid: VSI for the vlan filters + * @v_list: list of macvlans to be removed + * @count: length of the list + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_remove_vlan(struct i40e_hw *hw, u16 seid, + struct i40e_aqc_add_remove_vlan_element_data *v_list, + u8 count, struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_macvlan *cmd = + (struct i40e_aqc_macvlan *)&desc.params.raw; + enum i40e_status_code status; + u16 buf_size; + + if (count == 0 || !v_list || !hw) + return I40E_ERR_PARAM; + + buf_size = count * sizeof(*v_list); + + /* prep the rest of the request */ + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_vlan); + cmd->num_addresses = CPU_TO_LE16(count); + cmd->seid[0] = CPU_TO_LE16(seid | I40E_AQC_MACVLAN_CMD_SEID_VALID); + cmd->seid[1] = 0; + cmd->seid[2] = 0; + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, v_list, buf_size, + cmd_details); + + return status; +} + +/** + * i40e_aq_send_msg_to_vf + * @hw: pointer to the hardware structure + * @vfid: vf id to send msg + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cmd_details: pointer to command details + * + * send msg to vf + **/ +enum i40e_status_code i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid, + u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_pf_vf_message *cmd = + (struct i40e_aqc_pf_vf_message *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_vf); + cmd->id = CPU_TO_LE32(vfid); + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_SI); + if (msglen) { + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | + I40E_AQ_FLAG_RD)); + if (msglen > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(msglen); + } + status = i40e_asq_send_command(hw, &desc, msg, msglen, cmd_details); + + return status; +} + +/** + * i40e_aq_debug_read_register + * @hw: pointer to the hw struct + * @reg_addr: register address + * @reg_val: register value + * @cmd_details: pointer to command details structure or NULL + * + * Read the register using the admin queue commands + **/ +enum i40e_status_code i40e_aq_debug_read_register(struct i40e_hw *hw, + u32 reg_addr, u64 *reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_debug_reg_read_write *cmd_resp = + (struct i40e_aqc_debug_reg_read_write *)&desc.params.raw; + enum i40e_status_code status; + + if (reg_val == NULL) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_read_reg); + + cmd_resp->address = CPU_TO_LE32(reg_addr); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (status == I40E_SUCCESS) { + *reg_val = ((u64)LE32_TO_CPU(cmd_resp->value_high) << 32) | + (u64)LE32_TO_CPU(cmd_resp->value_low); + } + + return status; +} + +/** + * i40e_aq_debug_write_register + * @hw: pointer to the hw struct + * @reg_addr: register address + * @reg_val: register value + * @cmd_details: pointer to command details structure or NULL + * + * Write to a register using the admin queue commands + **/ +enum i40e_status_code i40e_aq_debug_write_register(struct i40e_hw *hw, + u32 reg_addr, u64 reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_debug_reg_read_write *cmd = + (struct i40e_aqc_debug_reg_read_write *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_write_reg); + + cmd->address = CPU_TO_LE32(reg_addr); + cmd->value_high = CPU_TO_LE32((u32)(reg_val >> 32)); + cmd->value_low = CPU_TO_LE32((u32)(reg_val & 0xFFFFFFFF)); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_hmc_resource_profile + * @hw: pointer to the hw struct + * @profile: type of profile the HMC is to be set as + * @pe_vf_enabled_count: the number of PE enabled VFs the system has + * @cmd_details: pointer to command details structure or NULL + * + * query the HMC profile of the device. + **/ +enum i40e_status_code i40e_aq_get_hmc_resource_profile(struct i40e_hw *hw, + enum i40e_aq_hmc_profile *profile, + u8 *pe_vf_enabled_count, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aq_get_set_hmc_resource_profile *resp = + (struct i40e_aq_get_set_hmc_resource_profile *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_query_hmc_resource_profile); + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + *profile = (enum i40e_aq_hmc_profile)(resp->pm_profile & + I40E_AQ_GET_HMC_RESOURCE_PROFILE_PM_MASK); + *pe_vf_enabled_count = resp->pe_vf_enabled & + I40E_AQ_GET_HMC_RESOURCE_PROFILE_COUNT_MASK; + + return status; +} + +/** + * i40e_aq_set_hmc_resource_profile + * @hw: pointer to the hw struct + * @profile: type of profile the HMC is to be set as + * @pe_vf_enabled_count: the number of PE enabled VFs the system has + * @cmd_details: pointer to command details structure or NULL + * + * set the HMC profile of the device. + **/ +enum i40e_status_code i40e_aq_set_hmc_resource_profile(struct i40e_hw *hw, + enum i40e_aq_hmc_profile profile, + u8 pe_vf_enabled_count, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aq_get_set_hmc_resource_profile *cmd = + (struct i40e_aq_get_set_hmc_resource_profile *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_hmc_resource_profile); + + cmd->pm_profile = (u8)profile; + cmd->pe_vf_enabled = pe_vf_enabled_count; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_request_resource + * @hw: pointer to the hw struct + * @resource: resource id + * @access: access type + * @sdp_number: resource number + * @timeout: the maximum time in ms that the driver may hold the resource + * @cmd_details: pointer to command details structure or NULL + * + * requests common resource using the admin queue commands + **/ +enum i40e_status_code i40e_aq_request_resource(struct i40e_hw *hw, + enum i40e_aq_resources_ids resource, + enum i40e_aq_resource_access_type access, + u8 sdp_number, u64 *timeout, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_request_resource *cmd_resp = + (struct i40e_aqc_request_resource *)&desc.params.raw; + enum i40e_status_code status; + + DEBUGFUNC("i40e_aq_request_resource"); + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_request_resource); + + cmd_resp->resource_id = CPU_TO_LE16(resource); + cmd_resp->access_type = CPU_TO_LE16(access); + cmd_resp->resource_number = CPU_TO_LE32(sdp_number); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + /* The completion specifies the maximum time in ms that the driver + * may hold the resource in the Timeout field. + * If the resource is held by someone else, the command completes with + * busy return value and the timeout field indicates the maximum time + * the current owner of the resource has to free it. + */ + if (status == I40E_SUCCESS || hw->aq.asq_last_status == I40E_AQ_RC_EBUSY) + *timeout = LE32_TO_CPU(cmd_resp->timeout); + + return status; +} + +/** + * i40e_aq_release_resource + * @hw: pointer to the hw struct + * @resource: resource id + * @sdp_number: resource number + * @cmd_details: pointer to command details structure or NULL + * + * release common resource using the admin queue commands + **/ +enum i40e_status_code i40e_aq_release_resource(struct i40e_hw *hw, + enum i40e_aq_resources_ids resource, + u8 sdp_number, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_request_resource *cmd = + (struct i40e_aqc_request_resource *)&desc.params.raw; + enum i40e_status_code status; + + DEBUGFUNC("i40e_aq_release_resource"); + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_release_resource); + + cmd->resource_id = CPU_TO_LE16(resource); + cmd->resource_number = CPU_TO_LE32(sdp_number); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_read_nvm + * @hw: pointer to the hw struct + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: byte offset from the module beginning + * @length: length of the section to be read (in bytes from the offset) + * @data: command buffer (size [bytes] = length) + * @last_command: tells if this is the last command in a series + * @cmd_details: pointer to command details structure or NULL + * + * Read the NVM using the admin queue commands + **/ +enum i40e_status_code i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, void *data, + bool last_command, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_nvm_update *cmd = + (struct i40e_aqc_nvm_update *)&desc.params.raw; + enum i40e_status_code status; + + DEBUGFUNC("i40e_aq_read_nvm"); + + /* In offset the highest byte must be zeroed. */ + if (offset & 0xFF000000) { + status = I40E_ERR_PARAM; + goto i40e_aq_read_nvm_exit; + } + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_read); + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->command_flags |= I40E_AQ_NVM_LAST_CMD; + cmd->module_pointer = module_pointer; + cmd->offset = CPU_TO_LE32(offset); + cmd->length = CPU_TO_LE16(length); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (length > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, data, length, cmd_details); + +i40e_aq_read_nvm_exit: + return status; +} + +/** + * i40e_aq_read_nvm_config - read an nvm config block + * @hw: pointer to the hw struct + * @cmd_flags: NVM access admin command bits + * @field_id: field or feature id + * @data: buffer for result + * @buf_size: buffer size + * @element_count: pointer to count of elements read by FW + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_read_nvm_config(struct i40e_hw *hw, + u8 cmd_flags, u32 field_id, void *data, + u16 buf_size, u16 *element_count, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_nvm_config_read *cmd = + (struct i40e_aqc_nvm_config_read *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_config_read); + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + cmd->cmd_flags = CPU_TO_LE16(cmd_flags); + cmd->element_id = CPU_TO_LE16((u16)(0xffff & field_id)); + if (cmd_flags & I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_MASK) + cmd->element_id_msw = CPU_TO_LE16((u16)(field_id >> 16)); + else + cmd->element_id_msw = 0; + + status = i40e_asq_send_command(hw, &desc, data, buf_size, cmd_details); + + if (!status && element_count) + *element_count = LE16_TO_CPU(cmd->element_count); + + return status; +} + +/** + * i40e_aq_write_nvm_config - write an nvm config block + * @hw: pointer to the hw struct + * @cmd_flags: NVM access admin command bits + * @data: buffer for result + * @buf_size: buffer size + * @element_count: count of elements to be written + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_write_nvm_config(struct i40e_hw *hw, + u8 cmd_flags, void *data, u16 buf_size, + u16 element_count, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_nvm_config_write *cmd = + (struct i40e_aqc_nvm_config_write *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_config_write); + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buf_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + cmd->element_count = CPU_TO_LE16(element_count); + cmd->cmd_flags = CPU_TO_LE16(cmd_flags); + status = i40e_asq_send_command(hw, &desc, data, buf_size, cmd_details); + + return status; +} + +/** + * i40e_aq_oem_post_update - triggers an OEM specific flow after update + * @hw: pointer to the hw struct + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_oem_post_update(struct i40e_hw *hw, + void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_oem_post_update); + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + if (status && LE16_TO_CPU(desc.retval) == I40E_AQ_RC_ESRCH) + status = I40E_ERR_NOT_IMPLEMENTED; + + return status; +} + +/** + * i40e_aq_erase_nvm + * @hw: pointer to the hw struct + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: offset in the module (expressed in 4 KB from module's beginning) + * @length: length of the section to be erased (expressed in 4 KB) + * @last_command: tells if this is the last command in a series + * @cmd_details: pointer to command details structure or NULL + * + * Erase the NVM sector using the admin queue commands + **/ +enum i40e_status_code i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, bool last_command, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_nvm_update *cmd = + (struct i40e_aqc_nvm_update *)&desc.params.raw; + enum i40e_status_code status; + + DEBUGFUNC("i40e_aq_erase_nvm"); + + /* In offset the highest byte must be zeroed. */ + if (offset & 0xFF000000) { + status = I40E_ERR_PARAM; + goto i40e_aq_erase_nvm_exit; + } + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_erase); + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->command_flags |= I40E_AQ_NVM_LAST_CMD; + cmd->module_pointer = module_pointer; + cmd->offset = CPU_TO_LE32(offset); + cmd->length = CPU_TO_LE16(length); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + +i40e_aq_erase_nvm_exit: + return status; +} + +#define I40E_DEV_FUNC_CAP_SWITCH_MODE 0x01 +#define I40E_DEV_FUNC_CAP_MGMT_MODE 0x02 +#define I40E_DEV_FUNC_CAP_NPAR 0x03 +#define I40E_DEV_FUNC_CAP_OS2BMC 0x04 +#define I40E_DEV_FUNC_CAP_VALID_FUNC 0x05 +#define I40E_DEV_FUNC_CAP_SRIOV_1_1 0x12 +#define I40E_DEV_FUNC_CAP_VF 0x13 +#define I40E_DEV_FUNC_CAP_VMDQ 0x14 +#define I40E_DEV_FUNC_CAP_802_1_QBG 0x15 +#define I40E_DEV_FUNC_CAP_802_1_QBH 0x16 +#define I40E_DEV_FUNC_CAP_VSI 0x17 +#define I40E_DEV_FUNC_CAP_DCB 0x18 +#define I40E_DEV_FUNC_CAP_FCOE 0x21 +#define I40E_DEV_FUNC_CAP_ISCSI 0x22 +#define I40E_DEV_FUNC_CAP_RSS 0x40 +#define I40E_DEV_FUNC_CAP_RX_QUEUES 0x41 +#define I40E_DEV_FUNC_CAP_TX_QUEUES 0x42 +#define I40E_DEV_FUNC_CAP_MSIX 0x43 +#define I40E_DEV_FUNC_CAP_MSIX_VF 0x44 +#define I40E_DEV_FUNC_CAP_FLOW_DIRECTOR 0x45 +#define I40E_DEV_FUNC_CAP_IEEE_1588 0x46 +#define I40E_DEV_FUNC_CAP_FLEX10 0xF1 +#define I40E_DEV_FUNC_CAP_CEM 0xF2 +#define I40E_DEV_FUNC_CAP_IWARP 0x51 +#define I40E_DEV_FUNC_CAP_LED 0x61 +#define I40E_DEV_FUNC_CAP_SDP 0x62 +#define I40E_DEV_FUNC_CAP_MDIO 0x63 +#define I40E_DEV_FUNC_CAP_WR_CSR_PROT 0x64 + +/** + * i40e_parse_discover_capabilities + * @hw: pointer to the hw struct + * @buff: pointer to a buffer containing device/function capability records + * @cap_count: number of capability records in the list + * @list_type_opc: type of capabilities list to parse + * + * Parse the device/function capabilities list. + **/ +static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff, + u32 cap_count, + enum i40e_admin_queue_opc list_type_opc) +{ + struct i40e_aqc_list_capabilities_element_resp *cap; + u32 valid_functions, num_functions; + u32 number, logical_id, phys_id; + struct i40e_hw_capabilities *p; + u8 major_rev; + u32 i = 0; + u16 id; + + cap = (struct i40e_aqc_list_capabilities_element_resp *) buff; + + if (list_type_opc == i40e_aqc_opc_list_dev_capabilities) + p = (struct i40e_hw_capabilities *)&hw->dev_caps; + else if (list_type_opc == i40e_aqc_opc_list_func_capabilities) + p = (struct i40e_hw_capabilities *)&hw->func_caps; + else + return; + + for (i = 0; i < cap_count; i++, cap++) { + id = LE16_TO_CPU(cap->id); + number = LE32_TO_CPU(cap->number); + logical_id = LE32_TO_CPU(cap->logical_id); + phys_id = LE32_TO_CPU(cap->phys_id); + major_rev = cap->major_rev; + + switch (id) { + case I40E_DEV_FUNC_CAP_SWITCH_MODE: + p->switch_mode = number; + break; + case I40E_DEV_FUNC_CAP_MGMT_MODE: + p->management_mode = number; + break; + case I40E_DEV_FUNC_CAP_NPAR: + p->npar_enable = number; + break; + case I40E_DEV_FUNC_CAP_OS2BMC: + p->os2bmc = number; + break; + case I40E_DEV_FUNC_CAP_VALID_FUNC: + p->valid_functions = number; + break; + case I40E_DEV_FUNC_CAP_SRIOV_1_1: + if (number == 1) + p->sr_iov_1_1 = TRUE; + break; + case I40E_DEV_FUNC_CAP_VF: + p->num_vfs = number; + p->vf_base_id = logical_id; + break; + case I40E_DEV_FUNC_CAP_VMDQ: + if (number == 1) + p->vmdq = TRUE; + break; + case I40E_DEV_FUNC_CAP_802_1_QBG: + if (number == 1) + p->evb_802_1_qbg = TRUE; + break; + case I40E_DEV_FUNC_CAP_802_1_QBH: + if (number == 1) + p->evb_802_1_qbh = TRUE; + break; + case I40E_DEV_FUNC_CAP_VSI: + p->num_vsis = number; + break; + case I40E_DEV_FUNC_CAP_DCB: + if (number == 1) { + p->dcb = TRUE; + p->enabled_tcmap = logical_id; + p->maxtc = phys_id; + } + break; + case I40E_DEV_FUNC_CAP_FCOE: + if (number == 1) + p->fcoe = TRUE; + break; + case I40E_DEV_FUNC_CAP_ISCSI: + if (number == 1) + p->iscsi = TRUE; + break; + case I40E_DEV_FUNC_CAP_RSS: + p->rss = TRUE; + p->rss_table_size = number; + p->rss_table_entry_width = logical_id; + break; + case I40E_DEV_FUNC_CAP_RX_QUEUES: + p->num_rx_qp = number; + p->base_queue = phys_id; + break; + case I40E_DEV_FUNC_CAP_TX_QUEUES: + p->num_tx_qp = number; + p->base_queue = phys_id; + break; + case I40E_DEV_FUNC_CAP_MSIX: + p->num_msix_vectors = number; + break; + case I40E_DEV_FUNC_CAP_MSIX_VF: + p->num_msix_vectors_vf = number; + break; + case I40E_DEV_FUNC_CAP_FLEX10: + if (major_rev == 1) { + if (number == 1) { + p->flex10_enable = TRUE; + p->flex10_capable = TRUE; + } + } else { + /* Capability revision >= 2 */ + if (number & 1) + p->flex10_enable = TRUE; + if (number & 2) + p->flex10_capable = TRUE; + } + p->flex10_mode = logical_id; + p->flex10_status = phys_id; + break; + case I40E_DEV_FUNC_CAP_CEM: + if (number == 1) + p->mgmt_cem = TRUE; + break; + case I40E_DEV_FUNC_CAP_IWARP: + if (number == 1) + p->iwarp = TRUE; + break; + case I40E_DEV_FUNC_CAP_LED: + if (phys_id < I40E_HW_CAP_MAX_GPIO) + p->led[phys_id] = TRUE; + break; + case I40E_DEV_FUNC_CAP_SDP: + if (phys_id < I40E_HW_CAP_MAX_GPIO) + p->sdp[phys_id] = TRUE; + break; + case I40E_DEV_FUNC_CAP_MDIO: + if (number == 1) { + p->mdio_port_num = phys_id; + p->mdio_port_mode = logical_id; + } + break; + case I40E_DEV_FUNC_CAP_IEEE_1588: + if (number == 1) + p->ieee_1588 = TRUE; + break; + case I40E_DEV_FUNC_CAP_FLOW_DIRECTOR: + p->fd = TRUE; + p->fd_filters_guaranteed = number; + p->fd_filters_best_effort = logical_id; + break; + case I40E_DEV_FUNC_CAP_WR_CSR_PROT: + p->wr_csr_prot = (u64)number; + p->wr_csr_prot |= (u64)logical_id << 32; + break; + default: + break; + } + } + + if (p->fcoe) + i40e_debug(hw, I40E_DEBUG_ALL, "device is FCoE capable\n"); + + /* Always disable FCoE if compiled without the I40E_FCOE_ENA flag */ + p->fcoe = FALSE; + + /* count the enabled ports (aka the "not disabled" ports) */ + hw->num_ports = 0; + for (i = 0; i < 4; i++) { + u32 port_cfg_reg = I40E_PRTGEN_CNF + (4 * i); + u64 port_cfg = 0; + + /* use AQ read to get the physical register offset instead + * of the port relative offset + */ + i40e_aq_debug_read_register(hw, port_cfg_reg, &port_cfg, NULL); + if (!(port_cfg & I40E_PRTGEN_CNF_PORT_DIS_MASK)) + hw->num_ports++; + } + + valid_functions = p->valid_functions; + num_functions = 0; + while (valid_functions) { + if (valid_functions & 1) + num_functions++; + valid_functions >>= 1; + } + + /* partition id is 1-based, and functions are evenly spread + * across the ports as partitions + */ + hw->partition_id = (hw->pf_id / hw->num_ports) + 1; + hw->num_partitions = num_functions / hw->num_ports; + + /* additional HW specific goodies that might + * someday be HW version specific + */ + p->rx_buf_chain_len = I40E_MAX_CHAINED_RX_BUFFERS; +} + +/** + * i40e_aq_discover_capabilities + * @hw: pointer to the hw struct + * @buff: a virtual buffer to hold the capabilities + * @buff_size: Size of the virtual buffer + * @data_size: Size of the returned data, or buff size needed if AQ err==ENOMEM + * @list_type_opc: capabilities type to discover - pass in the command opcode + * @cmd_details: pointer to command details structure or NULL + * + * Get the device capabilities descriptions from the firmware + **/ +enum i40e_status_code i40e_aq_discover_capabilities(struct i40e_hw *hw, + void *buff, u16 buff_size, u16 *data_size, + enum i40e_admin_queue_opc list_type_opc, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aqc_list_capabilites *cmd; + struct i40e_aq_desc desc; + enum i40e_status_code status = I40E_SUCCESS; + + cmd = (struct i40e_aqc_list_capabilites *)&desc.params.raw; + + if (list_type_opc != i40e_aqc_opc_list_func_capabilities && + list_type_opc != i40e_aqc_opc_list_dev_capabilities) { + status = I40E_ERR_PARAM; + goto exit; + } + + i40e_fill_default_direct_cmd_desc(&desc, list_type_opc); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + *data_size = LE16_TO_CPU(desc.datalen); + + if (status) + goto exit; + + i40e_parse_discover_capabilities(hw, buff, LE32_TO_CPU(cmd->count), + list_type_opc); + +exit: + return status; +} + +/** + * i40e_aq_update_nvm + * @hw: pointer to the hw struct + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: byte offset from the module beginning + * @length: length of the section to be written (in bytes from the offset) + * @data: command buffer (size [bytes] = length) + * @last_command: tells if this is the last command in a series + * @cmd_details: pointer to command details structure or NULL + * + * Update the NVM using the admin queue commands + **/ +enum i40e_status_code i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, void *data, + bool last_command, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_nvm_update *cmd = + (struct i40e_aqc_nvm_update *)&desc.params.raw; + enum i40e_status_code status; + + DEBUGFUNC("i40e_aq_update_nvm"); + + /* In offset the highest byte must be zeroed. */ + if (offset & 0xFF000000) { + status = I40E_ERR_PARAM; + goto i40e_aq_update_nvm_exit; + } + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update); + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->command_flags |= I40E_AQ_NVM_LAST_CMD; + cmd->module_pointer = module_pointer; + cmd->offset = CPU_TO_LE32(offset); + cmd->length = CPU_TO_LE16(length); + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (length > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, data, length, cmd_details); + +i40e_aq_update_nvm_exit: + return status; +} + +/** + * i40e_aq_get_lldp_mib + * @hw: pointer to the hw struct + * @bridge_type: type of bridge requested + * @mib_type: Local, Remote or both Local and Remote MIBs + * @buff: pointer to a user supplied buffer to store the MIB block + * @buff_size: size of the buffer (in bytes) + * @local_len : length of the returned Local LLDP MIB + * @remote_len: length of the returned Remote LLDP MIB + * @cmd_details: pointer to command details structure or NULL + * + * Requests the complete LLDP MIB (entire packet). + **/ +enum i40e_status_code i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type, + u8 mib_type, void *buff, u16 buff_size, + u16 *local_len, u16 *remote_len, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_get_mib *cmd = + (struct i40e_aqc_lldp_get_mib *)&desc.params.raw; + struct i40e_aqc_lldp_get_mib *resp = + (struct i40e_aqc_lldp_get_mib *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_get_mib); + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + + cmd->type = mib_type & I40E_AQ_LLDP_MIB_TYPE_MASK; + cmd->type |= ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) & + I40E_AQ_LLDP_BRIDGE_TYPE_MASK); + + desc.datalen = CPU_TO_LE16(buff_size); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (local_len != NULL) + *local_len = LE16_TO_CPU(resp->local_len); + if (remote_len != NULL) + *remote_len = LE16_TO_CPU(resp->remote_len); + } + + return status; +} + + /** + * i40e_aq_set_lldp_mib - Set the LLDP MIB + * @hw: pointer to the hw struct + * @mib_type: Local, Remote or both Local and Remote MIBs + * @buff: pointer to a user supplied buffer to store the MIB block + * @buff_size: size of the buffer (in bytes) + * @cmd_details: pointer to command details structure or NULL + * + * Set the LLDP MIB. + **/ +enum i40e_status_code i40e_aq_set_lldp_mib(struct i40e_hw *hw, + u8 mib_type, void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_set_local_mib *cmd = + (struct i40e_aqc_lldp_set_local_mib *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_lldp_set_local_mib); + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(buff_size); + + cmd->type = mib_type; + cmd->length = CPU_TO_LE16(buff_size); + cmd->address_high = CPU_TO_LE32(I40E_HI_WORD((uintptr_t)buff)); + cmd->address_low = CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buff)); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + return status; +} + +/** + * i40e_aq_cfg_lldp_mib_change_event + * @hw: pointer to the hw struct + * @enable_update: Enable or Disable event posting + * @cmd_details: pointer to command details structure or NULL + * + * Enable or Disable posting of an event on ARQ when LLDP MIB + * associated with the interface changes + **/ +enum i40e_status_code i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, + bool enable_update, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_update_mib *cmd = + (struct i40e_aqc_lldp_update_mib *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_mib); + + if (!enable_update) + cmd->command |= I40E_AQ_LLDP_MIB_UPDATE_DISABLE; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_add_lldp_tlv + * @hw: pointer to the hw struct + * @bridge_type: type of bridge + * @buff: buffer with TLV to add + * @buff_size: length of the buffer + * @tlv_len: length of the TLV to be added + * @mib_len: length of the LLDP MIB returned in response + * @cmd_details: pointer to command details structure or NULL + * + * Add the specified TLV to LLDP Local MIB for the given bridge type, + * it is responsibility of the caller to make sure that the TLV is not + * already present in the LLDPDU. + * In return firmware will write the complete LLDP MIB with the newly + * added TLV in the response buffer. + **/ +enum i40e_status_code i40e_aq_add_lldp_tlv(struct i40e_hw *hw, u8 bridge_type, + void *buff, u16 buff_size, u16 tlv_len, + u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_add_tlv *cmd = + (struct i40e_aqc_lldp_add_tlv *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff || tlv_len == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_add_tlv); + + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(buff_size); + + cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) & + I40E_AQ_LLDP_BRIDGE_TYPE_MASK); + cmd->len = CPU_TO_LE16(tlv_len); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (mib_len != NULL) + *mib_len = LE16_TO_CPU(desc.datalen); + } + + return status; +} + +/** + * i40e_aq_update_lldp_tlv + * @hw: pointer to the hw struct + * @bridge_type: type of bridge + * @buff: buffer with TLV to update + * @buff_size: size of the buffer holding original and updated TLVs + * @old_len: Length of the Original TLV + * @new_len: Length of the Updated TLV + * @offset: offset of the updated TLV in the buff + * @mib_len: length of the returned LLDP MIB + * @cmd_details: pointer to command details structure or NULL + * + * Update the specified TLV to the LLDP Local MIB for the given bridge type. + * Firmware will place the complete LLDP MIB in response buffer with the + * updated TLV. + **/ +enum i40e_status_code i40e_aq_update_lldp_tlv(struct i40e_hw *hw, + u8 bridge_type, void *buff, u16 buff_size, + u16 old_len, u16 new_len, u16 offset, + u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_update_tlv *cmd = + (struct i40e_aqc_lldp_update_tlv *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff || offset == 0 || + old_len == 0 || new_len == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_tlv); + + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(buff_size); + + cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) & + I40E_AQ_LLDP_BRIDGE_TYPE_MASK); + cmd->old_len = CPU_TO_LE16(old_len); + cmd->new_offset = CPU_TO_LE16(offset); + cmd->new_len = CPU_TO_LE16(new_len); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (mib_len != NULL) + *mib_len = LE16_TO_CPU(desc.datalen); + } + + return status; +} + +/** + * i40e_aq_delete_lldp_tlv + * @hw: pointer to the hw struct + * @bridge_type: type of bridge + * @buff: pointer to a user supplied buffer that has the TLV + * @buff_size: length of the buffer + * @tlv_len: length of the TLV to be deleted + * @mib_len: length of the returned LLDP MIB + * @cmd_details: pointer to command details structure or NULL + * + * Delete the specified TLV from LLDP Local MIB for the given bridge type. + * The firmware places the entire LLDP MIB in the response buffer. + **/ +enum i40e_status_code i40e_aq_delete_lldp_tlv(struct i40e_hw *hw, + u8 bridge_type, void *buff, u16 buff_size, + u16 tlv_len, u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_add_tlv *cmd = + (struct i40e_aqc_lldp_add_tlv *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_delete_tlv); + + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(buff_size); + cmd->len = CPU_TO_LE16(tlv_len); + cmd->type = ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) & + I40E_AQ_LLDP_BRIDGE_TYPE_MASK); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (mib_len != NULL) + *mib_len = LE16_TO_CPU(desc.datalen); + } + + return status; +} + +/** + * i40e_aq_stop_lldp + * @hw: pointer to the hw struct + * @shutdown_agent: True if LLDP Agent needs to be Shutdown + * @cmd_details: pointer to command details structure or NULL + * + * Stop or Shutdown the embedded LLDP Agent + **/ +enum i40e_status_code i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_stop *cmd = + (struct i40e_aqc_lldp_stop *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_stop); + + if (shutdown_agent) + cmd->command |= I40E_AQ_LLDP_AGENT_SHUTDOWN; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_start_lldp + * @hw: pointer to the hw struct + * @cmd_details: pointer to command details structure or NULL + * + * Start the embedded LLDP Agent on all ports. + **/ +enum i40e_status_code i40e_aq_start_lldp(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_start *cmd = + (struct i40e_aqc_lldp_start *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_start); + + cmd->command = I40E_AQ_LLDP_AGENT_START; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_cee_dcb_config + * @hw: pointer to the hw struct + * @buff: response buffer that stores CEE operational configuration + * @buff_size: size of the buffer passed + * @cmd_details: pointer to command details structure or NULL + * + * Get CEE DCBX mode operational configuration from firmware + **/ +enum i40e_status_code i40e_aq_get_cee_dcb_config(struct i40e_hw *hw, + void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_cee_dcb_cfg); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + status = i40e_asq_send_command(hw, &desc, (void *)buff, buff_size, + cmd_details); + + return status; +} + +/** + * i40e_aq_start_stop_dcbx - Start/Stop DCBx service in FW + * @hw: pointer to the hw struct + * @start_agent: True if DCBx Agent needs to be Started + * False if DCBx Agent needs to be Stopped + * @cmd_details: pointer to command details structure or NULL + * + * Start/Stop the embedded dcbx Agent + **/ +enum i40e_status_code i40e_aq_start_stop_dcbx(struct i40e_hw *hw, + bool start_agent, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_lldp_stop_start_specific_agent *cmd = + (struct i40e_aqc_lldp_stop_start_specific_agent *) + &desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_lldp_stop_start_spec_agent); + + if (start_agent) + cmd->command = I40E_AQC_START_SPECIFIC_AGENT_MASK; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_add_udp_tunnel + * @hw: pointer to the hw struct + * @udp_port: the UDP port to add + * @header_len: length of the tunneling header length in DWords + * @protocol_index: protocol index type + * @filter_index: pointer to filter index + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_add_udp_tunnel(struct i40e_hw *hw, + u16 udp_port, u8 protocol_index, + u8 *filter_index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_udp_tunnel *cmd = + (struct i40e_aqc_add_udp_tunnel *)&desc.params.raw; + struct i40e_aqc_del_udp_tunnel_completion *resp = + (struct i40e_aqc_del_udp_tunnel_completion *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_udp_tunnel); + + cmd->udp_port = CPU_TO_LE16(udp_port); + cmd->protocol_type = protocol_index; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status && filter_index) + *filter_index = resp->index; + + return status; +} + +/** + * i40e_aq_del_udp_tunnel + * @hw: pointer to the hw struct + * @index: filter index + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_remove_udp_tunnel *cmd = + (struct i40e_aqc_remove_udp_tunnel *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_del_udp_tunnel); + + cmd->index = index; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_switch_resource_alloc (0x0204) + * @hw: pointer to the hw struct + * @num_entries: pointer to u8 to store the number of resource entries returned + * @buf: pointer to a user supplied buffer. This buffer must be large enough + * to store the resource information for all resource types. Each + * resource type is a i40e_aqc_switch_resource_alloc_data structure. + * @count: size, in bytes, of the buffer provided + * @cmd_details: pointer to command details structure or NULL + * + * Query the resources allocated to a function. + **/ +enum i40e_status_code i40e_aq_get_switch_resource_alloc(struct i40e_hw *hw, + u8 *num_entries, + struct i40e_aqc_switch_resource_alloc_element_resp *buf, + u16 count, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_get_switch_resource_alloc *cmd_resp = + (struct i40e_aqc_get_switch_resource_alloc *)&desc.params.raw; + enum i40e_status_code status; + u16 length = count * sizeof(*buf); + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_switch_resource_alloc); + + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (length > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, buf, length, cmd_details); + + if (!status && num_entries) + *num_entries = cmd_resp->num_entries; + + return status; +} + +/** + * i40e_aq_delete_element - Delete switch element + * @hw: pointer to the hw struct + * @seid: the SEID to delete from the switch + * @cmd_details: pointer to command details structure or NULL + * + * This deletes a switch element from the switch. + **/ +enum i40e_status_code i40e_aq_delete_element(struct i40e_hw *hw, u16 seid, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_switch_seid *cmd = + (struct i40e_aqc_switch_seid *)&desc.params.raw; + enum i40e_status_code status; + + if (seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_delete_element); + + cmd->seid = CPU_TO_LE16(seid); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40_aq_add_pvirt - Instantiate a Port Virtualizer on a port + * @hw: pointer to the hw struct + * @flags: component flags + * @mac_seid: uplink seid (MAC SEID) + * @vsi_seid: connected vsi seid + * @ret_seid: seid of create pv component + * + * This instantiates an i40e port virtualizer with specified flags. + * Depending on specified flags the port virtualizer can act as a + * 802.1Qbr port virtualizer or a 802.1Qbg S-component. + */ +enum i40e_status_code i40e_aq_add_pvirt(struct i40e_hw *hw, u16 flags, + u16 mac_seid, u16 vsi_seid, + u16 *ret_seid) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_update_pv *cmd = + (struct i40e_aqc_add_update_pv *)&desc.params.raw; + struct i40e_aqc_add_update_pv_completion *resp = + (struct i40e_aqc_add_update_pv_completion *)&desc.params.raw; + enum i40e_status_code status; + + if (vsi_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_pv); + cmd->command_flags = CPU_TO_LE16(flags); + cmd->uplink_seid = CPU_TO_LE16(mac_seid); + cmd->connected_seid = CPU_TO_LE16(vsi_seid); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + if (!status && ret_seid) + *ret_seid = LE16_TO_CPU(resp->pv_seid); + + return status; +} + +/** + * i40e_aq_add_tag - Add an S/E-tag + * @hw: pointer to the hw struct + * @direct_to_queue: should s-tag direct flow to a specific queue + * @vsi_seid: VSI SEID to use this tag + * @tag: value of the tag + * @queue_num: queue number, only valid is direct_to_queue is TRUE + * @tags_used: return value, number of tags in use by this PF + * @tags_free: return value, number of unallocated tags + * @cmd_details: pointer to command details structure or NULL + * + * This associates an S- or E-tag to a VSI in the switch complex. It returns + * the number of tags allocated by the PF, and the number of unallocated + * tags available. + **/ +enum i40e_status_code i40e_aq_add_tag(struct i40e_hw *hw, bool direct_to_queue, + u16 vsi_seid, u16 tag, u16 queue_num, + u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_tag *cmd = + (struct i40e_aqc_add_tag *)&desc.params.raw; + struct i40e_aqc_add_remove_tag_completion *resp = + (struct i40e_aqc_add_remove_tag_completion *)&desc.params.raw; + enum i40e_status_code status; + + if (vsi_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_tag); + + cmd->seid = CPU_TO_LE16(vsi_seid); + cmd->tag = CPU_TO_LE16(tag); + if (direct_to_queue) { + cmd->flags = CPU_TO_LE16(I40E_AQC_ADD_TAG_FLAG_TO_QUEUE); + cmd->queue_number = CPU_TO_LE16(queue_num); + } + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status) { + if (tags_used != NULL) + *tags_used = LE16_TO_CPU(resp->tags_used); + if (tags_free != NULL) + *tags_free = LE16_TO_CPU(resp->tags_free); + } + + return status; +} + +/** + * i40e_aq_remove_tag - Remove an S- or E-tag + * @hw: pointer to the hw struct + * @vsi_seid: VSI SEID this tag is associated with + * @tag: value of the S-tag to delete + * @tags_used: return value, number of tags in use by this PF + * @tags_free: return value, number of unallocated tags + * @cmd_details: pointer to command details structure or NULL + * + * This deletes an S- or E-tag from a VSI in the switch complex. It returns + * the number of tags allocated by the PF, and the number of unallocated + * tags available. + **/ +enum i40e_status_code i40e_aq_remove_tag(struct i40e_hw *hw, u16 vsi_seid, + u16 tag, u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_remove_tag *cmd = + (struct i40e_aqc_remove_tag *)&desc.params.raw; + struct i40e_aqc_add_remove_tag_completion *resp = + (struct i40e_aqc_add_remove_tag_completion *)&desc.params.raw; + enum i40e_status_code status; + + if (vsi_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_tag); + + cmd->seid = CPU_TO_LE16(vsi_seid); + cmd->tag = CPU_TO_LE16(tag); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status) { + if (tags_used != NULL) + *tags_used = LE16_TO_CPU(resp->tags_used); + if (tags_free != NULL) + *tags_free = LE16_TO_CPU(resp->tags_free); + } + + return status; +} + +/** + * i40e_aq_add_mcast_etag - Add a multicast E-tag + * @hw: pointer to the hw struct + * @pv_seid: Port Virtualizer of this SEID to associate E-tag with + * @etag: value of E-tag to add + * @num_tags_in_buf: number of unicast E-tags in indirect buffer + * @buf: address of indirect buffer + * @tags_used: return value, number of E-tags in use by this port + * @tags_free: return value, number of unallocated M-tags + * @cmd_details: pointer to command details structure or NULL + * + * This associates a multicast E-tag to a port virtualizer. It will return + * the number of tags allocated by the PF, and the number of unallocated + * tags available. + * + * The indirect buffer pointed to by buf is a list of 2-byte E-tags, + * num_tags_in_buf long. + **/ +enum i40e_status_code i40e_aq_add_mcast_etag(struct i40e_hw *hw, u16 pv_seid, + u16 etag, u8 num_tags_in_buf, void *buf, + u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_mcast_etag *cmd = + (struct i40e_aqc_add_remove_mcast_etag *)&desc.params.raw; + struct i40e_aqc_add_remove_mcast_etag_completion *resp = + (struct i40e_aqc_add_remove_mcast_etag_completion *)&desc.params.raw; + enum i40e_status_code status; + u16 length = sizeof(u16) * num_tags_in_buf; + + if ((pv_seid == 0) || (buf == NULL) || (num_tags_in_buf == 0)) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_add_multicast_etag); + + cmd->pv_seid = CPU_TO_LE16(pv_seid); + cmd->etag = CPU_TO_LE16(etag); + cmd->num_unicast_etags = num_tags_in_buf; + + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + if (length > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + status = i40e_asq_send_command(hw, &desc, buf, length, cmd_details); + + if (!status) { + if (tags_used != NULL) + *tags_used = LE16_TO_CPU(resp->mcast_etags_used); + if (tags_free != NULL) + *tags_free = LE16_TO_CPU(resp->mcast_etags_free); + } + + return status; +} + +/** + * i40e_aq_remove_mcast_etag - Remove a multicast E-tag + * @hw: pointer to the hw struct + * @pv_seid: Port Virtualizer SEID this M-tag is associated with + * @etag: value of the E-tag to remove + * @tags_used: return value, number of tags in use by this port + * @tags_free: return value, number of unallocated tags + * @cmd_details: pointer to command details structure or NULL + * + * This deletes an E-tag from the port virtualizer. It will return + * the number of tags allocated by the port, and the number of unallocated + * tags available. + **/ +enum i40e_status_code i40e_aq_remove_mcast_etag(struct i40e_hw *hw, u16 pv_seid, + u16 etag, u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_mcast_etag *cmd = + (struct i40e_aqc_add_remove_mcast_etag *)&desc.params.raw; + struct i40e_aqc_add_remove_mcast_etag_completion *resp = + (struct i40e_aqc_add_remove_mcast_etag_completion *)&desc.params.raw; + enum i40e_status_code status; + + + if (pv_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_remove_multicast_etag); + + cmd->pv_seid = CPU_TO_LE16(pv_seid); + cmd->etag = CPU_TO_LE16(etag); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status) { + if (tags_used != NULL) + *tags_used = LE16_TO_CPU(resp->mcast_etags_used); + if (tags_free != NULL) + *tags_free = LE16_TO_CPU(resp->mcast_etags_free); + } + + return status; +} + +/** + * i40e_aq_update_tag - Update an S/E-tag + * @hw: pointer to the hw struct + * @vsi_seid: VSI SEID using this S-tag + * @old_tag: old tag value + * @new_tag: new tag value + * @tags_used: return value, number of tags in use by this PF + * @tags_free: return value, number of unallocated tags + * @cmd_details: pointer to command details structure or NULL + * + * This updates the value of the tag currently attached to this VSI + * in the switch complex. It will return the number of tags allocated + * by the PF, and the number of unallocated tags available. + **/ +enum i40e_status_code i40e_aq_update_tag(struct i40e_hw *hw, u16 vsi_seid, + u16 old_tag, u16 new_tag, u16 *tags_used, + u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_update_tag *cmd = + (struct i40e_aqc_update_tag *)&desc.params.raw; + struct i40e_aqc_update_tag_completion *resp = + (struct i40e_aqc_update_tag_completion *)&desc.params.raw; + enum i40e_status_code status; + + if (vsi_seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_update_tag); + + cmd->seid = CPU_TO_LE16(vsi_seid); + cmd->old_tag = CPU_TO_LE16(old_tag); + cmd->new_tag = CPU_TO_LE16(new_tag); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status) { + if (tags_used != NULL) + *tags_used = LE16_TO_CPU(resp->tags_used); + if (tags_free != NULL) + *tags_free = LE16_TO_CPU(resp->tags_free); + } + + return status; +} + +/** + * i40e_aq_dcb_ignore_pfc - Ignore PFC for given TCs + * @hw: pointer to the hw struct + * @tcmap: TC map for request/release any ignore PFC condition + * @request: request or release ignore PFC condition + * @tcmap_ret: return TCs for which PFC is currently ignored + * @cmd_details: pointer to command details structure or NULL + * + * This sends out request/release to ignore PFC condition for a TC. + * It will return the TCs for which PFC is currently ignored. + **/ +enum i40e_status_code i40e_aq_dcb_ignore_pfc(struct i40e_hw *hw, u8 tcmap, + bool request, u8 *tcmap_ret, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_pfc_ignore *cmd_resp = + (struct i40e_aqc_pfc_ignore *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_ignore_pfc); + + if (request) + cmd_resp->command_flags = I40E_AQC_PFC_IGNORE_SET; + + cmd_resp->tc_bitmap = tcmap; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status) { + if (tcmap_ret != NULL) + *tcmap_ret = cmd_resp->tc_bitmap; + } + + return status; +} + +/** + * i40e_aq_dcb_updated - DCB Updated Command + * @hw: pointer to the hw struct + * @cmd_details: pointer to command details structure or NULL + * + * When LLDP is handled in PF this command is used by the PF + * to notify EMP that a DCB setting is modified. + * When LLDP is handled in EMP this command is used by the PF + * to notify EMP whenever one of the following parameters get + * modified: + * - PFCLinkDelayAllowance in PRTDCB_GENC.PFCLDA + * - PCIRTT in PRTDCB_GENC.PCIRTT + * - Maximum Frame Size for non-FCoE TCs set by PRTDCB_TDPUC.MAX_TXFRAME. + * EMP will return when the shared RPB settings have been + * recomputed and modified. The retval field in the descriptor + * will be set to 0 when RPB is modified. + **/ +enum i40e_status_code i40e_aq_dcb_updated(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_updated); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_add_statistics - Add a statistics block to a VLAN in a switch. + * @hw: pointer to the hw struct + * @seid: defines the SEID of the switch for which the stats are requested + * @vlan_id: the VLAN ID for which the statistics are requested + * @stat_index: index of the statistics counters block assigned to this VLAN + * @cmd_details: pointer to command details structure or NULL + * + * XL710 supports 128 smonVlanStats counters.This command is used to + * allocate a set of smonVlanStats counters to a specific VLAN in a specific + * switch. + **/ +enum i40e_status_code i40e_aq_add_statistics(struct i40e_hw *hw, u16 seid, + u16 vlan_id, u16 *stat_index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_statistics *cmd_resp = + (struct i40e_aqc_add_remove_statistics *)&desc.params.raw; + enum i40e_status_code status; + + if ((seid == 0) || (stat_index == NULL)) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_statistics); + + cmd_resp->seid = CPU_TO_LE16(seid); + cmd_resp->vlan = CPU_TO_LE16(vlan_id); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status && stat_index) + *stat_index = LE16_TO_CPU(cmd_resp->stat_index); + + return status; +} + +/** + * i40e_aq_remove_statistics - Remove a statistics block to a VLAN in a switch. + * @hw: pointer to the hw struct + * @seid: defines the SEID of the switch for which the stats are requested + * @vlan_id: the VLAN ID for which the statistics are requested + * @stat_index: index of the statistics counters block assigned to this VLAN + * @cmd_details: pointer to command details structure or NULL + * + * XL710 supports 128 smonVlanStats counters.This command is used to + * deallocate a set of smonVlanStats counters to a specific VLAN in a specific + * switch. + **/ +enum i40e_status_code i40e_aq_remove_statistics(struct i40e_hw *hw, u16 seid, + u16 vlan_id, u16 stat_index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_statistics *cmd = + (struct i40e_aqc_add_remove_statistics *)&desc.params.raw; + enum i40e_status_code status; + + if (seid == 0) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_remove_statistics); + + cmd->seid = CPU_TO_LE16(seid); + cmd->vlan = CPU_TO_LE16(vlan_id); + cmd->stat_index = CPU_TO_LE16(stat_index); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_set_port_parameters - set physical port parameters. + * @hw: pointer to the hw struct + * @bad_frame_vsi: defines the VSI to which bad frames are forwarded + * @save_bad_pac: if set packets with errors are forwarded to the bad frames VSI + * @pad_short_pac: if set transmit packets smaller than 60 bytes are padded + * @double_vlan: if set double VLAN is enabled + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_set_port_parameters(struct i40e_hw *hw, + u16 bad_frame_vsi, bool save_bad_pac, + bool pad_short_pac, bool double_vlan, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aqc_set_port_parameters *cmd; + enum i40e_status_code status; + struct i40e_aq_desc desc; + u16 command_flags = 0; + + cmd = (struct i40e_aqc_set_port_parameters *)&desc.params.raw; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_port_parameters); + + cmd->bad_frame_vsi = CPU_TO_LE16(bad_frame_vsi); + if (save_bad_pac) + command_flags |= I40E_AQ_SET_P_PARAMS_SAVE_BAD_PACKETS; + if (pad_short_pac) + command_flags |= I40E_AQ_SET_P_PARAMS_PAD_SHORT_PACKETS; + if (double_vlan) + command_flags |= I40E_AQ_SET_P_PARAMS_DOUBLE_VLAN_ENA; + cmd->command_flags = CPU_TO_LE16(command_flags); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_tx_sched_cmd - generic Tx scheduler AQ command handler + * @hw: pointer to the hw struct + * @seid: seid for the physical port/switching component/vsi + * @buff: Indirect buffer to hold data parameters and response + * @buff_size: Indirect buffer size + * @opcode: Tx scheduler AQ command opcode + * @cmd_details: pointer to command details structure or NULL + * + * Generic command handler for Tx scheduler AQ commands + **/ +static enum i40e_status_code i40e_aq_tx_sched_cmd(struct i40e_hw *hw, u16 seid, + void *buff, u16 buff_size, + enum i40e_admin_queue_opc opcode, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_tx_sched_ind *cmd = + (struct i40e_aqc_tx_sched_ind *)&desc.params.raw; + enum i40e_status_code status; + bool cmd_param_flag = FALSE; + + switch (opcode) { + case i40e_aqc_opc_configure_vsi_ets_sla_bw_limit: + case i40e_aqc_opc_configure_vsi_tc_bw: + case i40e_aqc_opc_enable_switching_comp_ets: + case i40e_aqc_opc_modify_switching_comp_ets: + case i40e_aqc_opc_disable_switching_comp_ets: + case i40e_aqc_opc_configure_switching_comp_ets_bw_limit: + case i40e_aqc_opc_configure_switching_comp_bw_config: + cmd_param_flag = TRUE; + break; + case i40e_aqc_opc_query_vsi_bw_config: + case i40e_aqc_opc_query_vsi_ets_sla_config: + case i40e_aqc_opc_query_switching_comp_ets_config: + case i40e_aqc_opc_query_port_ets_config: + case i40e_aqc_opc_query_switching_comp_bw_config: + cmd_param_flag = FALSE; + break; + default: + return I40E_ERR_PARAM; + } + + i40e_fill_default_direct_cmd_desc(&desc, opcode); + + /* Indirect command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (cmd_param_flag) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + desc.datalen = CPU_TO_LE16(buff_size); + + cmd->vsi_seid = CPU_TO_LE16(seid); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + + return status; +} + +/** + * i40e_aq_config_vsi_bw_limit - Configure VSI BW Limit + * @hw: pointer to the hw struct + * @seid: VSI seid + * @credit: BW limit credits (0 = disabled) + * @max_credit: Max BW limit credits + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw, + u16 seid, u16 credit, u8 max_credit, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_configure_vsi_bw_limit *cmd = + (struct i40e_aqc_configure_vsi_bw_limit *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_configure_vsi_bw_limit); + + cmd->vsi_seid = CPU_TO_LE16(seid); + cmd->credit = CPU_TO_LE16(credit); + cmd->max_credit = max_credit; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_config_switch_comp_bw_limit - Configure Switching component BW Limit + * @hw: pointer to the hw struct + * @seid: switching component seid + * @credit: BW limit credits (0 = disabled) + * @max_bw: Max BW limit credits + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_config_switch_comp_bw_limit(struct i40e_hw *hw, + u16 seid, u16 credit, u8 max_bw, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_configure_switching_comp_bw_limit *cmd = + (struct i40e_aqc_configure_switching_comp_bw_limit *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_configure_switching_comp_bw_limit); + + cmd->seid = CPU_TO_LE16(seid); + cmd->credit = CPU_TO_LE16(credit); + cmd->max_bw = max_bw; + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_config_vsi_ets_sla_bw_limit - Config VSI BW Limit per TC + * @hw: pointer to the hw struct + * @seid: VSI seid + * @bw_data: Buffer holding enabled TCs, per TC BW limit/credits + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_config_vsi_ets_sla_bw_limit(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_configure_vsi_ets_sla_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_configure_vsi_ets_sla_bw_limit, + cmd_details); +} + +/** + * i40e_aq_config_vsi_tc_bw - Config VSI BW Allocation per TC + * @hw: pointer to the hw struct + * @seid: VSI seid + * @bw_data: Buffer holding enabled TCs, relative TC BW limit/credits + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_configure_vsi_tc_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_configure_vsi_tc_bw, + cmd_details); +} + +/** + * i40e_aq_config_switch_comp_ets_bw_limit - Config Switch comp BW Limit per TC + * @hw: pointer to the hw struct + * @seid: seid of the switching component + * @bw_data: Buffer holding enabled TCs, per TC BW limit/credits + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_config_switch_comp_ets_bw_limit( + struct i40e_hw *hw, u16 seid, + struct i40e_aqc_configure_switching_comp_ets_bw_limit_data *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_configure_switching_comp_ets_bw_limit, + cmd_details); +} + +/** + * i40e_aq_query_vsi_bw_config - Query VSI BW configuration + * @hw: pointer to the hw struct + * @seid: seid of the VSI + * @bw_data: Buffer to hold VSI BW configuration + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_query_vsi_bw_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_vsi_bw_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_query_vsi_bw_config, + cmd_details); +} + +/** + * i40e_aq_query_vsi_ets_sla_config - Query VSI BW configuration per TC + * @hw: pointer to the hw struct + * @seid: seid of the VSI + * @bw_data: Buffer to hold VSI BW configuration per TC + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_query_vsi_ets_sla_config, + cmd_details); +} + +/** + * i40e_aq_query_switch_comp_ets_config - Query Switch comp BW config per TC + * @hw: pointer to the hw struct + * @seid: seid of the switching component + * @bw_data: Buffer to hold switching component's per TC BW config + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_query_switching_comp_ets_config, + cmd_details); +} + +/** + * i40e_aq_query_port_ets_config - Query Physical Port ETS configuration + * @hw: pointer to the hw struct + * @seid: seid of the VSI or switching component connected to Physical Port + * @bw_data: Buffer to hold current ETS configuration for the Physical Port + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_query_port_ets_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_port_ets_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_query_port_ets_config, + cmd_details); +} + +/** + * i40e_aq_query_switch_comp_bw_config - Query Switch comp BW configuration + * @hw: pointer to the hw struct + * @seid: seid of the switching component + * @bw_data: Buffer to hold switching component's BW configuration + * @cmd_details: pointer to command details structure or NULL + **/ +enum i40e_status_code i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data), + i40e_aqc_opc_query_switching_comp_bw_config, + cmd_details); +} + +/** + * i40e_validate_filter_settings + * @hw: pointer to the hardware structure + * @settings: Filter control settings + * + * Check and validate the filter control settings passed. + * The function checks for the valid filter/context sizes being + * passed for FCoE and PE. + * + * Returns I40E_SUCCESS if the values passed are valid and within + * range else returns an error. + **/ +static enum i40e_status_code i40e_validate_filter_settings(struct i40e_hw *hw, + struct i40e_filter_control_settings *settings) +{ + u32 fcoe_cntx_size, fcoe_filt_size; + u32 pe_cntx_size, pe_filt_size; + u32 fcoe_fmax; + + u32 val; + + /* Validate FCoE settings passed */ + switch (settings->fcoe_filt_num) { + case I40E_HASH_FILTER_SIZE_1K: + case I40E_HASH_FILTER_SIZE_2K: + case I40E_HASH_FILTER_SIZE_4K: + case I40E_HASH_FILTER_SIZE_8K: + case I40E_HASH_FILTER_SIZE_16K: + case I40E_HASH_FILTER_SIZE_32K: + fcoe_filt_size = I40E_HASH_FILTER_BASE_SIZE; + fcoe_filt_size <<= (u32)settings->fcoe_filt_num; + break; + default: + return I40E_ERR_PARAM; + } + + switch (settings->fcoe_cntx_num) { + case I40E_DMA_CNTX_SIZE_512: + case I40E_DMA_CNTX_SIZE_1K: + case I40E_DMA_CNTX_SIZE_2K: + case I40E_DMA_CNTX_SIZE_4K: + fcoe_cntx_size = I40E_DMA_CNTX_BASE_SIZE; + fcoe_cntx_size <<= (u32)settings->fcoe_cntx_num; + break; + default: + return I40E_ERR_PARAM; + } + + /* Validate PE settings passed */ + switch (settings->pe_filt_num) { + case I40E_HASH_FILTER_SIZE_1K: + case I40E_HASH_FILTER_SIZE_2K: + case I40E_HASH_FILTER_SIZE_4K: + case I40E_HASH_FILTER_SIZE_8K: + case I40E_HASH_FILTER_SIZE_16K: + case I40E_HASH_FILTER_SIZE_32K: + case I40E_HASH_FILTER_SIZE_64K: + case I40E_HASH_FILTER_SIZE_128K: + case I40E_HASH_FILTER_SIZE_256K: + case I40E_HASH_FILTER_SIZE_512K: + case I40E_HASH_FILTER_SIZE_1M: + pe_filt_size = I40E_HASH_FILTER_BASE_SIZE; + pe_filt_size <<= (u32)settings->pe_filt_num; + break; + default: + return I40E_ERR_PARAM; + } + + switch (settings->pe_cntx_num) { + case I40E_DMA_CNTX_SIZE_512: + case I40E_DMA_CNTX_SIZE_1K: + case I40E_DMA_CNTX_SIZE_2K: + case I40E_DMA_CNTX_SIZE_4K: + case I40E_DMA_CNTX_SIZE_8K: + case I40E_DMA_CNTX_SIZE_16K: + case I40E_DMA_CNTX_SIZE_32K: + case I40E_DMA_CNTX_SIZE_64K: + case I40E_DMA_CNTX_SIZE_128K: + case I40E_DMA_CNTX_SIZE_256K: + pe_cntx_size = I40E_DMA_CNTX_BASE_SIZE; + pe_cntx_size <<= (u32)settings->pe_cntx_num; + break; + default: + return I40E_ERR_PARAM; + } + + /* FCHSIZE + FCDSIZE should not be greater than PMFCOEFMAX */ + val = rd32(hw, I40E_GLHMC_FCOEFMAX); + fcoe_fmax = (val & I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK) + >> I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT; + if (fcoe_filt_size + fcoe_cntx_size > fcoe_fmax) + return I40E_ERR_INVALID_SIZE; + + return I40E_SUCCESS; +} + +/** + * i40e_set_filter_control + * @hw: pointer to the hardware structure + * @settings: Filter control settings + * + * Set the Queue Filters for PE/FCoE and enable filters required + * for a single PF. It is expected that these settings are programmed + * at the driver initialization time. + **/ +enum i40e_status_code i40e_set_filter_control(struct i40e_hw *hw, + struct i40e_filter_control_settings *settings) +{ + enum i40e_status_code ret = I40E_SUCCESS; + u32 hash_lut_size = 0; + u32 val; + + if (!settings) + return I40E_ERR_PARAM; + + /* Validate the input settings */ + ret = i40e_validate_filter_settings(hw, settings); + if (ret) + return ret; + + /* Read the PF Queue Filter control register */ + val = rd32(hw, I40E_PFQF_CTL_0); + + /* Program required PE hash buckets for the PF */ + val &= ~I40E_PFQF_CTL_0_PEHSIZE_MASK; + val |= ((u32)settings->pe_filt_num << I40E_PFQF_CTL_0_PEHSIZE_SHIFT) & + I40E_PFQF_CTL_0_PEHSIZE_MASK; + /* Program required PE contexts for the PF */ + val &= ~I40E_PFQF_CTL_0_PEDSIZE_MASK; + val |= ((u32)settings->pe_cntx_num << I40E_PFQF_CTL_0_PEDSIZE_SHIFT) & + I40E_PFQF_CTL_0_PEDSIZE_MASK; + + /* Program required FCoE hash buckets for the PF */ + val &= ~I40E_PFQF_CTL_0_PFFCHSIZE_MASK; + val |= ((u32)settings->fcoe_filt_num << + I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT) & + I40E_PFQF_CTL_0_PFFCHSIZE_MASK; + /* Program required FCoE DDP contexts for the PF */ + val &= ~I40E_PFQF_CTL_0_PFFCDSIZE_MASK; + val |= ((u32)settings->fcoe_cntx_num << + I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT) & + I40E_PFQF_CTL_0_PFFCDSIZE_MASK; + + /* Program Hash LUT size for the PF */ + val &= ~I40E_PFQF_CTL_0_HASHLUTSIZE_MASK; + if (settings->hash_lut_size == I40E_HASH_LUT_SIZE_512) + hash_lut_size = 1; + val |= (hash_lut_size << I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT) & + I40E_PFQF_CTL_0_HASHLUTSIZE_MASK; + + /* Enable FDIR, Ethertype and MACVLAN filters for PF and VFs */ + if (settings->enable_fdir) + val |= I40E_PFQF_CTL_0_FD_ENA_MASK; + if (settings->enable_ethtype) + val |= I40E_PFQF_CTL_0_ETYPE_ENA_MASK; + if (settings->enable_macvlan) + val |= I40E_PFQF_CTL_0_MACVLAN_ENA_MASK; + + wr32(hw, I40E_PFQF_CTL_0, val); + + return I40E_SUCCESS; +} + +/** + * i40e_aq_add_rem_control_packet_filter - Add or Remove Control Packet Filter + * @hw: pointer to the hw struct + * @mac_addr: MAC address to use in the filter + * @ethtype: Ethertype to use in the filter + * @flags: Flags that needs to be applied to the filter + * @vsi_seid: seid of the control VSI + * @queue: VSI queue number to send the packet to + * @is_add: Add control packet filter if True else remove + * @stats: Structure to hold information on control filter counts + * @cmd_details: pointer to command details structure or NULL + * + * This command will Add or Remove control packet filter for a control VSI. + * In return it will update the total number of perfect filter count in + * the stats member. + **/ +enum i40e_status_code i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw, + u8 *mac_addr, u16 ethtype, u16 flags, + u16 vsi_seid, u16 queue, bool is_add, + struct i40e_control_filter_stats *stats, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_control_packet_filter *cmd = + (struct i40e_aqc_add_remove_control_packet_filter *) + &desc.params.raw; + struct i40e_aqc_add_remove_control_packet_filter_completion *resp = + (struct i40e_aqc_add_remove_control_packet_filter_completion *) + &desc.params.raw; + enum i40e_status_code status; + + if (vsi_seid == 0) + return I40E_ERR_PARAM; + + if (is_add) { + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_add_control_packet_filter); + cmd->queue = CPU_TO_LE16(queue); + } else { + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_remove_control_packet_filter); + } + + if (mac_addr) + i40e_memcpy(cmd->mac, mac_addr, I40E_ETH_LENGTH_OF_ADDRESS, + I40E_NONDMA_TO_NONDMA); + + cmd->etype = CPU_TO_LE16(ethtype); + cmd->flags = CPU_TO_LE16(flags); + cmd->seid = CPU_TO_LE16(vsi_seid); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + if (!status && stats) { + stats->mac_etype_used = LE16_TO_CPU(resp->mac_etype_used); + stats->etype_used = LE16_TO_CPU(resp->etype_used); + stats->mac_etype_free = LE16_TO_CPU(resp->mac_etype_free); + stats->etype_free = LE16_TO_CPU(resp->etype_free); + } + + return status; +} + +/** + * i40e_aq_add_cloud_filters + * @hw: pointer to the hardware structure + * @seid: VSI seid to add cloud filters from + * @filters: Buffer which contains the filters to be added + * @filter_count: number of filters contained in the buffer + * + * Set the cloud filters for a given VSI. The contents of the + * i40e_aqc_add_remove_cloud_filters_element_data are filled + * in by the caller of the function. + * + **/ +enum i40e_status_code i40e_aq_add_cloud_filters(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_add_remove_cloud_filters_element_data *filters, + u8 filter_count) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_cloud_filters *cmd = + (struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw; + u16 buff_len; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_add_cloud_filters); + + buff_len = filter_count * sizeof(*filters); + desc.datalen = CPU_TO_LE16(buff_len); + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + cmd->num_filters = filter_count; + cmd->seid = CPU_TO_LE16(seid); + + status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL); + + return status; +} + +/** + * i40e_aq_remove_cloud_filters + * @hw: pointer to the hardware structure + * @seid: VSI seid to remove cloud filters from + * @filters: Buffer which contains the filters to be removed + * @filter_count: number of filters contained in the buffer + * + * Remove the cloud filters for a given VSI. The contents of the + * i40e_aqc_add_remove_cloud_filters_element_data are filled + * in by the caller of the function. + * + **/ +enum i40e_status_code i40e_aq_remove_cloud_filters(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_add_remove_cloud_filters_element_data *filters, + u8 filter_count) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_add_remove_cloud_filters *cmd = + (struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw; + enum i40e_status_code status; + u16 buff_len; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_remove_cloud_filters); + + buff_len = filter_count * sizeof(*filters); + desc.datalen = CPU_TO_LE16(buff_len); + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD)); + cmd->num_filters = filter_count; + cmd->seid = CPU_TO_LE16(seid); + + status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL); + + return status; +} + +/** + * i40e_aq_alternate_write + * @hw: pointer to the hardware structure + * @reg_addr0: address of first dword to be read + * @reg_val0: value to be written under 'reg_addr0' + * @reg_addr1: address of second dword to be read + * @reg_val1: value to be written under 'reg_addr1' + * + * Write one or two dwords to alternate structure. Fields are indicated + * by 'reg_addr0' and 'reg_addr1' register numbers. + * + **/ +enum i40e_status_code i40e_aq_alternate_write(struct i40e_hw *hw, + u32 reg_addr0, u32 reg_val0, + u32 reg_addr1, u32 reg_val1) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_write *cmd_resp = + (struct i40e_aqc_alternate_write *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_write); + cmd_resp->address0 = CPU_TO_LE32(reg_addr0); + cmd_resp->address1 = CPU_TO_LE32(reg_addr1); + cmd_resp->data0 = CPU_TO_LE32(reg_val0); + cmd_resp->data1 = CPU_TO_LE32(reg_val1); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + + return status; +} + +/** + * i40e_aq_alternate_write_indirect + * @hw: pointer to the hardware structure + * @addr: address of a first register to be modified + * @dw_count: number of alternate structure fields to write + * @buffer: pointer to the command buffer + * + * Write 'dw_count' dwords from 'buffer' to alternate structure + * starting at 'addr'. + * + **/ +enum i40e_status_code i40e_aq_alternate_write_indirect(struct i40e_hw *hw, + u32 addr, u32 dw_count, void *buffer) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_ind_write *cmd_resp = + (struct i40e_aqc_alternate_ind_write *)&desc.params.raw; + enum i40e_status_code status; + + if (buffer == NULL) + return I40E_ERR_PARAM; + + /* Indirect command */ + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_alternate_write_indirect); + + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_RD); + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF); + if (dw_count > (I40E_AQ_LARGE_BUF/4)) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + cmd_resp->address = CPU_TO_LE32(addr); + cmd_resp->length = CPU_TO_LE32(dw_count); + cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_WORD((uintptr_t)buffer)); + cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buffer)); + + status = i40e_asq_send_command(hw, &desc, buffer, + I40E_LO_DWORD(4*dw_count), NULL); + + return status; +} + +/** + * i40e_aq_alternate_read + * @hw: pointer to the hardware structure + * @reg_addr0: address of first dword to be read + * @reg_val0: pointer for data read from 'reg_addr0' + * @reg_addr1: address of second dword to be read + * @reg_val1: pointer for data read from 'reg_addr1' + * + * Read one or two dwords from alternate structure. Fields are indicated + * by 'reg_addr0' and 'reg_addr1' register numbers. If 'reg_val1' pointer + * is not passed then only register at 'reg_addr0' is read. + * + **/ +enum i40e_status_code i40e_aq_alternate_read(struct i40e_hw *hw, + u32 reg_addr0, u32 *reg_val0, + u32 reg_addr1, u32 *reg_val1) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_write *cmd_resp = + (struct i40e_aqc_alternate_write *)&desc.params.raw; + enum i40e_status_code status; + + if (reg_val0 == NULL) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_read); + cmd_resp->address0 = CPU_TO_LE32(reg_addr0); + cmd_resp->address1 = CPU_TO_LE32(reg_addr1); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + + if (status == I40E_SUCCESS) { + *reg_val0 = LE32_TO_CPU(cmd_resp->data0); + + if (reg_val1 != NULL) + *reg_val1 = LE32_TO_CPU(cmd_resp->data1); + } + + return status; +} + +/** + * i40e_aq_alternate_read_indirect + * @hw: pointer to the hardware structure + * @addr: address of the alternate structure field + * @dw_count: number of alternate structure fields to read + * @buffer: pointer to the command buffer + * + * Read 'dw_count' dwords from alternate structure starting at 'addr' and + * place them in 'buffer'. The buffer should be allocated by caller. + * + **/ +enum i40e_status_code i40e_aq_alternate_read_indirect(struct i40e_hw *hw, + u32 addr, u32 dw_count, void *buffer) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_ind_write *cmd_resp = + (struct i40e_aqc_alternate_ind_write *)&desc.params.raw; + enum i40e_status_code status; + + if (buffer == NULL) + return I40E_ERR_PARAM; + + /* Indirect command */ + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_alternate_read_indirect); + + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_RD); + desc.flags |= CPU_TO_LE16(I40E_AQ_FLAG_BUF); + if (dw_count > (I40E_AQ_LARGE_BUF/4)) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + cmd_resp->address = CPU_TO_LE32(addr); + cmd_resp->length = CPU_TO_LE32(dw_count); + cmd_resp->addr_high = CPU_TO_LE32(I40E_HI_DWORD((uintptr_t)buffer)); + cmd_resp->addr_low = CPU_TO_LE32(I40E_LO_DWORD((uintptr_t)buffer)); + + status = i40e_asq_send_command(hw, &desc, buffer, + I40E_LO_DWORD(4*dw_count), NULL); + + return status; +} + +/** + * i40e_aq_alternate_clear + * @hw: pointer to the HW structure. + * + * Clear the alternate structures of the port from which the function + * is called. + * + **/ +enum i40e_status_code i40e_aq_alternate_clear(struct i40e_hw *hw) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_alternate_clear_port); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + + return status; +} + +/** + * i40e_aq_alternate_write_done + * @hw: pointer to the HW structure. + * @bios_mode: indicates whether the command is executed by UEFI or legacy BIOS + * @reset_needed: indicates the SW should trigger GLOBAL reset + * + * Indicates to the FW that alternate structures have been changed. + * + **/ +enum i40e_status_code i40e_aq_alternate_write_done(struct i40e_hw *hw, + u8 bios_mode, bool *reset_needed) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_write_done *cmd = + (struct i40e_aqc_alternate_write_done *)&desc.params.raw; + enum i40e_status_code status; + + if (reset_needed == NULL) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_alternate_write_done); + + cmd->cmd_flags = CPU_TO_LE16(bios_mode); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + if (!status && reset_needed) + *reset_needed = ((LE16_TO_CPU(cmd->cmd_flags) & + I40E_AQ_ALTERNATE_RESET_NEEDED) != 0); + + return status; +} + +/** + * i40e_aq_set_oem_mode + * @hw: pointer to the HW structure. + * @oem_mode: the OEM mode to be used + * + * Sets the device to a specific operating mode. Currently the only supported + * mode is no_clp, which causes FW to refrain from using Alternate RAM. + * + **/ +enum i40e_status_code i40e_aq_set_oem_mode(struct i40e_hw *hw, + u8 oem_mode) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_alternate_write_done *cmd = + (struct i40e_aqc_alternate_write_done *)&desc.params.raw; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_alternate_set_mode); + + cmd->cmd_flags = CPU_TO_LE16(oem_mode); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL); + + return status; +} + +/** + * i40e_aq_resume_port_tx + * @hw: pointer to the hardware structure + * @cmd_details: pointer to command details structure or NULL + * + * Resume port's Tx traffic + **/ +enum i40e_status_code i40e_aq_resume_port_tx(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_resume_port_tx); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_set_pci_config_data - store PCI bus info + * @hw: pointer to hardware structure + * @link_status: the link status word from PCI config space + * + * Stores the PCI bus info (speed, width, type) within the i40e_hw structure + **/ +void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status) +{ + hw->bus.type = i40e_bus_type_pci_express; + + switch (link_status & I40E_PCI_LINK_WIDTH) { + case I40E_PCI_LINK_WIDTH_1: + hw->bus.width = i40e_bus_width_pcie_x1; + break; + case I40E_PCI_LINK_WIDTH_2: + hw->bus.width = i40e_bus_width_pcie_x2; + break; + case I40E_PCI_LINK_WIDTH_4: + hw->bus.width = i40e_bus_width_pcie_x4; + break; + case I40E_PCI_LINK_WIDTH_8: + hw->bus.width = i40e_bus_width_pcie_x8; + break; + default: + hw->bus.width = i40e_bus_width_unknown; + break; + } + + switch (link_status & I40E_PCI_LINK_SPEED) { + case I40E_PCI_LINK_SPEED_2500: + hw->bus.speed = i40e_bus_speed_2500; + break; + case I40E_PCI_LINK_SPEED_5000: + hw->bus.speed = i40e_bus_speed_5000; + break; + case I40E_PCI_LINK_SPEED_8000: + hw->bus.speed = i40e_bus_speed_8000; + break; + default: + hw->bus.speed = i40e_bus_speed_unknown; + break; + } +} + +/** + * i40e_aq_debug_dump + * @hw: pointer to the hardware structure + * @cluster_id: specific cluster to dump + * @table_id: table id within cluster + * @start_index: index of line in the block to read + * @buff_size: dump buffer size + * @buff: dump buffer + * @ret_buff_size: actual buffer size returned + * @ret_next_table: next block to read + * @ret_next_index: next index to read + * + * Dump internal FW/HW data for debug purposes. + * + **/ +enum i40e_status_code i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id, + u8 table_id, u32 start_index, u16 buff_size, + void *buff, u16 *ret_buff_size, + u8 *ret_next_table, u32 *ret_next_index, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_debug_dump_internals *cmd = + (struct i40e_aqc_debug_dump_internals *)&desc.params.raw; + struct i40e_aqc_debug_dump_internals *resp = + (struct i40e_aqc_debug_dump_internals *)&desc.params.raw; + enum i40e_status_code status; + + if (buff_size == 0 || !buff) + return I40E_ERR_PARAM; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_debug_dump_internals); + /* Indirect Command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + if (buff_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + cmd->cluster_id = cluster_id; + cmd->table_id = table_id; + cmd->idx = CPU_TO_LE32(start_index); + + desc.datalen = CPU_TO_LE16(buff_size); + + status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details); + if (!status) { + if (ret_buff_size != NULL) + *ret_buff_size = LE16_TO_CPU(desc.datalen); + if (ret_next_table != NULL) + *ret_next_table = resp->table_id; + if (ret_next_index != NULL) + *ret_next_index = LE32_TO_CPU(resp->idx); + } + + return status; +} + +/** + * i40e_read_bw_from_alt_ram + * @hw: pointer to the hardware structure + * @max_bw: pointer for max_bw read + * @min_bw: pointer for min_bw read + * @min_valid: pointer for bool that is TRUE if min_bw is a valid value + * @max_valid: pointer for bool that is TRUE if max_bw is a valid value + * + * Read bw from the alternate ram for the given pf + **/ +enum i40e_status_code i40e_read_bw_from_alt_ram(struct i40e_hw *hw, + u32 *max_bw, u32 *min_bw, + bool *min_valid, bool *max_valid) +{ + enum i40e_status_code status; + u32 max_bw_addr, min_bw_addr; + + /* Calculate the address of the min/max bw registers */ + max_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET + + I40E_ALT_STRUCT_MAX_BW_OFFSET + + (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id); + min_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET + + I40E_ALT_STRUCT_MIN_BW_OFFSET + + (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id); + + /* Read the bandwidths from alt ram */ + status = i40e_aq_alternate_read(hw, max_bw_addr, max_bw, + min_bw_addr, min_bw); + + if (*min_bw & I40E_ALT_BW_VALID_MASK) + *min_valid = TRUE; + else + *min_valid = FALSE; + + if (*max_bw & I40E_ALT_BW_VALID_MASK) + *max_valid = TRUE; + else + *max_valid = FALSE; + + return status; +} + +/** + * i40e_aq_configure_partition_bw + * @hw: pointer to the hardware structure + * @bw_data: Buffer holding valid pfs and bw limits + * @cmd_details: pointer to command details + * + * Configure partitions guaranteed/max bw + **/ +enum i40e_status_code i40e_aq_configure_partition_bw(struct i40e_hw *hw, + struct i40e_aqc_configure_partition_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details) +{ + enum i40e_status_code status; + struct i40e_aq_desc desc; + u16 bwd_size = sizeof(*bw_data); + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_configure_partition_bw); + + /* Indirect command */ + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_BUF); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_RD); + + if (bwd_size > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + + desc.datalen = CPU_TO_LE16(bwd_size); + + status = i40e_asq_send_command(hw, &desc, bw_data, bwd_size, cmd_details); + + return status; +} + +/** + * i40e_aq_send_msg_to_pf + * @hw: pointer to the hardware structure + * @v_opcode: opcodes for VF-PF communication + * @v_retval: return error code + * @msg: pointer to the msg buffer + * @msglen: msg length + * @cmd_details: pointer to command details + * + * Send message to PF driver using admin queue. By default, this message + * is sent asynchronously, i.e. i40e_asq_send_command() does not wait for + * completion before returning. + **/ +enum i40e_status_code i40e_aq_send_msg_to_pf(struct i40e_hw *hw, + enum i40e_virtchnl_ops v_opcode, + enum i40e_status_code v_retval, + u8 *msg, u16 msglen, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_asq_cmd_details details; + enum i40e_status_code status; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_pf); + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_SI); + desc.cookie_high = CPU_TO_LE32(v_opcode); + desc.cookie_low = CPU_TO_LE32(v_retval); + if (msglen) { + desc.flags |= CPU_TO_LE16((u16)(I40E_AQ_FLAG_BUF + | I40E_AQ_FLAG_RD)); + if (msglen > I40E_AQ_LARGE_BUF) + desc.flags |= CPU_TO_LE16((u16)I40E_AQ_FLAG_LB); + desc.datalen = CPU_TO_LE16(msglen); + } + if (!cmd_details) { + i40e_memset(&details, 0, sizeof(details), I40E_NONDMA_MEM); + details.async = TRUE; + cmd_details = &details; + } + status = i40e_asq_send_command(hw, (struct i40e_aq_desc *)&desc, msg, + msglen, cmd_details); + return status; +} + +/** + * i40e_vf_parse_hw_config + * @hw: pointer to the hardware structure + * @msg: pointer to the virtual channel VF resource structure + * + * Given a VF resource message from the PF, populate the hw struct + * with appropriate information. + **/ +void i40e_vf_parse_hw_config(struct i40e_hw *hw, + struct i40e_virtchnl_vf_resource *msg) +{ + struct i40e_virtchnl_vsi_resource *vsi_res; + int i; + + vsi_res = &msg->vsi_res[0]; + + hw->dev_caps.num_vsis = msg->num_vsis; + hw->dev_caps.num_rx_qp = msg->num_queue_pairs; + hw->dev_caps.num_tx_qp = msg->num_queue_pairs; + hw->dev_caps.num_msix_vectors_vf = msg->max_vectors; + hw->dev_caps.dcb = msg->vf_offload_flags & + I40E_VIRTCHNL_VF_OFFLOAD_L2; + hw->dev_caps.fcoe = (msg->vf_offload_flags & + I40E_VIRTCHNL_VF_OFFLOAD_FCOE) ? 1 : 0; + hw->dev_caps.iwarp = (msg->vf_offload_flags & + I40E_VIRTCHNL_VF_OFFLOAD_IWARP) ? 1 : 0; + for (i = 0; i < msg->num_vsis; i++) { + if (vsi_res->vsi_type == I40E_VSI_SRIOV) { + i40e_memcpy(hw->mac.perm_addr, + vsi_res->default_mac_addr, + I40E_ETH_LENGTH_OF_ADDRESS, + I40E_NONDMA_TO_NONDMA); + i40e_memcpy(hw->mac.addr, vsi_res->default_mac_addr, + I40E_ETH_LENGTH_OF_ADDRESS, + I40E_NONDMA_TO_NONDMA); + } + vsi_res++; + } +} + +/** + * i40e_vf_reset + * @hw: pointer to the hardware structure + * + * Send a VF_RESET message to the PF. Does not wait for response from PF + * as none will be forthcoming. Immediately after calling this function, + * the admin queue should be shut down and (optionally) reinitialized. + **/ +enum i40e_status_code i40e_vf_reset(struct i40e_hw *hw) +{ + return i40e_aq_send_msg_to_pf(hw, I40E_VIRTCHNL_OP_RESET_VF, + I40E_SUCCESS, NULL, 0, NULL); +} diff --git a/usr/src/uts/common/io/i40e/core/i40e_devids.h b/usr/src/uts/common/io/i40e/core/i40e_devids.h new file mode 100644 index 0000000000..5b927bed9f --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_devids.h @@ -0,0 +1,68 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_devids.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_DEVIDS_H_ +#define _I40E_DEVIDS_H_ + +/* Vendor ID */ +#define I40E_INTEL_VENDOR_ID 0x8086 + +/* Device IDs */ +#define I40E_DEV_ID_SFP_XL710 0x1572 +#define I40E_DEV_ID_QEMU 0x1574 +#define I40E_DEV_ID_KX_A 0x157F +#define I40E_DEV_ID_KX_B 0x1580 +#define I40E_DEV_ID_KX_C 0x1581 +#define I40E_DEV_ID_QSFP_A 0x1583 +#define I40E_DEV_ID_QSFP_B 0x1584 +#define I40E_DEV_ID_QSFP_C 0x1585 +#define I40E_DEV_ID_10G_BASE_T 0x1586 +#define I40E_DEV_ID_20G_KR2 0x1587 +#define I40E_DEV_ID_20G_KR2_A 0x1588 +#define I40E_DEV_ID_10G_BASE_T4 0x1589 +#define I40E_DEV_ID_VF 0x154C +#define I40E_DEV_ID_VF_HV 0x1571 +#ifdef X722_SUPPORT +#define I40E_DEV_ID_SFP_X722 0x37D0 +#define I40E_DEV_ID_1G_BASE_T_X722 0x37D1 +#define I40E_DEV_ID_10G_BASE_T_X722 0x37D2 +#define I40E_DEV_ID_X722_VF 0x37CD +#define I40E_DEV_ID_X722_VF_HV 0x37D9 +#endif /* X722_SUPPORT */ + +#define i40e_is_40G_device(d) ((d) == I40E_DEV_ID_QSFP_A || \ + (d) == I40E_DEV_ID_QSFP_B || \ + (d) == I40E_DEV_ID_QSFP_C) + +#endif /* _I40E_DEVIDS_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_hmc.c b/usr/src/uts/common/io/i40e/core/i40e_hmc.c new file mode 100644 index 0000000000..3f0e6e8d5b --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_hmc.c @@ -0,0 +1,373 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_hmc.c 284049 2015-06-05 22:52:42Z jfv $*/ + +#include "i40e_osdep.h" +#include "i40e_register.h" +#include "i40e_status.h" +#include "i40e_alloc.h" +#include "i40e_hmc.h" +#ifndef I40E_NO_TYPE_HEADER +#include "i40e_type.h" +#endif + +/** + * i40e_add_sd_table_entry - Adds a segment descriptor to the table + * @hw: pointer to our hw struct + * @hmc_info: pointer to the HMC configuration information struct + * @sd_index: segment descriptor index to manipulate + * @type: what type of segment descriptor we're manipulating + * @direct_mode_sz: size to alloc in direct mode + **/ +enum i40e_status_code i40e_add_sd_table_entry(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 sd_index, + enum i40e_sd_entry_type type, + u64 direct_mode_sz) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_sd_entry *sd_entry; + enum i40e_memory_type mem_type; + bool dma_mem_alloc_done = FALSE; + struct i40e_dma_mem mem; + u64 alloc_len; + + if (NULL == hmc_info->sd_table.sd_entry) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_add_sd_table_entry: bad sd_entry\n"); + goto exit; + } + + if (sd_index >= hmc_info->sd_table.sd_cnt) { + ret_code = I40E_ERR_INVALID_SD_INDEX; + DEBUGOUT("i40e_add_sd_table_entry: bad sd_index\n"); + goto exit; + } + + sd_entry = &hmc_info->sd_table.sd_entry[sd_index]; + if (!sd_entry->valid) { + if (I40E_SD_TYPE_PAGED == type) { + mem_type = i40e_mem_pd; + alloc_len = I40E_HMC_PAGED_BP_SIZE; + } else { + mem_type = i40e_mem_bp_jumbo; + alloc_len = direct_mode_sz; + } + + /* allocate a 4K pd page or 2M backing page */ + ret_code = i40e_allocate_dma_mem(hw, &mem, mem_type, alloc_len, + I40E_HMC_PD_BP_BUF_ALIGNMENT); + if (ret_code) + goto exit; + dma_mem_alloc_done = TRUE; + if (I40E_SD_TYPE_PAGED == type) { + ret_code = i40e_allocate_virt_mem(hw, + &sd_entry->u.pd_table.pd_entry_virt_mem, + sizeof(struct i40e_hmc_pd_entry) * 512); + if (ret_code) + goto exit; + sd_entry->u.pd_table.pd_entry = + (struct i40e_hmc_pd_entry *) + sd_entry->u.pd_table.pd_entry_virt_mem.va; + i40e_memcpy(&sd_entry->u.pd_table.pd_page_addr, + &mem, sizeof(struct i40e_dma_mem), + I40E_NONDMA_TO_NONDMA); + } else { + i40e_memcpy(&sd_entry->u.bp.addr, + &mem, sizeof(struct i40e_dma_mem), + I40E_NONDMA_TO_NONDMA); + sd_entry->u.bp.sd_pd_index = sd_index; + } + /* initialize the sd entry */ + hmc_info->sd_table.sd_entry[sd_index].entry_type = type; + + /* increment the ref count */ + I40E_INC_SD_REFCNT(&hmc_info->sd_table); + } + /* Increment backing page reference count */ + if (I40E_SD_TYPE_DIRECT == sd_entry->entry_type) + I40E_INC_BP_REFCNT(&sd_entry->u.bp); +exit: + if (I40E_SUCCESS != ret_code) + if (dma_mem_alloc_done) + i40e_free_dma_mem(hw, &mem); + + return ret_code; +} + +/** + * i40e_add_pd_table_entry - Adds page descriptor to the specified table + * @hw: pointer to our HW structure + * @hmc_info: pointer to the HMC configuration information structure + * @pd_index: which page descriptor index to manipulate + * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one. + * + * This function: + * 1. Initializes the pd entry + * 2. Adds pd_entry in the pd_table + * 3. Mark the entry valid in i40e_hmc_pd_entry structure + * 4. Initializes the pd_entry's ref count to 1 + * assumptions: + * 1. The memory for pd should be pinned down, physically contiguous and + * aligned on 4K boundary and zeroed memory. + * 2. It should be 4K in size. + **/ +enum i40e_status_code i40e_add_pd_table_entry(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 pd_index, + struct i40e_dma_mem *rsrc_pg) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_pd_table *pd_table; + struct i40e_hmc_pd_entry *pd_entry; + struct i40e_dma_mem mem; + struct i40e_dma_mem *page = &mem; + u32 sd_idx, rel_pd_idx; + u64 *pd_addr; + u64 page_desc; + + if (pd_index / I40E_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt) { + ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX; + DEBUGOUT("i40e_add_pd_table_entry: bad pd_index\n"); + goto exit; + } + + /* find corresponding sd */ + sd_idx = (pd_index / I40E_HMC_PD_CNT_IN_SD); + if (I40E_SD_TYPE_PAGED != + hmc_info->sd_table.sd_entry[sd_idx].entry_type) + goto exit; + + rel_pd_idx = (pd_index % I40E_HMC_PD_CNT_IN_SD); + pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table; + pd_entry = &pd_table->pd_entry[rel_pd_idx]; + if (!pd_entry->valid) { + if (rsrc_pg) { + pd_entry->rsrc_pg = TRUE; + page = rsrc_pg; + } else { + /* allocate a 4K backing page */ + ret_code = i40e_allocate_dma_mem(hw, page, i40e_mem_bp, + I40E_HMC_PAGED_BP_SIZE, + I40E_HMC_PD_BP_BUF_ALIGNMENT); + if (ret_code) + goto exit; + pd_entry->rsrc_pg = FALSE; + } + + i40e_memcpy(&pd_entry->bp.addr, page, + sizeof(struct i40e_dma_mem), I40E_NONDMA_TO_NONDMA); + pd_entry->bp.sd_pd_index = pd_index; + pd_entry->bp.entry_type = I40E_SD_TYPE_PAGED; + /* Set page address and valid bit */ + page_desc = page->pa | 0x1; + + pd_addr = (u64 *)pd_table->pd_page_addr.va; + pd_addr += rel_pd_idx; + + /* Add the backing page physical address in the pd entry */ + i40e_memcpy(pd_addr, &page_desc, sizeof(u64), + I40E_NONDMA_TO_DMA); + + pd_entry->sd_index = sd_idx; + pd_entry->valid = TRUE; + I40E_INC_PD_REFCNT(pd_table); + } + I40E_INC_BP_REFCNT(&pd_entry->bp); +exit: + return ret_code; +} + +/** + * i40e_remove_pd_bp - remove a backing page from a page descriptor + * @hw: pointer to our HW structure + * @hmc_info: pointer to the HMC configuration information structure + * @idx: the page index + * @is_pf: distinguishes a VF from a PF + * + * This function: + * 1. Marks the entry in pd tabe (for paged address mode) or in sd table + * (for direct address mode) invalid. + * 2. Write to register PMPDINV to invalidate the backing page in FV cache + * 3. Decrement the ref count for the pd _entry + * assumptions: + * 1. Caller can deallocate the memory used by backing storage after this + * function returns. + **/ +enum i40e_status_code i40e_remove_pd_bp(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_pd_entry *pd_entry; + struct i40e_hmc_pd_table *pd_table; + struct i40e_hmc_sd_entry *sd_entry; + u32 sd_idx, rel_pd_idx; + u64 *pd_addr; + + /* calculate index */ + sd_idx = idx / I40E_HMC_PD_CNT_IN_SD; + rel_pd_idx = idx % I40E_HMC_PD_CNT_IN_SD; + if (sd_idx >= hmc_info->sd_table.sd_cnt) { + ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX; + DEBUGOUT("i40e_remove_pd_bp: bad idx\n"); + goto exit; + } + sd_entry = &hmc_info->sd_table.sd_entry[sd_idx]; + if (I40E_SD_TYPE_PAGED != sd_entry->entry_type) { + ret_code = I40E_ERR_INVALID_SD_TYPE; + DEBUGOUT("i40e_remove_pd_bp: wrong sd_entry type\n"); + goto exit; + } + /* get the entry and decrease its ref counter */ + pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table; + pd_entry = &pd_table->pd_entry[rel_pd_idx]; + I40E_DEC_BP_REFCNT(&pd_entry->bp); + if (pd_entry->bp.ref_cnt) + goto exit; + + /* mark the entry invalid */ + pd_entry->valid = FALSE; + I40E_DEC_PD_REFCNT(pd_table); + pd_addr = (u64 *)pd_table->pd_page_addr.va; + pd_addr += rel_pd_idx; + i40e_memset(pd_addr, 0, sizeof(u64), I40E_DMA_MEM); + I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, idx); + + /* free memory here */ + if (!pd_entry->rsrc_pg) + ret_code = i40e_free_dma_mem(hw, &(pd_entry->bp.addr)); + if (I40E_SUCCESS != ret_code) + goto exit; + if (!pd_table->ref_cnt) + i40e_free_virt_mem(hw, &pd_table->pd_entry_virt_mem); +exit: + return ret_code; +} + +/** + * i40e_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry + * @hmc_info: pointer to the HMC configuration information structure + * @idx: the page index + **/ +enum i40e_status_code i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info, + u32 idx) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_sd_entry *sd_entry; + + /* get the entry and decrease its ref counter */ + sd_entry = &hmc_info->sd_table.sd_entry[idx]; + I40E_DEC_BP_REFCNT(&sd_entry->u.bp); + if (sd_entry->u.bp.ref_cnt) { + ret_code = I40E_ERR_NOT_READY; + goto exit; + } + I40E_DEC_SD_REFCNT(&hmc_info->sd_table); + + /* mark the entry invalid */ + sd_entry->valid = FALSE; +exit: + return ret_code; +} + +/** + * i40e_remove_sd_bp_new - Removes a backing page from a segment descriptor + * @hw: pointer to our hw struct + * @hmc_info: pointer to the HMC configuration information structure + * @idx: the page index + * @is_pf: used to distinguish between VF and PF + **/ +enum i40e_status_code i40e_remove_sd_bp_new(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx, bool is_pf) +{ + struct i40e_hmc_sd_entry *sd_entry; + + if (!is_pf) + return I40E_NOT_SUPPORTED; + + /* get the entry and decrease its ref counter */ + sd_entry = &hmc_info->sd_table.sd_entry[idx]; + I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_DIRECT); + + return i40e_free_dma_mem(hw, &(sd_entry->u.bp.addr)); +} + +/** + * i40e_prep_remove_pd_page - Prepares to remove a PD page from sd entry. + * @hmc_info: pointer to the HMC configuration information structure + * @idx: segment descriptor index to find the relevant page descriptor + **/ +enum i40e_status_code i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info, + u32 idx) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_sd_entry *sd_entry; + + sd_entry = &hmc_info->sd_table.sd_entry[idx]; + + if (sd_entry->u.pd_table.ref_cnt) { + ret_code = I40E_ERR_NOT_READY; + goto exit; + } + + /* mark the entry invalid */ + sd_entry->valid = FALSE; + + I40E_DEC_SD_REFCNT(&hmc_info->sd_table); +exit: + return ret_code; +} + +/** + * i40e_remove_pd_page_new - Removes a PD page from sd entry. + * @hw: pointer to our hw struct + * @hmc_info: pointer to the HMC configuration information structure + * @idx: segment descriptor index to find the relevant page descriptor + * @is_pf: used to distinguish between VF and PF + **/ +enum i40e_status_code i40e_remove_pd_page_new(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx, bool is_pf) +{ + struct i40e_hmc_sd_entry *sd_entry; + + if (!is_pf) + return I40E_NOT_SUPPORTED; + + sd_entry = &hmc_info->sd_table.sd_entry[idx]; + I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_PAGED); + + return i40e_free_dma_mem(hw, &(sd_entry->u.pd_table.pd_page_addr)); +} diff --git a/usr/src/uts/common/io/i40e/core/i40e_hmc.h b/usr/src/uts/common/io/i40e/core/i40e_hmc.h new file mode 100644 index 0000000000..d6e1f93421 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_hmc.h @@ -0,0 +1,246 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_hmc.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_HMC_H_ +#define _I40E_HMC_H_ + +#define I40E_HMC_MAX_BP_COUNT 512 + +/* forward-declare the HW struct for the compiler */ +struct i40e_hw; + +#define I40E_HMC_INFO_SIGNATURE 0x484D5347 /* HMSG */ +#define I40E_HMC_PD_CNT_IN_SD 512 +#define I40E_HMC_DIRECT_BP_SIZE 0x200000 /* 2M */ +#define I40E_HMC_PAGED_BP_SIZE 4096 +#define I40E_HMC_PD_BP_BUF_ALIGNMENT 4096 +#define I40E_FIRST_VF_FPM_ID 16 + +struct i40e_hmc_obj_info { + u64 base; /* base addr in FPM */ + u32 max_cnt; /* max count available for this hmc func */ + u32 cnt; /* count of objects driver actually wants to create */ + u64 size; /* size in bytes of one object */ +}; + +enum i40e_sd_entry_type { + I40E_SD_TYPE_INVALID = 0, + I40E_SD_TYPE_PAGED = 1, + I40E_SD_TYPE_DIRECT = 2 +}; + +struct i40e_hmc_bp { + enum i40e_sd_entry_type entry_type; + struct i40e_dma_mem addr; /* populate to be used by hw */ + u32 sd_pd_index; + u32 ref_cnt; +}; + +struct i40e_hmc_pd_entry { + struct i40e_hmc_bp bp; + u32 sd_index; + bool rsrc_pg; + bool valid; +}; + +struct i40e_hmc_pd_table { + struct i40e_dma_mem pd_page_addr; /* populate to be used by hw */ + struct i40e_hmc_pd_entry *pd_entry; /* [512] for sw book keeping */ + struct i40e_virt_mem pd_entry_virt_mem; /* virt mem for pd_entry */ + + u32 ref_cnt; + u32 sd_index; +}; + +struct i40e_hmc_sd_entry { + enum i40e_sd_entry_type entry_type; + bool valid; + + union { + struct i40e_hmc_pd_table pd_table; + struct i40e_hmc_bp bp; + } u; +}; + +struct i40e_hmc_sd_table { + struct i40e_virt_mem addr; /* used to track sd_entry allocations */ + u32 sd_cnt; + u32 ref_cnt; + struct i40e_hmc_sd_entry *sd_entry; /* (sd_cnt*512) entries max */ +}; + +struct i40e_hmc_info { + u32 signature; + /* equals to pci func num for PF and dynamically allocated for VFs */ + u8 hmc_fn_id; + u16 first_sd_index; /* index of the first available SD */ + + /* hmc objects */ + struct i40e_hmc_obj_info *hmc_obj; + struct i40e_virt_mem hmc_obj_virt_mem; + struct i40e_hmc_sd_table sd_table; +}; + +#define I40E_INC_SD_REFCNT(sd_table) ((sd_table)->ref_cnt++) +#define I40E_INC_PD_REFCNT(pd_table) ((pd_table)->ref_cnt++) +#define I40E_INC_BP_REFCNT(bp) ((bp)->ref_cnt++) + +#define I40E_DEC_SD_REFCNT(sd_table) ((sd_table)->ref_cnt--) +#define I40E_DEC_PD_REFCNT(pd_table) ((pd_table)->ref_cnt--) +#define I40E_DEC_BP_REFCNT(bp) ((bp)->ref_cnt--) + +/** + * I40E_SET_PF_SD_ENTRY - marks the sd entry as valid in the hardware + * @hw: pointer to our hw struct + * @pa: pointer to physical address + * @sd_index: segment descriptor index + * @type: if sd entry is direct or paged + **/ +#define I40E_SET_PF_SD_ENTRY(hw, pa, sd_index, type) \ +{ \ + u32 val1, val2, val3; \ + val1 = (u32)(I40E_HI_DWORD(pa)); \ + val2 = (u32)(pa) | (I40E_HMC_MAX_BP_COUNT << \ + I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) | \ + ((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) << \ + I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) | \ + BIT(I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT); \ + val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT); \ + wr32((hw), I40E_PFHMC_SDDATAHIGH, val1); \ + wr32((hw), I40E_PFHMC_SDDATALOW, val2); \ + wr32((hw), I40E_PFHMC_SDCMD, val3); \ +} + +/** + * I40E_CLEAR_PF_SD_ENTRY - marks the sd entry as invalid in the hardware + * @hw: pointer to our hw struct + * @sd_index: segment descriptor index + * @type: if sd entry is direct or paged + **/ +#define I40E_CLEAR_PF_SD_ENTRY(hw, sd_index, type) \ +{ \ + u32 val2, val3; \ + val2 = (I40E_HMC_MAX_BP_COUNT << \ + I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) | \ + ((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) << \ + I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT); \ + val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT); \ + wr32((hw), I40E_PFHMC_SDDATAHIGH, 0); \ + wr32((hw), I40E_PFHMC_SDDATALOW, val2); \ + wr32((hw), I40E_PFHMC_SDCMD, val3); \ +} + +/** + * I40E_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware + * @hw: pointer to our hw struct + * @sd_idx: segment descriptor index + * @pd_idx: page descriptor index + **/ +#define I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx) \ + wr32((hw), I40E_PFHMC_PDINV, \ + (((sd_idx) << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) | \ + ((pd_idx) << I40E_PFHMC_PDINV_PMPDIDX_SHIFT))) + +/** + * I40E_FIND_SD_INDEX_LIMIT - finds segment descriptor index limit + * @hmc_info: pointer to the HMC configuration information structure + * @type: type of HMC resources we're searching + * @index: starting index for the object + * @cnt: number of objects we're trying to create + * @sd_idx: pointer to return index of the segment descriptor in question + * @sd_limit: pointer to return the maximum number of segment descriptors + * + * This function calculates the segment descriptor index and index limit + * for the resource defined by i40e_hmc_rsrc_type. + **/ +#define I40E_FIND_SD_INDEX_LIMIT(hmc_info, type, index, cnt, sd_idx, sd_limit)\ +{ \ + u64 fpm_addr, fpm_limit; \ + fpm_addr = (hmc_info)->hmc_obj[(type)].base + \ + (hmc_info)->hmc_obj[(type)].size * (index); \ + fpm_limit = fpm_addr + (hmc_info)->hmc_obj[(type)].size * (cnt);\ + *(sd_idx) = (u32)(fpm_addr / I40E_HMC_DIRECT_BP_SIZE); \ + *(sd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_DIRECT_BP_SIZE); \ + /* add one more to the limit to correct our range */ \ + *(sd_limit) += 1; \ +} + +/** + * I40E_FIND_PD_INDEX_LIMIT - finds page descriptor index limit + * @hmc_info: pointer to the HMC configuration information struct + * @type: HMC resource type we're examining + * @idx: starting index for the object + * @cnt: number of objects we're trying to create + * @pd_index: pointer to return page descriptor index + * @pd_limit: pointer to return page descriptor index limit + * + * Calculates the page descriptor index and index limit for the resource + * defined by i40e_hmc_rsrc_type. + **/ +#define I40E_FIND_PD_INDEX_LIMIT(hmc_info, type, idx, cnt, pd_index, pd_limit)\ +{ \ + u64 fpm_adr, fpm_limit; \ + fpm_adr = (hmc_info)->hmc_obj[(type)].base + \ + (hmc_info)->hmc_obj[(type)].size * (idx); \ + fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt); \ + *(pd_index) = (u32)(fpm_adr / I40E_HMC_PAGED_BP_SIZE); \ + *(pd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_PAGED_BP_SIZE); \ + /* add one more to the limit to correct our range */ \ + *(pd_limit) += 1; \ +} +enum i40e_status_code i40e_add_sd_table_entry(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 sd_index, + enum i40e_sd_entry_type type, + u64 direct_mode_sz); + +enum i40e_status_code i40e_add_pd_table_entry(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 pd_index, + struct i40e_dma_mem *rsrc_pg); +enum i40e_status_code i40e_remove_pd_bp(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx); +enum i40e_status_code i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info, + u32 idx); +enum i40e_status_code i40e_remove_sd_bp_new(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx, bool is_pf); +enum i40e_status_code i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info, + u32 idx); +enum i40e_status_code i40e_remove_pd_page_new(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx, bool is_pf); + +#endif /* _I40E_HMC_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c new file mode 100644 index 0000000000..2b2fa4f8f9 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.c @@ -0,0 +1,1412 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_lan_hmc.c 284049 2015-06-05 22:52:42Z jfv $*/ + +#include "i40e_osdep.h" +#include "i40e_register.h" +#include "i40e_type.h" +#include "i40e_hmc.h" +#include "i40e_lan_hmc.h" +#include "i40e_prototype.h" + +/* lan specific interface functions */ + +/** + * i40e_align_l2obj_base - aligns base object pointer to 512 bytes + * @offset: base address offset needing alignment + * + * Aligns the layer 2 function private memory so it's 512-byte aligned. + **/ +static u64 i40e_align_l2obj_base(u64 offset) +{ + u64 aligned_offset = offset; + + if ((offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT) > 0) + aligned_offset += (I40E_HMC_L2OBJ_BASE_ALIGNMENT - + (offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT)); + + return aligned_offset; +} + +/** + * i40e_calculate_l2fpm_size - calculates layer 2 FPM memory size + * @txq_num: number of Tx queues needing backing context + * @rxq_num: number of Rx queues needing backing context + * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context + * @fcoe_filt_num: number of FCoE filters needing backing context + * + * Calculates the maximum amount of memory for the function required, based + * on the number of resources it must provide context for. + **/ +u64 i40e_calculate_l2fpm_size(u32 txq_num, u32 rxq_num, + u32 fcoe_cntx_num, u32 fcoe_filt_num) +{ + u64 fpm_size = 0; + + fpm_size = txq_num * I40E_HMC_OBJ_SIZE_TXQ; + fpm_size = i40e_align_l2obj_base(fpm_size); + + fpm_size += (rxq_num * I40E_HMC_OBJ_SIZE_RXQ); + fpm_size = i40e_align_l2obj_base(fpm_size); + + fpm_size += (fcoe_cntx_num * I40E_HMC_OBJ_SIZE_FCOE_CNTX); + fpm_size = i40e_align_l2obj_base(fpm_size); + + fpm_size += (fcoe_filt_num * I40E_HMC_OBJ_SIZE_FCOE_FILT); + fpm_size = i40e_align_l2obj_base(fpm_size); + + return fpm_size; +} + +/** + * i40e_init_lan_hmc - initialize i40e_hmc_info struct + * @hw: pointer to the HW structure + * @txq_num: number of Tx queues needing backing context + * @rxq_num: number of Rx queues needing backing context + * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context + * @fcoe_filt_num: number of FCoE filters needing backing context + * + * This function will be called once per physical function initialization. + * It will fill out the i40e_hmc_obj_info structure for LAN objects based on + * the driver's provided input, as well as information from the HMC itself + * loaded from NVRAM. + * + * Assumptions: + * - HMC Resource Profile has been selected before calling this function. + **/ +enum i40e_status_code i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num, + u32 rxq_num, u32 fcoe_cntx_num, + u32 fcoe_filt_num) +{ + struct i40e_hmc_obj_info *obj, *full_obj; + enum i40e_status_code ret_code = I40E_SUCCESS; + u64 l2fpm_size; + u32 size_exp; + + hw->hmc.signature = I40E_HMC_INFO_SIGNATURE; + hw->hmc.hmc_fn_id = hw->pf_id; + + /* allocate memory for hmc_obj */ + ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem, + sizeof(struct i40e_hmc_obj_info) * I40E_HMC_LAN_MAX); + if (ret_code) + goto init_lan_hmc_out; + hw->hmc.hmc_obj = (struct i40e_hmc_obj_info *) + hw->hmc.hmc_obj_virt_mem.va; + + /* The full object will be used to create the LAN HMC SD */ + full_obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_FULL]; + full_obj->max_cnt = 0; + full_obj->cnt = 0; + full_obj->base = 0; + full_obj->size = 0; + + /* Tx queue context information */ + obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX]; + obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX); + obj->cnt = txq_num; + obj->base = 0; + size_exp = rd32(hw, I40E_GLHMC_LANTXOBJSZ); + obj->size = BIT_ULL(size_exp); + + /* validate values requested by driver don't exceed HMC capacity */ + if (txq_num > obj->max_cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT3("i40e_init_lan_hmc: Tx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n", + txq_num, obj->max_cnt, ret_code); + goto init_lan_hmc_out; + } + + /* aggregate values into the full LAN object for later */ + full_obj->max_cnt += obj->max_cnt; + full_obj->cnt += obj->cnt; + + /* Rx queue context information */ + obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX]; + obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX); + obj->cnt = rxq_num; + obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_TX].base + + (hw->hmc.hmc_obj[I40E_HMC_LAN_TX].cnt * + hw->hmc.hmc_obj[I40E_HMC_LAN_TX].size); + obj->base = i40e_align_l2obj_base(obj->base); + size_exp = rd32(hw, I40E_GLHMC_LANRXOBJSZ); + obj->size = BIT_ULL(size_exp); + + /* validate values requested by driver don't exceed HMC capacity */ + if (rxq_num > obj->max_cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT3("i40e_init_lan_hmc: Rx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n", + rxq_num, obj->max_cnt, ret_code); + goto init_lan_hmc_out; + } + + /* aggregate values into the full LAN object for later */ + full_obj->max_cnt += obj->max_cnt; + full_obj->cnt += obj->cnt; + + /* FCoE context information */ + obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX]; + obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEMAX); + obj->cnt = fcoe_cntx_num; + obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_RX].base + + (hw->hmc.hmc_obj[I40E_HMC_LAN_RX].cnt * + hw->hmc.hmc_obj[I40E_HMC_LAN_RX].size); + obj->base = i40e_align_l2obj_base(obj->base); + size_exp = rd32(hw, I40E_GLHMC_FCOEDDPOBJSZ); + obj->size = BIT_ULL(size_exp); + + /* validate values requested by driver don't exceed HMC capacity */ + if (fcoe_cntx_num > obj->max_cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT3("i40e_init_lan_hmc: FCoE context: asks for 0x%x but max allowed is 0x%x, returns error %d\n", + fcoe_cntx_num, obj->max_cnt, ret_code); + goto init_lan_hmc_out; + } + + /* aggregate values into the full LAN object for later */ + full_obj->max_cnt += obj->max_cnt; + full_obj->cnt += obj->cnt; + + /* FCoE filter information */ + obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT]; + obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEFMAX); + obj->cnt = fcoe_filt_num; + obj->base = hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].base + + (hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].cnt * + hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].size); + obj->base = i40e_align_l2obj_base(obj->base); + size_exp = rd32(hw, I40E_GLHMC_FCOEFOBJSZ); + obj->size = BIT_ULL(size_exp); + + /* validate values requested by driver don't exceed HMC capacity */ + if (fcoe_filt_num > obj->max_cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT3("i40e_init_lan_hmc: FCoE filter: asks for 0x%x but max allowed is 0x%x, returns error %d\n", + fcoe_filt_num, obj->max_cnt, ret_code); + goto init_lan_hmc_out; + } + + /* aggregate values into the full LAN object for later */ + full_obj->max_cnt += obj->max_cnt; + full_obj->cnt += obj->cnt; + + hw->hmc.first_sd_index = 0; + hw->hmc.sd_table.ref_cnt = 0; + l2fpm_size = i40e_calculate_l2fpm_size(txq_num, rxq_num, fcoe_cntx_num, + fcoe_filt_num); + if (NULL == hw->hmc.sd_table.sd_entry) { + hw->hmc.sd_table.sd_cnt = (u32) + (l2fpm_size + I40E_HMC_DIRECT_BP_SIZE - 1) / + I40E_HMC_DIRECT_BP_SIZE; + + /* allocate the sd_entry members in the sd_table */ + ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.sd_table.addr, + (sizeof(struct i40e_hmc_sd_entry) * + hw->hmc.sd_table.sd_cnt)); + if (ret_code) + goto init_lan_hmc_out; + hw->hmc.sd_table.sd_entry = + (struct i40e_hmc_sd_entry *)hw->hmc.sd_table.addr.va; + } + /* store in the LAN full object for later */ + full_obj->size = l2fpm_size; + +init_lan_hmc_out: + return ret_code; +} + +/** + * i40e_remove_pd_page - Remove a page from the page descriptor table + * @hw: pointer to the HW structure + * @hmc_info: pointer to the HMC configuration information structure + * @idx: segment descriptor index to find the relevant page descriptor + * + * This function: + * 1. Marks the entry in pd table (for paged address mode) invalid + * 2. write to register PMPDINV to invalidate the backing page in FV cache + * 3. Decrement the ref count for pd_entry + * assumptions: + * 1. caller can deallocate the memory used by pd after this function + * returns. + **/ +static enum i40e_status_code i40e_remove_pd_page(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (i40e_prep_remove_pd_page(hmc_info, idx) == I40E_SUCCESS) + ret_code = i40e_remove_pd_page_new(hw, hmc_info, idx, TRUE); + + return ret_code; +} + +/** + * i40e_remove_sd_bp - remove a backing page from a segment descriptor + * @hw: pointer to our HW structure + * @hmc_info: pointer to the HMC configuration information structure + * @idx: the page index + * + * This function: + * 1. Marks the entry in sd table (for direct address mode) invalid + * 2. write to register PMSDCMD, PMSDDATALOW(PMSDDATALOW.PMSDVALID set + * to 0) and PMSDDATAHIGH to invalidate the sd page + * 3. Decrement the ref count for the sd_entry + * assumptions: + * 1. caller can deallocate the memory used by backing storage after this + * function returns. + **/ +static enum i40e_status_code i40e_remove_sd_bp(struct i40e_hw *hw, + struct i40e_hmc_info *hmc_info, + u32 idx) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + + if (i40e_prep_remove_sd_bp(hmc_info, idx) == I40E_SUCCESS) + ret_code = i40e_remove_sd_bp_new(hw, hmc_info, idx, TRUE); + + return ret_code; +} + +/** + * i40e_create_lan_hmc_object - allocate backing store for hmc objects + * @hw: pointer to the HW structure + * @info: pointer to i40e_hmc_create_obj_info struct + * + * This will allocate memory for PDs and backing pages and populate + * the sd and pd entries. + **/ +enum i40e_status_code i40e_create_lan_hmc_object(struct i40e_hw *hw, + struct i40e_hmc_lan_create_obj_info *info) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_sd_entry *sd_entry; + u32 pd_idx1 = 0, pd_lmt1 = 0; + u32 pd_idx = 0, pd_lmt = 0; + bool pd_error = FALSE; + u32 sd_idx, sd_lmt; + u64 sd_size; + u32 i, j; + + if (NULL == info) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_create_lan_hmc_object: bad info ptr\n"); + goto exit; + } + if (NULL == info->hmc_info) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_create_lan_hmc_object: bad hmc_info ptr\n"); + goto exit; + } + if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_create_lan_hmc_object: bad signature\n"); + goto exit; + } + + if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX; + DEBUGOUT1("i40e_create_lan_hmc_object: returns error %d\n", + ret_code); + goto exit; + } + if ((info->start_idx + info->count) > + info->hmc_info->hmc_obj[info->rsrc_type].cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT1("i40e_create_lan_hmc_object: returns error %d\n", + ret_code); + goto exit; + } + + /* find sd index and limit */ + I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type, + info->start_idx, info->count, + &sd_idx, &sd_lmt); + if (sd_idx >= info->hmc_info->sd_table.sd_cnt || + sd_lmt > info->hmc_info->sd_table.sd_cnt) { + ret_code = I40E_ERR_INVALID_SD_INDEX; + goto exit; + } + /* find pd index */ + I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type, + info->start_idx, info->count, &pd_idx, + &pd_lmt); + + /* This is to cover for cases where you may not want to have an SD with + * the full 2M memory but something smaller. By not filling out any + * size, the function will default the SD size to be 2M. + */ + if (info->direct_mode_sz == 0) + sd_size = I40E_HMC_DIRECT_BP_SIZE; + else + sd_size = info->direct_mode_sz; + + /* check if all the sds are valid. If not, allocate a page and + * initialize it. + */ + for (j = sd_idx; j < sd_lmt; j++) { + /* update the sd table entry */ + ret_code = i40e_add_sd_table_entry(hw, info->hmc_info, j, + info->entry_type, + sd_size); + if (I40E_SUCCESS != ret_code) + goto exit_sd_error; + sd_entry = &info->hmc_info->sd_table.sd_entry[j]; + if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) { + /* check if all the pds in this sd are valid. If not, + * allocate a page and initialize it. + */ + + /* find pd_idx and pd_lmt in this sd */ + pd_idx1 = max(pd_idx, (j * I40E_HMC_MAX_BP_COUNT)); + pd_lmt1 = min(pd_lmt, + ((j + 1) * I40E_HMC_MAX_BP_COUNT)); + for (i = pd_idx1; i < pd_lmt1; i++) { + /* update the pd table entry */ + ret_code = i40e_add_pd_table_entry(hw, + info->hmc_info, + i, NULL); + if (I40E_SUCCESS != ret_code) { + pd_error = TRUE; + break; + } + } + if (pd_error) { + /* remove the backing pages from pd_idx1 to i */ + while (i && (i > pd_idx1)) { + i40e_remove_pd_bp(hw, info->hmc_info, + (i - 1)); + i--; + } + } + } + if (!sd_entry->valid) { + sd_entry->valid = TRUE; + switch (sd_entry->entry_type) { + case I40E_SD_TYPE_PAGED: + I40E_SET_PF_SD_ENTRY(hw, + sd_entry->u.pd_table.pd_page_addr.pa, + j, sd_entry->entry_type); + break; + case I40E_SD_TYPE_DIRECT: + I40E_SET_PF_SD_ENTRY(hw, sd_entry->u.bp.addr.pa, + j, sd_entry->entry_type); + break; + default: + ret_code = I40E_ERR_INVALID_SD_TYPE; + goto exit; + } + } + } + goto exit; + +exit_sd_error: + /* cleanup for sd entries from j to sd_idx */ + while (j && (j > sd_idx)) { + sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1]; + switch (sd_entry->entry_type) { + case I40E_SD_TYPE_PAGED: + pd_idx1 = max(pd_idx, + ((j - 1) * I40E_HMC_MAX_BP_COUNT)); + pd_lmt1 = min(pd_lmt, (j * I40E_HMC_MAX_BP_COUNT)); + for (i = pd_idx1; i < pd_lmt1; i++) + i40e_remove_pd_bp(hw, info->hmc_info, i); + i40e_remove_pd_page(hw, info->hmc_info, (j - 1)); + break; + case I40E_SD_TYPE_DIRECT: + i40e_remove_sd_bp(hw, info->hmc_info, (j - 1)); + break; + default: + ret_code = I40E_ERR_INVALID_SD_TYPE; + break; + } + j--; + } +exit: + return ret_code; +} + +/** + * i40e_configure_lan_hmc - prepare the HMC backing store + * @hw: pointer to the hw structure + * @model: the model for the layout of the SD/PD tables + * + * - This function will be called once per physical function initialization. + * - This function will be called after i40e_init_lan_hmc() and before + * any LAN/FCoE HMC objects can be created. + **/ +enum i40e_status_code i40e_configure_lan_hmc(struct i40e_hw *hw, + enum i40e_hmc_model model) +{ + struct i40e_hmc_lan_create_obj_info info; + u8 hmc_fn_id = hw->hmc.hmc_fn_id; + struct i40e_hmc_obj_info *obj; + enum i40e_status_code ret_code = I40E_SUCCESS; + + /* Initialize part of the create object info struct */ + info.hmc_info = &hw->hmc; + info.rsrc_type = I40E_HMC_LAN_FULL; + info.start_idx = 0; + info.direct_mode_sz = hw->hmc.hmc_obj[I40E_HMC_LAN_FULL].size; + + /* Build the SD entry for the LAN objects */ + switch (model) { + case I40E_HMC_MODEL_DIRECT_PREFERRED: + case I40E_HMC_MODEL_DIRECT_ONLY: + info.entry_type = I40E_SD_TYPE_DIRECT; + /* Make one big object, a single SD */ + info.count = 1; + ret_code = i40e_create_lan_hmc_object(hw, &info); + if ((ret_code != I40E_SUCCESS) && (model == I40E_HMC_MODEL_DIRECT_PREFERRED)) + goto try_type_paged; + else if (ret_code != I40E_SUCCESS) + goto configure_lan_hmc_out; + /* else clause falls through the break */ + break; + case I40E_HMC_MODEL_PAGED_ONLY: +try_type_paged: + info.entry_type = I40E_SD_TYPE_PAGED; + /* Make one big object in the PD table */ + info.count = 1; + ret_code = i40e_create_lan_hmc_object(hw, &info); + if (ret_code != I40E_SUCCESS) + goto configure_lan_hmc_out; + break; + default: + /* unsupported type */ + ret_code = I40E_ERR_INVALID_SD_TYPE; + DEBUGOUT1("i40e_configure_lan_hmc: Unknown SD type: %d\n", + ret_code); + goto configure_lan_hmc_out; + } + + /* Configure and program the FPM registers so objects can be created */ + + /* Tx contexts */ + obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX]; + wr32(hw, I40E_GLHMC_LANTXBASE(hmc_fn_id), + (u32)((obj->base & I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK) / 512)); + wr32(hw, I40E_GLHMC_LANTXCNT(hmc_fn_id), obj->cnt); + + /* Rx contexts */ + obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX]; + wr32(hw, I40E_GLHMC_LANRXBASE(hmc_fn_id), + (u32)((obj->base & I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK) / 512)); + wr32(hw, I40E_GLHMC_LANRXCNT(hmc_fn_id), obj->cnt); + + /* FCoE contexts */ + obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX]; + wr32(hw, I40E_GLHMC_FCOEDDPBASE(hmc_fn_id), + (u32)((obj->base & I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK) / 512)); + wr32(hw, I40E_GLHMC_FCOEDDPCNT(hmc_fn_id), obj->cnt); + + /* FCoE filters */ + obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT]; + wr32(hw, I40E_GLHMC_FCOEFBASE(hmc_fn_id), + (u32)((obj->base & I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK) / 512)); + wr32(hw, I40E_GLHMC_FCOEFCNT(hmc_fn_id), obj->cnt); + +configure_lan_hmc_out: + return ret_code; +} + +/** + * i40e_delete_hmc_object - remove hmc objects + * @hw: pointer to the HW structure + * @info: pointer to i40e_hmc_delete_obj_info struct + * + * This will de-populate the SDs and PDs. It frees + * the memory for PDS and backing storage. After this function is returned, + * caller should deallocate memory allocated previously for + * book-keeping information about PDs and backing storage. + **/ +enum i40e_status_code i40e_delete_lan_hmc_object(struct i40e_hw *hw, + struct i40e_hmc_lan_delete_obj_info *info) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_hmc_pd_table *pd_table; + u32 pd_idx, pd_lmt, rel_pd_idx; + u32 sd_idx, sd_lmt; + u32 i, j; + + if (NULL == info) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_delete_hmc_object: bad info ptr\n"); + goto exit; + } + if (NULL == info->hmc_info) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_delete_hmc_object: bad info->hmc_info ptr\n"); + goto exit; + } + if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_delete_hmc_object: bad hmc_info->signature\n"); + goto exit; + } + + if (NULL == info->hmc_info->sd_table.sd_entry) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_delete_hmc_object: bad sd_entry\n"); + goto exit; + } + + if (NULL == info->hmc_info->hmc_obj) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_delete_hmc_object: bad hmc_info->hmc_obj\n"); + goto exit; + } + if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX; + DEBUGOUT1("i40e_delete_hmc_object: returns error %d\n", + ret_code); + goto exit; + } + + if ((info->start_idx + info->count) > + info->hmc_info->hmc_obj[info->rsrc_type].cnt) { + ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT; + DEBUGOUT1("i40e_delete_hmc_object: returns error %d\n", + ret_code); + goto exit; + } + + I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type, + info->start_idx, info->count, &pd_idx, + &pd_lmt); + + for (j = pd_idx; j < pd_lmt; j++) { + sd_idx = j / I40E_HMC_PD_CNT_IN_SD; + + if (I40E_SD_TYPE_PAGED != + info->hmc_info->sd_table.sd_entry[sd_idx].entry_type) + continue; + + rel_pd_idx = j % I40E_HMC_PD_CNT_IN_SD; + + pd_table = + &info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table; + if (pd_table->pd_entry[rel_pd_idx].valid) { + ret_code = i40e_remove_pd_bp(hw, info->hmc_info, j); + if (I40E_SUCCESS != ret_code) + goto exit; + } + } + + /* find sd index and limit */ + I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type, + info->start_idx, info->count, + &sd_idx, &sd_lmt); + if (sd_idx >= info->hmc_info->sd_table.sd_cnt || + sd_lmt > info->hmc_info->sd_table.sd_cnt) { + ret_code = I40E_ERR_INVALID_SD_INDEX; + goto exit; + } + + for (i = sd_idx; i < sd_lmt; i++) { + if (!info->hmc_info->sd_table.sd_entry[i].valid) + continue; + switch (info->hmc_info->sd_table.sd_entry[i].entry_type) { + case I40E_SD_TYPE_DIRECT: + ret_code = i40e_remove_sd_bp(hw, info->hmc_info, i); + if (I40E_SUCCESS != ret_code) + goto exit; + break; + case I40E_SD_TYPE_PAGED: + ret_code = i40e_remove_pd_page(hw, info->hmc_info, i); + if (I40E_SUCCESS != ret_code) + goto exit; + break; + default: + break; + } + } +exit: + return ret_code; +} + +/** + * i40e_shutdown_lan_hmc - Remove HMC backing store, free allocated memory + * @hw: pointer to the hw structure + * + * This must be called by drivers as they are shutting down and being + * removed from the OS. + **/ +enum i40e_status_code i40e_shutdown_lan_hmc(struct i40e_hw *hw) +{ + struct i40e_hmc_lan_delete_obj_info info; + enum i40e_status_code ret_code; + + info.hmc_info = &hw->hmc; + info.rsrc_type = I40E_HMC_LAN_FULL; + info.start_idx = 0; + info.count = 1; + + /* delete the object */ + ret_code = i40e_delete_lan_hmc_object(hw, &info); + + /* free the SD table entry for LAN */ + i40e_free_virt_mem(hw, &hw->hmc.sd_table.addr); + hw->hmc.sd_table.sd_cnt = 0; + hw->hmc.sd_table.sd_entry = NULL; + + /* free memory used for hmc_obj */ + i40e_free_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem); + hw->hmc.hmc_obj = NULL; + + return ret_code; +} + +#define I40E_HMC_STORE(_struct, _ele) \ + offsetof(struct _struct, _ele), \ + FIELD_SIZEOF(struct _struct, _ele) + +struct i40e_context_ele { + u16 offset; + u16 size_of; + u16 width; + u16 lsb; +}; + +/* LAN Tx Queue Context */ +static struct i40e_context_ele i40e_hmc_txq_ce_info[] = { + /* Field Width LSB */ + {I40E_HMC_STORE(i40e_hmc_obj_txq, head), 13, 0 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, new_context), 1, 30 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, base), 57, 32 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, fc_ena), 1, 89 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, timesync_ena), 1, 90 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, fd_ena), 1, 91 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, alt_vlan_ena), 1, 92 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, cpuid), 8, 96 }, +/* line 1 */ + {I40E_HMC_STORE(i40e_hmc_obj_txq, thead_wb), 13, 0 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_ena), 1, 32 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, qlen), 13, 33 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, tphrdesc_ena), 1, 46 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, tphrpacket_ena), 1, 47 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, tphwdesc_ena), 1, 48 + 128 }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_addr), 64, 64 + 128 }, +/* line 7 */ + {I40E_HMC_STORE(i40e_hmc_obj_txq, crc), 32, 0 + (7 * 128) }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist), 10, 84 + (7 * 128) }, + {I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist_act), 1, 94 + (7 * 128) }, + { 0 } +}; + +/* LAN Rx Queue Context */ +static struct i40e_context_ele i40e_hmc_rxq_ce_info[] = { + /* Field Width LSB */ + { I40E_HMC_STORE(i40e_hmc_obj_rxq, head), 13, 0 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, cpuid), 8, 13 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, base), 57, 32 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, qlen), 13, 89 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, dbuff), 7, 102 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, hbuff), 5, 109 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, dtype), 2, 114 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, dsize), 1, 116 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, crcstrip), 1, 117 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, fc_ena), 1, 118 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, l2tsel), 1, 119 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_0), 4, 120 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_1), 2, 124 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, showiv), 1, 127 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, rxmax), 14, 174 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, tphrdesc_ena), 1, 193 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, tphwdesc_ena), 1, 194 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, tphdata_ena), 1, 195 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, tphhead_ena), 1, 196 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, lrxqthresh), 3, 198 }, + { I40E_HMC_STORE(i40e_hmc_obj_rxq, prefena), 1, 201 }, + { 0 } +}; + +/** + * i40e_write_byte - replace HMC context byte + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be read from + * @src: the struct to be read from + **/ +static void i40e_write_byte(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *src) +{ + u8 src_byte, dest_byte, mask; + u8 *from, *dest; + u16 shift_width; + + /* copy from the next struct field */ + from = src + ce_info->offset; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + mask = BIT(ce_info->width) - 1; + + src_byte = *from; + src_byte &= mask; + + /* shift to correct alignment */ + mask <<= shift_width; + src_byte <<= shift_width; + + /* get the current bits from the target bit string */ + dest = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&dest_byte, dest, sizeof(dest_byte), I40E_DMA_TO_NONDMA); + + dest_byte &= ~mask; /* get the bits not changing */ + dest_byte |= src_byte; /* add in the new bits */ + + /* put it all back */ + i40e_memcpy(dest, &dest_byte, sizeof(dest_byte), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_write_word - replace HMC context word + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be read from + * @src: the struct to be read from + **/ +static void i40e_write_word(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *src) +{ + u16 src_word, mask; + u8 *from, *dest; + u16 shift_width; + __le16 dest_word; + + /* copy from the next struct field */ + from = src + ce_info->offset; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + mask = BIT(ce_info->width) - 1; + + /* don't swizzle the bits until after the mask because the mask bits + * will be in a different bit position on big endian machines + */ + src_word = *(u16 *)from; + src_word &= mask; + + /* shift to correct alignment */ + mask <<= shift_width; + src_word <<= shift_width; + + /* get the current bits from the target bit string */ + dest = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&dest_word, dest, sizeof(dest_word), I40E_DMA_TO_NONDMA); + + dest_word &= ~(CPU_TO_LE16(mask)); /* get the bits not changing */ + dest_word |= CPU_TO_LE16(src_word); /* add in the new bits */ + + /* put it all back */ + i40e_memcpy(dest, &dest_word, sizeof(dest_word), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_write_dword - replace HMC context dword + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be read from + * @src: the struct to be read from + **/ +static void i40e_write_dword(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *src) +{ + u32 src_dword, mask; + u8 *from, *dest; + u16 shift_width; + __le32 dest_dword; + + /* copy from the next struct field */ + from = src + ce_info->offset; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + + /* if the field width is exactly 32 on an x86 machine, then the shift + * operation will not work because the SHL instructions count is masked + * to 5 bits so the shift will do nothing + */ + if (ce_info->width < 32) + mask = BIT(ce_info->width) - 1; + else + mask = ~(u32)0; + + /* don't swizzle the bits until after the mask because the mask bits + * will be in a different bit position on big endian machines + */ + src_dword = *(u32 *)from; + src_dword &= mask; + + /* shift to correct alignment */ + mask <<= shift_width; + src_dword <<= shift_width; + + /* get the current bits from the target bit string */ + dest = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&dest_dword, dest, sizeof(dest_dword), I40E_DMA_TO_NONDMA); + + dest_dword &= ~(CPU_TO_LE32(mask)); /* get the bits not changing */ + dest_dword |= CPU_TO_LE32(src_dword); /* add in the new bits */ + + /* put it all back */ + i40e_memcpy(dest, &dest_dword, sizeof(dest_dword), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_write_qword - replace HMC context qword + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be read from + * @src: the struct to be read from + **/ +static void i40e_write_qword(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *src) +{ + u64 src_qword, mask; + u8 *from, *dest; + u16 shift_width; + __le64 dest_qword; + + /* copy from the next struct field */ + from = src + ce_info->offset; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + + /* if the field width is exactly 64 on an x86 machine, then the shift + * operation will not work because the SHL instructions count is masked + * to 6 bits so the shift will do nothing + */ + if (ce_info->width < 64) + mask = BIT_ULL(ce_info->width) - 1; + else + mask = ~(u64)0; + + /* don't swizzle the bits until after the mask because the mask bits + * will be in a different bit position on big endian machines + */ + src_qword = *(u64 *)from; + src_qword &= mask; + + /* shift to correct alignment */ + mask <<= shift_width; + src_qword <<= shift_width; + + /* get the current bits from the target bit string */ + dest = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&dest_qword, dest, sizeof(dest_qword), I40E_DMA_TO_NONDMA); + + dest_qword &= ~(CPU_TO_LE64(mask)); /* get the bits not changing */ + dest_qword |= CPU_TO_LE64(src_qword); /* add in the new bits */ + + /* put it all back */ + i40e_memcpy(dest, &dest_qword, sizeof(dest_qword), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_read_byte - read HMC context byte into struct + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static void i40e_read_byte(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + u8 dest_byte, mask; + u8 *src, *target; + u16 shift_width; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + mask = BIT(ce_info->width) - 1; + + /* shift to correct alignment */ + mask <<= shift_width; + + /* get the current bits from the src bit string */ + src = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&dest_byte, src, sizeof(dest_byte), I40E_DMA_TO_NONDMA); + + dest_byte &= ~(mask); + + dest_byte >>= shift_width; + + /* get the address from the struct field */ + target = dest + ce_info->offset; + + /* put it back in the struct */ + i40e_memcpy(target, &dest_byte, sizeof(dest_byte), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_read_word - read HMC context word into struct + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static void i40e_read_word(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + u16 dest_word, mask; + u8 *src, *target; + u16 shift_width; + __le16 src_word; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + mask = BIT(ce_info->width) - 1; + + /* shift to correct alignment */ + mask <<= shift_width; + + /* get the current bits from the src bit string */ + src = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&src_word, src, sizeof(src_word), I40E_DMA_TO_NONDMA); + + /* the data in the memory is stored as little endian so mask it + * correctly + */ + src_word &= ~(CPU_TO_LE16(mask)); + + /* get the data back into host order before shifting */ + dest_word = LE16_TO_CPU(src_word); + + dest_word >>= shift_width; + + /* get the address from the struct field */ + target = dest + ce_info->offset; + + /* put it back in the struct */ + i40e_memcpy(target, &dest_word, sizeof(dest_word), I40E_NONDMA_TO_DMA); +} + +/** + * i40e_read_dword - read HMC context dword into struct + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static void i40e_read_dword(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + u32 dest_dword, mask; + u8 *src, *target; + u16 shift_width; + __le32 src_dword; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + + /* if the field width is exactly 32 on an x86 machine, then the shift + * operation will not work because the SHL instructions count is masked + * to 5 bits so the shift will do nothing + */ + if (ce_info->width < 32) + mask = BIT(ce_info->width) - 1; + else + mask = ~(u32)0; + + /* shift to correct alignment */ + mask <<= shift_width; + + /* get the current bits from the src bit string */ + src = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&src_dword, src, sizeof(src_dword), I40E_DMA_TO_NONDMA); + + /* the data in the memory is stored as little endian so mask it + * correctly + */ + src_dword &= ~(CPU_TO_LE32(mask)); + + /* get the data back into host order before shifting */ + dest_dword = LE32_TO_CPU(src_dword); + + dest_dword >>= shift_width; + + /* get the address from the struct field */ + target = dest + ce_info->offset; + + /* put it back in the struct */ + i40e_memcpy(target, &dest_dword, sizeof(dest_dword), + I40E_NONDMA_TO_DMA); +} + +/** + * i40e_read_qword - read HMC context qword into struct + * @hmc_bits: pointer to the HMC memory + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static void i40e_read_qword(u8 *hmc_bits, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + u64 dest_qword, mask; + u8 *src, *target; + u16 shift_width; + __le64 src_qword; + + /* prepare the bits and mask */ + shift_width = ce_info->lsb % 8; + + /* if the field width is exactly 64 on an x86 machine, then the shift + * operation will not work because the SHL instructions count is masked + * to 6 bits so the shift will do nothing + */ + if (ce_info->width < 64) + mask = BIT_ULL(ce_info->width) - 1; + else + mask = ~(u64)0; + + /* shift to correct alignment */ + mask <<= shift_width; + + /* get the current bits from the src bit string */ + src = hmc_bits + (ce_info->lsb / 8); + + i40e_memcpy(&src_qword, src, sizeof(src_qword), I40E_DMA_TO_NONDMA); + + /* the data in the memory is stored as little endian so mask it + * correctly + */ + src_qword &= ~(CPU_TO_LE64(mask)); + + /* get the data back into host order before shifting */ + dest_qword = LE64_TO_CPU(src_qword); + + dest_qword >>= shift_width; + + /* get the address from the struct field */ + target = dest + ce_info->offset; + + /* put it back in the struct */ + i40e_memcpy(target, &dest_qword, sizeof(dest_qword), + I40E_NONDMA_TO_DMA); +} + +/** + * i40e_get_hmc_context - extract HMC context bits + * @context_bytes: pointer to the context bit array + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static enum i40e_status_code i40e_get_hmc_context(u8 *context_bytes, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + int f; + + for (f = 0; ce_info[f].width != 0; f++) { + switch (ce_info[f].size_of) { + case 1: + i40e_read_byte(context_bytes, &ce_info[f], dest); + break; + case 2: + i40e_read_word(context_bytes, &ce_info[f], dest); + break; + case 4: + i40e_read_dword(context_bytes, &ce_info[f], dest); + break; + case 8: + i40e_read_qword(context_bytes, &ce_info[f], dest); + break; + default: + /* nothing to do, just keep going */ + break; + } + } + + return I40E_SUCCESS; +} + +/** + * i40e_clear_hmc_context - zero out the HMC context bits + * @hw: the hardware struct + * @context_bytes: pointer to the context bit array (DMA memory) + * @hmc_type: the type of HMC resource + **/ +static enum i40e_status_code i40e_clear_hmc_context(struct i40e_hw *hw, + u8 *context_bytes, + enum i40e_hmc_lan_rsrc_type hmc_type) +{ + /* clean the bit array */ + i40e_memset(context_bytes, 0, (u32)hw->hmc.hmc_obj[hmc_type].size, + I40E_DMA_MEM); + + return I40E_SUCCESS; +} + +/** + * i40e_set_hmc_context - replace HMC context bits + * @context_bytes: pointer to the context bit array + * @ce_info: a description of the struct to be filled + * @dest: the struct to be filled + **/ +static enum i40e_status_code i40e_set_hmc_context(u8 *context_bytes, + struct i40e_context_ele *ce_info, + u8 *dest) +{ + int f; + + for (f = 0; ce_info[f].width != 0; f++) { + + /* we have to deal with each element of the HMC using the + * correct size so that we are correct regardless of the + * endianness of the machine + */ + switch (ce_info[f].size_of) { + case 1: + i40e_write_byte(context_bytes, &ce_info[f], dest); + break; + case 2: + i40e_write_word(context_bytes, &ce_info[f], dest); + break; + case 4: + i40e_write_dword(context_bytes, &ce_info[f], dest); + break; + case 8: + i40e_write_qword(context_bytes, &ce_info[f], dest); + break; + } + } + + return I40E_SUCCESS; +} + +/** + * i40e_hmc_get_object_va - retrieves an object's virtual address + * @hw: pointer to the hw structure + * @object_base: pointer to u64 to get the va + * @rsrc_type: the hmc resource type + * @obj_idx: hmc object index + * + * This function retrieves the object's virtual address from the object + * base pointer. This function is used for LAN Queue contexts. + **/ +static +enum i40e_status_code i40e_hmc_get_object_va(struct i40e_hw *hw, + u8 **object_base, + enum i40e_hmc_lan_rsrc_type rsrc_type, + u32 obj_idx) +{ + u32 obj_offset_in_sd, obj_offset_in_pd; + struct i40e_hmc_info *hmc_info = &hw->hmc; + struct i40e_hmc_sd_entry *sd_entry; + struct i40e_hmc_pd_entry *pd_entry; + u32 pd_idx, pd_lmt, rel_pd_idx; + enum i40e_status_code ret_code = I40E_SUCCESS; + u64 obj_offset_in_fpm; + u32 sd_idx, sd_lmt; + + if (NULL == hmc_info) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info ptr\n"); + goto exit; + } + if (NULL == hmc_info->hmc_obj) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info->hmc_obj ptr\n"); + goto exit; + } + if (NULL == object_base) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_hmc_get_object_va: bad object_base ptr\n"); + goto exit; + } + if (I40E_HMC_INFO_SIGNATURE != hmc_info->signature) { + ret_code = I40E_ERR_BAD_PTR; + DEBUGOUT("i40e_hmc_get_object_va: bad hmc_info->signature\n"); + goto exit; + } + if (obj_idx >= hmc_info->hmc_obj[rsrc_type].cnt) { + DEBUGOUT1("i40e_hmc_get_object_va: returns error %d\n", + ret_code); + ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX; + goto exit; + } + /* find sd index and limit */ + I40E_FIND_SD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1, + &sd_idx, &sd_lmt); + + sd_entry = &hmc_info->sd_table.sd_entry[sd_idx]; + obj_offset_in_fpm = hmc_info->hmc_obj[rsrc_type].base + + hmc_info->hmc_obj[rsrc_type].size * obj_idx; + + if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) { + I40E_FIND_PD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1, + &pd_idx, &pd_lmt); + rel_pd_idx = pd_idx % I40E_HMC_PD_CNT_IN_SD; + pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx]; + obj_offset_in_pd = (u32)(obj_offset_in_fpm % + I40E_HMC_PAGED_BP_SIZE); + *object_base = (u8 *)pd_entry->bp.addr.va + obj_offset_in_pd; + } else { + obj_offset_in_sd = (u32)(obj_offset_in_fpm % + I40E_HMC_DIRECT_BP_SIZE); + *object_base = (u8 *)sd_entry->u.bp.addr.va + obj_offset_in_sd; + } +exit: + return ret_code; +} + +/** + * i40e_get_lan_tx_queue_context - return the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + * @s: the struct to be filled + **/ +enum i40e_status_code i40e_get_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_txq *s) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue); + if (err < 0) + return err; + + return i40e_get_hmc_context(context_bytes, + i40e_hmc_txq_ce_info, (u8 *)s); +} + +/** + * i40e_clear_lan_tx_queue_context - clear the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + **/ +enum i40e_status_code i40e_clear_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue); + if (err < 0) + return err; + + return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_TX); +} + +/** + * i40e_set_lan_tx_queue_context - set the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + * @s: the struct to be filled + **/ +enum i40e_status_code i40e_set_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_txq *s) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_TX, queue); + if (err < 0) + return err; + + return i40e_set_hmc_context(context_bytes, + i40e_hmc_txq_ce_info, (u8 *)s); +} + +/** + * i40e_get_lan_rx_queue_context - return the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + * @s: the struct to be filled + **/ +enum i40e_status_code i40e_get_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_rxq *s) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue); + if (err < 0) + return err; + + return i40e_get_hmc_context(context_bytes, + i40e_hmc_rxq_ce_info, (u8 *)s); +} + +/** + * i40e_clear_lan_rx_queue_context - clear the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + **/ +enum i40e_status_code i40e_clear_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue); + if (err < 0) + return err; + + return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_RX); +} + +/** + * i40e_set_lan_rx_queue_context - set the HMC context for the queue + * @hw: the hardware struct + * @queue: the queue we care about + * @s: the struct to be filled + **/ +enum i40e_status_code i40e_set_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_rxq *s) +{ + enum i40e_status_code err; + u8 *context_bytes; + + err = i40e_hmc_get_object_va(hw, &context_bytes, I40E_HMC_LAN_RX, queue); + if (err < 0) + return err; + + return i40e_set_hmc_context(context_bytes, + i40e_hmc_rxq_ce_info, (u8 *)s); +} diff --git a/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h new file mode 100644 index 0000000000..2a575264ab --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_lan_hmc.h @@ -0,0 +1,201 @@ +/****************************************************************************** + + Copyright (c) 2013-2014, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_lan_hmc.h 283119 2015-05-19 18:35:18Z jhb $*/ + +#ifndef _I40E_LAN_HMC_H_ +#define _I40E_LAN_HMC_H_ + +/* forward-declare the HW struct for the compiler */ +struct i40e_hw; + +/* HMC element context information */ + +/* Rx queue context data + * + * The sizes of the variables may be larger than needed due to crossing byte + * boundaries. If we do not have the width of the variable set to the correct + * size then we could end up shifting bits off the top of the variable when the + * variable is at the top of a byte and crosses over into the next byte. + */ +struct i40e_hmc_obj_rxq { + u16 head; + u16 cpuid; /* bigger than needed, see above for reason */ + u64 base; + u16 qlen; +#define I40E_RXQ_CTX_DBUFF_SHIFT 7 + u16 dbuff; /* bigger than needed, see above for reason */ +#define I40E_RXQ_CTX_HBUFF_SHIFT 6 + u16 hbuff; /* bigger than needed, see above for reason */ + u8 dtype; + u8 dsize; + u8 crcstrip; + u8 fc_ena; + u8 l2tsel; + u8 hsplit_0; + u8 hsplit_1; + u8 showiv; + u32 rxmax; /* bigger than needed, see above for reason */ + u8 tphrdesc_ena; + u8 tphwdesc_ena; + u8 tphdata_ena; + u8 tphhead_ena; + u16 lrxqthresh; /* bigger than needed, see above for reason */ + u8 prefena; /* NOTE: normally must be set to 1 at init */ +}; + +/* Tx queue context data +* +* The sizes of the variables may be larger than needed due to crossing byte +* boundaries. If we do not have the width of the variable set to the correct +* size then we could end up shifting bits off the top of the variable when the +* variable is at the top of a byte and crosses over into the next byte. +*/ +struct i40e_hmc_obj_txq { + u16 head; + u8 new_context; + u64 base; + u8 fc_ena; + u8 timesync_ena; + u8 fd_ena; + u8 alt_vlan_ena; + u16 thead_wb; + u8 cpuid; + u8 head_wb_ena; + u16 qlen; + u8 tphrdesc_ena; + u8 tphrpacket_ena; + u8 tphwdesc_ena; + u64 head_wb_addr; + u32 crc; + u16 rdylist; + u8 rdylist_act; +}; + +/* for hsplit_0 field of Rx HMC context */ +enum i40e_hmc_obj_rx_hsplit_0 { + I40E_HMC_OBJ_RX_HSPLIT_0_NO_SPLIT = 0, + I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_L2 = 1, + I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_IP = 2, + I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_TCP_UDP = 4, + I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_SCTP = 8, +}; + +/* fcoe_cntx and fcoe_filt are for debugging purpose only */ +struct i40e_hmc_obj_fcoe_cntx { + u32 rsv[32]; +}; + +struct i40e_hmc_obj_fcoe_filt { + u32 rsv[8]; +}; + +/* Context sizes for LAN objects */ +enum i40e_hmc_lan_object_size { + I40E_HMC_LAN_OBJ_SZ_8 = 0x3, + I40E_HMC_LAN_OBJ_SZ_16 = 0x4, + I40E_HMC_LAN_OBJ_SZ_32 = 0x5, + I40E_HMC_LAN_OBJ_SZ_64 = 0x6, + I40E_HMC_LAN_OBJ_SZ_128 = 0x7, + I40E_HMC_LAN_OBJ_SZ_256 = 0x8, + I40E_HMC_LAN_OBJ_SZ_512 = 0x9, +}; + +#define I40E_HMC_L2OBJ_BASE_ALIGNMENT 512 +#define I40E_HMC_OBJ_SIZE_TXQ 128 +#define I40E_HMC_OBJ_SIZE_RXQ 32 +#define I40E_HMC_OBJ_SIZE_FCOE_CNTX 64 +#define I40E_HMC_OBJ_SIZE_FCOE_FILT 64 + +enum i40e_hmc_lan_rsrc_type { + I40E_HMC_LAN_FULL = 0, + I40E_HMC_LAN_TX = 1, + I40E_HMC_LAN_RX = 2, + I40E_HMC_FCOE_CTX = 3, + I40E_HMC_FCOE_FILT = 4, + I40E_HMC_LAN_MAX = 5 +}; + +enum i40e_hmc_model { + I40E_HMC_MODEL_DIRECT_PREFERRED = 0, + I40E_HMC_MODEL_DIRECT_ONLY = 1, + I40E_HMC_MODEL_PAGED_ONLY = 2, + I40E_HMC_MODEL_UNKNOWN, +}; + +struct i40e_hmc_lan_create_obj_info { + struct i40e_hmc_info *hmc_info; + u32 rsrc_type; + u32 start_idx; + u32 count; + enum i40e_sd_entry_type entry_type; + u64 direct_mode_sz; +}; + +struct i40e_hmc_lan_delete_obj_info { + struct i40e_hmc_info *hmc_info; + u32 rsrc_type; + u32 start_idx; + u32 count; +}; + +enum i40e_status_code i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num, + u32 rxq_num, u32 fcoe_cntx_num, + u32 fcoe_filt_num); +enum i40e_status_code i40e_configure_lan_hmc(struct i40e_hw *hw, + enum i40e_hmc_model model); +enum i40e_status_code i40e_shutdown_lan_hmc(struct i40e_hw *hw); + +u64 i40e_calculate_l2fpm_size(u32 txq_num, u32 rxq_num, + u32 fcoe_cntx_num, u32 fcoe_filt_num); +enum i40e_status_code i40e_get_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_txq *s); +enum i40e_status_code i40e_clear_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue); +enum i40e_status_code i40e_set_lan_tx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_txq *s); +enum i40e_status_code i40e_get_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_rxq *s); +enum i40e_status_code i40e_clear_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue); +enum i40e_status_code i40e_set_lan_rx_queue_context(struct i40e_hw *hw, + u16 queue, + struct i40e_hmc_obj_rxq *s); +enum i40e_status_code i40e_create_lan_hmc_object(struct i40e_hw *hw, + struct i40e_hmc_lan_create_obj_info *info); +enum i40e_status_code i40e_delete_lan_hmc_object(struct i40e_hw *hw, + struct i40e_hmc_lan_delete_obj_info *info); + +#endif /* _I40E_LAN_HMC_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_nvm.c b/usr/src/uts/common/io/i40e/core/i40e_nvm.c new file mode 100644 index 0000000000..04d61bb969 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_nvm.c @@ -0,0 +1,712 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_nvm.c 284049 2015-06-05 22:52:42Z jfv $*/ + +#include "i40e_prototype.h" + +enum i40e_status_code i40e_read_nvm_word_srctl(struct i40e_hw *hw, u16 offset, + u16 *data); +enum i40e_status_code i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset, + u16 *data); +enum i40e_status_code i40e_read_nvm_buffer_srctl(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data); +enum i40e_status_code i40e_read_nvm_buffer_aq(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data); +enum i40e_status_code i40e_read_nvm_aq(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 words, void *data, + bool last_command); + +/** + * i40e_init_nvm_ops - Initialize NVM function pointers + * @hw: pointer to the HW structure + * + * Setup the function pointers and the NVM info structure. Should be called + * once per NVM initialization, e.g. inside the i40e_init_shared_code(). + * Please notice that the NVM term is used here (& in all methods covered + * in this file) as an equivalent of the FLASH part mapped into the SR. + * We are accessing FLASH always thru the Shadow RAM. + **/ +enum i40e_status_code i40e_init_nvm(struct i40e_hw *hw) +{ + struct i40e_nvm_info *nvm = &hw->nvm; + enum i40e_status_code ret_code = I40E_SUCCESS; + u32 fla, gens; + u8 sr_size; + + DEBUGFUNC("i40e_init_nvm"); + + /* The SR size is stored regardless of the nvm programming mode + * as the blank mode may be used in the factory line. + */ + gens = rd32(hw, I40E_GLNVM_GENS); + sr_size = ((gens & I40E_GLNVM_GENS_SR_SIZE_MASK) >> + I40E_GLNVM_GENS_SR_SIZE_SHIFT); + /* Switching to words (sr_size contains power of 2KB) */ + nvm->sr_size = BIT(sr_size) * I40E_SR_WORDS_IN_1KB; + + /* Check if we are in the normal or blank NVM programming mode */ + fla = rd32(hw, I40E_GLNVM_FLA); + if (fla & I40E_GLNVM_FLA_LOCKED_MASK) { /* Normal programming mode */ + /* Max NVM timeout */ + nvm->timeout = I40E_MAX_NVM_TIMEOUT; + nvm->blank_nvm_mode = FALSE; + } else { /* Blank programming mode */ + nvm->blank_nvm_mode = TRUE; + ret_code = I40E_ERR_NVM_BLANK_MODE; + i40e_debug(hw, I40E_DEBUG_NVM, "NVM init error: unsupported blank mode.\n"); + } + + return ret_code; +} + +/** + * i40e_acquire_nvm - Generic request for acquiring the NVM ownership + * @hw: pointer to the HW structure + * @access: NVM access type (read or write) + * + * This function will request NVM ownership for reading + * via the proper Admin Command. + **/ +enum i40e_status_code i40e_acquire_nvm(struct i40e_hw *hw, + enum i40e_aq_resource_access_type access) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u64 gtime, timeout; + u64 time_left = 0; + + DEBUGFUNC("i40e_acquire_nvm"); + + if (hw->nvm.blank_nvm_mode) + goto i40e_i40e_acquire_nvm_exit; + + ret_code = i40e_aq_request_resource(hw, I40E_NVM_RESOURCE_ID, access, + 0, &time_left, NULL); + /* Reading the Global Device Timer */ + gtime = rd32(hw, I40E_GLVFGEN_TIMER); + + /* Store the timeout */ + hw->nvm.hw_semaphore_timeout = I40E_MS_TO_GTIME(time_left) + gtime; + + if (ret_code) + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM acquire type %d failed time_left=%llu ret=%d aq_err=%d\n", + access, time_left, ret_code, hw->aq.asq_last_status); + + if (ret_code && time_left) { + /* Poll until the current NVM owner timeouts */ + timeout = I40E_MS_TO_GTIME(I40E_MAX_NVM_TIMEOUT) + gtime; + while ((gtime < timeout) && time_left) { + i40e_msec_delay(10); + gtime = rd32(hw, I40E_GLVFGEN_TIMER); + ret_code = i40e_aq_request_resource(hw, + I40E_NVM_RESOURCE_ID, + access, 0, &time_left, + NULL); + if (ret_code == I40E_SUCCESS) { + hw->nvm.hw_semaphore_timeout = + I40E_MS_TO_GTIME(time_left) + gtime; + break; + } + } + if (ret_code != I40E_SUCCESS) { + hw->nvm.hw_semaphore_timeout = 0; + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM acquire timed out, wait %llu ms before trying again. status=%d aq_err=%d\n", + time_left, ret_code, hw->aq.asq_last_status); + } + } + +i40e_i40e_acquire_nvm_exit: + return ret_code; +} + +/** + * i40e_release_nvm - Generic request for releasing the NVM ownership + * @hw: pointer to the HW structure + * + * This function will release NVM resource via the proper Admin Command. + **/ +void i40e_release_nvm(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u32 total_delay = 0; + + DEBUGFUNC("i40e_release_nvm"); + + if (hw->nvm.blank_nvm_mode) + return; + + ret_code = i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL); + + /* there are some rare cases when trying to release the resource + * results in an admin Q timeout, so handle them correctly + */ + while ((ret_code == I40E_ERR_ADMIN_QUEUE_TIMEOUT) && + (total_delay < hw->aq.asq_cmd_timeout)) { + i40e_msec_delay(1); + ret_code = i40e_aq_release_resource(hw, + I40E_NVM_RESOURCE_ID, 0, NULL); + total_delay++; + } +} + +/** + * i40e_poll_sr_srctl_done_bit - Polls the GLNVM_SRCTL done bit + * @hw: pointer to the HW structure + * + * Polls the SRCTL Shadow RAM register done bit. + **/ +static enum i40e_status_code i40e_poll_sr_srctl_done_bit(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_ERR_TIMEOUT; + u32 srctl, wait_cnt; + + DEBUGFUNC("i40e_poll_sr_srctl_done_bit"); + + /* Poll the I40E_GLNVM_SRCTL until the done bit is set */ + for (wait_cnt = 0; wait_cnt < I40E_SRRD_SRCTL_ATTEMPTS; wait_cnt++) { + srctl = rd32(hw, I40E_GLNVM_SRCTL); + if (srctl & I40E_GLNVM_SRCTL_DONE_MASK) { + ret_code = I40E_SUCCESS; + break; + } + i40e_usec_delay(5); + } + if (ret_code == I40E_ERR_TIMEOUT) + i40e_debug(hw, I40E_DEBUG_NVM, "Done bit in GLNVM_SRCTL not set"); + return ret_code; +} + +/** + * i40e_read_nvm_word - Reads Shadow RAM + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM + * + * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register. + **/ +enum i40e_status_code i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, + u16 *data) +{ +#ifdef X722_SUPPORT + if (hw->mac.type == I40E_MAC_X722) + return i40e_read_nvm_word_aq(hw, offset, data); +#endif + return i40e_read_nvm_word_srctl(hw, offset, data); +} + +/** + * i40e_read_nvm_word_srctl - Reads Shadow RAM via SRCTL register + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM + * + * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register. + **/ +enum i40e_status_code i40e_read_nvm_word_srctl(struct i40e_hw *hw, u16 offset, + u16 *data) +{ + enum i40e_status_code ret_code = I40E_ERR_TIMEOUT; + u32 sr_reg; + + DEBUGFUNC("i40e_read_nvm_word_srctl"); + + if (offset >= hw->nvm.sr_size) { + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM read error: Offset %d beyond Shadow RAM limit %d\n", + offset, hw->nvm.sr_size); + ret_code = I40E_ERR_PARAM; + goto read_nvm_exit; + } + + /* Poll the done bit first */ + ret_code = i40e_poll_sr_srctl_done_bit(hw); + if (ret_code == I40E_SUCCESS) { + /* Write the address and start reading */ + sr_reg = ((u32)offset << I40E_GLNVM_SRCTL_ADDR_SHIFT) | + BIT(I40E_GLNVM_SRCTL_START_SHIFT); + wr32(hw, I40E_GLNVM_SRCTL, sr_reg); + + /* Poll I40E_GLNVM_SRCTL until the done bit is set */ + ret_code = i40e_poll_sr_srctl_done_bit(hw); + if (ret_code == I40E_SUCCESS) { + sr_reg = rd32(hw, I40E_GLNVM_SRDATA); + *data = (u16)((sr_reg & + I40E_GLNVM_SRDATA_RDDATA_MASK) + >> I40E_GLNVM_SRDATA_RDDATA_SHIFT); + } + } + if (ret_code != I40E_SUCCESS) + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM read error: Couldn't access Shadow RAM address: 0x%x\n", + offset); + +read_nvm_exit: + return ret_code; +} + +/** + * i40e_read_nvm_word_aq - Reads Shadow RAM via AQ + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM + * + * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register. + **/ +enum i40e_status_code i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset, + u16 *data) +{ + enum i40e_status_code ret_code = I40E_ERR_TIMEOUT; + + DEBUGFUNC("i40e_read_nvm_word_aq"); + + ret_code = i40e_read_nvm_aq(hw, 0x0, offset, 1, data, TRUE); + *data = LE16_TO_CPU(*(__le16 *)data); + + return ret_code; +} + +/** + * i40e_read_nvm_buffer - Reads Shadow RAM buffer + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM + * + * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd() + * method. The buffer read is preceded by the NVM ownership take + * and followed by the release. + **/ +enum i40e_status_code i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data) +{ +#ifdef X722_SUPPORT + if (hw->mac.type == I40E_MAC_X722) + return i40e_read_nvm_buffer_aq(hw, offset, words, data); +#endif + return i40e_read_nvm_buffer_srctl(hw, offset, words, data); +} + +/** + * i40e_read_nvm_buffer_srctl - Reads Shadow RAM buffer via SRCTL register + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM + * + * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd() + * method. The buffer read is preceded by the NVM ownership take + * and followed by the release. + **/ +enum i40e_status_code i40e_read_nvm_buffer_srctl(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u16 index, word; + + DEBUGFUNC("i40e_read_nvm_buffer_srctl"); + + /* Loop thru the selected region */ + for (word = 0; word < *words; word++) { + index = offset + word; + ret_code = i40e_read_nvm_word_srctl(hw, index, &data[word]); + if (ret_code != I40E_SUCCESS) + break; + } + + /* Update the number of words read from the Shadow RAM */ + *words = word; + + return ret_code; +} + +/** + * i40e_read_nvm_buffer_aq - Reads Shadow RAM buffer via AQ + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF). + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM + * + * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_aq() + * method. The buffer read is preceded by the NVM ownership take + * and followed by the release. + **/ +enum i40e_status_code i40e_read_nvm_buffer_aq(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data) +{ + enum i40e_status_code ret_code; + u16 read_size = *words; + bool last_cmd = FALSE; + u16 words_read = 0; + u16 i = 0; + + DEBUGFUNC("i40e_read_nvm_buffer_aq"); + + do { + /* Calculate number of bytes we should read in this step. + * FVL AQ do not allow to read more than one page at a time or + * to cross page boundaries. + */ + if (offset % I40E_SR_SECTOR_SIZE_IN_WORDS) + read_size = min(*words, + (u16)(I40E_SR_SECTOR_SIZE_IN_WORDS - + (offset % I40E_SR_SECTOR_SIZE_IN_WORDS))); + else + read_size = min((*words - words_read), + I40E_SR_SECTOR_SIZE_IN_WORDS); + + /* Check if this is last command, if so set proper flag */ + if ((words_read + read_size) >= *words) + last_cmd = TRUE; + + ret_code = i40e_read_nvm_aq(hw, 0x0, offset, read_size, + data + words_read, last_cmd); + if (ret_code != I40E_SUCCESS) + goto read_nvm_buffer_aq_exit; + + /* Increment counter for words already read and move offset to + * new read location + */ + words_read += read_size; + offset += read_size; + } while (words_read < *words); + + for (i = 0; i < *words; i++) + data[i] = LE16_TO_CPU(((__le16 *)data)[i]); + +read_nvm_buffer_aq_exit: + *words = words_read; + return ret_code; +} + +/** + * i40e_read_nvm_aq - Read Shadow RAM. + * @hw: pointer to the HW structure. + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: offset in words from module start + * @words: number of words to write + * @data: buffer with words to write to the Shadow RAM + * @last_command: tells the AdminQ that this is the last command + * + * Writes a 16 bit words buffer to the Shadow RAM using the admin command. + **/ +enum i40e_status_code i40e_read_nvm_aq(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 words, void *data, + bool last_command) +{ + enum i40e_status_code ret_code = I40E_ERR_NVM; + struct i40e_asq_cmd_details cmd_details; + + DEBUGFUNC("i40e_read_nvm_aq"); + + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; + + /* Here we are checking the SR limit only for the flat memory model. + * We cannot do it for the module-based model, as we did not acquire + * the NVM resource yet (we cannot get the module pointer value). + * Firmware will check the module-based model. + */ + if ((offset + words) > hw->nvm.sr_size) + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM write error: offset %d beyond Shadow RAM limit %d\n", + (offset + words), hw->nvm.sr_size); + else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS) + /* We can write only up to 4KB (one sector), in one AQ write */ + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM write fail error: tried to write %d words, limit is %d.\n", + words, I40E_SR_SECTOR_SIZE_IN_WORDS); + else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS) + != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS)) + /* A single write cannot spread over two sectors */ + i40e_debug(hw, I40E_DEBUG_NVM, + "NVM write error: cannot spread over two sectors in a single write offset=%d words=%d\n", + offset, words); + else + ret_code = i40e_aq_read_nvm(hw, module_pointer, + 2 * offset, /*bytes*/ + 2 * words, /*bytes*/ + data, last_command, &cmd_details); + + return ret_code; +} + +/** + * i40e_write_nvm_aq - Writes Shadow RAM. + * @hw: pointer to the HW structure. + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: offset in words from module start + * @words: number of words to write + * @data: buffer with words to write to the Shadow RAM + * @last_command: tells the AdminQ that this is the last command + * + * Writes a 16 bit words buffer to the Shadow RAM using the admin command. + **/ +enum i40e_status_code i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 words, void *data, + bool last_command) +{ + enum i40e_status_code ret_code = I40E_ERR_NVM; + struct i40e_asq_cmd_details cmd_details; + + DEBUGFUNC("i40e_write_nvm_aq"); + + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; + + /* Here we are checking the SR limit only for the flat memory model. + * We cannot do it for the module-based model, as we did not acquire + * the NVM resource yet (we cannot get the module pointer value). + * Firmware will check the module-based model. + */ + if ((offset + words) > hw->nvm.sr_size) + DEBUGOUT("NVM write error: offset beyond Shadow RAM limit.\n"); + else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS) + /* We can write only up to 4KB (one sector), in one AQ write */ + DEBUGOUT("NVM write fail error: cannot write more than 4KB in a single write.\n"); + else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS) + != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS)) + /* A single write cannot spread over two sectors */ + DEBUGOUT("NVM write error: cannot spread over two sectors in a single write.\n"); + else + ret_code = i40e_aq_update_nvm(hw, module_pointer, + 2 * offset, /*bytes*/ + 2 * words, /*bytes*/ + data, last_command, &cmd_details); + + return ret_code; +} + +/** + * i40e_write_nvm_word - Writes Shadow RAM word + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to write + * @data: word to write to the Shadow RAM + * + * Writes a 16 bit word to the SR using the i40e_write_nvm_aq() method. + * NVM ownership have to be acquired and released (on ARQ completion event + * reception) by caller. To commit SR to NVM update checksum function + * should be called. + **/ +enum i40e_status_code i40e_write_nvm_word(struct i40e_hw *hw, u32 offset, + void *data) +{ + DEBUGFUNC("i40e_write_nvm_word"); + + *((__le16 *)data) = CPU_TO_LE16(*((u16 *)data)); + + /* Value 0x00 below means that we treat SR as a flat mem */ + return i40e_write_nvm_aq(hw, 0x00, offset, 1, data, FALSE); +} + +/** + * i40e_write_nvm_buffer - Writes Shadow RAM buffer + * @hw: pointer to the HW structure + * @module_pointer: module pointer location in words from the NVM beginning + * @offset: offset of the Shadow RAM buffer to write + * @words: number of words to write + * @data: words to write to the Shadow RAM + * + * Writes a 16 bit words buffer to the Shadow RAM using the admin command. + * NVM ownership must be acquired before calling this function and released + * on ARQ completion event reception by caller. To commit SR to NVM update + * checksum function should be called. + **/ +enum i40e_status_code i40e_write_nvm_buffer(struct i40e_hw *hw, + u8 module_pointer, u32 offset, + u16 words, void *data) +{ + __le16 *le_word_ptr = (__le16 *)data; + u16 *word_ptr = (u16 *)data; + u32 i = 0; + + DEBUGFUNC("i40e_write_nvm_buffer"); + + for (i = 0; i < words; i++) + le_word_ptr[i] = CPU_TO_LE16(word_ptr[i]); + + /* Here we will only write one buffer as the size of the modules + * mirrored in the Shadow RAM is always less than 4K. + */ + return i40e_write_nvm_aq(hw, module_pointer, offset, words, + data, FALSE); +} + +/** + * i40e_calc_nvm_checksum - Calculates and returns the checksum + * @hw: pointer to hardware structure + * @checksum: pointer to the checksum + * + * This function calculates SW Checksum that covers the whole 64kB shadow RAM + * except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD + * is customer specific and unknown. Therefore, this function skips all maximum + * possible size of VPD (1kB). + **/ +enum i40e_status_code i40e_calc_nvm_checksum(struct i40e_hw *hw, u16 *checksum) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + struct i40e_virt_mem vmem; + u16 pcie_alt_module = 0; + u16 checksum_local = 0; + u16 vpd_module = 0; + u16 *data; + u16 i = 0; + + DEBUGFUNC("i40e_calc_nvm_checksum"); + + ret_code = i40e_allocate_virt_mem(hw, &vmem, + I40E_SR_SECTOR_SIZE_IN_WORDS * sizeof(u16)); + if (ret_code) + goto i40e_calc_nvm_checksum_exit; + data = (u16 *)vmem.va; + + /* read pointer to VPD area */ + ret_code = i40e_read_nvm_word(hw, I40E_SR_VPD_PTR, &vpd_module); + if (ret_code != I40E_SUCCESS) { + ret_code = I40E_ERR_NVM_CHECKSUM; + goto i40e_calc_nvm_checksum_exit; + } + + /* read pointer to PCIe Alt Auto-load module */ + ret_code = i40e_read_nvm_word(hw, I40E_SR_PCIE_ALT_AUTO_LOAD_PTR, + &pcie_alt_module); + if (ret_code != I40E_SUCCESS) { + ret_code = I40E_ERR_NVM_CHECKSUM; + goto i40e_calc_nvm_checksum_exit; + } + + /* Calculate SW checksum that covers the whole 64kB shadow RAM + * except the VPD and PCIe ALT Auto-load modules + */ + for (i = 0; i < hw->nvm.sr_size; i++) { + /* Read SR page */ + if ((i % I40E_SR_SECTOR_SIZE_IN_WORDS) == 0) { + u16 words = I40E_SR_SECTOR_SIZE_IN_WORDS; + + ret_code = i40e_read_nvm_buffer(hw, i, &words, data); + if (ret_code != I40E_SUCCESS) { + ret_code = I40E_ERR_NVM_CHECKSUM; + goto i40e_calc_nvm_checksum_exit; + } + } + + /* Skip Checksum word */ + if (i == I40E_SR_SW_CHECKSUM_WORD) + continue; + /* Skip VPD module (convert byte size to word count) */ + if ((i >= (u32)vpd_module) && + (i < ((u32)vpd_module + + (I40E_SR_VPD_MODULE_MAX_SIZE / 2)))) { + continue; + } + /* Skip PCIe ALT module (convert byte size to word count) */ + if ((i >= (u32)pcie_alt_module) && + (i < ((u32)pcie_alt_module + + (I40E_SR_PCIE_ALT_MODULE_MAX_SIZE / 2)))) { + continue; + } + + checksum_local += data[i % I40E_SR_SECTOR_SIZE_IN_WORDS]; + } + + *checksum = (u16)I40E_SR_SW_CHECKSUM_BASE - checksum_local; + +i40e_calc_nvm_checksum_exit: + i40e_free_virt_mem(hw, &vmem); + return ret_code; +} + +/** + * i40e_update_nvm_checksum - Updates the NVM checksum + * @hw: pointer to hardware structure + * + * NVM ownership must be acquired before calling this function and released + * on ARQ completion event reception by caller. + * This function will commit SR to NVM. + **/ +enum i40e_status_code i40e_update_nvm_checksum(struct i40e_hw *hw) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u16 checksum; + __le16 le_sum; + + DEBUGFUNC("i40e_update_nvm_checksum"); + + ret_code = i40e_calc_nvm_checksum(hw, &checksum); + le_sum = CPU_TO_LE16(checksum); + if (ret_code == I40E_SUCCESS) + ret_code = i40e_write_nvm_aq(hw, 0x00, I40E_SR_SW_CHECKSUM_WORD, + 1, &le_sum, TRUE); + + return ret_code; +} + +/** + * i40e_validate_nvm_checksum - Validate EEPROM checksum + * @hw: pointer to hardware structure + * @checksum: calculated checksum + * + * Performs checksum calculation and validates the NVM SW checksum. If the + * caller does not need checksum, the value can be NULL. + **/ +enum i40e_status_code i40e_validate_nvm_checksum(struct i40e_hw *hw, + u16 *checksum) +{ + enum i40e_status_code ret_code = I40E_SUCCESS; + u16 checksum_sr = 0; + u16 checksum_local = 0; + + DEBUGFUNC("i40e_validate_nvm_checksum"); + + ret_code = i40e_calc_nvm_checksum(hw, &checksum_local); + if (ret_code != I40E_SUCCESS) + goto i40e_validate_nvm_checksum_exit; + + /* Do not use i40e_read_nvm_word() because we do not want to take + * the synchronization semaphores twice here. + */ + i40e_read_nvm_word(hw, I40E_SR_SW_CHECKSUM_WORD, &checksum_sr); + + /* Verify read checksum from EEPROM is the same as + * calculated checksum + */ + if (checksum_local != checksum_sr) + ret_code = I40E_ERR_NVM_CHECKSUM; + + /* If the user cares, return the calculated checksum */ + if (checksum) + *checksum = checksum_local; + +i40e_validate_nvm_checksum_exit: + return ret_code; +} diff --git a/usr/src/uts/common/io/i40e/core/i40e_prototype.h b/usr/src/uts/common/io/i40e/core/i40e_prototype.h new file mode 100644 index 0000000000..6f1cfc3afe --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_prototype.h @@ -0,0 +1,478 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_prototype.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_PROTOTYPE_H_ +#define _I40E_PROTOTYPE_H_ + +#include "i40e_type.h" +#include "i40e_alloc.h" +#include "i40e_virtchnl.h" + +/* Prototypes for shared code functions that are not in + * the standard function pointer structures. These are + * mostly because they are needed even before the init + * has happened and will assist in the early SW and FW + * setup. + */ + +/* adminq functions */ +enum i40e_status_code i40e_init_adminq(struct i40e_hw *hw); +enum i40e_status_code i40e_shutdown_adminq(struct i40e_hw *hw); +enum i40e_status_code i40e_init_asq(struct i40e_hw *hw); +enum i40e_status_code i40e_init_arq(struct i40e_hw *hw); +enum i40e_status_code i40e_alloc_adminq_asq_ring(struct i40e_hw *hw); +enum i40e_status_code i40e_alloc_adminq_arq_ring(struct i40e_hw *hw); +enum i40e_status_code i40e_shutdown_asq(struct i40e_hw *hw); +enum i40e_status_code i40e_shutdown_arq(struct i40e_hw *hw); +u16 i40e_clean_asq(struct i40e_hw *hw); +void i40e_free_adminq_asq(struct i40e_hw *hw); +void i40e_free_adminq_arq(struct i40e_hw *hw); +enum i40e_status_code i40e_validate_mac_addr(u8 *mac_addr); +void i40e_adminq_init_ring_data(struct i40e_hw *hw); +enum i40e_status_code i40e_clean_arq_element(struct i40e_hw *hw, + struct i40e_arq_event_info *e, + u16 *events_pending); +enum i40e_status_code i40e_asq_send_command(struct i40e_hw *hw, + struct i40e_aq_desc *desc, + void *buff, /* can be NULL */ + u16 buff_size, + struct i40e_asq_cmd_details *cmd_details); +bool i40e_asq_done(struct i40e_hw *hw); + +/* debug function for adminq */ +void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, + void *desc, void *buffer, u16 buf_len); + +void i40e_idle_aq(struct i40e_hw *hw); +void i40e_resume_aq(struct i40e_hw *hw); +bool i40e_check_asq_alive(struct i40e_hw *hw); +enum i40e_status_code i40e_aq_queue_shutdown(struct i40e_hw *hw, bool unloading); +#ifdef X722_SUPPORT + +enum i40e_status_code i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 seid, + bool pf_lut, u8 *lut, u16 lut_size); +enum i40e_status_code i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 seid, + bool pf_lut, u8 *lut, u16 lut_size); +enum i40e_status_code i40e_aq_get_rss_key(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_get_set_rss_key_data *key); +enum i40e_status_code i40e_aq_set_rss_key(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_get_set_rss_key_data *key); +#endif +char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err); +char *i40e_stat_str(struct i40e_hw *hw, enum i40e_status_code stat_err); + + +u32 i40e_led_get(struct i40e_hw *hw); +void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink); + +/* admin send queue commands */ + +enum i40e_status_code i40e_aq_get_firmware_version(struct i40e_hw *hw, + u16 *fw_major_version, u16 *fw_minor_version, + u32 *fw_build, + u16 *api_major_version, u16 *api_minor_version, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_debug_write_register(struct i40e_hw *hw, + u32 reg_addr, u64 reg_val, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_debug_read_register(struct i40e_hw *hw, + u32 reg_addr, u64 *reg_val, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_default_vsi(struct i40e_hw *hw, u16 vsi_id, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_phy_capabilities(struct i40e_hw *hw, + bool qualified_modules, bool report_init, + struct i40e_aq_get_phy_abilities_resp *abilities, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw, + struct i40e_aq_set_phy_config *config, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures, + bool atomic_reset); +enum i40e_status_code i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_mac_config(struct i40e_hw *hw, + u16 max_frame_size, bool crc_en, u16 pacing, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_local_advt_reg(struct i40e_hw *hw, + u64 *advt_reg, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_partner_advt(struct i40e_hw *hw, + u64 *advt_reg, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_lb_modes(struct i40e_hw *hw, u16 lb_modes, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_clear_pxe_mode(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_link_restart_an(struct i40e_hw *hw, + bool enable_link, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_link_info(struct i40e_hw *hw, + bool enable_lse, struct i40e_link_status *link, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_local_advt_reg(struct i40e_hw *hw, + u64 advt_reg, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_send_driver_version(struct i40e_hw *hw, + struct i40e_driver_version *dv, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_vsi(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_vsi_broadcast(struct i40e_hw *hw, + u16 vsi_id, bool set_filter, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw, + u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw, + u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw, + u16 seid, bool enable, u16 vid, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, + u16 seid, bool enable, u16 vid, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_vsi_params(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_update_vsi_params(struct i40e_hw *hw, + struct i40e_vsi_context *vsi_ctx, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid, + u16 downlink_seid, u8 enabled_tc, + bool default_port, bool enable_l2_filtering, + u16 *pveb_seid, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_veb_parameters(struct i40e_hw *hw, + u16 veb_seid, u16 *switch_id, bool *floating, + u16 *statistic_index, u16 *vebs_used, + u16 *vebs_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_macvlan(struct i40e_hw *hw, u16 vsi_id, + struct i40e_aqc_add_macvlan_element_data *mv_list, + u16 count, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 vsi_id, + struct i40e_aqc_remove_macvlan_element_data *mv_list, + u16 count, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_vlan(struct i40e_hw *hw, u16 vsi_id, + struct i40e_aqc_add_remove_vlan_element_data *v_list, + u8 count, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_remove_vlan(struct i40e_hw *hw, u16 vsi_id, + struct i40e_aqc_add_remove_vlan_element_data *v_list, + u8 count, struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid, + u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_switch_config(struct i40e_hw *hw, + struct i40e_aqc_get_switch_config_resp *buf, + u16 buf_size, u16 *start_seid, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_request_resource(struct i40e_hw *hw, + enum i40e_aq_resources_ids resource, + enum i40e_aq_resource_access_type access, + u8 sdp_number, u64 *timeout, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_release_resource(struct i40e_hw *hw, + enum i40e_aq_resources_ids resource, + u8 sdp_number, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, void *data, + bool last_command, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, bool last_command, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_read_nvm_config(struct i40e_hw *hw, + u8 cmd_flags, u32 field_id, void *data, + u16 buf_size, u16 *element_count, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_write_nvm_config(struct i40e_hw *hw, + u8 cmd_flags, void *data, u16 buf_size, + u16 element_count, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_oem_post_update(struct i40e_hw *hw, + void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_discover_capabilities(struct i40e_hw *hw, + void *buff, u16 buff_size, u16 *data_size, + enum i40e_admin_queue_opc list_type_opc, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 length, void *data, + bool last_command, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type, + u8 mib_type, void *buff, u16 buff_size, + u16 *local_len, u16 *remote_len, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_lldp_mib(struct i40e_hw *hw, + u8 mib_type, void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, + bool enable_update, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_lldp_tlv(struct i40e_hw *hw, u8 bridge_type, + void *buff, u16 buff_size, u16 tlv_len, + u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_update_lldp_tlv(struct i40e_hw *hw, + u8 bridge_type, void *buff, u16 buff_size, + u16 old_len, u16 new_len, u16 offset, + u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_delete_lldp_tlv(struct i40e_hw *hw, + u8 bridge_type, void *buff, u16 buff_size, + u16 tlv_len, u16 *mib_len, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_start_lldp(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_cee_dcb_config(struct i40e_hw *hw, + void *buff, u16 buff_size, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_start_stop_dcbx(struct i40e_hw *hw, + bool start_agent, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_udp_tunnel(struct i40e_hw *hw, + u16 udp_port, u8 protocol_index, + u8 *filter_index, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_switch_resource_alloc(struct i40e_hw *hw, + u8 *num_entries, + struct i40e_aqc_switch_resource_alloc_element_resp *buf, + u16 count, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_pvirt(struct i40e_hw *hw, u16 flags, + u16 mac_seid, u16 vsi_seid, + u16 *ret_seid); +enum i40e_status_code i40e_aq_add_tag(struct i40e_hw *hw, bool direct_to_queue, + u16 vsi_seid, u16 tag, u16 queue_num, + u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_remove_tag(struct i40e_hw *hw, u16 vsi_seid, + u16 tag, u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_mcast_etag(struct i40e_hw *hw, u16 pe_seid, + u16 etag, u8 num_tags_in_buf, void *buf, + u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_remove_mcast_etag(struct i40e_hw *hw, u16 pe_seid, + u16 etag, u16 *tags_used, u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_update_tag(struct i40e_hw *hw, u16 vsi_seid, + u16 old_tag, u16 new_tag, u16 *tags_used, + u16 *tags_free, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_add_statistics(struct i40e_hw *hw, u16 seid, + u16 vlan_id, u16 *stat_index, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_remove_statistics(struct i40e_hw *hw, u16 seid, + u16 vlan_id, u16 stat_index, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_port_parameters(struct i40e_hw *hw, + u16 bad_frame_vsi, bool save_bad_pac, + bool pad_short_pac, bool double_vlan, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_delete_element(struct i40e_hw *hw, u16 seid, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_mac_address_write(struct i40e_hw *hw, + u16 flags, u8 *mac_addr, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw, + u16 seid, u16 credit, u8 max_credit, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_dcb_ignore_pfc(struct i40e_hw *hw, + u8 tcmap, bool request, u8 *tcmap_ret, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_get_hmc_resource_profile(struct i40e_hw *hw, + enum i40e_aq_hmc_profile *profile, + u8 *pe_vf_enabled_count, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_config_switch_comp_ets_bw_limit( + struct i40e_hw *hw, u16 seid, + struct i40e_aqc_configure_switching_comp_ets_bw_limit_data *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_config_vsi_ets_sla_bw_limit(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_configure_vsi_ets_sla_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_dcb_updated(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_set_hmc_resource_profile(struct i40e_hw *hw, + enum i40e_aq_hmc_profile profile, + u8 pe_vf_enabled_count, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_config_switch_comp_bw_limit(struct i40e_hw *hw, + u16 seid, u16 credit, u8 max_bw, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw, u16 seid, + struct i40e_aqc_configure_vsi_tc_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_query_vsi_bw_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_vsi_bw_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_query_port_ets_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_port_ets_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw, + u16 seid, + struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_resume_port_tx(struct i40e_hw *hw, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_read_lldp_cfg(struct i40e_hw *hw, + struct i40e_lldp_variables *lldp_cfg); +enum i40e_status_code i40e_aq_add_cloud_filters(struct i40e_hw *hw, + u16 vsi, + struct i40e_aqc_add_remove_cloud_filters_element_data *filters, + u8 filter_count); + +enum i40e_status_code i40e_aq_remove_cloud_filters(struct i40e_hw *hw, + u16 vsi, + struct i40e_aqc_add_remove_cloud_filters_element_data *filters, + u8 filter_count); + +enum i40e_status_code i40e_aq_alternate_read(struct i40e_hw *hw, + u32 reg_addr0, u32 *reg_val0, + u32 reg_addr1, u32 *reg_val1); +enum i40e_status_code i40e_aq_alternate_read_indirect(struct i40e_hw *hw, + u32 addr, u32 dw_count, void *buffer); +enum i40e_status_code i40e_aq_alternate_write(struct i40e_hw *hw, + u32 reg_addr0, u32 reg_val0, + u32 reg_addr1, u32 reg_val1); +enum i40e_status_code i40e_aq_alternate_write_indirect(struct i40e_hw *hw, + u32 addr, u32 dw_count, void *buffer); +enum i40e_status_code i40e_aq_alternate_clear(struct i40e_hw *hw); +enum i40e_status_code i40e_aq_alternate_write_done(struct i40e_hw *hw, + u8 bios_mode, bool *reset_needed); +enum i40e_status_code i40e_aq_set_oem_mode(struct i40e_hw *hw, + u8 oem_mode); + +/* i40e_common */ +enum i40e_status_code i40e_init_shared_code(struct i40e_hw *hw); +enum i40e_status_code i40e_pf_reset(struct i40e_hw *hw); +void i40e_clear_hw(struct i40e_hw *hw); +void i40e_clear_pxe_mode(struct i40e_hw *hw); +enum i40e_status_code i40e_get_link_status(struct i40e_hw *hw, bool *link_up); +enum i40e_status_code i40e_update_link_info(struct i40e_hw *hw); +enum i40e_status_code i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr); +enum i40e_status_code i40e_read_bw_from_alt_ram(struct i40e_hw *hw, + u32 *max_bw, u32 *min_bw, bool *min_valid, bool *max_valid); +enum i40e_status_code i40e_aq_configure_partition_bw(struct i40e_hw *hw, + struct i40e_aqc_configure_partition_bw_data *bw_data, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr); +enum i40e_status_code i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num, + u32 pba_num_size); +void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable); +enum i40e_aq_link_speed i40e_get_link_speed(struct i40e_hw *hw); +/* prototype for functions used for NVM access */ +enum i40e_status_code i40e_init_nvm(struct i40e_hw *hw); +enum i40e_status_code i40e_acquire_nvm(struct i40e_hw *hw, + enum i40e_aq_resource_access_type access); +void i40e_release_nvm(struct i40e_hw *hw); +enum i40e_status_code i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, + u16 *data); +enum i40e_status_code i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, + u16 *words, u16 *data); +enum i40e_status_code i40e_write_nvm_aq(struct i40e_hw *hw, u8 module, + u32 offset, u16 words, void *data, + bool last_command); +enum i40e_status_code i40e_write_nvm_word(struct i40e_hw *hw, u32 offset, + void *data); +enum i40e_status_code i40e_write_nvm_buffer(struct i40e_hw *hw, u8 module, + u32 offset, u16 words, void *data); +enum i40e_status_code i40e_calc_nvm_checksum(struct i40e_hw *hw, u16 *checksum); +enum i40e_status_code i40e_update_nvm_checksum(struct i40e_hw *hw); +enum i40e_status_code i40e_validate_nvm_checksum(struct i40e_hw *hw, + u16 *checksum); +enum i40e_status_code i40e_nvmupd_command(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *); +void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status); + +enum i40e_status_code i40e_set_mac_type(struct i40e_hw *hw); + +extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[]; + +static INLINE struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype) +{ + return i40e_ptype_lookup[ptype]; +} + +/* prototype for functions used for SW spinlocks */ +void i40e_init_spinlock(struct i40e_spinlock *sp); +void i40e_acquire_spinlock(struct i40e_spinlock *sp); +void i40e_release_spinlock(struct i40e_spinlock *sp); +void i40e_destroy_spinlock(struct i40e_spinlock *sp); + +/* i40e_common for VF drivers*/ +void i40e_vf_parse_hw_config(struct i40e_hw *hw, + struct i40e_virtchnl_vf_resource *msg); +enum i40e_status_code i40e_vf_reset(struct i40e_hw *hw); +enum i40e_status_code i40e_aq_send_msg_to_pf(struct i40e_hw *hw, + enum i40e_virtchnl_ops v_opcode, + enum i40e_status_code v_retval, + u8 *msg, u16 msglen, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_set_filter_control(struct i40e_hw *hw, + struct i40e_filter_control_settings *settings); +enum i40e_status_code i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw, + u8 *mac_addr, u16 ethtype, u16 flags, + u16 vsi_seid, u16 queue, bool is_add, + struct i40e_control_filter_stats *stats, + struct i40e_asq_cmd_details *cmd_details); +enum i40e_status_code i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id, + u8 table_id, u32 start_index, u16 buff_size, + void *buff, u16 *ret_buff_size, + u8 *ret_next_table, u32 *ret_next_index, + struct i40e_asq_cmd_details *cmd_details); +#endif /* _I40E_PROTOTYPE_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_register.h b/usr/src/uts/common/io/i40e/core/i40e_register.h new file mode 100644 index 0000000000..ff4b8a54f2 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_register.h @@ -0,0 +1,5317 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_register.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_REGISTER_H_ +#define _I40E_REGISTER_H_ + + +#define I40E_GL_ARQBAH 0x000801C0 /* Reset: EMPR */ +#define I40E_GL_ARQBAH_ARQBAH_SHIFT 0 +#define I40E_GL_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAH_ARQBAH_SHIFT) +#define I40E_GL_ARQBAL 0x000800C0 /* Reset: EMPR */ +#define I40E_GL_ARQBAL_ARQBAL_SHIFT 0 +#define I40E_GL_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAL_ARQBAL_SHIFT) +#define I40E_GL_ARQH 0x000803C0 /* Reset: EMPR */ +#define I40E_GL_ARQH_ARQH_SHIFT 0 +#define I40E_GL_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_GL_ARQH_ARQH_SHIFT) +#define I40E_GL_ARQT 0x000804C0 /* Reset: EMPR */ +#define I40E_GL_ARQT_ARQT_SHIFT 0 +#define I40E_GL_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_GL_ARQT_ARQT_SHIFT) +#define I40E_GL_ATQBAH 0x00080140 /* Reset: EMPR */ +#define I40E_GL_ATQBAH_ATQBAH_SHIFT 0 +#define I40E_GL_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAH_ATQBAH_SHIFT) +#define I40E_GL_ATQBAL 0x00080040 /* Reset: EMPR */ +#define I40E_GL_ATQBAL_ATQBAL_SHIFT 0 +#define I40E_GL_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAL_ATQBAL_SHIFT) +#define I40E_GL_ATQH 0x00080340 /* Reset: EMPR */ +#define I40E_GL_ATQH_ATQH_SHIFT 0 +#define I40E_GL_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_GL_ATQH_ATQH_SHIFT) +#define I40E_GL_ATQLEN 0x00080240 /* Reset: EMPR */ +#define I40E_GL_ATQLEN_ATQLEN_SHIFT 0 +#define I40E_GL_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_GL_ATQLEN_ATQLEN_SHIFT) +#define I40E_GL_ATQLEN_ATQVFE_SHIFT 28 +#define I40E_GL_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQVFE_SHIFT) +#define I40E_GL_ATQLEN_ATQOVFL_SHIFT 29 +#define I40E_GL_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQOVFL_SHIFT) +#define I40E_GL_ATQLEN_ATQCRIT_SHIFT 30 +#define I40E_GL_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQCRIT_SHIFT) +#define I40E_GL_ATQLEN_ATQENABLE_SHIFT 31 +#define I40E_GL_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQENABLE_SHIFT) +#define I40E_GL_ATQT 0x00080440 /* Reset: EMPR */ +#define I40E_GL_ATQT_ATQT_SHIFT 0 +#define I40E_GL_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_GL_ATQT_ATQT_SHIFT) +#define I40E_PF_ARQBAH 0x00080180 /* Reset: EMPR */ +#define I40E_PF_ARQBAH_ARQBAH_SHIFT 0 +#define I40E_PF_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAH_ARQBAH_SHIFT) +#define I40E_PF_ARQBAL 0x00080080 /* Reset: EMPR */ +#define I40E_PF_ARQBAL_ARQBAL_SHIFT 0 +#define I40E_PF_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAL_ARQBAL_SHIFT) +#define I40E_PF_ARQH 0x00080380 /* Reset: EMPR */ +#define I40E_PF_ARQH_ARQH_SHIFT 0 +#define I40E_PF_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_PF_ARQH_ARQH_SHIFT) +#define I40E_PF_ARQLEN 0x00080280 /* Reset: EMPR */ +#define I40E_PF_ARQLEN_ARQLEN_SHIFT 0 +#define I40E_PF_ARQLEN_ARQLEN_MASK I40E_MASK(0x3FF, I40E_PF_ARQLEN_ARQLEN_SHIFT) +#define I40E_PF_ARQLEN_ARQVFE_SHIFT 28 +#define I40E_PF_ARQLEN_ARQVFE_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQVFE_SHIFT) +#define I40E_PF_ARQLEN_ARQOVFL_SHIFT 29 +#define I40E_PF_ARQLEN_ARQOVFL_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQOVFL_SHIFT) +#define I40E_PF_ARQLEN_ARQCRIT_SHIFT 30 +#define I40E_PF_ARQLEN_ARQCRIT_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQCRIT_SHIFT) +#define I40E_PF_ARQLEN_ARQENABLE_SHIFT 31 +#define I40E_PF_ARQLEN_ARQENABLE_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQENABLE_SHIFT) +#define I40E_PF_ARQT 0x00080480 /* Reset: EMPR */ +#define I40E_PF_ARQT_ARQT_SHIFT 0 +#define I40E_PF_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_PF_ARQT_ARQT_SHIFT) +#define I40E_PF_ATQBAH 0x00080100 /* Reset: EMPR */ +#define I40E_PF_ATQBAH_ATQBAH_SHIFT 0 +#define I40E_PF_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAH_ATQBAH_SHIFT) +#define I40E_PF_ATQBAL 0x00080000 /* Reset: EMPR */ +#define I40E_PF_ATQBAL_ATQBAL_SHIFT 0 +#define I40E_PF_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAL_ATQBAL_SHIFT) +#define I40E_PF_ATQH 0x00080300 /* Reset: EMPR */ +#define I40E_PF_ATQH_ATQH_SHIFT 0 +#define I40E_PF_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_PF_ATQH_ATQH_SHIFT) +#define I40E_PF_ATQLEN 0x00080200 /* Reset: EMPR */ +#define I40E_PF_ATQLEN_ATQLEN_SHIFT 0 +#define I40E_PF_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_PF_ATQLEN_ATQLEN_SHIFT) +#define I40E_PF_ATQLEN_ATQVFE_SHIFT 28 +#define I40E_PF_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQVFE_SHIFT) +#define I40E_PF_ATQLEN_ATQOVFL_SHIFT 29 +#define I40E_PF_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQOVFL_SHIFT) +#define I40E_PF_ATQLEN_ATQCRIT_SHIFT 30 +#define I40E_PF_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQCRIT_SHIFT) +#define I40E_PF_ATQLEN_ATQENABLE_SHIFT 31 +#define I40E_PF_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQENABLE_SHIFT) +#define I40E_PF_ATQT 0x00080400 /* Reset: EMPR */ +#define I40E_PF_ATQT_ATQT_SHIFT 0 +#define I40E_PF_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_PF_ATQT_ATQT_SHIFT) +#define I40E_VF_ARQBAH(_VF) (0x00081400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ARQBAH_MAX_INDEX 127 +#define I40E_VF_ARQBAH_ARQBAH_SHIFT 0 +#define I40E_VF_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH_ARQBAH_SHIFT) +#define I40E_VF_ARQBAL(_VF) (0x00080C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ARQBAL_MAX_INDEX 127 +#define I40E_VF_ARQBAL_ARQBAL_SHIFT 0 +#define I40E_VF_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL_ARQBAL_SHIFT) +#define I40E_VF_ARQH(_VF) (0x00082400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ARQH_MAX_INDEX 127 +#define I40E_VF_ARQH_ARQH_SHIFT 0 +#define I40E_VF_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_VF_ARQH_ARQH_SHIFT) +#define I40E_VF_ARQLEN(_VF) (0x00081C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ARQLEN_MAX_INDEX 127 +#define I40E_VF_ARQLEN_ARQLEN_SHIFT 0 +#define I40E_VF_ARQLEN_ARQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ARQLEN_ARQLEN_SHIFT) +#define I40E_VF_ARQLEN_ARQVFE_SHIFT 28 +#define I40E_VF_ARQLEN_ARQVFE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQVFE_SHIFT) +#define I40E_VF_ARQLEN_ARQOVFL_SHIFT 29 +#define I40E_VF_ARQLEN_ARQOVFL_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQOVFL_SHIFT) +#define I40E_VF_ARQLEN_ARQCRIT_SHIFT 30 +#define I40E_VF_ARQLEN_ARQCRIT_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQCRIT_SHIFT) +#define I40E_VF_ARQLEN_ARQENABLE_SHIFT 31 +#define I40E_VF_ARQLEN_ARQENABLE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQENABLE_SHIFT) +#define I40E_VF_ARQT(_VF) (0x00082C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ARQT_MAX_INDEX 127 +#define I40E_VF_ARQT_ARQT_SHIFT 0 +#define I40E_VF_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_VF_ARQT_ARQT_SHIFT) +#define I40E_VF_ATQBAH(_VF) (0x00081000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ATQBAH_MAX_INDEX 127 +#define I40E_VF_ATQBAH_ATQBAH_SHIFT 0 +#define I40E_VF_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH_ATQBAH_SHIFT) +#define I40E_VF_ATQBAL(_VF) (0x00080800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ATQBAL_MAX_INDEX 127 +#define I40E_VF_ATQBAL_ATQBAL_SHIFT 0 +#define I40E_VF_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL_ATQBAL_SHIFT) +#define I40E_VF_ATQH(_VF) (0x00082000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ATQH_MAX_INDEX 127 +#define I40E_VF_ATQH_ATQH_SHIFT 0 +#define I40E_VF_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_VF_ATQH_ATQH_SHIFT) +#define I40E_VF_ATQLEN(_VF) (0x00081800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ATQLEN_MAX_INDEX 127 +#define I40E_VF_ATQLEN_ATQLEN_SHIFT 0 +#define I40E_VF_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ATQLEN_ATQLEN_SHIFT) +#define I40E_VF_ATQLEN_ATQVFE_SHIFT 28 +#define I40E_VF_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQVFE_SHIFT) +#define I40E_VF_ATQLEN_ATQOVFL_SHIFT 29 +#define I40E_VF_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQOVFL_SHIFT) +#define I40E_VF_ATQLEN_ATQCRIT_SHIFT 30 +#define I40E_VF_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQCRIT_SHIFT) +#define I40E_VF_ATQLEN_ATQENABLE_SHIFT 31 +#define I40E_VF_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQENABLE_SHIFT) +#define I40E_VF_ATQT(_VF) (0x00082800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_VF_ATQT_MAX_INDEX 127 +#define I40E_VF_ATQT_ATQT_SHIFT 0 +#define I40E_VF_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_VF_ATQT_ATQT_SHIFT) +#define I40E_PRT_L2TAGSEN 0x001C0B20 /* Reset: CORER */ +#define I40E_PRT_L2TAGSEN_ENABLE_SHIFT 0 +#define I40E_PRT_L2TAGSEN_ENABLE_MASK I40E_MASK(0xFF, I40E_PRT_L2TAGSEN_ENABLE_SHIFT) +#define I40E_PFCM_LAN_ERRDATA 0x0010C080 /* Reset: PFR */ +#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT 0 +#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT) +#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT 4 +#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT) +#define I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT 8 +#define I40E_PFCM_LAN_ERRDATA_Q_NUM_MASK I40E_MASK(0xFFF, I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT) +#define I40E_PFCM_LAN_ERRINFO 0x0010C000 /* Reset: PFR */ +#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT 0 +#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT) +#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT 4 +#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT) +#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT 8 +#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT) +#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT 16 +#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT) +#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT 24 +#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT) +#define I40E_PFCM_LANCTXCTL 0x0010C300 /* Reset: CORER */ +#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT 0 +#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_MASK I40E_MASK(0xFFF, I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT) +#define I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT 12 +#define I40E_PFCM_LANCTXCTL_SUB_LINE_MASK I40E_MASK(0x7, I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT) +#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT 15 +#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_MASK I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT) +#define I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT 17 +#define I40E_PFCM_LANCTXCTL_OP_CODE_MASK I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT) +#define I40E_PFCM_LANCTXDATA(_i) (0x0010C100 + ((_i) * 128)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_PFCM_LANCTXDATA_MAX_INDEX 3 +#define I40E_PFCM_LANCTXDATA_DATA_SHIFT 0 +#define I40E_PFCM_LANCTXDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_PFCM_LANCTXDATA_DATA_SHIFT) +#define I40E_PFCM_LANCTXSTAT 0x0010C380 /* Reset: CORER */ +#define I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT 0 +#define I40E_PFCM_LANCTXSTAT_CTX_DONE_MASK I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT) +#define I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT 1 +#define I40E_PFCM_LANCTXSTAT_CTX_MISS_MASK I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT) +#define I40E_VFCM_PE_ERRDATA1(_VF) (0x00138800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFCM_PE_ERRDATA1_MAX_INDEX 127 +#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT 0 +#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_MASK I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT) +#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT 4 +#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT) +#define I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT 8 +#define I40E_VFCM_PE_ERRDATA1_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT) +#define I40E_VFCM_PE_ERRINFO1(_VF) (0x00138400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFCM_PE_ERRINFO1_MAX_INDEX 127 +#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT 0 +#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_MASK I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT) +#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT 4 +#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT) +#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT 8 +#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT) +#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT 16 +#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT) +#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT 24 +#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT) +#define I40E_GLDCB_GENC 0x00083044 /* Reset: CORER */ +#define I40E_GLDCB_GENC_PCIRTT_SHIFT 0 +#define I40E_GLDCB_GENC_PCIRTT_MASK I40E_MASK(0xFFFF, I40E_GLDCB_GENC_PCIRTT_SHIFT) +#define I40E_GLDCB_RUPTI 0x00122618 /* Reset: CORER */ +#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT 0 +#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT) +#define I40E_PRTDCB_FCCFG 0x001E4640 /* Reset: GLOBR */ +#define I40E_PRTDCB_FCCFG_TFCE_SHIFT 3 +#define I40E_PRTDCB_FCCFG_TFCE_MASK I40E_MASK(0x3, I40E_PRTDCB_FCCFG_TFCE_SHIFT) +#define I40E_PRTDCB_FCRTV 0x001E4600 /* Reset: GLOBR */ +#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT 0 +#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT) +#define I40E_PRTDCB_FCTTVN(_i) (0x001E4580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: GLOBR */ +#define I40E_PRTDCB_FCTTVN_MAX_INDEX 3 +#define I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT 0 +#define I40E_PRTDCB_FCTTVN_TTV_2N_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT) +#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT 16 +#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT) +#define I40E_PRTDCB_GENC 0x00083000 /* Reset: CORER */ +#define I40E_PRTDCB_GENC_RESERVED_1_SHIFT 0 +#define I40E_PRTDCB_GENC_RESERVED_1_MASK I40E_MASK(0x3, I40E_PRTDCB_GENC_RESERVED_1_SHIFT) +#define I40E_PRTDCB_GENC_NUMTC_SHIFT 2 +#define I40E_PRTDCB_GENC_NUMTC_MASK I40E_MASK(0xF, I40E_PRTDCB_GENC_NUMTC_SHIFT) +#define I40E_PRTDCB_GENC_FCOEUP_SHIFT 6 +#define I40E_PRTDCB_GENC_FCOEUP_MASK I40E_MASK(0x7, I40E_PRTDCB_GENC_FCOEUP_SHIFT) +#define I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT 9 +#define I40E_PRTDCB_GENC_FCOEUP_VALID_MASK I40E_MASK(0x1, I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT) +#define I40E_PRTDCB_GENC_PFCLDA_SHIFT 16 +#define I40E_PRTDCB_GENC_PFCLDA_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_GENC_PFCLDA_SHIFT) +#define I40E_PRTDCB_GENS 0x00083020 /* Reset: CORER */ +#define I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT 0 +#define I40E_PRTDCB_GENS_DCBX_STATUS_MASK I40E_MASK(0x7, I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT) +#define I40E_PRTDCB_MFLCN 0x001E2400 /* Reset: GLOBR */ +#define I40E_PRTDCB_MFLCN_PMCF_SHIFT 0 +#define I40E_PRTDCB_MFLCN_PMCF_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_PMCF_SHIFT) +#define I40E_PRTDCB_MFLCN_DPF_SHIFT 1 +#define I40E_PRTDCB_MFLCN_DPF_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_DPF_SHIFT) +#define I40E_PRTDCB_MFLCN_RPFCM_SHIFT 2 +#define I40E_PRTDCB_MFLCN_RPFCM_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RPFCM_SHIFT) +#define I40E_PRTDCB_MFLCN_RFCE_SHIFT 3 +#define I40E_PRTDCB_MFLCN_RFCE_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RFCE_SHIFT) +#define I40E_PRTDCB_MFLCN_RPFCE_SHIFT 4 +#define I40E_PRTDCB_MFLCN_RPFCE_MASK I40E_MASK(0xFF, I40E_PRTDCB_MFLCN_RPFCE_SHIFT) +#define I40E_PRTDCB_RETSC 0x001223E0 /* Reset: CORER */ +#define I40E_PRTDCB_RETSC_ETS_MODE_SHIFT 0 +#define I40E_PRTDCB_RETSC_ETS_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSC_ETS_MODE_SHIFT) +#define I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT 1 +#define I40E_PRTDCB_RETSC_NON_ETS_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT) +#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT 2 +#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_MASK I40E_MASK(0xF, I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT) +#define I40E_PRTDCB_RETSC_LLTC_SHIFT 8 +#define I40E_PRTDCB_RETSC_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_RETSC_LLTC_SHIFT) +#define I40E_PRTDCB_RETSTCC(_i) (0x00122180 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTDCB_RETSTCC_MAX_INDEX 7 +#define I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT 0 +#define I40E_PRTDCB_RETSTCC_BWSHARE_MASK I40E_MASK(0x7F, I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT) +#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT 30 +#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT) +#define I40E_PRTDCB_RETSTCC_ETSTC_SHIFT 31 +#define I40E_PRTDCB_RETSTCC_ETSTC_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_ETSTC_SHIFT) +#define I40E_PRTDCB_RPPMC 0x001223A0 /* Reset: CORER */ +#define I40E_PRTDCB_RPPMC_LANRPPM_SHIFT 0 +#define I40E_PRTDCB_RPPMC_LANRPPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_LANRPPM_SHIFT) +#define I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT 8 +#define I40E_PRTDCB_RPPMC_RDMARPPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT) +#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT 16 +#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT) +#define I40E_PRTDCB_RUP 0x001C0B00 /* Reset: CORER */ +#define I40E_PRTDCB_RUP_NOVLANUP_SHIFT 0 +#define I40E_PRTDCB_RUP_NOVLANUP_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP_NOVLANUP_SHIFT) +#define I40E_PRTDCB_RUP2TC 0x001C09A0 /* Reset: CORER */ +#define I40E_PRTDCB_RUP2TC_UP0TC_SHIFT 0 +#define I40E_PRTDCB_RUP2TC_UP0TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP0TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP1TC_SHIFT 3 +#define I40E_PRTDCB_RUP2TC_UP1TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP1TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP2TC_SHIFT 6 +#define I40E_PRTDCB_RUP2TC_UP2TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP2TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP3TC_SHIFT 9 +#define I40E_PRTDCB_RUP2TC_UP3TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP3TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP4TC_SHIFT 12 +#define I40E_PRTDCB_RUP2TC_UP4TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP4TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP5TC_SHIFT 15 +#define I40E_PRTDCB_RUP2TC_UP5TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP5TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP6TC_SHIFT 18 +#define I40E_PRTDCB_RUP2TC_UP6TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP6TC_SHIFT) +#define I40E_PRTDCB_RUP2TC_UP7TC_SHIFT 21 +#define I40E_PRTDCB_RUP2TC_UP7TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP7TC_SHIFT) +#define I40E_PRTDCB_RUPTQ(_i) (0x00122400 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTDCB_RUPTQ_MAX_INDEX 7 +#define I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT 0 +#define I40E_PRTDCB_RUPTQ_RXQNUM_MASK I40E_MASK(0x3FFF, I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT) +#define I40E_PRTDCB_TC2PFC 0x001C0980 /* Reset: CORER */ +#define I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT 0 +#define I40E_PRTDCB_TC2PFC_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT) +#define I40E_PRTDCB_TCMSTC(_i) (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTDCB_TCMSTC_MAX_INDEX 7 +#define I40E_PRTDCB_TCMSTC_MSTC_SHIFT 0 +#define I40E_PRTDCB_TCMSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_MSTC_SHIFT) +#define I40E_PRTDCB_TCPMC 0x000A21A0 /* Reset: CORER */ +#define I40E_PRTDCB_TCPMC_CPM_SHIFT 0 +#define I40E_PRTDCB_TCPMC_CPM_MASK I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_CPM_SHIFT) +#define I40E_PRTDCB_TCPMC_LLTC_SHIFT 13 +#define I40E_PRTDCB_TCPMC_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_LLTC_SHIFT) +#define I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT 30 +#define I40E_PRTDCB_TCPMC_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT) +#define I40E_PRTDCB_TCWSTC(_i) (0x000A2040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTDCB_TCWSTC_MAX_INDEX 7 +#define I40E_PRTDCB_TCWSTC_MSTC_SHIFT 0 +#define I40E_PRTDCB_TCWSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCWSTC_MSTC_SHIFT) +#define I40E_PRTDCB_TDPMC 0x000A0180 /* Reset: CORER */ +#define I40E_PRTDCB_TDPMC_DPM_SHIFT 0 +#define I40E_PRTDCB_TDPMC_DPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_TDPMC_DPM_SHIFT) +#define I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT 30 +#define I40E_PRTDCB_TDPMC_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT) +#define I40E_PRTDCB_TETSC_TCB 0x000AE060 /* Reset: CORER */ +#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT 0 +#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_MASK I40E_MASK(0x1, I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT) +#define I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT 8 +#define I40E_PRTDCB_TETSC_TCB_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT) +#define I40E_PRTDCB_TETSC_TPB 0x00098060 /* Reset: CORER */ +#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT 0 +#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_MASK I40E_MASK(0x1, I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT) +#define I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT 8 +#define I40E_PRTDCB_TETSC_TPB_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT) +#define I40E_PRTDCB_TFCS 0x001E4560 /* Reset: GLOBR */ +#define I40E_PRTDCB_TFCS_TXOFF_SHIFT 0 +#define I40E_PRTDCB_TFCS_TXOFF_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF0_SHIFT 8 +#define I40E_PRTDCB_TFCS_TXOFF0_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF0_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF1_SHIFT 9 +#define I40E_PRTDCB_TFCS_TXOFF1_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF1_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF2_SHIFT 10 +#define I40E_PRTDCB_TFCS_TXOFF2_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF2_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF3_SHIFT 11 +#define I40E_PRTDCB_TFCS_TXOFF3_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF3_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF4_SHIFT 12 +#define I40E_PRTDCB_TFCS_TXOFF4_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF4_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF5_SHIFT 13 +#define I40E_PRTDCB_TFCS_TXOFF5_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF5_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF6_SHIFT 14 +#define I40E_PRTDCB_TFCS_TXOFF6_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF6_SHIFT) +#define I40E_PRTDCB_TFCS_TXOFF7_SHIFT 15 +#define I40E_PRTDCB_TFCS_TXOFF7_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF7_SHIFT) +#define I40E_PRTDCB_TPFCTS(_i) (0x001E4660 + ((_i) * 32)) /* _i=0...7 */ /* Reset: GLOBR */ +#define I40E_PRTDCB_TPFCTS_MAX_INDEX 7 +#define I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT 0 +#define I40E_PRTDCB_TPFCTS_PFCTIMER_MASK I40E_MASK(0x3FFF, I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT) +#define I40E_GLFCOE_RCTL 0x00269B94 /* Reset: CORER */ +#define I40E_GLFCOE_RCTL_FCOEVER_SHIFT 0 +#define I40E_GLFCOE_RCTL_FCOEVER_MASK I40E_MASK(0xF, I40E_GLFCOE_RCTL_FCOEVER_SHIFT) +#define I40E_GLFCOE_RCTL_SAVBAD_SHIFT 4 +#define I40E_GLFCOE_RCTL_SAVBAD_MASK I40E_MASK(0x1, I40E_GLFCOE_RCTL_SAVBAD_SHIFT) +#define I40E_GLFCOE_RCTL_ICRC_SHIFT 5 +#define I40E_GLFCOE_RCTL_ICRC_MASK I40E_MASK(0x1, I40E_GLFCOE_RCTL_ICRC_SHIFT) +#define I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT 16 +#define I40E_GLFCOE_RCTL_MAX_SIZE_MASK I40E_MASK(0x3FFF, I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT) +#define I40E_GL_FWSTS 0x00083048 /* Reset: POR */ +#define I40E_GL_FWSTS_FWS0B_SHIFT 0 +#define I40E_GL_FWSTS_FWS0B_MASK I40E_MASK(0xFF, I40E_GL_FWSTS_FWS0B_SHIFT) +#define I40E_GL_FWSTS_FWRI_SHIFT 9 +#define I40E_GL_FWSTS_FWRI_MASK I40E_MASK(0x1, I40E_GL_FWSTS_FWRI_SHIFT) +#define I40E_GL_FWSTS_FWS1B_SHIFT 16 +#define I40E_GL_FWSTS_FWS1B_MASK I40E_MASK(0xFF, I40E_GL_FWSTS_FWS1B_SHIFT) +#define I40E_GLGEN_CLKSTAT 0x000B8184 /* Reset: POR */ +#define I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT 0 +#define I40E_GLGEN_CLKSTAT_CLKMODE_MASK I40E_MASK(0x1, I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT) +#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT 4 +#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_MASK I40E_MASK(0x3, I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT) +#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT 8 +#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT) +#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT 12 +#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT) +#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT 16 +#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT) +#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT 20 +#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT) +#define I40E_GLGEN_GPIO_CTL(_i) (0x00088100 + ((_i) * 4)) /* _i=0...29 */ /* Reset: POR */ +#define I40E_GLGEN_GPIO_CTL_MAX_INDEX 29 +#define I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT 0 +#define I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT) +#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT 3 +#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT) +#define I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT 4 +#define I40E_GLGEN_GPIO_CTL_PIN_DIR_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT) +#define I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT 5 +#define I40E_GLGEN_GPIO_CTL_TRI_CTL_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT) +#define I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT 6 +#define I40E_GLGEN_GPIO_CTL_OUT_CTL_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT) +#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT 7 +#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_MASK I40E_MASK(0x7, I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT) +#define I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT 10 +#define I40E_GLGEN_GPIO_CTL_LED_INVRT_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT) +#define I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT 11 +#define I40E_GLGEN_GPIO_CTL_LED_BLINK_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT) +#define I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT 12 +#define I40E_GLGEN_GPIO_CTL_LED_MODE_MASK I40E_MASK(0x1F, I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT) +#define I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT 17 +#define I40E_GLGEN_GPIO_CTL_INT_MODE_MASK I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT) +#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT 19 +#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT) +#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT 20 +#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_MASK I40E_MASK(0x3F, I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT) +#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT 26 +#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_MASK I40E_MASK(0xF, I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT) +#define I40E_GLGEN_GPIO_SET 0x00088184 /* Reset: POR */ +#define I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT 0 +#define I40E_GLGEN_GPIO_SET_GPIO_INDX_MASK I40E_MASK(0x1F, I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT) +#define I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT 5 +#define I40E_GLGEN_GPIO_SET_SDP_DATA_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT) +#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT 6 +#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT) +#define I40E_GLGEN_GPIO_STAT 0x0008817C /* Reset: POR */ +#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT 0 +#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT) +#define I40E_GLGEN_GPIO_TRANSIT 0x00088180 /* Reset: POR */ +#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT 0 +#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT) +#define I40E_GLGEN_I2CCMD(_i) (0x000881E0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_I2CCMD_MAX_INDEX 3 +#define I40E_GLGEN_I2CCMD_DATA_SHIFT 0 +#define I40E_GLGEN_I2CCMD_DATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_I2CCMD_DATA_SHIFT) +#define I40E_GLGEN_I2CCMD_REGADD_SHIFT 16 +#define I40E_GLGEN_I2CCMD_REGADD_MASK I40E_MASK(0xFF, I40E_GLGEN_I2CCMD_REGADD_SHIFT) +#define I40E_GLGEN_I2CCMD_PHYADD_SHIFT 24 +#define I40E_GLGEN_I2CCMD_PHYADD_MASK I40E_MASK(0x7, I40E_GLGEN_I2CCMD_PHYADD_SHIFT) +#define I40E_GLGEN_I2CCMD_OP_SHIFT 27 +#define I40E_GLGEN_I2CCMD_OP_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_OP_SHIFT) +#define I40E_GLGEN_I2CCMD_RESET_SHIFT 28 +#define I40E_GLGEN_I2CCMD_RESET_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_RESET_SHIFT) +#define I40E_GLGEN_I2CCMD_R_SHIFT 29 +#define I40E_GLGEN_I2CCMD_R_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_R_SHIFT) +#define I40E_GLGEN_I2CCMD_E_SHIFT 31 +#define I40E_GLGEN_I2CCMD_E_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_E_SHIFT) +#define I40E_GLGEN_I2CPARAMS(_i) (0x000881AC + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_I2CPARAMS_MAX_INDEX 3 +#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT 0 +#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_MASK I40E_MASK(0x1F, I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT) +#define I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT 5 +#define I40E_GLGEN_I2CPARAMS_READ_TIME_MASK I40E_MASK(0x7, I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT) +#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT 8 +#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT) +#define I40E_GLGEN_I2CPARAMS_CLK_SHIFT 9 +#define I40E_GLGEN_I2CPARAMS_CLK_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_SHIFT) +#define I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT 10 +#define I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT) +#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT 11 +#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT) +#define I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT 12 +#define I40E_GLGEN_I2CPARAMS_DATA_IN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT) +#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT 13 +#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT) +#define I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT 14 +#define I40E_GLGEN_I2CPARAMS_CLK_IN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT) +#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT 15 +#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT) +#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT 31 +#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT) +#define I40E_GLGEN_LED_CTL 0x00088178 /* Reset: POR */ +#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT 0 +#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_MASK I40E_MASK(0x1, I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT) +#define I40E_GLGEN_MDIO_CTRL(_i) (0x000881D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_MDIO_CTRL_MAX_INDEX 3 +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT 0 +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_MASK I40E_MASK(0x1FFFF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT) +#define I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT 17 +#define I40E_GLGEN_MDIO_CTRL_CONTMDC_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT) +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT 18 +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_MASK I40E_MASK(0x7FF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT) +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT 29 +#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_MASK I40E_MASK(0x7, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL(_i) (0x000881C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_MDIO_I2C_SEL_MAX_INDEX 3 +#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT 0 +#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT 1 +#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_MASK I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT 5 +#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT 10 +#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT 15 +#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT 20 +#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT 25 +#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_MASK I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT) +#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT 31 +#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT) +#define I40E_GLGEN_MSCA(_i) (0x0008818C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_MSCA_MAX_INDEX 3 +#define I40E_GLGEN_MSCA_MDIADD_SHIFT 0 +#define I40E_GLGEN_MSCA_MDIADD_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSCA_MDIADD_SHIFT) +#define I40E_GLGEN_MSCA_DEVADD_SHIFT 16 +#define I40E_GLGEN_MSCA_DEVADD_MASK I40E_MASK(0x1F, I40E_GLGEN_MSCA_DEVADD_SHIFT) +#define I40E_GLGEN_MSCA_PHYADD_SHIFT 21 +#define I40E_GLGEN_MSCA_PHYADD_MASK I40E_MASK(0x1F, I40E_GLGEN_MSCA_PHYADD_SHIFT) +#define I40E_GLGEN_MSCA_OPCODE_SHIFT 26 +#define I40E_GLGEN_MSCA_OPCODE_MASK I40E_MASK(0x3, I40E_GLGEN_MSCA_OPCODE_SHIFT) +#define I40E_GLGEN_MSCA_STCODE_SHIFT 28 +#define I40E_GLGEN_MSCA_STCODE_MASK I40E_MASK(0x3, I40E_GLGEN_MSCA_STCODE_SHIFT) +#define I40E_GLGEN_MSCA_MDICMD_SHIFT 30 +#define I40E_GLGEN_MSCA_MDICMD_MASK I40E_MASK(0x1, I40E_GLGEN_MSCA_MDICMD_SHIFT) +#define I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT 31 +#define I40E_GLGEN_MSCA_MDIINPROGEN_MASK I40E_MASK(0x1, I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT) +#define I40E_GLGEN_MSRWD(_i) (0x0008819C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_GLGEN_MSRWD_MAX_INDEX 3 +#define I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT 0 +#define I40E_GLGEN_MSRWD_MDIWRDATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT) +#define I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT 16 +#define I40E_GLGEN_MSRWD_MDIRDDATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT) +#define I40E_GLGEN_PCIFCNCNT 0x001C0AB4 /* Reset: PCIR */ +#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT 0 +#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_MASK I40E_MASK(0x1F, I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT) +#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT 16 +#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_MASK I40E_MASK(0xFF, I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT) +#define I40E_GLGEN_RSTAT 0x000B8188 /* Reset: POR */ +#define I40E_GLGEN_RSTAT_DEVSTATE_SHIFT 0 +#define I40E_GLGEN_RSTAT_DEVSTATE_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_DEVSTATE_SHIFT) +#define I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT 2 +#define I40E_GLGEN_RSTAT_RESET_TYPE_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT) +#define I40E_GLGEN_RSTAT_CORERCNT_SHIFT 4 +#define I40E_GLGEN_RSTAT_CORERCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_CORERCNT_SHIFT) +#define I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT 6 +#define I40E_GLGEN_RSTAT_GLOBRCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT) +#define I40E_GLGEN_RSTAT_EMPRCNT_SHIFT 8 +#define I40E_GLGEN_RSTAT_EMPRCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_EMPRCNT_SHIFT) +#define I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT 10 +#define I40E_GLGEN_RSTAT_TIME_TO_RST_MASK I40E_MASK(0x3F, I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT) +#define I40E_GLGEN_RSTCTL 0x000B8180 /* Reset: POR */ +#define I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT 0 +#define I40E_GLGEN_RSTCTL_GRSTDEL_MASK I40E_MASK(0x3F, I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT) +#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT 8 +#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_MASK I40E_MASK(0x1, I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT) +#define I40E_GLGEN_RTRIG 0x000B8190 /* Reset: CORER */ +#define I40E_GLGEN_RTRIG_CORER_SHIFT 0 +#define I40E_GLGEN_RTRIG_CORER_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_CORER_SHIFT) +#define I40E_GLGEN_RTRIG_GLOBR_SHIFT 1 +#define I40E_GLGEN_RTRIG_GLOBR_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_GLOBR_SHIFT) +#define I40E_GLGEN_RTRIG_EMPFWR_SHIFT 2 +#define I40E_GLGEN_RTRIG_EMPFWR_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_EMPFWR_SHIFT) +#define I40E_GLGEN_STAT 0x000B612C /* Reset: POR */ +#define I40E_GLGEN_STAT_HWRSVD0_SHIFT 0 +#define I40E_GLGEN_STAT_HWRSVD0_MASK I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD0_SHIFT) +#define I40E_GLGEN_STAT_DCBEN_SHIFT 2 +#define I40E_GLGEN_STAT_DCBEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_DCBEN_SHIFT) +#define I40E_GLGEN_STAT_VTEN_SHIFT 3 +#define I40E_GLGEN_STAT_VTEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_VTEN_SHIFT) +#define I40E_GLGEN_STAT_FCOEN_SHIFT 4 +#define I40E_GLGEN_STAT_FCOEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_FCOEN_SHIFT) +#define I40E_GLGEN_STAT_EVBEN_SHIFT 5 +#define I40E_GLGEN_STAT_EVBEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_EVBEN_SHIFT) +#define I40E_GLGEN_STAT_HWRSVD1_SHIFT 6 +#define I40E_GLGEN_STAT_HWRSVD1_MASK I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD1_SHIFT) +#define I40E_GLGEN_VFLRSTAT(_i) (0x00092600 + ((_i) * 4)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLGEN_VFLRSTAT_MAX_INDEX 3 +#define I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT 0 +#define I40E_GLGEN_VFLRSTAT_VFLRE_MASK I40E_MASK(0xFFFFFFFF, I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT) +#define I40E_GLVFGEN_TIMER 0x000881BC /* Reset: CORER */ +#define I40E_GLVFGEN_TIMER_GTIME_SHIFT 0 +#define I40E_GLVFGEN_TIMER_GTIME_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVFGEN_TIMER_GTIME_SHIFT) +#define I40E_PFGEN_CTRL 0x00092400 /* Reset: PFR */ +#define I40E_PFGEN_CTRL_PFSWR_SHIFT 0 +#define I40E_PFGEN_CTRL_PFSWR_MASK I40E_MASK(0x1, I40E_PFGEN_CTRL_PFSWR_SHIFT) +#define I40E_PFGEN_DRUN 0x00092500 /* Reset: CORER */ +#define I40E_PFGEN_DRUN_DRVUNLD_SHIFT 0 +#define I40E_PFGEN_DRUN_DRVUNLD_MASK I40E_MASK(0x1, I40E_PFGEN_DRUN_DRVUNLD_SHIFT) +#define I40E_PFGEN_PORTNUM 0x001C0480 /* Reset: CORER */ +#define I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT 0 +#define I40E_PFGEN_PORTNUM_PORT_NUM_MASK I40E_MASK(0x3, I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT) +#define I40E_PFGEN_STATE 0x00088000 /* Reset: CORER */ +#define I40E_PFGEN_STATE_RESERVED_0_SHIFT 0 +#define I40E_PFGEN_STATE_RESERVED_0_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_RESERVED_0_SHIFT) +#define I40E_PFGEN_STATE_PFFCEN_SHIFT 1 +#define I40E_PFGEN_STATE_PFFCEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFFCEN_SHIFT) +#define I40E_PFGEN_STATE_PFLINKEN_SHIFT 2 +#define I40E_PFGEN_STATE_PFLINKEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFLINKEN_SHIFT) +#define I40E_PFGEN_STATE_PFSCEN_SHIFT 3 +#define I40E_PFGEN_STATE_PFSCEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFSCEN_SHIFT) +#define I40E_PRTGEN_CNF 0x000B8120 /* Reset: POR */ +#define I40E_PRTGEN_CNF_PORT_DIS_SHIFT 0 +#define I40E_PRTGEN_CNF_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_PORT_DIS_SHIFT) +#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT 1 +#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT) +#define I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT 2 +#define I40E_PRTGEN_CNF_EMP_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT) +#define I40E_PRTGEN_CNF2 0x000B8160 /* Reset: POR */ +#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT 0 +#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT) +#define I40E_PRTGEN_STATUS 0x000B8100 /* Reset: POR */ +#define I40E_PRTGEN_STATUS_PORT_VALID_SHIFT 0 +#define I40E_PRTGEN_STATUS_PORT_VALID_MASK I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_VALID_SHIFT) +#define I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT 1 +#define I40E_PRTGEN_STATUS_PORT_ACTIVE_MASK I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT) +#define I40E_VFGEN_RSTAT1(_VF) (0x00074400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFGEN_RSTAT1_MAX_INDEX 127 +#define I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT 0 +#define I40E_VFGEN_RSTAT1_VFR_STATE_MASK I40E_MASK(0x3, I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT) +#define I40E_VPGEN_VFRSTAT(_VF) (0x00091C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VPGEN_VFRSTAT_MAX_INDEX 127 +#define I40E_VPGEN_VFRSTAT_VFRD_SHIFT 0 +#define I40E_VPGEN_VFRSTAT_VFRD_MASK I40E_MASK(0x1, I40E_VPGEN_VFRSTAT_VFRD_SHIFT) +#define I40E_VPGEN_VFRTRIG(_VF) (0x00091800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VPGEN_VFRTRIG_MAX_INDEX 127 +#define I40E_VPGEN_VFRTRIG_VFSWR_SHIFT 0 +#define I40E_VPGEN_VFRTRIG_VFSWR_MASK I40E_MASK(0x1, I40E_VPGEN_VFRTRIG_VFSWR_SHIFT) +#define I40E_VSIGEN_RSTAT(_VSI) (0x00090800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_VSIGEN_RSTAT_MAX_INDEX 383 +#define I40E_VSIGEN_RSTAT_VMRD_SHIFT 0 +#define I40E_VSIGEN_RSTAT_VMRD_MASK I40E_MASK(0x1, I40E_VSIGEN_RSTAT_VMRD_SHIFT) +#define I40E_VSIGEN_RTRIG(_VSI) (0x00090000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_VSIGEN_RTRIG_MAX_INDEX 383 +#define I40E_VSIGEN_RTRIG_VMSWR_SHIFT 0 +#define I40E_VSIGEN_RTRIG_VMSWR_MASK I40E_MASK(0x1, I40E_VSIGEN_RTRIG_VMSWR_SHIFT) +#define I40E_GLHMC_FCOEDDPBASE(_i) (0x000C6600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FCOEDDPBASE_MAX_INDEX 15 +#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT 0 +#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT) +#define I40E_GLHMC_FCOEDDPCNT(_i) (0x000C6700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FCOEDDPCNT_MAX_INDEX 15 +#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT 0 +#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_MASK I40E_MASK(0xFFFFF, I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT) +#define I40E_GLHMC_FCOEDDPOBJSZ 0x000C2010 /* Reset: CORER */ +#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT 0 +#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT) +#define I40E_GLHMC_FCOEFBASE(_i) (0x000C6800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FCOEFBASE_MAX_INDEX 15 +#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT 0 +#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT) +#define I40E_GLHMC_FCOEFCNT(_i) (0x000C6900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FCOEFCNT_MAX_INDEX 15 +#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT 0 +#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_MASK I40E_MASK(0x7FFFFF, I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT) +#define I40E_GLHMC_FCOEFMAX 0x000C20D0 /* Reset: CORER */ +#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT 0 +#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK I40E_MASK(0xFFFF, I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT) +#define I40E_GLHMC_FCOEFOBJSZ 0x000C2018 /* Reset: CORER */ +#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT 0 +#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT) +#define I40E_GLHMC_FCOEMAX 0x000C2014 /* Reset: CORER */ +#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT 0 +#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_MASK I40E_MASK(0x1FFF, I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT) +#define I40E_GLHMC_FSIAVBASE(_i) (0x000C5600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FSIAVBASE_MAX_INDEX 15 +#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT 0 +#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT) +#define I40E_GLHMC_FSIAVCNT(_i) (0x000C5700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FSIAVCNT_MAX_INDEX 15 +#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT 0 +#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT) +#define I40E_GLHMC_FSIAVCNT_RSVD_SHIFT 29 +#define I40E_GLHMC_FSIAVCNT_RSVD_MASK I40E_MASK(0x7, I40E_GLHMC_FSIAVCNT_RSVD_SHIFT) +#define I40E_GLHMC_FSIAVMAX 0x000C2068 /* Reset: CORER */ +#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT 0 +#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_MASK I40E_MASK(0x1FFFF, I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT) +#define I40E_GLHMC_FSIAVOBJSZ 0x000C2064 /* Reset: CORER */ +#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT 0 +#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT) +#define I40E_GLHMC_FSIMCBASE(_i) (0x000C6000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FSIMCBASE_MAX_INDEX 15 +#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT 0 +#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT) +#define I40E_GLHMC_FSIMCCNT(_i) (0x000C6100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_FSIMCCNT_MAX_INDEX 15 +#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT 0 +#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT) +#define I40E_GLHMC_FSIMCMAX 0x000C2060 /* Reset: CORER */ +#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT 0 +#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_MASK I40E_MASK(0x3FFF, I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT) +#define I40E_GLHMC_FSIMCOBJSZ 0x000C205c /* Reset: CORER */ +#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT 0 +#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT) +#define I40E_GLHMC_LANQMAX 0x000C2008 /* Reset: CORER */ +#define I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT 0 +#define I40E_GLHMC_LANQMAX_PMLANQMAX_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT) +#define I40E_GLHMC_LANRXBASE(_i) (0x000C6400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_LANRXBASE_MAX_INDEX 15 +#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT 0 +#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT) +#define I40E_GLHMC_LANRXCNT(_i) (0x000C6500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_LANRXCNT_MAX_INDEX 15 +#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT 0 +#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT) +#define I40E_GLHMC_LANRXOBJSZ 0x000C200c /* Reset: CORER */ +#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT 0 +#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT) +#define I40E_GLHMC_LANTXBASE(_i) (0x000C6200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_LANTXBASE_MAX_INDEX 15 +#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT 0 +#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT) +#define I40E_GLHMC_LANTXBASE_RSVD_SHIFT 24 +#define I40E_GLHMC_LANTXBASE_RSVD_MASK I40E_MASK(0xFF, I40E_GLHMC_LANTXBASE_RSVD_SHIFT) +#define I40E_GLHMC_LANTXCNT(_i) (0x000C6300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_LANTXCNT_MAX_INDEX 15 +#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT 0 +#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT) +#define I40E_GLHMC_LANTXOBJSZ 0x000C2004 /* Reset: CORER */ +#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT 0 +#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT) +#define I40E_GLHMC_PFASSIGN(_i) (0x000C0c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PFASSIGN_MAX_INDEX 15 +#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT 0 +#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_MASK I40E_MASK(0xF, I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT) +#define I40E_GLHMC_SDPART(_i) (0x000C0800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_SDPART_MAX_INDEX 15 +#define I40E_GLHMC_SDPART_PMSDBASE_SHIFT 0 +#define I40E_GLHMC_SDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_SDPART_PMSDBASE_SHIFT) +#define I40E_GLHMC_SDPART_PMSDSIZE_SHIFT 16 +#define I40E_GLHMC_SDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_SDPART_PMSDSIZE_SHIFT) +#define I40E_PFHMC_ERRORDATA 0x000C0500 /* Reset: PFR */ +#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT 0 +#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_MASK I40E_MASK(0x3FFFFFFF, I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT) +#define I40E_PFHMC_ERRORINFO 0x000C0400 /* Reset: PFR */ +#define I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT 0 +#define I40E_PFHMC_ERRORINFO_PMF_INDEX_MASK I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT) +#define I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT 7 +#define I40E_PFHMC_ERRORINFO_PMF_ISVF_MASK I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT) +#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT 8 +#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_MASK I40E_MASK(0xF, I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT) +#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT 16 +#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_MASK I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT) +#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT 31 +#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_MASK I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT) +#define I40E_PFHMC_PDINV 0x000C0300 /* Reset: PFR */ +#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0 +#define I40E_PFHMC_PDINV_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_PFHMC_PDINV_PMSDIDX_SHIFT) +#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16 +#define I40E_PFHMC_PDINV_PMPDIDX_MASK I40E_MASK(0x1FF, I40E_PFHMC_PDINV_PMPDIDX_SHIFT) +#define I40E_PFHMC_SDCMD 0x000C0000 /* Reset: PFR */ +#define I40E_PFHMC_SDCMD_PMSDIDX_SHIFT 0 +#define I40E_PFHMC_SDCMD_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_PFHMC_SDCMD_PMSDIDX_SHIFT) +#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT 31 +#define I40E_PFHMC_SDCMD_PMSDWR_MASK I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDWR_SHIFT) +#define I40E_PFHMC_SDDATAHIGH 0x000C0200 /* Reset: PFR */ +#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT 0 +#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT) +#define I40E_PFHMC_SDDATALOW 0x000C0100 /* Reset: PFR */ +#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT 0 +#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT 1 +#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2 +#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK I40E_MASK(0x3FF, I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT 12 +#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_MASK I40E_MASK(0xFFFFF, I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT) +#define I40E_GL_GP_FUSE(_i) (0x0009400C + ((_i) * 4)) /* _i=0...28 */ /* Reset: POR */ +#define I40E_GL_GP_FUSE_MAX_INDEX 28 +#define I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT 0 +#define I40E_GL_GP_FUSE_GL_GP_FUSE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT) +#define I40E_GL_UFUSE 0x00094008 /* Reset: POR */ +#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT 1 +#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_MASK I40E_MASK(0x1, I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT) +#define I40E_GL_UFUSE_NIC_ID_SHIFT 2 +#define I40E_GL_UFUSE_NIC_ID_MASK I40E_MASK(0x1, I40E_GL_UFUSE_NIC_ID_SHIFT) +#define I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT 10 +#define I40E_GL_UFUSE_ULT_LOCKOUT_MASK I40E_MASK(0x1, I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT) +#define I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT 11 +#define I40E_GL_UFUSE_CLS_LOCKOUT_MASK I40E_MASK(0x1, I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT) +#define I40E_EMPINT_GPIO_ENA 0x00088188 /* Reset: POR */ +#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT 0 +#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT 1 +#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT 2 +#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT 3 +#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT 4 +#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT 5 +#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT 6 +#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT 7 +#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT 8 +#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT 9 +#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT 10 +#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT 11 +#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT 12 +#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT 13 +#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT 14 +#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT 15 +#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT 16 +#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT 17 +#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT 18 +#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT 19 +#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT 20 +#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT 21 +#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT 22 +#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT 23 +#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT 24 +#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT 25 +#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT 26 +#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT 27 +#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT 28 +#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT) +#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT 29 +#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT) +#define I40E_PFGEN_PORTMDIO_NUM 0x0003F100 /* Reset: CORER */ +#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT 0 +#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_MASK I40E_MASK(0x3, I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT) +#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT 4 +#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_MASK I40E_MASK(0x1, I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT) +#define I40E_PFINT_AEQCTL 0x00038700 /* Reset: CORER */ +#define I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT 0 +#define I40E_PFINT_AEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT) +#define I40E_PFINT_AEQCTL_ITR_INDX_SHIFT 11 +#define I40E_PFINT_AEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_AEQCTL_ITR_INDX_SHIFT) +#define I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_PFINT_AEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT) +#define I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_PFINT_AEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT) +#define I40E_PFINT_AEQCTL_INTEVENT_SHIFT 31 +#define I40E_PFINT_AEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_AEQCTL_INTEVENT_SHIFT) +#define I40E_PFINT_CEQCTL(_INTPF) (0x00036800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: CORER */ +#define I40E_PFINT_CEQCTL_MAX_INDEX 511 +#define I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT 0 +#define I40E_PFINT_CEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT) +#define I40E_PFINT_CEQCTL_ITR_INDX_SHIFT 11 +#define I40E_PFINT_CEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_CEQCTL_ITR_INDX_SHIFT) +#define I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_PFINT_CEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT) +#define I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT 16 +#define I40E_PFINT_CEQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT) +#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT 27 +#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT) +#define I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_PFINT_CEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT) +#define I40E_PFINT_CEQCTL_INTEVENT_SHIFT 31 +#define I40E_PFINT_CEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_CEQCTL_INTEVENT_SHIFT) +#define I40E_GLINT_CTL 0x0003F800 /* Reset: CORER */ +#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT 0 +#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT) +#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT 1 +#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT) +#define I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT 2 +#define I40E_GLINT_CTL_DIS_AUTOMASK_N_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT) +#define I40E_PFINT_DYN_CTL0 0x00038480 /* Reset: PFR */ +#define I40E_PFINT_DYN_CTL0_INTENA_SHIFT 0 +#define I40E_PFINT_DYN_CTL0_INTENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_SHIFT) +#define I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT 1 +#define I40E_PFINT_DYN_CTL0_CLEARPBA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT) +#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT 2 +#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT) +#define I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT 3 +#define I40E_PFINT_DYN_CTL0_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT) +#define I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT 5 +#define I40E_PFINT_DYN_CTL0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT) +#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT) +#define I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT 31 +#define I40E_PFINT_DYN_CTL0_INTENA_MSK_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT) +#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */ +#define I40E_PFINT_DYN_CTLN_MAX_INDEX 511 +#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT 0 +#define I40E_PFINT_DYN_CTLN_INTENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT 1 +#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT 2 +#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT) +#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 +#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) +#define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 +#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) +#define I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT 31 +#define I40E_PFINT_DYN_CTLN_INTENA_MSK_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT) +#define I40E_PFINT_GPIO_ENA 0x00088080 /* Reset: CORER */ +#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT 0 +#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT 1 +#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT 2 +#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT 3 +#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT 4 +#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT 5 +#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT 6 +#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT 7 +#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT 8 +#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT 9 +#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT 10 +#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT 11 +#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT 12 +#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT 13 +#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT 14 +#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT 15 +#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT 16 +#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT 17 +#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT 18 +#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT 19 +#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT 20 +#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT 21 +#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT 22 +#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT 23 +#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT 24 +#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT 25 +#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT 26 +#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT 27 +#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT 28 +#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT) +#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT 29 +#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT) +#define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ +#define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 +#define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_0_SHIFT 1 +#define I40E_PFINT_ICR0_QUEUE_0_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_0_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_1_SHIFT 2 +#define I40E_PFINT_ICR0_QUEUE_1_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_1_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_2_SHIFT 3 +#define I40E_PFINT_ICR0_QUEUE_2_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_2_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_3_SHIFT 4 +#define I40E_PFINT_ICR0_QUEUE_3_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_3_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_4_SHIFT 5 +#define I40E_PFINT_ICR0_QUEUE_4_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_4_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_5_SHIFT 6 +#define I40E_PFINT_ICR0_QUEUE_5_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_5_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_6_SHIFT 7 +#define I40E_PFINT_ICR0_QUEUE_6_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_6_SHIFT) +#define I40E_PFINT_ICR0_QUEUE_7_SHIFT 8 +#define I40E_PFINT_ICR0_QUEUE_7_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_7_SHIFT) +#define I40E_PFINT_ICR0_ECC_ERR_SHIFT 16 +#define I40E_PFINT_ICR0_ECC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ECC_ERR_SHIFT) +#define I40E_PFINT_ICR0_MAL_DETECT_SHIFT 19 +#define I40E_PFINT_ICR0_MAL_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_MAL_DETECT_SHIFT) +#define I40E_PFINT_ICR0_GRST_SHIFT 20 +#define I40E_PFINT_ICR0_GRST_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_GRST_SHIFT) +#define I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT 21 +#define I40E_PFINT_ICR0_PCI_EXCEPTION_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT) +#define I40E_PFINT_ICR0_GPIO_SHIFT 22 +#define I40E_PFINT_ICR0_GPIO_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_GPIO_SHIFT) +#define I40E_PFINT_ICR0_TIMESYNC_SHIFT 23 +#define I40E_PFINT_ICR0_TIMESYNC_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_TIMESYNC_SHIFT) +#define I40E_PFINT_ICR0_STORM_DETECT_SHIFT 24 +#define I40E_PFINT_ICR0_STORM_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_STORM_DETECT_SHIFT) +#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT) +#define I40E_PFINT_ICR0_HMC_ERR_SHIFT 26 +#define I40E_PFINT_ICR0_HMC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_HMC_ERR_SHIFT) +#define I40E_PFINT_ICR0_PE_CRITERR_SHIFT 28 +#define I40E_PFINT_ICR0_PE_CRITERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_PE_CRITERR_SHIFT) +#define I40E_PFINT_ICR0_VFLR_SHIFT 29 +#define I40E_PFINT_ICR0_VFLR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_VFLR_SHIFT) +#define I40E_PFINT_ICR0_ADMINQ_SHIFT 30 +#define I40E_PFINT_ICR0_ADMINQ_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ADMINQ_SHIFT) +#define I40E_PFINT_ICR0_SWINT_SHIFT 31 +#define I40E_PFINT_ICR0_SWINT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_SWINT_SHIFT) +#define I40E_PFINT_ICR0_ENA 0x00038800 /* Reset: CORER */ +#define I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT 16 +#define I40E_PFINT_ICR0_ENA_ECC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT) +#define I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT 19 +#define I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT) +#define I40E_PFINT_ICR0_ENA_GRST_SHIFT 20 +#define I40E_PFINT_ICR0_ENA_GRST_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GRST_SHIFT) +#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT 21 +#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT) +#define I40E_PFINT_ICR0_ENA_GPIO_SHIFT 22 +#define I40E_PFINT_ICR0_ENA_GPIO_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GPIO_SHIFT) +#define I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT 23 +#define I40E_PFINT_ICR0_ENA_TIMESYNC_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT) +#define I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT 24 +#define I40E_PFINT_ICR0_ENA_STORM_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT) +#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT) +#define I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT 26 +#define I40E_PFINT_ICR0_ENA_HMC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT) +#define I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT 28 +#define I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT) +#define I40E_PFINT_ICR0_ENA_VFLR_SHIFT 29 +#define I40E_PFINT_ICR0_ENA_VFLR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_VFLR_SHIFT) +#define I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT 30 +#define I40E_PFINT_ICR0_ENA_ADMINQ_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT) +#define I40E_PFINT_ICR0_ENA_RSVD_SHIFT 31 +#define I40E_PFINT_ICR0_ENA_RSVD_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_RSVD_SHIFT) +#define I40E_PFINT_ITR0(_i) (0x00038000 + ((_i) * 128)) /* _i=0...2 */ /* Reset: PFR */ +#define I40E_PFINT_ITR0_MAX_INDEX 2 +#define I40E_PFINT_ITR0_INTERVAL_SHIFT 0 +#define I40E_PFINT_ITR0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_ITR0_INTERVAL_SHIFT) +#define I40E_PFINT_ITRN(_i, _INTPF) (0x00030000 + ((_i) * 2048 + (_INTPF) * 4)) /* _i=0...2, _INTPF=0...511 */ /* Reset: PFR */ +#define I40E_PFINT_ITRN_MAX_INDEX 2 +#define I40E_PFINT_ITRN_INTERVAL_SHIFT 0 +#define I40E_PFINT_ITRN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_ITRN_INTERVAL_SHIFT) +#define I40E_PFINT_LNKLST0 0x00038500 /* Reset: PFR */ +#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT 0 +#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT) +#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11 +#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT) +#define I40E_PFINT_LNKLSTN(_INTPF) (0x00035000 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */ +#define I40E_PFINT_LNKLSTN_MAX_INDEX 511 +#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0 +#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) +#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11 +#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT) +#define I40E_PFINT_RATE0 0x00038580 /* Reset: PFR */ +#define I40E_PFINT_RATE0_INTERVAL_SHIFT 0 +#define I40E_PFINT_RATE0_INTERVAL_MASK I40E_MASK(0x3F, I40E_PFINT_RATE0_INTERVAL_SHIFT) +#define I40E_PFINT_RATE0_INTRL_ENA_SHIFT 6 +#define I40E_PFINT_RATE0_INTRL_ENA_MASK I40E_MASK(0x1, I40E_PFINT_RATE0_INTRL_ENA_SHIFT) +#define I40E_PFINT_RATEN(_INTPF) (0x00035800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */ +#define I40E_PFINT_RATEN_MAX_INDEX 511 +#define I40E_PFINT_RATEN_INTERVAL_SHIFT 0 +#define I40E_PFINT_RATEN_INTERVAL_MASK I40E_MASK(0x3F, I40E_PFINT_RATEN_INTERVAL_SHIFT) +#define I40E_PFINT_RATEN_INTRL_ENA_SHIFT 6 +#define I40E_PFINT_RATEN_INTRL_ENA_MASK I40E_MASK(0x1, I40E_PFINT_RATEN_INTRL_ENA_SHIFT) +#define I40E_PFINT_STAT_CTL0 0x00038400 /* Reset: CORER */ +#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2 +#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT) +#define I40E_QINT_RQCTL(_Q) (0x0003A000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */ +#define I40E_QINT_RQCTL_MAX_INDEX 1535 +#define I40E_QINT_RQCTL_MSIX_INDX_SHIFT 0 +#define I40E_QINT_RQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_QINT_RQCTL_MSIX_INDX_SHIFT) +#define I40E_QINT_RQCTL_ITR_INDX_SHIFT 11 +#define I40E_QINT_RQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_QINT_RQCTL_ITR_INDX_SHIFT) +#define I40E_QINT_RQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_QINT_RQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_QINT_RQCTL_MSIX0_INDX_SHIFT) +#define I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT 16 +#define I40E_QINT_RQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) +#define I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT 27 +#define I40E_QINT_RQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) +#define I40E_QINT_RQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_QINT_RQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_QINT_RQCTL_CAUSE_ENA_SHIFT) +#define I40E_QINT_RQCTL_INTEVENT_SHIFT 31 +#define I40E_QINT_RQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_QINT_RQCTL_INTEVENT_SHIFT) +#define I40E_QINT_TQCTL(_Q) (0x0003C000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */ +#define I40E_QINT_TQCTL_MAX_INDEX 1535 +#define I40E_QINT_TQCTL_MSIX_INDX_SHIFT 0 +#define I40E_QINT_TQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_QINT_TQCTL_MSIX_INDX_SHIFT) +#define I40E_QINT_TQCTL_ITR_INDX_SHIFT 11 +#define I40E_QINT_TQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_QINT_TQCTL_ITR_INDX_SHIFT) +#define I40E_QINT_TQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_QINT_TQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_QINT_TQCTL_MSIX0_INDX_SHIFT) +#define I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT 16 +#define I40E_QINT_TQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) +#define I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT 27 +#define I40E_QINT_TQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) +#define I40E_QINT_TQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_QINT_TQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_QINT_TQCTL_CAUSE_ENA_SHIFT) +#define I40E_QINT_TQCTL_INTEVENT_SHIFT 31 +#define I40E_QINT_TQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_QINT_TQCTL_INTEVENT_SHIFT) +#define I40E_VFINT_DYN_CTL0(_VF) (0x0002A400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFINT_DYN_CTL0_MAX_INDEX 127 +#define I40E_VFINT_DYN_CTL0_INTENA_SHIFT 0 +#define I40E_VFINT_DYN_CTL0_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_SHIFT) +#define I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT 1 +#define I40E_VFINT_DYN_CTL0_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT) +#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT 2 +#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT) +#define I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT 3 +#define I40E_VFINT_DYN_CTL0_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT 5 +#define I40E_VFINT_DYN_CTL0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT) +#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT) +#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT 25 +#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT 31 +#define I40E_VFINT_DYN_CTL0_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT) +#define I40E_VFINT_DYN_CTLN(_INTVF) (0x00024800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */ +#define I40E_VFINT_DYN_CTLN_MAX_INDEX 511 +#define I40E_VFINT_DYN_CTLN_INTENA_SHIFT 0 +#define I40E_VFINT_DYN_CTLN_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_SHIFT) +#define I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT 1 +#define I40E_VFINT_DYN_CTLN_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT) +#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT 2 +#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT) +#define I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT 3 +#define I40E_VFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT 5 +#define I40E_VFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT) +#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT 31 +#define I40E_VFINT_DYN_CTLN_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT) +#define I40E_VFINT_ICR0(_VF) (0x0002BC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VFINT_ICR0_MAX_INDEX 127 +#define I40E_VFINT_ICR0_INTEVENT_SHIFT 0 +#define I40E_VFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_INTEVENT_SHIFT) +#define I40E_VFINT_ICR0_QUEUE_0_SHIFT 1 +#define I40E_VFINT_ICR0_QUEUE_0_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_0_SHIFT) +#define I40E_VFINT_ICR0_QUEUE_1_SHIFT 2 +#define I40E_VFINT_ICR0_QUEUE_1_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_1_SHIFT) +#define I40E_VFINT_ICR0_QUEUE_2_SHIFT 3 +#define I40E_VFINT_ICR0_QUEUE_2_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_2_SHIFT) +#define I40E_VFINT_ICR0_QUEUE_3_SHIFT 4 +#define I40E_VFINT_ICR0_QUEUE_3_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_3_SHIFT) +#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT) +#define I40E_VFINT_ICR0_ADMINQ_SHIFT 30 +#define I40E_VFINT_ICR0_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ADMINQ_SHIFT) +#define I40E_VFINT_ICR0_SWINT_SHIFT 31 +#define I40E_VFINT_ICR0_SWINT_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_SWINT_SHIFT) +#define I40E_VFINT_ICR0_ENA(_VF) (0x0002C000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VFINT_ICR0_ENA_MAX_INDEX 127 +#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT) +#define I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT 30 +#define I40E_VFINT_ICR0_ENA_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT) +#define I40E_VFINT_ICR0_ENA_RSVD_SHIFT 31 +#define I40E_VFINT_ICR0_ENA_RSVD_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_RSVD_SHIFT) +#define I40E_VFINT_ITR0(_i, _VF) (0x00028000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...2, _VF=0...127 */ /* Reset: VFR */ +#define I40E_VFINT_ITR0_MAX_INDEX 2 +#define I40E_VFINT_ITR0_INTERVAL_SHIFT 0 +#define I40E_VFINT_ITR0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITR0_INTERVAL_SHIFT) +#define I40E_VFINT_ITRN(_i, _INTVF) (0x00020000 + ((_i) * 2048 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...511 */ /* Reset: VFR */ +#define I40E_VFINT_ITRN_MAX_INDEX 2 +#define I40E_VFINT_ITRN_INTERVAL_SHIFT 0 +#define I40E_VFINT_ITRN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITRN_INTERVAL_SHIFT) +#define I40E_VFINT_STAT_CTL0(_VF) (0x0002A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VFINT_STAT_CTL0_MAX_INDEX 127 +#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2 +#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT) +#define I40E_VPINT_AEQCTL(_VF) (0x0002B800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VPINT_AEQCTL_MAX_INDEX 127 +#define I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT 0 +#define I40E_VPINT_AEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT) +#define I40E_VPINT_AEQCTL_ITR_INDX_SHIFT 11 +#define I40E_VPINT_AEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_VPINT_AEQCTL_ITR_INDX_SHIFT) +#define I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_VPINT_AEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT) +#define I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_VPINT_AEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT) +#define I40E_VPINT_AEQCTL_INTEVENT_SHIFT 31 +#define I40E_VPINT_AEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_VPINT_AEQCTL_INTEVENT_SHIFT) +#define I40E_VPINT_CEQCTL(_INTVF) (0x00026800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: CORER */ +#define I40E_VPINT_CEQCTL_MAX_INDEX 511 +#define I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT 0 +#define I40E_VPINT_CEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT) +#define I40E_VPINT_CEQCTL_ITR_INDX_SHIFT 11 +#define I40E_VPINT_CEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_VPINT_CEQCTL_ITR_INDX_SHIFT) +#define I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT 13 +#define I40E_VPINT_CEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT) +#define I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT 16 +#define I40E_VPINT_CEQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT) +#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT 27 +#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT) +#define I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT 30 +#define I40E_VPINT_CEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT) +#define I40E_VPINT_CEQCTL_INTEVENT_SHIFT 31 +#define I40E_VPINT_CEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_VPINT_CEQCTL_INTEVENT_SHIFT) +#define I40E_VPINT_LNKLST0(_VF) (0x0002A800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VPINT_LNKLST0_MAX_INDEX 127 +#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT 0 +#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT) +#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11 +#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT) +#define I40E_VPINT_LNKLSTN(_INTVF) (0x00025000 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */ +#define I40E_VPINT_LNKLSTN_MAX_INDEX 511 +#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0 +#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT) +#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11 +#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT) +#define I40E_VPINT_RATE0(_VF) (0x0002AC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VPINT_RATE0_MAX_INDEX 127 +#define I40E_VPINT_RATE0_INTERVAL_SHIFT 0 +#define I40E_VPINT_RATE0_INTERVAL_MASK I40E_MASK(0x3F, I40E_VPINT_RATE0_INTERVAL_SHIFT) +#define I40E_VPINT_RATE0_INTRL_ENA_SHIFT 6 +#define I40E_VPINT_RATE0_INTRL_ENA_MASK I40E_MASK(0x1, I40E_VPINT_RATE0_INTRL_ENA_SHIFT) +#define I40E_VPINT_RATEN(_INTVF) (0x00025800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */ +#define I40E_VPINT_RATEN_MAX_INDEX 511 +#define I40E_VPINT_RATEN_INTERVAL_SHIFT 0 +#define I40E_VPINT_RATEN_INTERVAL_MASK I40E_MASK(0x3F, I40E_VPINT_RATEN_INTERVAL_SHIFT) +#define I40E_VPINT_RATEN_INTRL_ENA_SHIFT 6 +#define I40E_VPINT_RATEN_INTRL_ENA_MASK I40E_MASK(0x1, I40E_VPINT_RATEN_INTRL_ENA_SHIFT) +#define I40E_GL_RDPU_CNTRL 0x00051060 /* Reset: CORER */ +#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT 0 +#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_MASK I40E_MASK(0x1, I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT) +#define I40E_GL_RDPU_CNTRL_ECO_SHIFT 1 +#define I40E_GL_RDPU_CNTRL_ECO_MASK I40E_MASK(0x7FFFFFFF, I40E_GL_RDPU_CNTRL_ECO_SHIFT) +#define I40E_GLLAN_RCTL_0 0x0012A500 /* Reset: CORER */ +#define I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT 0 +#define I40E_GLLAN_RCTL_0_PXE_MODE_MASK I40E_MASK(0x1, I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT) +#define I40E_GLLAN_TSOMSK_F 0x000442D8 /* Reset: CORER */ +#define I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT 0 +#define I40E_GLLAN_TSOMSK_F_TCPMSKF_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT) +#define I40E_GLLAN_TSOMSK_L 0x000442E0 /* Reset: CORER */ +#define I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT 0 +#define I40E_GLLAN_TSOMSK_L_TCPMSKL_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT) +#define I40E_GLLAN_TSOMSK_M 0x000442DC /* Reset: CORER */ +#define I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT 0 +#define I40E_GLLAN_TSOMSK_M_TCPMSKM_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT) +#define I40E_GLLAN_TXPRE_QDIS(_i) (0x000e6500 + ((_i) * 4)) /* _i=0...11 */ /* Reset: CORER */ +#define I40E_GLLAN_TXPRE_QDIS_MAX_INDEX 11 +#define I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT 0 +#define I40E_GLLAN_TXPRE_QDIS_QINDX_MASK I40E_MASK(0x7FF, I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT) +#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT 16 +#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT) +#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT 30 +#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT) +#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT 31 +#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT) +#define I40E_PFLAN_QALLOC 0x001C0400 /* Reset: CORER */ +#define I40E_PFLAN_QALLOC_FIRSTQ_SHIFT 0 +#define I40E_PFLAN_QALLOC_FIRSTQ_MASK I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_FIRSTQ_SHIFT) +#define I40E_PFLAN_QALLOC_LASTQ_SHIFT 16 +#define I40E_PFLAN_QALLOC_LASTQ_MASK I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_LASTQ_SHIFT) +#define I40E_PFLAN_QALLOC_VALID_SHIFT 31 +#define I40E_PFLAN_QALLOC_VALID_MASK I40E_MASK(0x1, I40E_PFLAN_QALLOC_VALID_SHIFT) +#define I40E_QRX_ENA(_Q) (0x00120000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */ +#define I40E_QRX_ENA_MAX_INDEX 1535 +#define I40E_QRX_ENA_QENA_REQ_SHIFT 0 +#define I40E_QRX_ENA_QENA_REQ_MASK I40E_MASK(0x1, I40E_QRX_ENA_QENA_REQ_SHIFT) +#define I40E_QRX_ENA_FAST_QDIS_SHIFT 1 +#define I40E_QRX_ENA_FAST_QDIS_MASK I40E_MASK(0x1, I40E_QRX_ENA_FAST_QDIS_SHIFT) +#define I40E_QRX_ENA_QENA_STAT_SHIFT 2 +#define I40E_QRX_ENA_QENA_STAT_MASK I40E_MASK(0x1, I40E_QRX_ENA_QENA_STAT_SHIFT) +#define I40E_QRX_TAIL(_Q) (0x00128000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */ +#define I40E_QRX_TAIL_MAX_INDEX 1535 +#define I40E_QRX_TAIL_TAIL_SHIFT 0 +#define I40E_QRX_TAIL_TAIL_MASK I40E_MASK(0x1FFF, I40E_QRX_TAIL_TAIL_SHIFT) +#define I40E_QTX_CTL(_Q) (0x00104000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */ +#define I40E_QTX_CTL_MAX_INDEX 1535 +#define I40E_QTX_CTL_PFVF_Q_SHIFT 0 +#define I40E_QTX_CTL_PFVF_Q_MASK I40E_MASK(0x3, I40E_QTX_CTL_PFVF_Q_SHIFT) +#define I40E_QTX_CTL_PF_INDX_SHIFT 2 +#define I40E_QTX_CTL_PF_INDX_MASK I40E_MASK(0xF, I40E_QTX_CTL_PF_INDX_SHIFT) +#define I40E_QTX_CTL_VFVM_INDX_SHIFT 7 +#define I40E_QTX_CTL_VFVM_INDX_MASK I40E_MASK(0x1FF, I40E_QTX_CTL_VFVM_INDX_SHIFT) +#define I40E_QTX_ENA(_Q) (0x00100000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */ +#define I40E_QTX_ENA_MAX_INDEX 1535 +#define I40E_QTX_ENA_QENA_REQ_SHIFT 0 +#define I40E_QTX_ENA_QENA_REQ_MASK I40E_MASK(0x1, I40E_QTX_ENA_QENA_REQ_SHIFT) +#define I40E_QTX_ENA_FAST_QDIS_SHIFT 1 +#define I40E_QTX_ENA_FAST_QDIS_MASK I40E_MASK(0x1, I40E_QTX_ENA_FAST_QDIS_SHIFT) +#define I40E_QTX_ENA_QENA_STAT_SHIFT 2 +#define I40E_QTX_ENA_QENA_STAT_MASK I40E_MASK(0x1, I40E_QTX_ENA_QENA_STAT_SHIFT) +#define I40E_QTX_HEAD(_Q) (0x000E4000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */ +#define I40E_QTX_HEAD_MAX_INDEX 1535 +#define I40E_QTX_HEAD_HEAD_SHIFT 0 +#define I40E_QTX_HEAD_HEAD_MASK I40E_MASK(0x1FFF, I40E_QTX_HEAD_HEAD_SHIFT) +#define I40E_QTX_HEAD_RS_PENDING_SHIFT 16 +#define I40E_QTX_HEAD_RS_PENDING_MASK I40E_MASK(0x1, I40E_QTX_HEAD_RS_PENDING_SHIFT) +#define I40E_QTX_TAIL(_Q) (0x00108000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */ +#define I40E_QTX_TAIL_MAX_INDEX 1535 +#define I40E_QTX_TAIL_TAIL_SHIFT 0 +#define I40E_QTX_TAIL_TAIL_MASK I40E_MASK(0x1FFF, I40E_QTX_TAIL_TAIL_SHIFT) +#define I40E_VPLAN_MAPENA(_VF) (0x00074000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VPLAN_MAPENA_MAX_INDEX 127 +#define I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT 0 +#define I40E_VPLAN_MAPENA_TXRX_ENA_MASK I40E_MASK(0x1, I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT) +#define I40E_VPLAN_QTABLE(_i, _VF) (0x00070000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: VFR */ +#define I40E_VPLAN_QTABLE_MAX_INDEX 15 +#define I40E_VPLAN_QTABLE_QINDEX_SHIFT 0 +#define I40E_VPLAN_QTABLE_QINDEX_MASK I40E_MASK(0x7FF, I40E_VPLAN_QTABLE_QINDEX_SHIFT) +#define I40E_VSILAN_QBASE(_VSI) (0x0020C800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */ +#define I40E_VSILAN_QBASE_MAX_INDEX 383 +#define I40E_VSILAN_QBASE_VSIBASE_SHIFT 0 +#define I40E_VSILAN_QBASE_VSIBASE_MASK I40E_MASK(0x7FF, I40E_VSILAN_QBASE_VSIBASE_SHIFT) +#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT 11 +#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK I40E_MASK(0x1, I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT) +#define I40E_VSILAN_QTABLE(_i, _VSI) (0x00200000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...7, _VSI=0...383 */ /* Reset: PFR */ +#define I40E_VSILAN_QTABLE_MAX_INDEX 7 +#define I40E_VSILAN_QTABLE_QINDEX_0_SHIFT 0 +#define I40E_VSILAN_QTABLE_QINDEX_0_MASK I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_0_SHIFT) +#define I40E_VSILAN_QTABLE_QINDEX_1_SHIFT 16 +#define I40E_VSILAN_QTABLE_QINDEX_1_MASK I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_1_SHIFT) +#define I40E_PRTGL_SAH 0x001E2140 /* Reset: GLOBR */ +#define I40E_PRTGL_SAH_FC_SAH_SHIFT 0 +#define I40E_PRTGL_SAH_FC_SAH_MASK I40E_MASK(0xFFFF, I40E_PRTGL_SAH_FC_SAH_SHIFT) +#define I40E_PRTGL_SAH_MFS_SHIFT 16 +#define I40E_PRTGL_SAH_MFS_MASK I40E_MASK(0xFFFF, I40E_PRTGL_SAH_MFS_SHIFT) +#define I40E_PRTGL_SAL 0x001E2120 /* Reset: GLOBR */ +#define I40E_PRTGL_SAL_FC_SAL_SHIFT 0 +#define I40E_PRTGL_SAL_FC_SAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTGL_SAL_FC_SAL_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP 0x001E30E0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP 0x001E3260 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP 0x001E32E0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL 0x001E3360 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1 0x001E3110 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2 0x001E3120 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE 0x001E30C0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_MASK I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1 0x001E3140 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2 0x001E3150 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE 0x001E30D0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_MASK I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(_i) (0x001E3370 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX 8 +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(_i) (0x001E3400 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MAX_INDEX 8 +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1 0x001E34B0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT) +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2 0x001E34C0 /* Reset: GLOBR */ +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT 0 +#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A 0x0008C480 /* Reset: GLOBR */ +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT 0 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT 2 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT 4 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT 6 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT 8 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT 10 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT 12 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT 14 +#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B 0x0008C484 /* Reset: GLOBR */ +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT 0 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT 2 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT 4 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT 6 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT 8 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT 10 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT 12 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT) +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT 14 +#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT) +#define I40E_GL_FWRESETCNT 0x00083100 /* Reset: POR */ +#define I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT 0 +#define I40E_GL_FWRESETCNT_FWRESETCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT) +#define I40E_GL_MNG_FWSM 0x000B6134 /* Reset: POR */ +#define I40E_GL_MNG_FWSM_FW_MODES_SHIFT 0 +#define I40E_GL_MNG_FWSM_FW_MODES_MASK I40E_MASK(0x3, I40E_GL_MNG_FWSM_FW_MODES_SHIFT) +#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT 10 +#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT) +#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT 11 +#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_MASK I40E_MASK(0xF, I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT) +#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT 15 +#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT) +#define I40E_GL_MNG_FWSM_RESET_CNT_SHIFT 16 +#define I40E_GL_MNG_FWSM_RESET_CNT_MASK I40E_MASK(0x7, I40E_GL_MNG_FWSM_RESET_CNT_SHIFT) +#define I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT 19 +#define I40E_GL_MNG_FWSM_EXT_ERR_IND_MASK I40E_MASK(0x3F, I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT) +#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT 26 +#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT) +#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT 27 +#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT) +#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT 28 +#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT) +#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT 29 +#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT) +#define I40E_GL_MNG_HWARB_CTRL 0x000B6130 /* Reset: POR */ +#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT 0 +#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_MASK I40E_MASK(0x1, I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT) +#define I40E_PRT_MNG_FTFT_DATA(_i) (0x000852A0 + ((_i) * 32)) /* _i=0...31 */ /* Reset: POR */ +#define I40E_PRT_MNG_FTFT_DATA_MAX_INDEX 31 +#define I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT 0 +#define I40E_PRT_MNG_FTFT_DATA_DWORD_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT) +#define I40E_PRT_MNG_FTFT_LENGTH 0x00085260 /* Reset: POR */ +#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT 0 +#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_MASK I40E_MASK(0xFF, I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT) +#define I40E_PRT_MNG_FTFT_MASK(_i) (0x00085160 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */ +#define I40E_PRT_MNG_FTFT_MASK_MAX_INDEX 7 +#define I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT 0 +#define I40E_PRT_MNG_FTFT_MASK_MASK_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT) +#define I40E_PRT_MNG_MANC 0x00256A20 /* Reset: POR */ +#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT 0 +#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT) +#define I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT 1 +#define I40E_PRT_MNG_MANC_NCSI_DISCARD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT) +#define I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT 17 +#define I40E_PRT_MNG_MANC_RCV_TCO_EN_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT) +#define I40E_PRT_MNG_MANC_RCV_ALL_SHIFT 19 +#define I40E_PRT_MNG_MANC_RCV_ALL_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_ALL_SHIFT) +#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT 25 +#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT) +#define I40E_PRT_MNG_MANC_NET_TYPE_SHIFT 26 +#define I40E_PRT_MNG_MANC_NET_TYPE_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_NET_TYPE_SHIFT) +#define I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT 28 +#define I40E_PRT_MNG_MANC_EN_BMC2OS_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT) +#define I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT 29 +#define I40E_PRT_MNG_MANC_EN_BMC2NET_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT) +#define I40E_PRT_MNG_MAVTV(_i) (0x00255900 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */ +#define I40E_PRT_MNG_MAVTV_MAX_INDEX 7 +#define I40E_PRT_MNG_MAVTV_VID_SHIFT 0 +#define I40E_PRT_MNG_MAVTV_VID_MASK I40E_MASK(0xFFF, I40E_PRT_MNG_MAVTV_VID_SHIFT) +#define I40E_PRT_MNG_MDEF(_i) (0x00255D00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */ +#define I40E_PRT_MNG_MDEF_MAX_INDEX 7 +#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT 0 +#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT 4 +#define I40E_PRT_MNG_MDEF_BROADCAST_AND_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT 5 +#define I40E_PRT_MNG_MDEF_VLAN_AND_MASK I40E_MASK(0xFF, I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT 13 +#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT 17 +#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT 21 +#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT 25 +#define I40E_PRT_MNG_MDEF_BROADCAST_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT 26 +#define I40E_PRT_MNG_MDEF_MULTICAST_AND_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT 27 +#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT 28 +#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT 29 +#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT 30 +#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT 31 +#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT(_i) (0x00255F00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */ +#define I40E_PRT_MNG_MDEF_EXT_MAX_INDEX 7 +#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT 0 +#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT 4 +#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT 8 +#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT 24 +#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT 25 +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT 26 +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT 27 +#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT 28 +#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT 29 +#define I40E_PRT_MNG_MDEF_EXT_MLD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT 30 +#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT) +#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT 31 +#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT) +#define I40E_PRT_MNG_MDEFVSI(_i) (0x00256580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_PRT_MNG_MDEFVSI_MAX_INDEX 3 +#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT 0 +#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT) +#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT 16 +#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT) +#define I40E_PRT_MNG_METF(_i) (0x00256780 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_PRT_MNG_METF_MAX_INDEX 3 +#define I40E_PRT_MNG_METF_ETYPE_SHIFT 0 +#define I40E_PRT_MNG_METF_ETYPE_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_METF_ETYPE_SHIFT) +#define I40E_PRT_MNG_METF_POLARITY_SHIFT 30 +#define I40E_PRT_MNG_METF_POLARITY_MASK I40E_MASK(0x1, I40E_PRT_MNG_METF_POLARITY_SHIFT) +#define I40E_PRT_MNG_MFUTP(_i) (0x00254E00 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */ +#define I40E_PRT_MNG_MFUTP_MAX_INDEX 15 +#define I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT 0 +#define I40E_PRT_MNG_MFUTP_MFUTP_N_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT) +#define I40E_PRT_MNG_MFUTP_UDP_SHIFT 16 +#define I40E_PRT_MNG_MFUTP_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_UDP_SHIFT) +#define I40E_PRT_MNG_MFUTP_TCP_SHIFT 17 +#define I40E_PRT_MNG_MFUTP_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_TCP_SHIFT) +#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT 18 +#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT) +#define I40E_PRT_MNG_MIPAF4(_i) (0x00256280 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_PRT_MNG_MIPAF4_MAX_INDEX 3 +#define I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT 0 +#define I40E_PRT_MNG_MIPAF4_MIPAF_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT) +#define I40E_PRT_MNG_MIPAF6(_i) (0x00254200 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */ +#define I40E_PRT_MNG_MIPAF6_MAX_INDEX 15 +#define I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT 0 +#define I40E_PRT_MNG_MIPAF6_MIPAF_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT) +#define I40E_PRT_MNG_MMAH(_i) (0x00256380 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_PRT_MNG_MMAH_MAX_INDEX 3 +#define I40E_PRT_MNG_MMAH_MMAH_SHIFT 0 +#define I40E_PRT_MNG_MMAH_MMAH_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MMAH_MMAH_SHIFT) +#define I40E_PRT_MNG_MMAL(_i) (0x00256480 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */ +#define I40E_PRT_MNG_MMAL_MAX_INDEX 3 +#define I40E_PRT_MNG_MMAL_MMAL_SHIFT 0 +#define I40E_PRT_MNG_MMAL_MMAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MMAL_MMAL_SHIFT) +#define I40E_PRT_MNG_MNGONLY 0x00256A60 /* Reset: POR */ +#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT 0 +#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_MASK I40E_MASK(0xFF, I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT) +#define I40E_PRT_MNG_MSFM 0x00256AA0 /* Reset: POR */ +#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT 0 +#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT) +#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT 1 +#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT) +#define I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT 2 +#define I40E_PRT_MNG_MSFM_PORT_298_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT) +#define I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT 3 +#define I40E_PRT_MNG_MSFM_PORT_298_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT) +#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT 4 +#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT) +#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT 5 +#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT) +#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT 6 +#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT) +#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT 7 +#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT) +#define I40E_MSIX_PBA(_i) (0x00001000 + ((_i) * 4)) /* _i=0...5 */ /* Reset: FLR */ +#define I40E_MSIX_PBA_MAX_INDEX 5 +#define I40E_MSIX_PBA_PENBIT_SHIFT 0 +#define I40E_MSIX_PBA_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_PBA_PENBIT_SHIFT) +#define I40E_MSIX_TADD(_i) (0x00000000 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */ +#define I40E_MSIX_TADD_MAX_INDEX 128 +#define I40E_MSIX_TADD_MSIXTADD10_SHIFT 0 +#define I40E_MSIX_TADD_MSIXTADD10_MASK I40E_MASK(0x3, I40E_MSIX_TADD_MSIXTADD10_SHIFT) +#define I40E_MSIX_TADD_MSIXTADD_SHIFT 2 +#define I40E_MSIX_TADD_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_MSIX_TADD_MSIXTADD_SHIFT) +#define I40E_MSIX_TMSG(_i) (0x00000008 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */ +#define I40E_MSIX_TMSG_MAX_INDEX 128 +#define I40E_MSIX_TMSG_MSIXTMSG_SHIFT 0 +#define I40E_MSIX_TMSG_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_TMSG_MSIXTMSG_SHIFT) +#define I40E_MSIX_TUADD(_i) (0x00000004 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */ +#define I40E_MSIX_TUADD_MAX_INDEX 128 +#define I40E_MSIX_TUADD_MSIXTUADD_SHIFT 0 +#define I40E_MSIX_TUADD_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_TUADD_MSIXTUADD_SHIFT) +#define I40E_MSIX_TVCTRL(_i) (0x0000000C + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */ +#define I40E_MSIX_TVCTRL_MAX_INDEX 128 +#define I40E_MSIX_TVCTRL_MASK_SHIFT 0 +#define I40E_MSIX_TVCTRL_MASK_MASK I40E_MASK(0x1, I40E_MSIX_TVCTRL_MASK_SHIFT) +#define I40E_VFMSIX_PBA1(_i) (0x00002000 + ((_i) * 4)) /* _i=0...19 */ /* Reset: VFLR */ +#define I40E_VFMSIX_PBA1_MAX_INDEX 19 +#define I40E_VFMSIX_PBA1_PENBIT_SHIFT 0 +#define I40E_VFMSIX_PBA1_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA1_PENBIT_SHIFT) +#define I40E_VFMSIX_TADD1(_i) (0x00002100 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TADD1_MAX_INDEX 639 +#define I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT 0 +#define I40E_VFMSIX_TADD1_MSIXTADD10_MASK I40E_MASK(0x3, I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT) +#define I40E_VFMSIX_TADD1_MSIXTADD_SHIFT 2 +#define I40E_VFMSIX_TADD1_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD1_MSIXTADD_SHIFT) +#define I40E_VFMSIX_TMSG1(_i) (0x00002108 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TMSG1_MAX_INDEX 639 +#define I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT 0 +#define I40E_VFMSIX_TMSG1_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT) +#define I40E_VFMSIX_TUADD1(_i) (0x00002104 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TUADD1_MAX_INDEX 639 +#define I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT 0 +#define I40E_VFMSIX_TUADD1_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT) +#define I40E_VFMSIX_TVCTRL1(_i) (0x0000210C + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TVCTRL1_MAX_INDEX 639 +#define I40E_VFMSIX_TVCTRL1_MASK_SHIFT 0 +#define I40E_VFMSIX_TVCTRL1_MASK_MASK I40E_MASK(0x1, I40E_VFMSIX_TVCTRL1_MASK_SHIFT) +#define I40E_GLNVM_FLA 0x000B6108 /* Reset: POR */ +#define I40E_GLNVM_FLA_FL_SCK_SHIFT 0 +#define I40E_GLNVM_FLA_FL_SCK_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SCK_SHIFT) +#define I40E_GLNVM_FLA_FL_CE_SHIFT 1 +#define I40E_GLNVM_FLA_FL_CE_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_CE_SHIFT) +#define I40E_GLNVM_FLA_FL_SI_SHIFT 2 +#define I40E_GLNVM_FLA_FL_SI_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SI_SHIFT) +#define I40E_GLNVM_FLA_FL_SO_SHIFT 3 +#define I40E_GLNVM_FLA_FL_SO_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SO_SHIFT) +#define I40E_GLNVM_FLA_FL_REQ_SHIFT 4 +#define I40E_GLNVM_FLA_FL_REQ_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_REQ_SHIFT) +#define I40E_GLNVM_FLA_FL_GNT_SHIFT 5 +#define I40E_GLNVM_FLA_FL_GNT_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_GNT_SHIFT) +#define I40E_GLNVM_FLA_LOCKED_SHIFT 6 +#define I40E_GLNVM_FLA_LOCKED_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT) +#define I40E_GLNVM_FLA_FL_SADDR_SHIFT 18 +#define I40E_GLNVM_FLA_FL_SADDR_MASK I40E_MASK(0x7FF, I40E_GLNVM_FLA_FL_SADDR_SHIFT) +#define I40E_GLNVM_FLA_FL_BUSY_SHIFT 30 +#define I40E_GLNVM_FLA_FL_BUSY_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_BUSY_SHIFT) +#define I40E_GLNVM_FLA_FL_DER_SHIFT 31 +#define I40E_GLNVM_FLA_FL_DER_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_DER_SHIFT) +#define I40E_GLNVM_FLASHID 0x000B6104 /* Reset: POR */ +#define I40E_GLNVM_FLASHID_FLASHID_SHIFT 0 +#define I40E_GLNVM_FLASHID_FLASHID_MASK I40E_MASK(0xFFFFFF, I40E_GLNVM_FLASHID_FLASHID_SHIFT) +#define I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT 31 +#define I40E_GLNVM_FLASHID_FLEEP_PERF_MASK I40E_MASK(0x1, I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT) +#define I40E_GLNVM_GENS 0x000B6100 /* Reset: POR */ +#define I40E_GLNVM_GENS_NVM_PRES_SHIFT 0 +#define I40E_GLNVM_GENS_NVM_PRES_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_NVM_PRES_SHIFT) +#define I40E_GLNVM_GENS_SR_SIZE_SHIFT 5 +#define I40E_GLNVM_GENS_SR_SIZE_MASK I40E_MASK(0x7, I40E_GLNVM_GENS_SR_SIZE_SHIFT) +#define I40E_GLNVM_GENS_BANK1VAL_SHIFT 8 +#define I40E_GLNVM_GENS_BANK1VAL_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_BANK1VAL_SHIFT) +#define I40E_GLNVM_GENS_ALT_PRST_SHIFT 23 +#define I40E_GLNVM_GENS_ALT_PRST_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_ALT_PRST_SHIFT) +#define I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT 25 +#define I40E_GLNVM_GENS_FL_AUTO_RD_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT) +#define I40E_GLNVM_PROTCSR(_i) (0x000B6010 + ((_i) * 4)) /* _i=0...59 */ /* Reset: POR */ +#define I40E_GLNVM_PROTCSR_MAX_INDEX 59 +#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT 0 +#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_MASK I40E_MASK(0xFFFFFF, I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT) +#define I40E_GLNVM_SRCTL 0x000B6110 /* Reset: POR */ +#define I40E_GLNVM_SRCTL_SRBUSY_SHIFT 0 +#define I40E_GLNVM_SRCTL_SRBUSY_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_SRBUSY_SHIFT) +#define I40E_GLNVM_SRCTL_ADDR_SHIFT 14 +#define I40E_GLNVM_SRCTL_ADDR_MASK I40E_MASK(0x7FFF, I40E_GLNVM_SRCTL_ADDR_SHIFT) +#define I40E_GLNVM_SRCTL_WRITE_SHIFT 29 +#define I40E_GLNVM_SRCTL_WRITE_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_WRITE_SHIFT) +#define I40E_GLNVM_SRCTL_START_SHIFT 30 +#define I40E_GLNVM_SRCTL_START_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_START_SHIFT) +#define I40E_GLNVM_SRCTL_DONE_SHIFT 31 +#define I40E_GLNVM_SRCTL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_DONE_SHIFT) +#define I40E_GLNVM_SRDATA 0x000B6114 /* Reset: POR */ +#define I40E_GLNVM_SRDATA_WRDATA_SHIFT 0 +#define I40E_GLNVM_SRDATA_WRDATA_MASK I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_WRDATA_SHIFT) +#define I40E_GLNVM_SRDATA_RDDATA_SHIFT 16 +#define I40E_GLNVM_SRDATA_RDDATA_MASK I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_RDDATA_SHIFT) +#define I40E_GLNVM_ULD 0x000B6008 /* Reset: POR */ +#define I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT 0 +#define I40E_GLNVM_ULD_CONF_PCIR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT 1 +#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT 2 +#define I40E_GLNVM_ULD_CONF_LCB_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT 3 +#define I40E_GLNVM_ULD_CONF_CORE_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT 4 +#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT 5 +#define I40E_GLNVM_ULD_CONF_POR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT 6 +#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT 7 +#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT 8 +#define I40E_GLNVM_ULD_CONF_EMP_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT) +#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT 9 +#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT) +#define I40E_GLPCI_BYTCTH 0x0009C484 /* Reset: PCIR */ +#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT 0 +#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT) +#define I40E_GLPCI_BYTCTL 0x0009C488 /* Reset: PCIR */ +#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT 0 +#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT) +#define I40E_GLPCI_CAPCTRL 0x000BE4A4 /* Reset: PCIR */ +#define I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT 0 +#define I40E_GLPCI_CAPCTRL_VPD_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT) +#define I40E_GLPCI_CAPSUP 0x000BE4A8 /* Reset: PCIR */ +#define I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT 0 +#define I40E_GLPCI_CAPSUP_PCIE_VER_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT) +#define I40E_GLPCI_CAPSUP_LTR_EN_SHIFT 2 +#define I40E_GLPCI_CAPSUP_LTR_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LTR_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_TPH_EN_SHIFT 3 +#define I40E_GLPCI_CAPSUP_TPH_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_TPH_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_ARI_EN_SHIFT 4 +#define I40E_GLPCI_CAPSUP_ARI_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ARI_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_IOV_EN_SHIFT 5 +#define I40E_GLPCI_CAPSUP_IOV_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IOV_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_ACS_EN_SHIFT 6 +#define I40E_GLPCI_CAPSUP_ACS_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ACS_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_SEC_EN_SHIFT 7 +#define I40E_GLPCI_CAPSUP_SEC_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_SEC_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT 16 +#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT 17 +#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_IDO_EN_SHIFT 18 +#define I40E_GLPCI_CAPSUP_IDO_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IDO_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT 19 +#define I40E_GLPCI_CAPSUP_MSI_MASK_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT) +#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT 20 +#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT) +#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT 30 +#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT) +#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT 31 +#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT) +#define I40E_GLPCI_CNF 0x000BE4C0 /* Reset: POR */ +#define I40E_GLPCI_CNF_FLEX10_SHIFT 1 +#define I40E_GLPCI_CNF_FLEX10_MASK I40E_MASK(0x1, I40E_GLPCI_CNF_FLEX10_SHIFT) +#define I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT 2 +#define I40E_GLPCI_CNF_WAKE_PIN_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT) +#define I40E_GLPCI_CNF2 0x000BE494 /* Reset: PCIR */ +#define I40E_GLPCI_CNF2_RO_DIS_SHIFT 0 +#define I40E_GLPCI_CNF2_RO_DIS_MASK I40E_MASK(0x1, I40E_GLPCI_CNF2_RO_DIS_SHIFT) +#define I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT 1 +#define I40E_GLPCI_CNF2_CACHELINE_SIZE_MASK I40E_MASK(0x1, I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT) +#define I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT 2 +#define I40E_GLPCI_CNF2_MSI_X_PF_N_MASK I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT) +#define I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT 13 +#define I40E_GLPCI_CNF2_MSI_X_VF_N_MASK I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT) +#define I40E_GLPCI_DREVID 0x0009C480 /* Reset: PCIR */ +#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0 +#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK I40E_MASK(0xFF, I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT) +#define I40E_GLPCI_GSCL_1 0x0009C48C /* Reset: PCIR */ +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT 0 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT 1 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT 2 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT 3 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT) +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT 4 +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT) +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT 5 +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT) +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT 6 +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT) +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT 7 +#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT) +#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT 8 +#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT) +#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT 9 +#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_MASK I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT) +#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT 14 +#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT) +#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT 15 +#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_MASK I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT 28 +#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT 29 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT 30 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT) +#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT 31 +#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT) +#define I40E_GLPCI_GSCL_2 0x0009C490 /* Reset: PCIR */ +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT 0 +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT) +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT 8 +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT) +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT 16 +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT) +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT 24 +#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT) +#define I40E_GLPCI_GSCL_5_8(_i) (0x0009C494 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */ +#define I40E_GLPCI_GSCL_5_8_MAX_INDEX 3 +#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT 0 +#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_MASK I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT) +#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT 16 +#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_MASK I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT) +#define I40E_GLPCI_GSCN_0_3(_i) (0x0009C4A4 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */ +#define I40E_GLPCI_GSCN_0_3_MAX_INDEX 3 +#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT 0 +#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT) +#define I40E_GLPCI_LBARCTRL 0x000BE484 /* Reset: POR */ +#define I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT 0 +#define I40E_GLPCI_LBARCTRL_PREFBAR_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT) +#define I40E_GLPCI_LBARCTRL_BAR32_SHIFT 1 +#define I40E_GLPCI_LBARCTRL_BAR32_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_BAR32_SHIFT) +#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT 3 +#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT) +#define I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT 4 +#define I40E_GLPCI_LBARCTRL_RSVD_4_MASK I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT) +#define I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT 6 +#define I40E_GLPCI_LBARCTRL_FL_SIZE_MASK I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT) +#define I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT 10 +#define I40E_GLPCI_LBARCTRL_RSVD_10_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT) +#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT 11 +#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_MASK I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT) +#define I40E_GLPCI_LINKCAP 0x000BE4AC /* Reset: PCIR */ +#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT 0 +#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_MASK I40E_MASK(0x3F, I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT) +#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT 6 +#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_MASK I40E_MASK(0x7, I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT) +#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT 9 +#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_MASK I40E_MASK(0xF, I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT) +#define I40E_GLPCI_PCIERR 0x000BE4FC /* Reset: PCIR */ +#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT 0 +#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT) +#define I40E_GLPCI_PKTCT 0x0009C4BC /* Reset: PCIR */ +#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT 0 +#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT) +#define I40E_GLPCI_PM_MUX_NPQ 0x0009C4F4 /* Reset: PCIR */ +#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT 0 +#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_MASK I40E_MASK(0x7, I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT) +#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT 16 +#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_MASK I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT) +#define I40E_GLPCI_PM_MUX_PFB 0x0009C4F0 /* Reset: PCIR */ +#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT 0 +#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_MASK I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT) +#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT 16 +#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_MASK I40E_MASK(0x7, I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT) +#define I40E_GLPCI_PMSUP 0x000BE4B0 /* Reset: PCIR */ +#define I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT 0 +#define I40E_GLPCI_PMSUP_ASPM_SUP_MASK I40E_MASK(0x3, I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT) +#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT 2 +#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT) +#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT 5 +#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT) +#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT 8 +#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT) +#define I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT 11 +#define I40E_GLPCI_PMSUP_L1_ACC_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT) +#define I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT 14 +#define I40E_GLPCI_PMSUP_SLOT_CLK_MASK I40E_MASK(0x1, I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT) +#define I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT 15 +#define I40E_GLPCI_PMSUP_OBFF_SUP_MASK I40E_MASK(0x3, I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT) +#define I40E_GLPCI_PQ_MAX_USED_SPC 0x0009C4EC /* Reset: PCIR */ +#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT 0 +#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_MASK I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT) +#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT 8 +#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_MASK I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT) +#define I40E_GLPCI_PWRDATA 0x000BE490 /* Reset: PCIR */ +#define I40E_GLPCI_PWRDATA_D0_POWER_SHIFT 0 +#define I40E_GLPCI_PWRDATA_D0_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D0_POWER_SHIFT) +#define I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT 8 +#define I40E_GLPCI_PWRDATA_COMM_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT) +#define I40E_GLPCI_PWRDATA_D3_POWER_SHIFT 16 +#define I40E_GLPCI_PWRDATA_D3_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D3_POWER_SHIFT) +#define I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT 24 +#define I40E_GLPCI_PWRDATA_DATA_SCALE_MASK I40E_MASK(0x3, I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT) +#define I40E_GLPCI_REVID 0x000BE4B4 /* Reset: PCIR */ +#define I40E_GLPCI_REVID_NVM_REVID_SHIFT 0 +#define I40E_GLPCI_REVID_NVM_REVID_MASK I40E_MASK(0xFF, I40E_GLPCI_REVID_NVM_REVID_SHIFT) +#define I40E_GLPCI_SERH 0x000BE49C /* Reset: PCIR */ +#define I40E_GLPCI_SERH_SER_NUM_H_SHIFT 0 +#define I40E_GLPCI_SERH_SER_NUM_H_MASK I40E_MASK(0xFFFF, I40E_GLPCI_SERH_SER_NUM_H_SHIFT) +#define I40E_GLPCI_SERL 0x000BE498 /* Reset: PCIR */ +#define I40E_GLPCI_SERL_SER_NUM_L_SHIFT 0 +#define I40E_GLPCI_SERL_SER_NUM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SERL_SER_NUM_L_SHIFT) +#define I40E_GLPCI_SPARE_BITS_0 0x0009C4F8 /* Reset: PCIR */ +#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT 0 +#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT) +#define I40E_GLPCI_SPARE_BITS_1 0x0009C4FC /* Reset: PCIR */ +#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT 0 +#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT) +#define I40E_GLPCI_SUBVENID 0x000BE48C /* Reset: PCIR */ +#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT 0 +#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_MASK I40E_MASK(0xFFFF, I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT) +#define I40E_GLPCI_UPADD 0x000BE4F8 /* Reset: PCIR */ +#define I40E_GLPCI_UPADD_ADDRESS_SHIFT 1 +#define I40E_GLPCI_UPADD_ADDRESS_MASK I40E_MASK(0x7FFFFFFF, I40E_GLPCI_UPADD_ADDRESS_SHIFT) +#define I40E_GLPCI_VENDORID 0x000BE518 /* Reset: PCIR */ +#define I40E_GLPCI_VENDORID_VENDORID_SHIFT 0 +#define I40E_GLPCI_VENDORID_VENDORID_MASK I40E_MASK(0xFFFF, I40E_GLPCI_VENDORID_VENDORID_SHIFT) +#define I40E_GLPCI_VFSUP 0x000BE4B8 /* Reset: PCIR */ +#define I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT 0 +#define I40E_GLPCI_VFSUP_VF_PREFETCH_MASK I40E_MASK(0x1, I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT) +#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT 1 +#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_MASK I40E_MASK(0x1, I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT) +#define I40E_GLTPH_CTRL 0x000BE480 /* Reset: PCIR */ +#define I40E_GLTPH_CTRL_DESC_PH_SHIFT 9 +#define I40E_GLTPH_CTRL_DESC_PH_MASK I40E_MASK(0x3, I40E_GLTPH_CTRL_DESC_PH_SHIFT) +#define I40E_GLTPH_CTRL_DATA_PH_SHIFT 11 +#define I40E_GLTPH_CTRL_DATA_PH_MASK I40E_MASK(0x3, I40E_GLTPH_CTRL_DATA_PH_SHIFT) +#define I40E_PF_FUNC_RID 0x0009C000 /* Reset: PCIR */ +#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT 0 +#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_MASK I40E_MASK(0x7, I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT) +#define I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT 3 +#define I40E_PF_FUNC_RID_DEVICE_NUMBER_MASK I40E_MASK(0x1F, I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT) +#define I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT 8 +#define I40E_PF_FUNC_RID_BUS_NUMBER_MASK I40E_MASK(0xFF, I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT) +#define I40E_PF_PCI_CIAA 0x0009C080 /* Reset: FLR */ +#define I40E_PF_PCI_CIAA_ADDRESS_SHIFT 0 +#define I40E_PF_PCI_CIAA_ADDRESS_MASK I40E_MASK(0xFFF, I40E_PF_PCI_CIAA_ADDRESS_SHIFT) +#define I40E_PF_PCI_CIAA_VF_NUM_SHIFT 12 +#define I40E_PF_PCI_CIAA_VF_NUM_MASK I40E_MASK(0x7F, I40E_PF_PCI_CIAA_VF_NUM_SHIFT) +#define I40E_PF_PCI_CIAD 0x0009C100 /* Reset: FLR */ +#define I40E_PF_PCI_CIAD_DATA_SHIFT 0 +#define I40E_PF_PCI_CIAD_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_PCI_CIAD_DATA_SHIFT) +#define I40E_PFPCI_CLASS 0x000BE400 /* Reset: PCIR */ +#define I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT 0 +#define I40E_PFPCI_CLASS_STORAGE_CLASS_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT) +#define I40E_PFPCI_CLASS_RESERVED_1_SHIFT 1 +#define I40E_PFPCI_CLASS_RESERVED_1_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_RESERVED_1_SHIFT) +#define I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT 2 +#define I40E_PFPCI_CLASS_PF_IS_LAN_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT) +#define I40E_PFPCI_CNF 0x000BE000 /* Reset: PCIR */ +#define I40E_PFPCI_CNF_MSI_EN_SHIFT 2 +#define I40E_PFPCI_CNF_MSI_EN_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_MSI_EN_SHIFT) +#define I40E_PFPCI_CNF_EXROM_DIS_SHIFT 3 +#define I40E_PFPCI_CNF_EXROM_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_EXROM_DIS_SHIFT) +#define I40E_PFPCI_CNF_IO_BAR_SHIFT 4 +#define I40E_PFPCI_CNF_IO_BAR_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_IO_BAR_SHIFT) +#define I40E_PFPCI_CNF_INT_PIN_SHIFT 5 +#define I40E_PFPCI_CNF_INT_PIN_MASK I40E_MASK(0x3, I40E_PFPCI_CNF_INT_PIN_SHIFT) +#define I40E_PFPCI_DEVID 0x000BE080 /* Reset: PCIR */ +#define I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT 0 +#define I40E_PFPCI_DEVID_PF_DEV_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT) +#define I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT 16 +#define I40E_PFPCI_DEVID_VF_DEV_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT) +#define I40E_PFPCI_FACTPS 0x0009C180 /* Reset: FLR */ +#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT 0 +#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_MASK I40E_MASK(0x3, I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT) +#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT 3 +#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_MASK I40E_MASK(0x1, I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT) +#define I40E_PFPCI_FUNC 0x000BE200 /* Reset: POR */ +#define I40E_PFPCI_FUNC_FUNC_DIS_SHIFT 0 +#define I40E_PFPCI_FUNC_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_FUNC_DIS_SHIFT) +#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT 1 +#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT) +#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT 2 +#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT) +#define I40E_PFPCI_FUNC2 0x000BE180 /* Reset: PCIR */ +#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT 0 +#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT) +#define I40E_PFPCI_ICAUSE 0x0009C200 /* Reset: PFR */ +#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT 0 +#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT) +#define I40E_PFPCI_IENA 0x0009C280 /* Reset: PFR */ +#define I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT 0 +#define I40E_PFPCI_IENA_PCIE_ERR_EN_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT) +#define I40E_PFPCI_PF_FLUSH_DONE 0x0009C800 /* Reset: PCIR */ +#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT 0 +#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT) +#define I40E_PFPCI_PM 0x000BE300 /* Reset: POR */ +#define I40E_PFPCI_PM_PME_EN_SHIFT 0 +#define I40E_PFPCI_PM_PME_EN_MASK I40E_MASK(0x1, I40E_PFPCI_PM_PME_EN_SHIFT) +#define I40E_PFPCI_STATUS1 0x000BE280 /* Reset: POR */ +#define I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT 0 +#define I40E_PFPCI_STATUS1_FUNC_VALID_MASK I40E_MASK(0x1, I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT) +#define I40E_PFPCI_SUBSYSID 0x000BE100 /* Reset: PCIR */ +#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT 0 +#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT) +#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT 16 +#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT) +#define I40E_PFPCI_VF_FLUSH_DONE 0x0000E400 /* Reset: PCIR */ +#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT 0 +#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT) +#define I40E_PFPCI_VF_FLUSH_DONE1(_VF) (0x0009C600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: PCIR */ +#define I40E_PFPCI_VF_FLUSH_DONE1_MAX_INDEX 127 +#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT 0 +#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT) +#define I40E_PFPCI_VM_FLUSH_DONE 0x0009C880 /* Reset: PCIR */ +#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT 0 +#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT) +#define I40E_PFPCI_VMINDEX 0x0009C300 /* Reset: PCIR */ +#define I40E_PFPCI_VMINDEX_VMINDEX_SHIFT 0 +#define I40E_PFPCI_VMINDEX_VMINDEX_MASK I40E_MASK(0x1FF, I40E_PFPCI_VMINDEX_VMINDEX_SHIFT) +#define I40E_PFPCI_VMPEND 0x0009C380 /* Reset: PCIR */ +#define I40E_PFPCI_VMPEND_PENDING_SHIFT 0 +#define I40E_PFPCI_VMPEND_PENDING_MASK I40E_MASK(0x1, I40E_PFPCI_VMPEND_PENDING_SHIFT) +#define I40E_PRTPM_EEE_STAT 0x001E4320 /* Reset: GLOBR */ +#define I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT 29 +#define I40E_PRTPM_EEE_STAT_EEE_NEG_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT) +#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT 30 +#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT) +#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT 31 +#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT) +#define I40E_PRTPM_EEEC 0x001E4380 /* Reset: GLOBR */ +#define I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT 16 +#define I40E_PRTPM_EEEC_TW_WAKE_MIN_MASK I40E_MASK(0x3F, I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT) +#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT 24 +#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_MASK I40E_MASK(0x3, I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT) +#define I40E_PRTPM_EEEC_TEEE_DLY_SHIFT 26 +#define I40E_PRTPM_EEEC_TEEE_DLY_MASK I40E_MASK(0x3F, I40E_PRTPM_EEEC_TEEE_DLY_SHIFT) +#define I40E_PRTPM_EEEFWD 0x001E4400 /* Reset: GLOBR */ +#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT 31 +#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_MASK I40E_MASK(0x1, I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT) +#define I40E_PRTPM_EEER 0x001E4360 /* Reset: GLOBR */ +#define I40E_PRTPM_EEER_TW_SYSTEM_SHIFT 0 +#define I40E_PRTPM_EEER_TW_SYSTEM_MASK I40E_MASK(0xFFFF, I40E_PRTPM_EEER_TW_SYSTEM_SHIFT) +#define I40E_PRTPM_EEER_TX_LPI_EN_SHIFT 16 +#define I40E_PRTPM_EEER_TX_LPI_EN_MASK I40E_MASK(0x1, I40E_PRTPM_EEER_TX_LPI_EN_SHIFT) +#define I40E_PRTPM_EEETXC 0x001E43E0 /* Reset: GLOBR */ +#define I40E_PRTPM_EEETXC_TW_PHY_SHIFT 0 +#define I40E_PRTPM_EEETXC_TW_PHY_MASK I40E_MASK(0xFFFF, I40E_PRTPM_EEETXC_TW_PHY_SHIFT) +#define I40E_PRTPM_GC 0x000B8140 /* Reset: POR */ +#define I40E_PRTPM_GC_EMP_LINK_ON_SHIFT 0 +#define I40E_PRTPM_GC_EMP_LINK_ON_MASK I40E_MASK(0x1, I40E_PRTPM_GC_EMP_LINK_ON_SHIFT) +#define I40E_PRTPM_GC_MNG_VETO_SHIFT 1 +#define I40E_PRTPM_GC_MNG_VETO_MASK I40E_MASK(0x1, I40E_PRTPM_GC_MNG_VETO_SHIFT) +#define I40E_PRTPM_GC_RATD_SHIFT 2 +#define I40E_PRTPM_GC_RATD_MASK I40E_MASK(0x1, I40E_PRTPM_GC_RATD_SHIFT) +#define I40E_PRTPM_GC_LCDMP_SHIFT 3 +#define I40E_PRTPM_GC_LCDMP_MASK I40E_MASK(0x1, I40E_PRTPM_GC_LCDMP_SHIFT) +#define I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT 31 +#define I40E_PRTPM_GC_LPLU_ASSERTED_MASK I40E_MASK(0x1, I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT) +#define I40E_PRTPM_RLPIC 0x001E43A0 /* Reset: GLOBR */ +#define I40E_PRTPM_RLPIC_ERLPIC_SHIFT 0 +#define I40E_PRTPM_RLPIC_ERLPIC_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_RLPIC_ERLPIC_SHIFT) +#define I40E_PRTPM_TLPIC 0x001E43C0 /* Reset: GLOBR */ +#define I40E_PRTPM_TLPIC_ETLPIC_SHIFT 0 +#define I40E_PRTPM_TLPIC_ETLPIC_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_TLPIC_ETLPIC_SHIFT) +#define I40E_GLRPB_DPSS 0x000AC828 /* Reset: CORER */ +#define I40E_GLRPB_DPSS_DPS_TCN_SHIFT 0 +#define I40E_GLRPB_DPSS_DPS_TCN_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_DPSS_DPS_TCN_SHIFT) +#define I40E_GLRPB_GHW 0x000AC830 /* Reset: CORER */ +#define I40E_GLRPB_GHW_GHW_SHIFT 0 +#define I40E_GLRPB_GHW_GHW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_GHW_GHW_SHIFT) +#define I40E_GLRPB_GLW 0x000AC834 /* Reset: CORER */ +#define I40E_GLRPB_GLW_GLW_SHIFT 0 +#define I40E_GLRPB_GLW_GLW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_GLW_GLW_SHIFT) +#define I40E_GLRPB_PHW 0x000AC844 /* Reset: CORER */ +#define I40E_GLRPB_PHW_PHW_SHIFT 0 +#define I40E_GLRPB_PHW_PHW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_PHW_PHW_SHIFT) +#define I40E_GLRPB_PLW 0x000AC848 /* Reset: CORER */ +#define I40E_GLRPB_PLW_PLW_SHIFT 0 +#define I40E_GLRPB_PLW_PLW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_PLW_PLW_SHIFT) +#define I40E_PRTRPB_DHW(_i) (0x000AC100 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTRPB_DHW_MAX_INDEX 7 +#define I40E_PRTRPB_DHW_DHW_TCN_SHIFT 0 +#define I40E_PRTRPB_DHW_DHW_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DHW_DHW_TCN_SHIFT) +#define I40E_PRTRPB_DLW(_i) (0x000AC220 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTRPB_DLW_MAX_INDEX 7 +#define I40E_PRTRPB_DLW_DLW_TCN_SHIFT 0 +#define I40E_PRTRPB_DLW_DLW_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DLW_DLW_TCN_SHIFT) +#define I40E_PRTRPB_DPS(_i) (0x000AC320 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTRPB_DPS_MAX_INDEX 7 +#define I40E_PRTRPB_DPS_DPS_TCN_SHIFT 0 +#define I40E_PRTRPB_DPS_DPS_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DPS_DPS_TCN_SHIFT) +#define I40E_PRTRPB_SHT(_i) (0x000AC480 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTRPB_SHT_MAX_INDEX 7 +#define I40E_PRTRPB_SHT_SHT_TCN_SHIFT 0 +#define I40E_PRTRPB_SHT_SHT_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SHT_SHT_TCN_SHIFT) +#define I40E_PRTRPB_SHW 0x000AC580 /* Reset: CORER */ +#define I40E_PRTRPB_SHW_SHW_SHIFT 0 +#define I40E_PRTRPB_SHW_SHW_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SHW_SHW_SHIFT) +#define I40E_PRTRPB_SLT(_i) (0x000AC5A0 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTRPB_SLT_MAX_INDEX 7 +#define I40E_PRTRPB_SLT_SLT_TCN_SHIFT 0 +#define I40E_PRTRPB_SLT_SLT_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SLT_SLT_TCN_SHIFT) +#define I40E_PRTRPB_SLW 0x000AC6A0 /* Reset: CORER */ +#define I40E_PRTRPB_SLW_SLW_SHIFT 0 +#define I40E_PRTRPB_SLW_SLW_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SLW_SLW_SHIFT) +#define I40E_PRTRPB_SPS 0x000AC7C0 /* Reset: CORER */ +#define I40E_PRTRPB_SPS_SPS_SHIFT 0 +#define I40E_PRTRPB_SPS_SPS_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SPS_SPS_SHIFT) +#define I40E_GLQF_CTL 0x00269BA4 /* Reset: CORER */ +#define I40E_GLQF_CTL_HTOEP_SHIFT 1 +#define I40E_GLQF_CTL_HTOEP_MASK I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_SHIFT) +#define I40E_GLQF_CTL_HTOEP_FCOE_SHIFT 2 +#define I40E_GLQF_CTL_HTOEP_FCOE_MASK I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_FCOE_SHIFT) +#define I40E_GLQF_CTL_PCNT_ALLOC_SHIFT 3 +#define I40E_GLQF_CTL_PCNT_ALLOC_MASK I40E_MASK(0x7, I40E_GLQF_CTL_PCNT_ALLOC_SHIFT) +#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT 6 +#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_MASK I40E_MASK(0x1, I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT) +#define I40E_GLQF_CTL_RSVD_SHIFT 7 +#define I40E_GLQF_CTL_RSVD_MASK I40E_MASK(0x1, I40E_GLQF_CTL_RSVD_SHIFT) +#define I40E_GLQF_CTL_MAXPEBLEN_SHIFT 8 +#define I40E_GLQF_CTL_MAXPEBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXPEBLEN_SHIFT) +#define I40E_GLQF_CTL_MAXFCBLEN_SHIFT 11 +#define I40E_GLQF_CTL_MAXFCBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXFCBLEN_SHIFT) +#define I40E_GLQF_CTL_MAXFDBLEN_SHIFT 14 +#define I40E_GLQF_CTL_MAXFDBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXFDBLEN_SHIFT) +#define I40E_GLQF_CTL_FDBEST_SHIFT 17 +#define I40E_GLQF_CTL_FDBEST_MASK I40E_MASK(0xFF, I40E_GLQF_CTL_FDBEST_SHIFT) +#define I40E_GLQF_CTL_PROGPRIO_SHIFT 25 +#define I40E_GLQF_CTL_PROGPRIO_MASK I40E_MASK(0x1, I40E_GLQF_CTL_PROGPRIO_SHIFT) +#define I40E_GLQF_CTL_INVALPRIO_SHIFT 26 +#define I40E_GLQF_CTL_INVALPRIO_MASK I40E_MASK(0x1, I40E_GLQF_CTL_INVALPRIO_SHIFT) +#define I40E_GLQF_CTL_IGNORE_IP_SHIFT 27 +#define I40E_GLQF_CTL_IGNORE_IP_MASK I40E_MASK(0x1, I40E_GLQF_CTL_IGNORE_IP_SHIFT) +#define I40E_GLQF_FDCNT_0 0x00269BAC /* Reset: CORER */ +#define I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT 0 +#define I40E_GLQF_FDCNT_0_GUARANT_CNT_MASK I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT) +#define I40E_GLQF_FDCNT_0_BESTCNT_SHIFT 13 +#define I40E_GLQF_FDCNT_0_BESTCNT_MASK I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_BESTCNT_SHIFT) +#define I40E_GLQF_HKEY(_i) (0x00270140 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */ +#define I40E_GLQF_HKEY_MAX_INDEX 12 +#define I40E_GLQF_HKEY_KEY_0_SHIFT 0 +#define I40E_GLQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_0_SHIFT) +#define I40E_GLQF_HKEY_KEY_1_SHIFT 8 +#define I40E_GLQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_1_SHIFT) +#define I40E_GLQF_HKEY_KEY_2_SHIFT 16 +#define I40E_GLQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_2_SHIFT) +#define I40E_GLQF_HKEY_KEY_3_SHIFT 24 +#define I40E_GLQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_3_SHIFT) +#define I40E_GLQF_HSYM(_i) (0x00269D00 + ((_i) * 4)) /* _i=0...63 */ /* Reset: CORER */ +#define I40E_GLQF_HSYM_MAX_INDEX 63 +#define I40E_GLQF_HSYM_SYMH_ENA_SHIFT 0 +#define I40E_GLQF_HSYM_SYMH_ENA_MASK I40E_MASK(0x1, I40E_GLQF_HSYM_SYMH_ENA_SHIFT) +#define I40E_GLQF_PCNT(_i) (0x00266800 + ((_i) * 4)) /* _i=0...511 */ /* Reset: CORER */ +#define I40E_GLQF_PCNT_MAX_INDEX 511 +#define I40E_GLQF_PCNT_PCNT_SHIFT 0 +#define I40E_GLQF_PCNT_PCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_PCNT_PCNT_SHIFT) +#define I40E_GLQF_SWAP(_i, _j) (0x00267E00 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */ +#define I40E_GLQF_SWAP_MAX_INDEX 1 +#define I40E_GLQF_SWAP_OFF0_SRC0_SHIFT 0 +#define I40E_GLQF_SWAP_OFF0_SRC0_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC0_SHIFT) +#define I40E_GLQF_SWAP_OFF0_SRC1_SHIFT 6 +#define I40E_GLQF_SWAP_OFF0_SRC1_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC1_SHIFT) +#define I40E_GLQF_SWAP_FLEN0_SHIFT 12 +#define I40E_GLQF_SWAP_FLEN0_MASK I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN0_SHIFT) +#define I40E_GLQF_SWAP_OFF1_SRC0_SHIFT 16 +#define I40E_GLQF_SWAP_OFF1_SRC0_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC0_SHIFT) +#define I40E_GLQF_SWAP_OFF1_SRC1_SHIFT 22 +#define I40E_GLQF_SWAP_OFF1_SRC1_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC1_SHIFT) +#define I40E_GLQF_SWAP_FLEN1_SHIFT 28 +#define I40E_GLQF_SWAP_FLEN1_MASK I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN1_SHIFT) +#define I40E_PFQF_CTL_0 0x001C0AC0 /* Reset: CORER */ +#define I40E_PFQF_CTL_0_PEHSIZE_SHIFT 0 +#define I40E_PFQF_CTL_0_PEHSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEHSIZE_SHIFT) +#define I40E_PFQF_CTL_0_PEDSIZE_SHIFT 5 +#define I40E_PFQF_CTL_0_PEDSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEDSIZE_SHIFT) +#define I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT 10 +#define I40E_PFQF_CTL_0_PFFCHSIZE_MASK I40E_MASK(0xF, I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT) +#define I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT 14 +#define I40E_PFQF_CTL_0_PFFCDSIZE_MASK I40E_MASK(0x3, I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT) +#define I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT 16 +#define I40E_PFQF_CTL_0_HASHLUTSIZE_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT) +#define I40E_PFQF_CTL_0_FD_ENA_SHIFT 17 +#define I40E_PFQF_CTL_0_FD_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_FD_ENA_SHIFT) +#define I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT 18 +#define I40E_PFQF_CTL_0_ETYPE_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT) +#define I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT 19 +#define I40E_PFQF_CTL_0_MACVLAN_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT) +#define I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT 20 +#define I40E_PFQF_CTL_0_VFFCHSIZE_MASK I40E_MASK(0xF, I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT) +#define I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT 24 +#define I40E_PFQF_CTL_0_VFFCDSIZE_MASK I40E_MASK(0x3, I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT) +#define I40E_PFQF_CTL_1 0x00245D80 /* Reset: CORER */ +#define I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT 0 +#define I40E_PFQF_CTL_1_CLEARFDTABLE_MASK I40E_MASK(0x1, I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT) +#define I40E_PFQF_FDALLOC 0x00246280 /* Reset: CORER */ +#define I40E_PFQF_FDALLOC_FDALLOC_SHIFT 0 +#define I40E_PFQF_FDALLOC_FDALLOC_MASK I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDALLOC_SHIFT) +#define I40E_PFQF_FDALLOC_FDBEST_SHIFT 8 +#define I40E_PFQF_FDALLOC_FDBEST_MASK I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDBEST_SHIFT) +#define I40E_PFQF_FDSTAT 0x00246380 /* Reset: CORER */ +#define I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT 0 +#define I40E_PFQF_FDSTAT_GUARANT_CNT_MASK I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT) +#define I40E_PFQF_FDSTAT_BEST_CNT_SHIFT 16 +#define I40E_PFQF_FDSTAT_BEST_CNT_MASK I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_BEST_CNT_SHIFT) +#define I40E_PFQF_HENA(_i) (0x00245900 + ((_i) * 128)) /* _i=0...1 */ /* Reset: CORER */ +#define I40E_PFQF_HENA_MAX_INDEX 1 +#define I40E_PFQF_HENA_PTYPE_ENA_SHIFT 0 +#define I40E_PFQF_HENA_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_PFQF_HENA_PTYPE_ENA_SHIFT) +#define I40E_PFQF_HKEY(_i) (0x00244800 + ((_i) * 128)) /* _i=0...12 */ /* Reset: CORER */ +#define I40E_PFQF_HKEY_MAX_INDEX 12 +#define I40E_PFQF_HKEY_KEY_0_SHIFT 0 +#define I40E_PFQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_0_SHIFT) +#define I40E_PFQF_HKEY_KEY_1_SHIFT 8 +#define I40E_PFQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_1_SHIFT) +#define I40E_PFQF_HKEY_KEY_2_SHIFT 16 +#define I40E_PFQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_2_SHIFT) +#define I40E_PFQF_HKEY_KEY_3_SHIFT 24 +#define I40E_PFQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_3_SHIFT) +#define I40E_PFQF_HLUT(_i) (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_PFQF_HLUT_MAX_INDEX 127 +#define I40E_PFQF_HLUT_LUT0_SHIFT 0 +#define I40E_PFQF_HLUT_LUT0_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT0_SHIFT) +#define I40E_PFQF_HLUT_LUT1_SHIFT 8 +#define I40E_PFQF_HLUT_LUT1_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT1_SHIFT) +#define I40E_PFQF_HLUT_LUT2_SHIFT 16 +#define I40E_PFQF_HLUT_LUT2_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT2_SHIFT) +#define I40E_PFQF_HLUT_LUT3_SHIFT 24 +#define I40E_PFQF_HLUT_LUT3_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT3_SHIFT) +#define I40E_PRTQF_CTL_0 0x00256E60 /* Reset: CORER */ +#define I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT 0 +#define I40E_PRTQF_CTL_0_HSYM_ENA_MASK I40E_MASK(0x1, I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT) +#define I40E_PRTQF_FD_FLXINSET(_i) (0x00253800 + ((_i) * 32)) /* _i=0...63 */ /* Reset: CORER */ +#define I40E_PRTQF_FD_FLXINSET_MAX_INDEX 63 +#define I40E_PRTQF_FD_FLXINSET_INSET_SHIFT 0 +#define I40E_PRTQF_FD_FLXINSET_INSET_MASK I40E_MASK(0xFF, I40E_PRTQF_FD_FLXINSET_INSET_SHIFT) +#define I40E_PRTQF_FD_MSK(_i, _j) (0x00252000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */ /* Reset: CORER */ +#define I40E_PRTQF_FD_MSK_MAX_INDEX 63 +#define I40E_PRTQF_FD_MSK_MASK_SHIFT 0 +#define I40E_PRTQF_FD_MSK_MASK_MASK I40E_MASK(0xFFFF, I40E_PRTQF_FD_MSK_MASK_SHIFT) +#define I40E_PRTQF_FD_MSK_OFFSET_SHIFT 16 +#define I40E_PRTQF_FD_MSK_OFFSET_MASK I40E_MASK(0x3F, I40E_PRTQF_FD_MSK_OFFSET_SHIFT) +#define I40E_PRTQF_FLX_PIT(_i) (0x00255200 + ((_i) * 32)) /* _i=0...8 */ /* Reset: CORER */ +#define I40E_PRTQF_FLX_PIT_MAX_INDEX 8 +#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT 0 +#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT) +#define I40E_PRTQF_FLX_PIT_FSIZE_SHIFT 5 +#define I40E_PRTQF_FLX_PIT_FSIZE_MASK I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_FSIZE_SHIFT) +#define I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT 10 +#define I40E_PRTQF_FLX_PIT_DEST_OFF_MASK I40E_MASK(0x3F, I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT) +#define I40E_VFQF_HENA1(_i, _VF) (0x00230800 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...1, _VF=0...127 */ /* Reset: CORER */ +#define I40E_VFQF_HENA1_MAX_INDEX 1 +#define I40E_VFQF_HENA1_PTYPE_ENA_SHIFT 0 +#define I40E_VFQF_HENA1_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA1_PTYPE_ENA_SHIFT) +#define I40E_VFQF_HKEY1(_i, _VF) (0x00228000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...12, _VF=0...127 */ /* Reset: CORER */ +#define I40E_VFQF_HKEY1_MAX_INDEX 12 +#define I40E_VFQF_HKEY1_KEY_0_SHIFT 0 +#define I40E_VFQF_HKEY1_KEY_0_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_0_SHIFT) +#define I40E_VFQF_HKEY1_KEY_1_SHIFT 8 +#define I40E_VFQF_HKEY1_KEY_1_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_1_SHIFT) +#define I40E_VFQF_HKEY1_KEY_2_SHIFT 16 +#define I40E_VFQF_HKEY1_KEY_2_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_2_SHIFT) +#define I40E_VFQF_HKEY1_KEY_3_SHIFT 24 +#define I40E_VFQF_HKEY1_KEY_3_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_3_SHIFT) +#define I40E_VFQF_HLUT1(_i, _VF) (0x00220000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: CORER */ +#define I40E_VFQF_HLUT1_MAX_INDEX 15 +#define I40E_VFQF_HLUT1_LUT0_SHIFT 0 +#define I40E_VFQF_HLUT1_LUT0_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT0_SHIFT) +#define I40E_VFQF_HLUT1_LUT1_SHIFT 8 +#define I40E_VFQF_HLUT1_LUT1_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT1_SHIFT) +#define I40E_VFQF_HLUT1_LUT2_SHIFT 16 +#define I40E_VFQF_HLUT1_LUT2_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT2_SHIFT) +#define I40E_VFQF_HLUT1_LUT3_SHIFT 24 +#define I40E_VFQF_HLUT1_LUT3_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT3_SHIFT) +#define I40E_VFQF_HREGION1(_i, _VF) (0x0022E000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...7, _VF=0...127 */ /* Reset: CORER */ +#define I40E_VFQF_HREGION1_MAX_INDEX 7 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT 0 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT) +#define I40E_VFQF_HREGION1_REGION_0_SHIFT 1 +#define I40E_VFQF_HREGION1_REGION_0_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_0_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT 4 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT) +#define I40E_VFQF_HREGION1_REGION_1_SHIFT 5 +#define I40E_VFQF_HREGION1_REGION_1_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_1_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT 8 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT) +#define I40E_VFQF_HREGION1_REGION_2_SHIFT 9 +#define I40E_VFQF_HREGION1_REGION_2_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_2_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT 12 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT) +#define I40E_VFQF_HREGION1_REGION_3_SHIFT 13 +#define I40E_VFQF_HREGION1_REGION_3_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_3_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT 16 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT) +#define I40E_VFQF_HREGION1_REGION_4_SHIFT 17 +#define I40E_VFQF_HREGION1_REGION_4_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_4_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT 20 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT) +#define I40E_VFQF_HREGION1_REGION_5_SHIFT 21 +#define I40E_VFQF_HREGION1_REGION_5_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_5_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT 24 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT) +#define I40E_VFQF_HREGION1_REGION_6_SHIFT 25 +#define I40E_VFQF_HREGION1_REGION_6_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_6_SHIFT) +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT 28 +#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT) +#define I40E_VFQF_HREGION1_REGION_7_SHIFT 29 +#define I40E_VFQF_HREGION1_REGION_7_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_7_SHIFT) +#define I40E_VPQF_CTL(_VF) (0x001C0000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VPQF_CTL_MAX_INDEX 127 +#define I40E_VPQF_CTL_PEHSIZE_SHIFT 0 +#define I40E_VPQF_CTL_PEHSIZE_MASK I40E_MASK(0x1F, I40E_VPQF_CTL_PEHSIZE_SHIFT) +#define I40E_VPQF_CTL_PEDSIZE_SHIFT 5 +#define I40E_VPQF_CTL_PEDSIZE_MASK I40E_MASK(0x1F, I40E_VPQF_CTL_PEDSIZE_SHIFT) +#define I40E_VPQF_CTL_FCHSIZE_SHIFT 10 +#define I40E_VPQF_CTL_FCHSIZE_MASK I40E_MASK(0xF, I40E_VPQF_CTL_FCHSIZE_SHIFT) +#define I40E_VPQF_CTL_FCDSIZE_SHIFT 14 +#define I40E_VPQF_CTL_FCDSIZE_MASK I40E_MASK(0x3, I40E_VPQF_CTL_FCDSIZE_SHIFT) +#define I40E_VSIQF_CTL(_VSI) (0x0020D800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */ +#define I40E_VSIQF_CTL_MAX_INDEX 383 +#define I40E_VSIQF_CTL_FCOE_ENA_SHIFT 0 +#define I40E_VSIQF_CTL_FCOE_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_FCOE_ENA_SHIFT) +#define I40E_VSIQF_CTL_PETCP_ENA_SHIFT 1 +#define I40E_VSIQF_CTL_PETCP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PETCP_ENA_SHIFT) +#define I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT 2 +#define I40E_VSIQF_CTL_PEUUDP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT) +#define I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT 3 +#define I40E_VSIQF_CTL_PEMUDP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT) +#define I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT 4 +#define I40E_VSIQF_CTL_PEUFRAG_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT) +#define I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT 5 +#define I40E_VSIQF_CTL_PEMFRAG_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT) +#define I40E_VSIQF_TCREGION(_i, _VSI) (0x00206000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...3, _VSI=0...383 */ /* Reset: PFR */ +#define I40E_VSIQF_TCREGION_MAX_INDEX 3 +#define I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT 0 +#define I40E_VSIQF_TCREGION_TC_OFFSET_MASK I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT) +#define I40E_VSIQF_TCREGION_TC_SIZE_SHIFT 9 +#define I40E_VSIQF_TCREGION_TC_SIZE_MASK I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE_SHIFT) +#define I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT 16 +#define I40E_VSIQF_TCREGION_TC_OFFSET2_MASK I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT) +#define I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT 25 +#define I40E_VSIQF_TCREGION_TC_SIZE2_MASK I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT) +#define I40E_GL_FCOECRC(_i) (0x00314d80 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOECRC_MAX_INDEX 143 +#define I40E_GL_FCOECRC_FCOECRC_SHIFT 0 +#define I40E_GL_FCOECRC_FCOECRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOECRC_FCOECRC_SHIFT) +#define I40E_GL_FCOEDDPC(_i) (0x00314480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDDPC_MAX_INDEX 143 +#define I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT 0 +#define I40E_GL_FCOEDDPC_FCOEDDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT) +#define I40E_GL_FCOEDIFEC(_i) (0x00318480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDIFEC_MAX_INDEX 143 +#define I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT 0 +#define I40E_GL_FCOEDIFEC_FCOEDIFRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT) +#define I40E_GL_FCOEDIFTCL(_i) (0x00354000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDIFTCL_MAX_INDEX 143 +#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT 0 +#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT) +#define I40E_GL_FCOEDIXEC(_i) (0x0034c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDIXEC_MAX_INDEX 143 +#define I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT 0 +#define I40E_GL_FCOEDIXEC_FCOEDIXEC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT) +#define I40E_GL_FCOEDIXVC(_i) (0x00350000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDIXVC_MAX_INDEX 143 +#define I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT 0 +#define I40E_GL_FCOEDIXVC_FCOEDIXVC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT) +#define I40E_GL_FCOEDWRCH(_i) (0x00320004 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDWRCH_MAX_INDEX 143 +#define I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT 0 +#define I40E_GL_FCOEDWRCH_FCOEDWRCH_MASK I40E_MASK(0xFFFF, I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT) +#define I40E_GL_FCOEDWRCL(_i) (0x00320000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDWRCL_MAX_INDEX 143 +#define I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT 0 +#define I40E_GL_FCOEDWRCL_FCOEDWRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT) +#define I40E_GL_FCOEDWTCH(_i) (0x00348084 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDWTCH_MAX_INDEX 143 +#define I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT 0 +#define I40E_GL_FCOEDWTCH_FCOEDWTCH_MASK I40E_MASK(0xFFFF, I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT) +#define I40E_GL_FCOEDWTCL(_i) (0x00348080 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEDWTCL_MAX_INDEX 143 +#define I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT 0 +#define I40E_GL_FCOEDWTCL_FCOEDWTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT) +#define I40E_GL_FCOELAST(_i) (0x00314000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOELAST_MAX_INDEX 143 +#define I40E_GL_FCOELAST_FCOELAST_SHIFT 0 +#define I40E_GL_FCOELAST_FCOELAST_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOELAST_FCOELAST_SHIFT) +#define I40E_GL_FCOEPRC(_i) (0x00315200 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEPRC_MAX_INDEX 143 +#define I40E_GL_FCOEPRC_FCOEPRC_SHIFT 0 +#define I40E_GL_FCOEPRC_FCOEPRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPRC_FCOEPRC_SHIFT) +#define I40E_GL_FCOEPTC(_i) (0x00344C00 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOEPTC_MAX_INDEX 143 +#define I40E_GL_FCOEPTC_FCOEPTC_SHIFT 0 +#define I40E_GL_FCOEPTC_FCOEPTC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPTC_FCOEPTC_SHIFT) +#define I40E_GL_FCOERPDC(_i) (0x00324000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_FCOERPDC_MAX_INDEX 143 +#define I40E_GL_FCOERPDC_FCOERPDC_SHIFT 0 +#define I40E_GL_FCOERPDC_FCOERPDC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOERPDC_FCOERPDC_SHIFT) +#define I40E_GL_RXERR1_L(_i) (0x00318000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_RXERR1_L_MAX_INDEX 143 +#define I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT 0 +#define I40E_GL_RXERR1_L_FCOEDIFRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT) +#define I40E_GL_RXERR2_L(_i) (0x0031c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */ +#define I40E_GL_RXERR2_L_MAX_INDEX 143 +#define I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT 0 +#define I40E_GL_RXERR2_L_FCOEDIXAC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT) +#define I40E_GLPRT_BPRCH(_i) (0x003005E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_BPRCH_MAX_INDEX 3 +#define I40E_GLPRT_BPRCH_BPRCH_SHIFT 0 +#define I40E_GLPRT_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_BPRCH_BPRCH_SHIFT) +#define I40E_GLPRT_BPRCL(_i) (0x003005E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_BPRCL_MAX_INDEX 3 +#define I40E_GLPRT_BPRCL_BPRCL_SHIFT 0 +#define I40E_GLPRT_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPRCL_BPRCL_SHIFT) +#define I40E_GLPRT_BPTCH(_i) (0x00300A04 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_BPTCH_MAX_INDEX 3 +#define I40E_GLPRT_BPTCH_BPTCH_SHIFT 0 +#define I40E_GLPRT_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_BPTCH_BPTCH_SHIFT) +#define I40E_GLPRT_BPTCL(_i) (0x00300A00 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_BPTCL_MAX_INDEX 3 +#define I40E_GLPRT_BPTCL_BPTCL_SHIFT 0 +#define I40E_GLPRT_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPTCL_BPTCL_SHIFT) +#define I40E_GLPRT_CRCERRS(_i) (0x00300080 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_CRCERRS_MAX_INDEX 3 +#define I40E_GLPRT_CRCERRS_CRCERRS_SHIFT 0 +#define I40E_GLPRT_CRCERRS_CRCERRS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_CRCERRS_CRCERRS_SHIFT) +#define I40E_GLPRT_GORCH(_i) (0x00300004 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_GORCH_MAX_INDEX 3 +#define I40E_GLPRT_GORCH_GORCH_SHIFT 0 +#define I40E_GLPRT_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_GORCH_GORCH_SHIFT) +#define I40E_GLPRT_GORCL(_i) (0x00300000 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_GORCL_MAX_INDEX 3 +#define I40E_GLPRT_GORCL_GORCL_SHIFT 0 +#define I40E_GLPRT_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GORCL_GORCL_SHIFT) +#define I40E_GLPRT_GOTCH(_i) (0x00300684 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_GOTCH_MAX_INDEX 3 +#define I40E_GLPRT_GOTCH_GOTCH_SHIFT 0 +#define I40E_GLPRT_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_GOTCH_GOTCH_SHIFT) +#define I40E_GLPRT_GOTCL(_i) (0x00300680 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_GOTCL_MAX_INDEX 3 +#define I40E_GLPRT_GOTCL_GOTCL_SHIFT 0 +#define I40E_GLPRT_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GOTCL_GOTCL_SHIFT) +#define I40E_GLPRT_ILLERRC(_i) (0x003000E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_ILLERRC_MAX_INDEX 3 +#define I40E_GLPRT_ILLERRC_ILLERRC_SHIFT 0 +#define I40E_GLPRT_ILLERRC_ILLERRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ILLERRC_ILLERRC_SHIFT) +#define I40E_GLPRT_LDPC(_i) (0x00300620 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_LDPC_MAX_INDEX 3 +#define I40E_GLPRT_LDPC_LDPC_SHIFT 0 +#define I40E_GLPRT_LDPC_LDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LDPC_LDPC_SHIFT) +#define I40E_GLPRT_LXOFFRXC(_i) (0x00300160 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_LXOFFRXC_MAX_INDEX 3 +#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT 0 +#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT) +#define I40E_GLPRT_LXOFFTXC(_i) (0x003009A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_LXOFFTXC_MAX_INDEX 3 +#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT 0 +#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT) +#define I40E_GLPRT_LXONRXC(_i) (0x00300140 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_LXONRXC_MAX_INDEX 3 +#define I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT 0 +#define I40E_GLPRT_LXONRXC_LXONRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT) +#define I40E_GLPRT_LXONTXC(_i) (0x00300980 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_LXONTXC_MAX_INDEX 3 +#define I40E_GLPRT_LXONTXC_LXONTXC_SHIFT 0 +#define I40E_GLPRT_LXONTXC_LXONTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONTXC_LXONTXC_SHIFT) +#define I40E_GLPRT_MLFC(_i) (0x00300020 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MLFC_MAX_INDEX 3 +#define I40E_GLPRT_MLFC_MLFC_SHIFT 0 +#define I40E_GLPRT_MLFC_MLFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MLFC_MLFC_SHIFT) +#define I40E_GLPRT_MPRCH(_i) (0x003005C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MPRCH_MAX_INDEX 3 +#define I40E_GLPRT_MPRCH_MPRCH_SHIFT 0 +#define I40E_GLPRT_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_MPRCH_MPRCH_SHIFT) +#define I40E_GLPRT_MPRCL(_i) (0x003005C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MPRCL_MAX_INDEX 3 +#define I40E_GLPRT_MPRCL_MPRCL_SHIFT 0 +#define I40E_GLPRT_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPRCL_MPRCL_SHIFT) +#define I40E_GLPRT_MPTCH(_i) (0x003009E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MPTCH_MAX_INDEX 3 +#define I40E_GLPRT_MPTCH_MPTCH_SHIFT 0 +#define I40E_GLPRT_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_MPTCH_MPTCH_SHIFT) +#define I40E_GLPRT_MPTCL(_i) (0x003009E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MPTCL_MAX_INDEX 3 +#define I40E_GLPRT_MPTCL_MPTCL_SHIFT 0 +#define I40E_GLPRT_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPTCL_MPTCL_SHIFT) +#define I40E_GLPRT_MRFC(_i) (0x00300040 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_MRFC_MAX_INDEX 3 +#define I40E_GLPRT_MRFC_MRFC_SHIFT 0 +#define I40E_GLPRT_MRFC_MRFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MRFC_MRFC_SHIFT) +#define I40E_GLPRT_PRC1023H(_i) (0x00300504 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC1023H_MAX_INDEX 3 +#define I40E_GLPRT_PRC1023H_PRC1023H_SHIFT 0 +#define I40E_GLPRT_PRC1023H_PRC1023H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC1023H_PRC1023H_SHIFT) +#define I40E_GLPRT_PRC1023L(_i) (0x00300500 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC1023L_MAX_INDEX 3 +#define I40E_GLPRT_PRC1023L_PRC1023L_SHIFT 0 +#define I40E_GLPRT_PRC1023L_PRC1023L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1023L_PRC1023L_SHIFT) +#define I40E_GLPRT_PRC127H(_i) (0x003004A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC127H_MAX_INDEX 3 +#define I40E_GLPRT_PRC127H_PRC127H_SHIFT 0 +#define I40E_GLPRT_PRC127H_PRC127H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC127H_PRC127H_SHIFT) +#define I40E_GLPRT_PRC127L(_i) (0x003004A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC127L_MAX_INDEX 3 +#define I40E_GLPRT_PRC127L_PRC127L_SHIFT 0 +#define I40E_GLPRT_PRC127L_PRC127L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC127L_PRC127L_SHIFT) +#define I40E_GLPRT_PRC1522H(_i) (0x00300524 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC1522H_MAX_INDEX 3 +#define I40E_GLPRT_PRC1522H_PRC1522H_SHIFT 0 +#define I40E_GLPRT_PRC1522H_PRC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC1522H_PRC1522H_SHIFT) +#define I40E_GLPRT_PRC1522L(_i) (0x00300520 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC1522L_MAX_INDEX 3 +#define I40E_GLPRT_PRC1522L_PRC1522L_SHIFT 0 +#define I40E_GLPRT_PRC1522L_PRC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1522L_PRC1522L_SHIFT) +#define I40E_GLPRT_PRC255H(_i) (0x003004C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC255H_MAX_INDEX 3 +#define I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT 0 +#define I40E_GLPRT_PRC255H_PRTPRC255H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT) +#define I40E_GLPRT_PRC255L(_i) (0x003004C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC255L_MAX_INDEX 3 +#define I40E_GLPRT_PRC255L_PRC255L_SHIFT 0 +#define I40E_GLPRT_PRC255L_PRC255L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC255L_PRC255L_SHIFT) +#define I40E_GLPRT_PRC511H(_i) (0x003004E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC511H_MAX_INDEX 3 +#define I40E_GLPRT_PRC511H_PRC511H_SHIFT 0 +#define I40E_GLPRT_PRC511H_PRC511H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC511H_PRC511H_SHIFT) +#define I40E_GLPRT_PRC511L(_i) (0x003004E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC511L_MAX_INDEX 3 +#define I40E_GLPRT_PRC511L_PRC511L_SHIFT 0 +#define I40E_GLPRT_PRC511L_PRC511L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC511L_PRC511L_SHIFT) +#define I40E_GLPRT_PRC64H(_i) (0x00300484 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC64H_MAX_INDEX 3 +#define I40E_GLPRT_PRC64H_PRC64H_SHIFT 0 +#define I40E_GLPRT_PRC64H_PRC64H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC64H_PRC64H_SHIFT) +#define I40E_GLPRT_PRC64L(_i) (0x00300480 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC64L_MAX_INDEX 3 +#define I40E_GLPRT_PRC64L_PRC64L_SHIFT 0 +#define I40E_GLPRT_PRC64L_PRC64L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC64L_PRC64L_SHIFT) +#define I40E_GLPRT_PRC9522H(_i) (0x00300544 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC9522H_MAX_INDEX 3 +#define I40E_GLPRT_PRC9522H_PRC1522H_SHIFT 0 +#define I40E_GLPRT_PRC9522H_PRC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC9522H_PRC1522H_SHIFT) +#define I40E_GLPRT_PRC9522L(_i) (0x00300540 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PRC9522L_MAX_INDEX 3 +#define I40E_GLPRT_PRC9522L_PRC1522L_SHIFT 0 +#define I40E_GLPRT_PRC9522L_PRC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC9522L_PRC1522L_SHIFT) +#define I40E_GLPRT_PTC1023H(_i) (0x00300724 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC1023H_MAX_INDEX 3 +#define I40E_GLPRT_PTC1023H_PTC1023H_SHIFT 0 +#define I40E_GLPRT_PTC1023H_PTC1023H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC1023H_PTC1023H_SHIFT) +#define I40E_GLPRT_PTC1023L(_i) (0x00300720 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC1023L_MAX_INDEX 3 +#define I40E_GLPRT_PTC1023L_PTC1023L_SHIFT 0 +#define I40E_GLPRT_PTC1023L_PTC1023L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1023L_PTC1023L_SHIFT) +#define I40E_GLPRT_PTC127H(_i) (0x003006C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC127H_MAX_INDEX 3 +#define I40E_GLPRT_PTC127H_PTC127H_SHIFT 0 +#define I40E_GLPRT_PTC127H_PTC127H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC127H_PTC127H_SHIFT) +#define I40E_GLPRT_PTC127L(_i) (0x003006C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC127L_MAX_INDEX 3 +#define I40E_GLPRT_PTC127L_PTC127L_SHIFT 0 +#define I40E_GLPRT_PTC127L_PTC127L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC127L_PTC127L_SHIFT) +#define I40E_GLPRT_PTC1522H(_i) (0x00300744 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC1522H_MAX_INDEX 3 +#define I40E_GLPRT_PTC1522H_PTC1522H_SHIFT 0 +#define I40E_GLPRT_PTC1522H_PTC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC1522H_PTC1522H_SHIFT) +#define I40E_GLPRT_PTC1522L(_i) (0x00300740 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC1522L_MAX_INDEX 3 +#define I40E_GLPRT_PTC1522L_PTC1522L_SHIFT 0 +#define I40E_GLPRT_PTC1522L_PTC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1522L_PTC1522L_SHIFT) +#define I40E_GLPRT_PTC255H(_i) (0x003006E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC255H_MAX_INDEX 3 +#define I40E_GLPRT_PTC255H_PTC255H_SHIFT 0 +#define I40E_GLPRT_PTC255H_PTC255H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC255H_PTC255H_SHIFT) +#define I40E_GLPRT_PTC255L(_i) (0x003006E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC255L_MAX_INDEX 3 +#define I40E_GLPRT_PTC255L_PTC255L_SHIFT 0 +#define I40E_GLPRT_PTC255L_PTC255L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC255L_PTC255L_SHIFT) +#define I40E_GLPRT_PTC511H(_i) (0x00300704 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC511H_MAX_INDEX 3 +#define I40E_GLPRT_PTC511H_PTC511H_SHIFT 0 +#define I40E_GLPRT_PTC511H_PTC511H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC511H_PTC511H_SHIFT) +#define I40E_GLPRT_PTC511L(_i) (0x00300700 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC511L_MAX_INDEX 3 +#define I40E_GLPRT_PTC511L_PTC511L_SHIFT 0 +#define I40E_GLPRT_PTC511L_PTC511L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC511L_PTC511L_SHIFT) +#define I40E_GLPRT_PTC64H(_i) (0x003006A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC64H_MAX_INDEX 3 +#define I40E_GLPRT_PTC64H_PTC64H_SHIFT 0 +#define I40E_GLPRT_PTC64H_PTC64H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC64H_PTC64H_SHIFT) +#define I40E_GLPRT_PTC64L(_i) (0x003006A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC64L_MAX_INDEX 3 +#define I40E_GLPRT_PTC64L_PTC64L_SHIFT 0 +#define I40E_GLPRT_PTC64L_PTC64L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC64L_PTC64L_SHIFT) +#define I40E_GLPRT_PTC9522H(_i) (0x00300764 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC9522H_MAX_INDEX 3 +#define I40E_GLPRT_PTC9522H_PTC9522H_SHIFT 0 +#define I40E_GLPRT_PTC9522H_PTC9522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC9522H_PTC9522H_SHIFT) +#define I40E_GLPRT_PTC9522L(_i) (0x00300760 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_PTC9522L_MAX_INDEX 3 +#define I40E_GLPRT_PTC9522L_PTC9522L_SHIFT 0 +#define I40E_GLPRT_PTC9522L_PTC9522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC9522L_PTC9522L_SHIFT) +#define I40E_GLPRT_PXOFFRXC(_i, _j) (0x00300280 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */ +#define I40E_GLPRT_PXOFFRXC_MAX_INDEX 3 +#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT 0 +#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT) +#define I40E_GLPRT_PXOFFTXC(_i, _j) (0x00300880 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */ +#define I40E_GLPRT_PXOFFTXC_MAX_INDEX 3 +#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT 0 +#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT) +#define I40E_GLPRT_PXONRXC(_i, _j) (0x00300180 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */ +#define I40E_GLPRT_PXONRXC_MAX_INDEX 3 +#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT 0 +#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT) +#define I40E_GLPRT_PXONTXC(_i, _j) (0x00300780 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */ +#define I40E_GLPRT_PXONTXC_MAX_INDEX 3 +#define I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT 0 +#define I40E_GLPRT_PXONTXC_PRPXONTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT) +#define I40E_GLPRT_RDPC(_i) (0x00300600 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RDPC_MAX_INDEX 3 +#define I40E_GLPRT_RDPC_RDPC_SHIFT 0 +#define I40E_GLPRT_RDPC_RDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RDPC_RDPC_SHIFT) +#define I40E_GLPRT_RFC(_i) (0x00300560 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RFC_MAX_INDEX 3 +#define I40E_GLPRT_RFC_RFC_SHIFT 0 +#define I40E_GLPRT_RFC_RFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RFC_RFC_SHIFT) +#define I40E_GLPRT_RJC(_i) (0x00300580 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RJC_MAX_INDEX 3 +#define I40E_GLPRT_RJC_RJC_SHIFT 0 +#define I40E_GLPRT_RJC_RJC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RJC_RJC_SHIFT) +#define I40E_GLPRT_RLEC(_i) (0x003000A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RLEC_MAX_INDEX 3 +#define I40E_GLPRT_RLEC_RLEC_SHIFT 0 +#define I40E_GLPRT_RLEC_RLEC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RLEC_RLEC_SHIFT) +#define I40E_GLPRT_ROC(_i) (0x00300120 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_ROC_MAX_INDEX 3 +#define I40E_GLPRT_ROC_ROC_SHIFT 0 +#define I40E_GLPRT_ROC_ROC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ROC_ROC_SHIFT) +#define I40E_GLPRT_RUC(_i) (0x00300100 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RUC_MAX_INDEX 3 +#define I40E_GLPRT_RUC_RUC_SHIFT 0 +#define I40E_GLPRT_RUC_RUC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUC_RUC_SHIFT) +#define I40E_GLPRT_RUPP(_i) (0x00300660 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_RUPP_MAX_INDEX 3 +#define I40E_GLPRT_RUPP_RUPP_SHIFT 0 +#define I40E_GLPRT_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUPP_RUPP_SHIFT) +#define I40E_GLPRT_RXON2OFFCNT(_i, _j) (0x00300380 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */ +#define I40E_GLPRT_RXON2OFFCNT_MAX_INDEX 3 +#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT 0 +#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT) +#define I40E_GLPRT_TDOLD(_i) (0x00300A20 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_TDOLD_MAX_INDEX 3 +#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT 0 +#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT) +#define I40E_GLPRT_UPRCH(_i) (0x003005A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_UPRCH_MAX_INDEX 3 +#define I40E_GLPRT_UPRCH_UPRCH_SHIFT 0 +#define I40E_GLPRT_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_UPRCH_UPRCH_SHIFT) +#define I40E_GLPRT_UPRCL(_i) (0x003005A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_UPRCL_MAX_INDEX 3 +#define I40E_GLPRT_UPRCL_UPRCL_SHIFT 0 +#define I40E_GLPRT_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPRCL_UPRCL_SHIFT) +#define I40E_GLPRT_UPTCH(_i) (0x003009C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_UPTCH_MAX_INDEX 3 +#define I40E_GLPRT_UPTCH_UPTCH_SHIFT 0 +#define I40E_GLPRT_UPTCH_UPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_UPTCH_UPTCH_SHIFT) +#define I40E_GLPRT_UPTCL(_i) (0x003009C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_GLPRT_UPTCL_MAX_INDEX 3 +#define I40E_GLPRT_UPTCL_VUPTCH_SHIFT 0 +#define I40E_GLPRT_UPTCL_VUPTCH_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPTCL_VUPTCH_SHIFT) +#define I40E_GLSW_BPRCH(_i) (0x00370104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_BPRCH_MAX_INDEX 15 +#define I40E_GLSW_BPRCH_BPRCH_SHIFT 0 +#define I40E_GLSW_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_BPRCH_BPRCH_SHIFT) +#define I40E_GLSW_BPRCL(_i) (0x00370100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_BPRCL_MAX_INDEX 15 +#define I40E_GLSW_BPRCL_BPRCL_SHIFT 0 +#define I40E_GLSW_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPRCL_BPRCL_SHIFT) +#define I40E_GLSW_BPTCH(_i) (0x00340104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_BPTCH_MAX_INDEX 15 +#define I40E_GLSW_BPTCH_BPTCH_SHIFT 0 +#define I40E_GLSW_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_BPTCH_BPTCH_SHIFT) +#define I40E_GLSW_BPTCL(_i) (0x00340100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_BPTCL_MAX_INDEX 15 +#define I40E_GLSW_BPTCL_BPTCL_SHIFT 0 +#define I40E_GLSW_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPTCL_BPTCL_SHIFT) +#define I40E_GLSW_GORCH(_i) (0x0035C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_GORCH_MAX_INDEX 15 +#define I40E_GLSW_GORCH_GORCH_SHIFT 0 +#define I40E_GLSW_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_GORCH_GORCH_SHIFT) +#define I40E_GLSW_GORCL(_i) (0x0035c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_GORCL_MAX_INDEX 15 +#define I40E_GLSW_GORCL_GORCL_SHIFT 0 +#define I40E_GLSW_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_GORCL_GORCL_SHIFT) +#define I40E_GLSW_GOTCH(_i) (0x0032C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_GOTCH_MAX_INDEX 15 +#define I40E_GLSW_GOTCH_GOTCH_SHIFT 0 +#define I40E_GLSW_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_GOTCH_GOTCH_SHIFT) +#define I40E_GLSW_GOTCL(_i) (0x0032c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_GOTCL_MAX_INDEX 15 +#define I40E_GLSW_GOTCL_GOTCL_SHIFT 0 +#define I40E_GLSW_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_GOTCL_GOTCL_SHIFT) +#define I40E_GLSW_MPRCH(_i) (0x00370084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_MPRCH_MAX_INDEX 15 +#define I40E_GLSW_MPRCH_MPRCH_SHIFT 0 +#define I40E_GLSW_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_MPRCH_MPRCH_SHIFT) +#define I40E_GLSW_MPRCL(_i) (0x00370080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_MPRCL_MAX_INDEX 15 +#define I40E_GLSW_MPRCL_MPRCL_SHIFT 0 +#define I40E_GLSW_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPRCL_MPRCL_SHIFT) +#define I40E_GLSW_MPTCH(_i) (0x00340084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_MPTCH_MAX_INDEX 15 +#define I40E_GLSW_MPTCH_MPTCH_SHIFT 0 +#define I40E_GLSW_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_MPTCH_MPTCH_SHIFT) +#define I40E_GLSW_MPTCL(_i) (0x00340080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_MPTCL_MAX_INDEX 15 +#define I40E_GLSW_MPTCL_MPTCL_SHIFT 0 +#define I40E_GLSW_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPTCL_MPTCL_SHIFT) +#define I40E_GLSW_RUPP(_i) (0x00370180 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_RUPP_MAX_INDEX 15 +#define I40E_GLSW_RUPP_RUPP_SHIFT 0 +#define I40E_GLSW_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_RUPP_RUPP_SHIFT) +#define I40E_GLSW_TDPC(_i) (0x00348000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_TDPC_MAX_INDEX 15 +#define I40E_GLSW_TDPC_TDPC_SHIFT 0 +#define I40E_GLSW_TDPC_TDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_TDPC_TDPC_SHIFT) +#define I40E_GLSW_UPRCH(_i) (0x00370004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_UPRCH_MAX_INDEX 15 +#define I40E_GLSW_UPRCH_UPRCH_SHIFT 0 +#define I40E_GLSW_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_UPRCH_UPRCH_SHIFT) +#define I40E_GLSW_UPRCL(_i) (0x00370000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_UPRCL_MAX_INDEX 15 +#define I40E_GLSW_UPRCL_UPRCL_SHIFT 0 +#define I40E_GLSW_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPRCL_UPRCL_SHIFT) +#define I40E_GLSW_UPTCH(_i) (0x00340004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_UPTCH_MAX_INDEX 15 +#define I40E_GLSW_UPTCH_UPTCH_SHIFT 0 +#define I40E_GLSW_UPTCH_UPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_UPTCH_UPTCH_SHIFT) +#define I40E_GLSW_UPTCL(_i) (0x00340000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLSW_UPTCL_MAX_INDEX 15 +#define I40E_GLSW_UPTCL_UPTCL_SHIFT 0 +#define I40E_GLSW_UPTCL_UPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPTCL_UPTCL_SHIFT) +#define I40E_GLV_BPRCH(_i) (0x0036D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_BPRCH_MAX_INDEX 383 +#define I40E_GLV_BPRCH_BPRCH_SHIFT 0 +#define I40E_GLV_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_BPRCH_BPRCH_SHIFT) +#define I40E_GLV_BPRCL(_i) (0x0036d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_BPRCL_MAX_INDEX 383 +#define I40E_GLV_BPRCL_BPRCL_SHIFT 0 +#define I40E_GLV_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_BPRCL_BPRCL_SHIFT) +#define I40E_GLV_BPTCH(_i) (0x0033D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_BPTCH_MAX_INDEX 383 +#define I40E_GLV_BPTCH_BPTCH_SHIFT 0 +#define I40E_GLV_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_BPTCH_BPTCH_SHIFT) +#define I40E_GLV_BPTCL(_i) (0x0033d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_BPTCL_MAX_INDEX 383 +#define I40E_GLV_BPTCL_BPTCL_SHIFT 0 +#define I40E_GLV_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_BPTCL_BPTCL_SHIFT) +#define I40E_GLV_GORCH(_i) (0x00358004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_GORCH_MAX_INDEX 383 +#define I40E_GLV_GORCH_GORCH_SHIFT 0 +#define I40E_GLV_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLV_GORCH_GORCH_SHIFT) +#define I40E_GLV_GORCL(_i) (0x00358000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_GORCL_MAX_INDEX 383 +#define I40E_GLV_GORCL_GORCL_SHIFT 0 +#define I40E_GLV_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_GORCL_GORCL_SHIFT) +#define I40E_GLV_GOTCH(_i) (0x00328004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_GOTCH_MAX_INDEX 383 +#define I40E_GLV_GOTCH_GOTCH_SHIFT 0 +#define I40E_GLV_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_GOTCH_GOTCH_SHIFT) +#define I40E_GLV_GOTCL(_i) (0x00328000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_GOTCL_MAX_INDEX 383 +#define I40E_GLV_GOTCL_GOTCL_SHIFT 0 +#define I40E_GLV_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_GOTCL_GOTCL_SHIFT) +#define I40E_GLV_MPRCH(_i) (0x0036CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_MPRCH_MAX_INDEX 383 +#define I40E_GLV_MPRCH_MPRCH_SHIFT 0 +#define I40E_GLV_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_MPRCH_MPRCH_SHIFT) +#define I40E_GLV_MPRCL(_i) (0x0036cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_MPRCL_MAX_INDEX 383 +#define I40E_GLV_MPRCL_MPRCL_SHIFT 0 +#define I40E_GLV_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_MPRCL_MPRCL_SHIFT) +#define I40E_GLV_MPTCH(_i) (0x0033CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_MPTCH_MAX_INDEX 383 +#define I40E_GLV_MPTCH_MPTCH_SHIFT 0 +#define I40E_GLV_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_MPTCH_MPTCH_SHIFT) +#define I40E_GLV_MPTCL(_i) (0x0033cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_MPTCL_MAX_INDEX 383 +#define I40E_GLV_MPTCL_MPTCL_SHIFT 0 +#define I40E_GLV_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_MPTCL_MPTCL_SHIFT) +#define I40E_GLV_RDPC(_i) (0x00310000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_RDPC_MAX_INDEX 383 +#define I40E_GLV_RDPC_RDPC_SHIFT 0 +#define I40E_GLV_RDPC_RDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RDPC_RDPC_SHIFT) +#define I40E_GLV_RUPP(_i) (0x0036E400 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_RUPP_MAX_INDEX 383 +#define I40E_GLV_RUPP_RUPP_SHIFT 0 +#define I40E_GLV_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT) +#define I40E_GLV_TEPC(_VSI) (0x00344000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_TEPC_MAX_INDEX 383 +#define I40E_GLV_TEPC_TEPC_SHIFT 0 +#define I40E_GLV_TEPC_TEPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT) +#define I40E_GLV_UPRCH(_i) (0x0036C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_UPRCH_MAX_INDEX 383 +#define I40E_GLV_UPRCH_UPRCH_SHIFT 0 +#define I40E_GLV_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_UPRCH_UPRCH_SHIFT) +#define I40E_GLV_UPRCL(_i) (0x0036c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_UPRCL_MAX_INDEX 383 +#define I40E_GLV_UPRCL_UPRCL_SHIFT 0 +#define I40E_GLV_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_UPRCL_UPRCL_SHIFT) +#define I40E_GLV_UPTCH(_i) (0x0033C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_UPTCH_MAX_INDEX 383 +#define I40E_GLV_UPTCH_GLVUPTCH_SHIFT 0 +#define I40E_GLV_UPTCH_GLVUPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_UPTCH_GLVUPTCH_SHIFT) +#define I40E_GLV_UPTCL(_i) (0x0033c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_UPTCL_MAX_INDEX 383 +#define I40E_GLV_UPTCL_UPTCL_SHIFT 0 +#define I40E_GLV_UPTCL_UPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_UPTCL_UPTCL_SHIFT) +#define I40E_GLVEBTC_RBCH(_i, _j) (0x00364004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_RBCH_MAX_INDEX 7 +#define I40E_GLVEBTC_RBCH_TCBCH_SHIFT 0 +#define I40E_GLVEBTC_RBCH_TCBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_RBCH_TCBCH_SHIFT) +#define I40E_GLVEBTC_RBCL(_i, _j) (0x00364000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_RBCL_MAX_INDEX 7 +#define I40E_GLVEBTC_RBCL_TCBCL_SHIFT 0 +#define I40E_GLVEBTC_RBCL_TCBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RBCL_TCBCL_SHIFT) +#define I40E_GLVEBTC_RPCH(_i, _j) (0x00368004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_RPCH_MAX_INDEX 7 +#define I40E_GLVEBTC_RPCH_TCPCH_SHIFT 0 +#define I40E_GLVEBTC_RPCH_TCPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_RPCH_TCPCH_SHIFT) +#define I40E_GLVEBTC_RPCL(_i, _j) (0x00368000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_RPCL_MAX_INDEX 7 +#define I40E_GLVEBTC_RPCL_TCPCL_SHIFT 0 +#define I40E_GLVEBTC_RPCL_TCPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RPCL_TCPCL_SHIFT) +#define I40E_GLVEBTC_TBCH(_i, _j) (0x00334004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_TBCH_MAX_INDEX 7 +#define I40E_GLVEBTC_TBCH_TCBCH_SHIFT 0 +#define I40E_GLVEBTC_TBCH_TCBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_TBCH_TCBCH_SHIFT) +#define I40E_GLVEBTC_TBCL(_i, _j) (0x00334000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_TBCL_MAX_INDEX 7 +#define I40E_GLVEBTC_TBCL_TCBCL_SHIFT 0 +#define I40E_GLVEBTC_TBCL_TCBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TBCL_TCBCL_SHIFT) +#define I40E_GLVEBTC_TPCH(_i, _j) (0x00338004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_TPCH_MAX_INDEX 7 +#define I40E_GLVEBTC_TPCH_TCPCH_SHIFT 0 +#define I40E_GLVEBTC_TPCH_TCPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_TPCH_TCPCH_SHIFT) +#define I40E_GLVEBTC_TPCL(_i, _j) (0x00338000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */ +#define I40E_GLVEBTC_TPCL_MAX_INDEX 7 +#define I40E_GLVEBTC_TPCL_TCPCL_SHIFT 0 +#define I40E_GLVEBTC_TPCL_TCPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TPCL_TCPCL_SHIFT) +#define I40E_GLVEBVL_BPCH(_i) (0x00374804 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_BPCH_MAX_INDEX 127 +#define I40E_GLVEBVL_BPCH_VLBPCH_SHIFT 0 +#define I40E_GLVEBVL_BPCH_VLBPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_BPCH_VLBPCH_SHIFT) +#define I40E_GLVEBVL_BPCL(_i) (0x00374800 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_BPCL_MAX_INDEX 127 +#define I40E_GLVEBVL_BPCL_VLBPCL_SHIFT 0 +#define I40E_GLVEBVL_BPCL_VLBPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_BPCL_VLBPCL_SHIFT) +#define I40E_GLVEBVL_GORCH(_i) (0x00360004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_GORCH_MAX_INDEX 127 +#define I40E_GLVEBVL_GORCH_VLBCH_SHIFT 0 +#define I40E_GLVEBVL_GORCH_VLBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_GORCH_VLBCH_SHIFT) +#define I40E_GLVEBVL_GORCL(_i) (0x00360000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_GORCL_MAX_INDEX 127 +#define I40E_GLVEBVL_GORCL_VLBCL_SHIFT 0 +#define I40E_GLVEBVL_GORCL_VLBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GORCL_VLBCL_SHIFT) +#define I40E_GLVEBVL_GOTCH(_i) (0x00330004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_GOTCH_MAX_INDEX 127 +#define I40E_GLVEBVL_GOTCH_VLBCH_SHIFT 0 +#define I40E_GLVEBVL_GOTCH_VLBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_GOTCH_VLBCH_SHIFT) +#define I40E_GLVEBVL_GOTCL(_i) (0x00330000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_GOTCL_MAX_INDEX 127 +#define I40E_GLVEBVL_GOTCL_VLBCL_SHIFT 0 +#define I40E_GLVEBVL_GOTCL_VLBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GOTCL_VLBCL_SHIFT) +#define I40E_GLVEBVL_MPCH(_i) (0x00374404 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_MPCH_MAX_INDEX 127 +#define I40E_GLVEBVL_MPCH_VLMPCH_SHIFT 0 +#define I40E_GLVEBVL_MPCH_VLMPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_MPCH_VLMPCH_SHIFT) +#define I40E_GLVEBVL_MPCL(_i) (0x00374400 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_MPCL_MAX_INDEX 127 +#define I40E_GLVEBVL_MPCL_VLMPCL_SHIFT 0 +#define I40E_GLVEBVL_MPCL_VLMPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_MPCL_VLMPCL_SHIFT) +#define I40E_GLVEBVL_UPCH(_i) (0x00374004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_UPCH_MAX_INDEX 127 +#define I40E_GLVEBVL_UPCH_VLUPCH_SHIFT 0 +#define I40E_GLVEBVL_UPCH_VLUPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_UPCH_VLUPCH_SHIFT) +#define I40E_GLVEBVL_UPCL(_i) (0x00374000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_GLVEBVL_UPCL_MAX_INDEX 127 +#define I40E_GLVEBVL_UPCL_VLUPCL_SHIFT 0 +#define I40E_GLVEBVL_UPCL_VLUPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_UPCL_VLUPCL_SHIFT) +#define I40E_GL_MTG_FLU_MSK_H 0x00269F4C /* Reset: CORER */ +#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT 0 +#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_MASK I40E_MASK(0xFFFF, I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT) +#define I40E_GL_SWR_DEF_ACT(_i) (0x00270200 + ((_i) * 4)) /* _i=0...35 */ /* Reset: CORER */ +#define I40E_GL_SWR_DEF_ACT_MAX_INDEX 35 +#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT 0 +#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT) +#define I40E_GL_SWR_DEF_ACT_EN(_i) (0x0026CFB8 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */ +#define I40E_GL_SWR_DEF_ACT_EN_MAX_INDEX 1 +#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT 0 +#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT) +#define I40E_PRTTSYN_ADJ 0x001E4280 /* Reset: GLOBR */ +#define I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT 0 +#define I40E_PRTTSYN_ADJ_TSYNADJ_MASK I40E_MASK(0x7FFFFFFF, I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT) +#define I40E_PRTTSYN_ADJ_SIGN_SHIFT 31 +#define I40E_PRTTSYN_ADJ_SIGN_MASK I40E_MASK(0x1, I40E_PRTTSYN_ADJ_SIGN_SHIFT) +#define I40E_PRTTSYN_AUX_0(_i) (0x001E42A0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_AUX_0_MAX_INDEX 1 +#define I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT 0 +#define I40E_PRTTSYN_AUX_0_OUT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT) +#define I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT 1 +#define I40E_PRTTSYN_AUX_0_OUTMOD_MASK I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT) +#define I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT 3 +#define I40E_PRTTSYN_AUX_0_OUTLVL_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT) +#define I40E_PRTTSYN_AUX_0_PULSEW_SHIFT 8 +#define I40E_PRTTSYN_AUX_0_PULSEW_MASK I40E_MASK(0xF, I40E_PRTTSYN_AUX_0_PULSEW_SHIFT) +#define I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT 16 +#define I40E_PRTTSYN_AUX_0_EVNTLVL_MASK I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT) +#define I40E_PRTTSYN_AUX_1(_i) (0x001E42E0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_AUX_1_MAX_INDEX 1 +#define I40E_PRTTSYN_AUX_1_INSTNT_SHIFT 0 +#define I40E_PRTTSYN_AUX_1_INSTNT_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_INSTNT_SHIFT) +#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT 1 +#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT) +#define I40E_PRTTSYN_CLKO(_i) (0x001E4240 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_CLKO_MAX_INDEX 1 +#define I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT 0 +#define I40E_PRTTSYN_CLKO_TSYNCLKO_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT) +#define I40E_PRTTSYN_CTL0 0x001E4200 /* Reset: GLOBR */ +#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT 0 +#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT) +#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT 1 +#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT) +#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT 2 +#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT) +#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT 3 +#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT) +#define I40E_PRTTSYN_CTL0_PF_ID_SHIFT 8 +#define I40E_PRTTSYN_CTL0_PF_ID_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL0_PF_ID_SHIFT) +#define I40E_PRTTSYN_CTL0_TSYNACT_SHIFT 12 +#define I40E_PRTTSYN_CTL0_TSYNACT_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL0_TSYNACT_SHIFT) +#define I40E_PRTTSYN_CTL0_TSYNENA_SHIFT 31 +#define I40E_PRTTSYN_CTL0_TSYNENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TSYNENA_SHIFT) +#define I40E_PRTTSYN_CTL1 0x00085020 /* Reset: CORER */ +#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT 0 +#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT) +#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT 8 +#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_MASK I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT) +#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT 16 +#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT) +#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT 20 +#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT) +#define I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT 24 +#define I40E_PRTTSYN_CTL1_TSYNTYPE_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT) +#define I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT 26 +#define I40E_PRTTSYN_CTL1_UDP_ENA_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT) +#define I40E_PRTTSYN_CTL1_TSYNENA_SHIFT 31 +#define I40E_PRTTSYN_CTL1_TSYNENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL1_TSYNENA_SHIFT) +#define I40E_PRTTSYN_EVNT_H(_i) (0x001E40C0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_EVNT_H_MAX_INDEX 1 +#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT 0 +#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT) +#define I40E_PRTTSYN_EVNT_L(_i) (0x001E4080 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_EVNT_L_MAX_INDEX 1 +#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT 0 +#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT) +#define I40E_PRTTSYN_INC_H 0x001E4060 /* Reset: GLOBR */ +#define I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT 0 +#define I40E_PRTTSYN_INC_H_TSYNINC_H_MASK I40E_MASK(0x3F, I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT) +#define I40E_PRTTSYN_INC_L 0x001E4040 /* Reset: GLOBR */ +#define I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT 0 +#define I40E_PRTTSYN_INC_L_TSYNINC_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT) +#define I40E_PRTTSYN_RXTIME_H(_i) (0x00085040 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_PRTTSYN_RXTIME_H_MAX_INDEX 3 +#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT 0 +#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT) +#define I40E_PRTTSYN_RXTIME_L(_i) (0x000850C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */ +#define I40E_PRTTSYN_RXTIME_L_MAX_INDEX 3 +#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT 0 +#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT) +#define I40E_PRTTSYN_STAT_0 0x001E4220 /* Reset: GLOBR */ +#define I40E_PRTTSYN_STAT_0_EVENT0_SHIFT 0 +#define I40E_PRTTSYN_STAT_0_EVENT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT0_SHIFT) +#define I40E_PRTTSYN_STAT_0_EVENT1_SHIFT 1 +#define I40E_PRTTSYN_STAT_0_EVENT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT1_SHIFT) +#define I40E_PRTTSYN_STAT_0_TGT0_SHIFT 2 +#define I40E_PRTTSYN_STAT_0_TGT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT0_SHIFT) +#define I40E_PRTTSYN_STAT_0_TGT1_SHIFT 3 +#define I40E_PRTTSYN_STAT_0_TGT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT1_SHIFT) +#define I40E_PRTTSYN_STAT_0_TXTIME_SHIFT 4 +#define I40E_PRTTSYN_STAT_0_TXTIME_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TXTIME_SHIFT) +#define I40E_PRTTSYN_STAT_1 0x00085140 /* Reset: CORER */ +#define I40E_PRTTSYN_STAT_1_RXT0_SHIFT 0 +#define I40E_PRTTSYN_STAT_1_RXT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT0_SHIFT) +#define I40E_PRTTSYN_STAT_1_RXT1_SHIFT 1 +#define I40E_PRTTSYN_STAT_1_RXT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT1_SHIFT) +#define I40E_PRTTSYN_STAT_1_RXT2_SHIFT 2 +#define I40E_PRTTSYN_STAT_1_RXT2_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT2_SHIFT) +#define I40E_PRTTSYN_STAT_1_RXT3_SHIFT 3 +#define I40E_PRTTSYN_STAT_1_RXT3_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT3_SHIFT) +#define I40E_PRTTSYN_TGT_H(_i) (0x001E4180 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_TGT_H_MAX_INDEX 1 +#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT 0 +#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT) +#define I40E_PRTTSYN_TGT_L(_i) (0x001E4140 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */ +#define I40E_PRTTSYN_TGT_L_MAX_INDEX 1 +#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT 0 +#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT) +#define I40E_PRTTSYN_TIME_H 0x001E4120 /* Reset: GLOBR */ +#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT 0 +#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT) +#define I40E_PRTTSYN_TIME_L 0x001E4100 /* Reset: GLOBR */ +#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT 0 +#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT) +#define I40E_PRTTSYN_TXTIME_H 0x001E41E0 /* Reset: GLOBR */ +#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT 0 +#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT) +#define I40E_PRTTSYN_TXTIME_L 0x001E41C0 /* Reset: GLOBR */ +#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT 0 +#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT) +#define I40E_GL_MDET_RX 0x0012A510 /* Reset: CORER */ +#define I40E_GL_MDET_RX_FUNCTION_SHIFT 0 +#define I40E_GL_MDET_RX_FUNCTION_MASK I40E_MASK(0xFF, I40E_GL_MDET_RX_FUNCTION_SHIFT) +#define I40E_GL_MDET_RX_EVENT_SHIFT 8 +#define I40E_GL_MDET_RX_EVENT_MASK I40E_MASK(0x1FF, I40E_GL_MDET_RX_EVENT_SHIFT) +#define I40E_GL_MDET_RX_QUEUE_SHIFT 17 +#define I40E_GL_MDET_RX_QUEUE_MASK I40E_MASK(0x3FFF, I40E_GL_MDET_RX_QUEUE_SHIFT) +#define I40E_GL_MDET_RX_VALID_SHIFT 31 +#define I40E_GL_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_GL_MDET_RX_VALID_SHIFT) +#define I40E_GL_MDET_TX 0x000E6480 /* Reset: CORER */ +#define I40E_GL_MDET_TX_QUEUE_SHIFT 0 +#define I40E_GL_MDET_TX_QUEUE_MASK I40E_MASK(0xFFF, I40E_GL_MDET_TX_QUEUE_SHIFT) +#define I40E_GL_MDET_TX_VF_NUM_SHIFT 12 +#define I40E_GL_MDET_TX_VF_NUM_MASK I40E_MASK(0x1FF, I40E_GL_MDET_TX_VF_NUM_SHIFT) +#define I40E_GL_MDET_TX_PF_NUM_SHIFT 21 +#define I40E_GL_MDET_TX_PF_NUM_MASK I40E_MASK(0xF, I40E_GL_MDET_TX_PF_NUM_SHIFT) +#define I40E_GL_MDET_TX_EVENT_SHIFT 25 +#define I40E_GL_MDET_TX_EVENT_MASK I40E_MASK(0x1F, I40E_GL_MDET_TX_EVENT_SHIFT) +#define I40E_GL_MDET_TX_VALID_SHIFT 31 +#define I40E_GL_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_GL_MDET_TX_VALID_SHIFT) +#define I40E_PF_MDET_RX 0x0012A400 /* Reset: CORER */ +#define I40E_PF_MDET_RX_VALID_SHIFT 0 +#define I40E_PF_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_PF_MDET_RX_VALID_SHIFT) +#define I40E_PF_MDET_TX 0x000E6400 /* Reset: CORER */ +#define I40E_PF_MDET_TX_VALID_SHIFT 0 +#define I40E_PF_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_PF_MDET_TX_VALID_SHIFT) +#define I40E_PF_VT_PFALLOC 0x001C0500 /* Reset: CORER */ +#define I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT 0 +#define I40E_PF_VT_PFALLOC_FIRSTVF_MASK I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT) +#define I40E_PF_VT_PFALLOC_LASTVF_SHIFT 8 +#define I40E_PF_VT_PFALLOC_LASTVF_MASK I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_LASTVF_SHIFT) +#define I40E_PF_VT_PFALLOC_VALID_SHIFT 31 +#define I40E_PF_VT_PFALLOC_VALID_MASK I40E_MASK(0x1, I40E_PF_VT_PFALLOC_VALID_SHIFT) +#define I40E_VP_MDET_RX(_VF) (0x0012A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VP_MDET_RX_MAX_INDEX 127 +#define I40E_VP_MDET_RX_VALID_SHIFT 0 +#define I40E_VP_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_VP_MDET_RX_VALID_SHIFT) +#define I40E_VP_MDET_TX(_VF) (0x000E6000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_VP_MDET_TX_MAX_INDEX 127 +#define I40E_VP_MDET_TX_VALID_SHIFT 0 +#define I40E_VP_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_VP_MDET_TX_VALID_SHIFT) +#define I40E_GLPM_WUMC 0x0006C800 /* Reset: POR */ +#define I40E_GLPM_WUMC_NOTCO_SHIFT 0 +#define I40E_GLPM_WUMC_NOTCO_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_NOTCO_SHIFT) +#define I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT 1 +#define I40E_GLPM_WUMC_SRST_PIN_VAL_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT) +#define I40E_GLPM_WUMC_ROL_MODE_SHIFT 2 +#define I40E_GLPM_WUMC_ROL_MODE_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_ROL_MODE_SHIFT) +#define I40E_GLPM_WUMC_RESERVED_4_SHIFT 3 +#define I40E_GLPM_WUMC_RESERVED_4_MASK I40E_MASK(0x1FFF, I40E_GLPM_WUMC_RESERVED_4_SHIFT) +#define I40E_GLPM_WUMC_MNG_WU_PF_SHIFT 16 +#define I40E_GLPM_WUMC_MNG_WU_PF_MASK I40E_MASK(0xFFFF, I40E_GLPM_WUMC_MNG_WU_PF_SHIFT) +#define I40E_PFPM_APM 0x000B8080 /* Reset: POR */ +#define I40E_PFPM_APM_APME_SHIFT 0 +#define I40E_PFPM_APM_APME_MASK I40E_MASK(0x1, I40E_PFPM_APM_APME_SHIFT) +#define I40E_PFPM_FHFT_LENGTH(_i) (0x0006A000 + ((_i) * 128)) /* _i=0...7 */ /* Reset: POR */ +#define I40E_PFPM_FHFT_LENGTH_MAX_INDEX 7 +#define I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT 0 +#define I40E_PFPM_FHFT_LENGTH_LENGTH_MASK I40E_MASK(0xFF, I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT) +#define I40E_PFPM_WUC 0x0006B200 /* Reset: POR */ +#define I40E_PFPM_WUC_EN_APM_D0_SHIFT 5 +#define I40E_PFPM_WUC_EN_APM_D0_MASK I40E_MASK(0x1, I40E_PFPM_WUC_EN_APM_D0_SHIFT) +#define I40E_PFPM_WUFC 0x0006B400 /* Reset: POR */ +#define I40E_PFPM_WUFC_LNKC_SHIFT 0 +#define I40E_PFPM_WUFC_LNKC_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_LNKC_SHIFT) +#define I40E_PFPM_WUFC_MAG_SHIFT 1 +#define I40E_PFPM_WUFC_MAG_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_MAG_SHIFT) +#define I40E_PFPM_WUFC_MNG_SHIFT 3 +#define I40E_PFPM_WUFC_MNG_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_MNG_SHIFT) +#define I40E_PFPM_WUFC_FLX0_ACT_SHIFT 4 +#define I40E_PFPM_WUFC_FLX0_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX1_ACT_SHIFT 5 +#define I40E_PFPM_WUFC_FLX1_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX2_ACT_SHIFT 6 +#define I40E_PFPM_WUFC_FLX2_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX3_ACT_SHIFT 7 +#define I40E_PFPM_WUFC_FLX3_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX4_ACT_SHIFT 8 +#define I40E_PFPM_WUFC_FLX4_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX5_ACT_SHIFT 9 +#define I40E_PFPM_WUFC_FLX5_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX6_ACT_SHIFT 10 +#define I40E_PFPM_WUFC_FLX6_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX7_ACT_SHIFT 11 +#define I40E_PFPM_WUFC_FLX7_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_ACT_SHIFT) +#define I40E_PFPM_WUFC_FLX0_SHIFT 16 +#define I40E_PFPM_WUFC_FLX0_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_SHIFT) +#define I40E_PFPM_WUFC_FLX1_SHIFT 17 +#define I40E_PFPM_WUFC_FLX1_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_SHIFT) +#define I40E_PFPM_WUFC_FLX2_SHIFT 18 +#define I40E_PFPM_WUFC_FLX2_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_SHIFT) +#define I40E_PFPM_WUFC_FLX3_SHIFT 19 +#define I40E_PFPM_WUFC_FLX3_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_SHIFT) +#define I40E_PFPM_WUFC_FLX4_SHIFT 20 +#define I40E_PFPM_WUFC_FLX4_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_SHIFT) +#define I40E_PFPM_WUFC_FLX5_SHIFT 21 +#define I40E_PFPM_WUFC_FLX5_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_SHIFT) +#define I40E_PFPM_WUFC_FLX6_SHIFT 22 +#define I40E_PFPM_WUFC_FLX6_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_SHIFT) +#define I40E_PFPM_WUFC_FLX7_SHIFT 23 +#define I40E_PFPM_WUFC_FLX7_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_SHIFT) +#define I40E_PFPM_WUFC_FW_RST_WK_SHIFT 31 +#define I40E_PFPM_WUFC_FW_RST_WK_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FW_RST_WK_SHIFT) +#define I40E_PFPM_WUS 0x0006B600 /* Reset: POR */ +#define I40E_PFPM_WUS_LNKC_SHIFT 0 +#define I40E_PFPM_WUS_LNKC_MASK I40E_MASK(0x1, I40E_PFPM_WUS_LNKC_SHIFT) +#define I40E_PFPM_WUS_MAG_SHIFT 1 +#define I40E_PFPM_WUS_MAG_MASK I40E_MASK(0x1, I40E_PFPM_WUS_MAG_SHIFT) +#define I40E_PFPM_WUS_PME_STATUS_SHIFT 2 +#define I40E_PFPM_WUS_PME_STATUS_MASK I40E_MASK(0x1, I40E_PFPM_WUS_PME_STATUS_SHIFT) +#define I40E_PFPM_WUS_MNG_SHIFT 3 +#define I40E_PFPM_WUS_MNG_MASK I40E_MASK(0x1, I40E_PFPM_WUS_MNG_SHIFT) +#define I40E_PFPM_WUS_FLX0_SHIFT 16 +#define I40E_PFPM_WUS_FLX0_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX0_SHIFT) +#define I40E_PFPM_WUS_FLX1_SHIFT 17 +#define I40E_PFPM_WUS_FLX1_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX1_SHIFT) +#define I40E_PFPM_WUS_FLX2_SHIFT 18 +#define I40E_PFPM_WUS_FLX2_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX2_SHIFT) +#define I40E_PFPM_WUS_FLX3_SHIFT 19 +#define I40E_PFPM_WUS_FLX3_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX3_SHIFT) +#define I40E_PFPM_WUS_FLX4_SHIFT 20 +#define I40E_PFPM_WUS_FLX4_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX4_SHIFT) +#define I40E_PFPM_WUS_FLX5_SHIFT 21 +#define I40E_PFPM_WUS_FLX5_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX5_SHIFT) +#define I40E_PFPM_WUS_FLX6_SHIFT 22 +#define I40E_PFPM_WUS_FLX6_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX6_SHIFT) +#define I40E_PFPM_WUS_FLX7_SHIFT 23 +#define I40E_PFPM_WUS_FLX7_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX7_SHIFT) +#define I40E_PFPM_WUS_FW_RST_WK_SHIFT 31 +#define I40E_PFPM_WUS_FW_RST_WK_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FW_RST_WK_SHIFT) +#define I40E_PRTPM_FHFHR 0x0006C000 /* Reset: POR */ +#define I40E_PRTPM_FHFHR_UNICAST_SHIFT 0 +#define I40E_PRTPM_FHFHR_UNICAST_MASK I40E_MASK(0x1, I40E_PRTPM_FHFHR_UNICAST_SHIFT) +#define I40E_PRTPM_FHFHR_MULTICAST_SHIFT 1 +#define I40E_PRTPM_FHFHR_MULTICAST_MASK I40E_MASK(0x1, I40E_PRTPM_FHFHR_MULTICAST_SHIFT) +#define I40E_PRTPM_SAH(_i) (0x001E44C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */ +#define I40E_PRTPM_SAH_MAX_INDEX 3 +#define I40E_PRTPM_SAH_PFPM_SAH_SHIFT 0 +#define I40E_PRTPM_SAH_PFPM_SAH_MASK I40E_MASK(0xFFFF, I40E_PRTPM_SAH_PFPM_SAH_SHIFT) +#define I40E_PRTPM_SAH_PF_NUM_SHIFT 26 +#define I40E_PRTPM_SAH_PF_NUM_MASK I40E_MASK(0xF, I40E_PRTPM_SAH_PF_NUM_SHIFT) +#define I40E_PRTPM_SAH_MC_MAG_EN_SHIFT 30 +#define I40E_PRTPM_SAH_MC_MAG_EN_MASK I40E_MASK(0x1, I40E_PRTPM_SAH_MC_MAG_EN_SHIFT) +#define I40E_PRTPM_SAH_AV_SHIFT 31 +#define I40E_PRTPM_SAH_AV_MASK I40E_MASK(0x1, I40E_PRTPM_SAH_AV_SHIFT) +#define I40E_PRTPM_SAL(_i) (0x001E4440 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */ +#define I40E_PRTPM_SAL_MAX_INDEX 3 +#define I40E_PRTPM_SAL_PFPM_SAL_SHIFT 0 +#define I40E_PRTPM_SAL_PFPM_SAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_SAL_PFPM_SAL_SHIFT) +#define I40E_VF_ARQBAH1 0x00006000 /* Reset: EMPR */ +#define I40E_VF_ARQBAH1_ARQBAH_SHIFT 0 +#define I40E_VF_ARQBAH1_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH1_ARQBAH_SHIFT) +#define I40E_VF_ARQBAL1 0x00006C00 /* Reset: EMPR */ +#define I40E_VF_ARQBAL1_ARQBAL_SHIFT 0 +#define I40E_VF_ARQBAL1_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL1_ARQBAL_SHIFT) +#define I40E_VF_ARQH1 0x00007400 /* Reset: EMPR */ +#define I40E_VF_ARQH1_ARQH_SHIFT 0 +#define I40E_VF_ARQH1_ARQH_MASK I40E_MASK(0x3FF, I40E_VF_ARQH1_ARQH_SHIFT) +#define I40E_VF_ARQLEN1 0x00008000 /* Reset: EMPR */ +#define I40E_VF_ARQLEN1_ARQLEN_SHIFT 0 +#define I40E_VF_ARQLEN1_ARQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ARQLEN1_ARQLEN_SHIFT) +#define I40E_VF_ARQLEN1_ARQVFE_SHIFT 28 +#define I40E_VF_ARQLEN1_ARQVFE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQVFE_SHIFT) +#define I40E_VF_ARQLEN1_ARQOVFL_SHIFT 29 +#define I40E_VF_ARQLEN1_ARQOVFL_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQOVFL_SHIFT) +#define I40E_VF_ARQLEN1_ARQCRIT_SHIFT 30 +#define I40E_VF_ARQLEN1_ARQCRIT_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQCRIT_SHIFT) +#define I40E_VF_ARQLEN1_ARQENABLE_SHIFT 31 +#define I40E_VF_ARQLEN1_ARQENABLE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQENABLE_SHIFT) +#define I40E_VF_ARQT1 0x00007000 /* Reset: EMPR */ +#define I40E_VF_ARQT1_ARQT_SHIFT 0 +#define I40E_VF_ARQT1_ARQT_MASK I40E_MASK(0x3FF, I40E_VF_ARQT1_ARQT_SHIFT) +#define I40E_VF_ATQBAH1 0x00007800 /* Reset: EMPR */ +#define I40E_VF_ATQBAH1_ATQBAH_SHIFT 0 +#define I40E_VF_ATQBAH1_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH1_ATQBAH_SHIFT) +#define I40E_VF_ATQBAL1 0x00007C00 /* Reset: EMPR */ +#define I40E_VF_ATQBAL1_ATQBAL_SHIFT 0 +#define I40E_VF_ATQBAL1_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL1_ATQBAL_SHIFT) +#define I40E_VF_ATQH1 0x00006400 /* Reset: EMPR */ +#define I40E_VF_ATQH1_ATQH_SHIFT 0 +#define I40E_VF_ATQH1_ATQH_MASK I40E_MASK(0x3FF, I40E_VF_ATQH1_ATQH_SHIFT) +#define I40E_VF_ATQLEN1 0x00006800 /* Reset: EMPR */ +#define I40E_VF_ATQLEN1_ATQLEN_SHIFT 0 +#define I40E_VF_ATQLEN1_ATQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ATQLEN1_ATQLEN_SHIFT) +#define I40E_VF_ATQLEN1_ATQVFE_SHIFT 28 +#define I40E_VF_ATQLEN1_ATQVFE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQVFE_SHIFT) +#define I40E_VF_ATQLEN1_ATQOVFL_SHIFT 29 +#define I40E_VF_ATQLEN1_ATQOVFL_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQOVFL_SHIFT) +#define I40E_VF_ATQLEN1_ATQCRIT_SHIFT 30 +#define I40E_VF_ATQLEN1_ATQCRIT_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQCRIT_SHIFT) +#define I40E_VF_ATQLEN1_ATQENABLE_SHIFT 31 +#define I40E_VF_ATQLEN1_ATQENABLE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQENABLE_SHIFT) +#define I40E_VF_ATQT1 0x00008400 /* Reset: EMPR */ +#define I40E_VF_ATQT1_ATQT_SHIFT 0 +#define I40E_VF_ATQT1_ATQT_MASK I40E_MASK(0x3FF, I40E_VF_ATQT1_ATQT_SHIFT) +#define I40E_VFGEN_RSTAT 0x00008800 /* Reset: VFR */ +#define I40E_VFGEN_RSTAT_VFR_STATE_SHIFT 0 +#define I40E_VFGEN_RSTAT_VFR_STATE_MASK I40E_MASK(0x3, I40E_VFGEN_RSTAT_VFR_STATE_SHIFT) +#define I40E_VFINT_DYN_CTL01 0x00005C00 /* Reset: VFR */ +#define I40E_VFINT_DYN_CTL01_INTENA_SHIFT 0 +#define I40E_VFINT_DYN_CTL01_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_SHIFT) +#define I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT 1 +#define I40E_VFINT_DYN_CTL01_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT) +#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT 2 +#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT) +#define I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT 3 +#define I40E_VFINT_DYN_CTL01_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT 5 +#define I40E_VFINT_DYN_CTL01_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT) +#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT) +#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT 25 +#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT 31 +#define I40E_VFINT_DYN_CTL01_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT) +#define I40E_VFINT_DYN_CTLN1(_INTVF) (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */ +#define I40E_VFINT_DYN_CTLN1_MAX_INDEX 15 +#define I40E_VFINT_DYN_CTLN1_INTENA_SHIFT 0 +#define I40E_VFINT_DYN_CTLN1_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_SHIFT) +#define I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT 1 +#define I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT) +#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT 2 +#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT) +#define I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT 3 +#define I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT 5 +#define I40E_VFINT_DYN_CTLN1_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT) +#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT 24 +#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT) +#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT 25 +#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT) +#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT 31 +#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT) +#define I40E_VFINT_ICR0_ENA1 0x00005000 /* Reset: CORER */ +#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT) +#define I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT 30 +#define I40E_VFINT_ICR0_ENA1_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT) +#define I40E_VFINT_ICR0_ENA1_RSVD_SHIFT 31 +#define I40E_VFINT_ICR0_ENA1_RSVD_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_RSVD_SHIFT) +#define I40E_VFINT_ICR01 0x00004800 /* Reset: CORER */ +#define I40E_VFINT_ICR01_INTEVENT_SHIFT 0 +#define I40E_VFINT_ICR01_INTEVENT_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_INTEVENT_SHIFT) +#define I40E_VFINT_ICR01_QUEUE_0_SHIFT 1 +#define I40E_VFINT_ICR01_QUEUE_0_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_0_SHIFT) +#define I40E_VFINT_ICR01_QUEUE_1_SHIFT 2 +#define I40E_VFINT_ICR01_QUEUE_1_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_1_SHIFT) +#define I40E_VFINT_ICR01_QUEUE_2_SHIFT 3 +#define I40E_VFINT_ICR01_QUEUE_2_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_2_SHIFT) +#define I40E_VFINT_ICR01_QUEUE_3_SHIFT 4 +#define I40E_VFINT_ICR01_QUEUE_3_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_3_SHIFT) +#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT 25 +#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT) +#define I40E_VFINT_ICR01_ADMINQ_SHIFT 30 +#define I40E_VFINT_ICR01_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_ADMINQ_SHIFT) +#define I40E_VFINT_ICR01_SWINT_SHIFT 31 +#define I40E_VFINT_ICR01_SWINT_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_SWINT_SHIFT) +#define I40E_VFINT_ITR01(_i) (0x00004C00 + ((_i) * 4)) /* _i=0...2 */ /* Reset: VFR */ +#define I40E_VFINT_ITR01_MAX_INDEX 2 +#define I40E_VFINT_ITR01_INTERVAL_SHIFT 0 +#define I40E_VFINT_ITR01_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITR01_INTERVAL_SHIFT) +#define I40E_VFINT_ITRN1(_i, _INTVF) (0x00002800 + ((_i) * 64 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...15 */ /* Reset: VFR */ +#define I40E_VFINT_ITRN1_MAX_INDEX 2 +#define I40E_VFINT_ITRN1_INTERVAL_SHIFT 0 +#define I40E_VFINT_ITRN1_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITRN1_INTERVAL_SHIFT) +#define I40E_VFINT_STAT_CTL01 0x00005400 /* Reset: CORER */ +#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT 2 +#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT) +#define I40E_QRX_TAIL1(_Q) (0x00002000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_QRX_TAIL1_MAX_INDEX 15 +#define I40E_QRX_TAIL1_TAIL_SHIFT 0 +#define I40E_QRX_TAIL1_TAIL_MASK I40E_MASK(0x1FFF, I40E_QRX_TAIL1_TAIL_SHIFT) +#define I40E_QTX_TAIL1(_Q) (0x00000000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: PFR */ +#define I40E_QTX_TAIL1_MAX_INDEX 15 +#define I40E_QTX_TAIL1_TAIL_SHIFT 0 +#define I40E_QTX_TAIL1_TAIL_MASK I40E_MASK(0x1FFF, I40E_QTX_TAIL1_TAIL_SHIFT) +#define I40E_VFMSIX_PBA 0x00002000 /* Reset: VFLR */ +#define I40E_VFMSIX_PBA_PENBIT_SHIFT 0 +#define I40E_VFMSIX_PBA_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA_PENBIT_SHIFT) +#define I40E_VFMSIX_TADD(_i) (0x00000000 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TADD_MAX_INDEX 16 +#define I40E_VFMSIX_TADD_MSIXTADD10_SHIFT 0 +#define I40E_VFMSIX_TADD_MSIXTADD10_MASK I40E_MASK(0x3, I40E_VFMSIX_TADD_MSIXTADD10_SHIFT) +#define I40E_VFMSIX_TADD_MSIXTADD_SHIFT 2 +#define I40E_VFMSIX_TADD_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD_MSIXTADD_SHIFT) +#define I40E_VFMSIX_TMSG(_i) (0x00000008 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TMSG_MAX_INDEX 16 +#define I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT 0 +#define I40E_VFMSIX_TMSG_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT) +#define I40E_VFMSIX_TUADD(_i) (0x00000004 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TUADD_MAX_INDEX 16 +#define I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT 0 +#define I40E_VFMSIX_TUADD_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT) +#define I40E_VFMSIX_TVCTRL(_i) (0x0000000C + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */ +#define I40E_VFMSIX_TVCTRL_MAX_INDEX 16 +#define I40E_VFMSIX_TVCTRL_MASK_SHIFT 0 +#define I40E_VFMSIX_TVCTRL_MASK_MASK I40E_MASK(0x1, I40E_VFMSIX_TVCTRL_MASK_SHIFT) +#define I40E_VFCM_PE_ERRDATA 0x0000DC00 /* Reset: VFR */ +#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0 +#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT) +#define I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT 4 +#define I40E_VFCM_PE_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT) +#define I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT 8 +#define I40E_VFCM_PE_ERRDATA_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT) +#define I40E_VFCM_PE_ERRINFO 0x0000D800 /* Reset: VFR */ +#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT 0 +#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT) +#define I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT 4 +#define I40E_VFCM_PE_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT) +#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8 +#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT) +#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16 +#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT) +#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24 +#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT) +#define I40E_VFQF_HENA(_i) (0x0000C400 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */ +#define I40E_VFQF_HENA_MAX_INDEX 1 +#define I40E_VFQF_HENA_PTYPE_ENA_SHIFT 0 +#define I40E_VFQF_HENA_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA_PTYPE_ENA_SHIFT) +#define I40E_VFQF_HKEY(_i) (0x0000CC00 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */ +#define I40E_VFQF_HKEY_MAX_INDEX 12 +#define I40E_VFQF_HKEY_KEY_0_SHIFT 0 +#define I40E_VFQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_0_SHIFT) +#define I40E_VFQF_HKEY_KEY_1_SHIFT 8 +#define I40E_VFQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_1_SHIFT) +#define I40E_VFQF_HKEY_KEY_2_SHIFT 16 +#define I40E_VFQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_2_SHIFT) +#define I40E_VFQF_HKEY_KEY_3_SHIFT 24 +#define I40E_VFQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_3_SHIFT) +#define I40E_VFQF_HLUT(_i) (0x0000D000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_VFQF_HLUT_MAX_INDEX 15 +#define I40E_VFQF_HLUT_LUT0_SHIFT 0 +#define I40E_VFQF_HLUT_LUT0_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT0_SHIFT) +#define I40E_VFQF_HLUT_LUT1_SHIFT 8 +#define I40E_VFQF_HLUT_LUT1_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT1_SHIFT) +#define I40E_VFQF_HLUT_LUT2_SHIFT 16 +#define I40E_VFQF_HLUT_LUT2_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT2_SHIFT) +#define I40E_VFQF_HLUT_LUT3_SHIFT 24 +#define I40E_VFQF_HLUT_LUT3_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT3_SHIFT) +#define I40E_VFQF_HREGION(_i) (0x0000D400 + ((_i) * 4)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_VFQF_HREGION_MAX_INDEX 7 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT) +#define I40E_VFQF_HREGION_REGION_0_SHIFT 1 +#define I40E_VFQF_HREGION_REGION_0_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_0_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT) +#define I40E_VFQF_HREGION_REGION_1_SHIFT 5 +#define I40E_VFQF_HREGION_REGION_1_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_1_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT) +#define I40E_VFQF_HREGION_REGION_2_SHIFT 9 +#define I40E_VFQF_HREGION_REGION_2_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_2_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT) +#define I40E_VFQF_HREGION_REGION_3_SHIFT 13 +#define I40E_VFQF_HREGION_REGION_3_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_3_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT) +#define I40E_VFQF_HREGION_REGION_4_SHIFT 17 +#define I40E_VFQF_HREGION_REGION_4_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_4_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT) +#define I40E_VFQF_HREGION_REGION_5_SHIFT 21 +#define I40E_VFQF_HREGION_REGION_5_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_5_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT) +#define I40E_VFQF_HREGION_REGION_6_SHIFT 25 +#define I40E_VFQF_HREGION_REGION_6_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_6_SHIFT) +#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28 +#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT) +#define I40E_VFQF_HREGION_REGION_7_SHIFT 29 +#define I40E_VFQF_HREGION_REGION_7_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_7_SHIFT) +#ifdef X722_SUPPORT + +#define I40E_MNGSB_FDCRC 0x000B7050 /* Reset: POR */ +#define I40E_MNGSB_FDCRC_CRC_RES_SHIFT 0 +#define I40E_MNGSB_FDCRC_CRC_RES_MASK I40E_MASK(0xFF, I40E_MNGSB_FDCRC_CRC_RES_SHIFT) +#define I40E_MNGSB_FDCS 0x000B7040 /* Reset: POR */ +#define I40E_MNGSB_FDCS_CRC_CONT_SHIFT 2 +#define I40E_MNGSB_FDCS_CRC_CONT_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_CONT_SHIFT) +#define I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT 3 +#define I40E_MNGSB_FDCS_CRC_SEED_EN_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT) +#define I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT 4 +#define I40E_MNGSB_FDCS_CRC_WR_INH_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT) +#define I40E_MNGSB_FDCS_CRC_SEED_SHIFT 8 +#define I40E_MNGSB_FDCS_CRC_SEED_MASK I40E_MASK(0xFF, I40E_MNGSB_FDCS_CRC_SEED_SHIFT) +#define I40E_MNGSB_FDS 0x000B7048 /* Reset: POR */ +#define I40E_MNGSB_FDS_START_BC_SHIFT 0 +#define I40E_MNGSB_FDS_START_BC_MASK I40E_MASK(0xFFF, I40E_MNGSB_FDS_START_BC_SHIFT) +#define I40E_MNGSB_FDS_LAST_BC_SHIFT 16 +#define I40E_MNGSB_FDS_LAST_BC_MASK I40E_MASK(0xFFF, I40E_MNGSB_FDS_LAST_BC_SHIFT) + +#define I40E_GL_VF_CTRL_RX(_VF) (0x00083600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_GL_VF_CTRL_RX_MAX_INDEX 127 +#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT 0 +#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_MASK I40E_MASK(0x1, I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT) +#define I40E_GL_VF_CTRL_TX(_VF) (0x00083400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */ +#define I40E_GL_VF_CTRL_TX_MAX_INDEX 127 +#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT 0 +#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_MASK I40E_MASK(0x1, I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT) + +#define I40E_GLCM_LAN_CACHESIZE 0x0010C4D8 /* Reset: CORER */ +#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFFF, I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT 12 +#define I40E_GLCM_LAN_CACHESIZE_SETS_MASK I40E_MASK(0xF, I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT) +#define I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT 16 +#define I40E_GLCM_LAN_CACHESIZE_WAYS_MASK I40E_MASK(0x3FF, I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT) +#define I40E_GLCM_PE_CACHESIZE 0x00138FE4 /* Reset: CORER */ +#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFFF, I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLCM_PE_CACHESIZE_SETS_SHIFT 12 +#define I40E_GLCM_PE_CACHESIZE_SETS_MASK I40E_MASK(0xF, I40E_GLCM_PE_CACHESIZE_SETS_SHIFT) +#define I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT 16 +#define I40E_GLCM_PE_CACHESIZE_WAYS_MASK I40E_MASK(0x1FF, I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT) +#define I40E_PFCM_PE_ERRDATA 0x00138D00 /* Reset: PFR */ +#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0 +#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT) +#define I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT 4 +#define I40E_PFCM_PE_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT) +#define I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT 8 +#define I40E_PFCM_PE_ERRDATA_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT) +#define I40E_PFCM_PE_ERRINFO 0x00138C80 /* Reset: PFR */ +#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT 0 +#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT) +#define I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT 4 +#define I40E_PFCM_PE_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT) +#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8 +#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT) +#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16 +#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT) +#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24 +#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT) + +#define I40E_PRTDCB_TFMSTC(_i) (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PRTDCB_TFMSTC_MAX_INDEX 7 +#define I40E_PRTDCB_TFMSTC_MSTC_SHIFT 0 +#define I40E_PRTDCB_TFMSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TFMSTC_MSTC_SHIFT) +#define I40E_GL_FWSTS_FWROWD_SHIFT 8 +#define I40E_GL_FWSTS_FWROWD_MASK I40E_MASK(0x1, I40E_GL_FWSTS_FWROWD_SHIFT) +#define I40E_GLFOC_CACHESIZE 0x000AA0DC /* Reset: CORER */ +#define I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLFOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLFOC_CACHESIZE_SETS_SHIFT 8 +#define I40E_GLFOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLFOC_CACHESIZE_SETS_SHIFT) +#define I40E_GLFOC_CACHESIZE_WAYS_SHIFT 20 +#define I40E_GLFOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLFOC_CACHESIZE_WAYS_SHIFT) +#define I40E_GLHMC_APBVTINUSEBASE(_i) (0x000C4a00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_APBVTINUSEBASE_MAX_INDEX 15 +#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0 +#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT) +#define I40E_GLHMC_CEQPART(_i) (0x001312C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_CEQPART_MAX_INDEX 15 +#define I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT 0 +#define I40E_GLHMC_CEQPART_PMCEQBASE_MASK I40E_MASK(0xFF, I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT) +#define I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT 16 +#define I40E_GLHMC_CEQPART_PMCEQSIZE_MASK I40E_MASK(0x1FF, I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT) +#define I40E_GLHMC_DBCQMAX 0x000C20F0 /* Reset: CORER */ +#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT 0 +#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_MASK I40E_MASK(0x3FFFF, I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT) +#define I40E_GLHMC_DBCQPART(_i) (0x00131240 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_DBCQPART_MAX_INDEX 15 +#define I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT 0 +#define I40E_GLHMC_DBCQPART_PMDBCQBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT) +#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT 16 +#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT) +#define I40E_GLHMC_DBQPMAX 0x000C20EC /* Reset: CORER */ +#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT 0 +#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_MASK I40E_MASK(0x7FFFF, I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT) +#define I40E_GLHMC_DBQPPART(_i) (0x00138D80 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_DBQPPART_MAX_INDEX 15 +#define I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT 0 +#define I40E_GLHMC_DBQPPART_PMDBQPBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT) +#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT 16 +#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT) +#define I40E_GLHMC_PEARPBASE(_i) (0x000C4800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEARPBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT 0 +#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT) +#define I40E_GLHMC_PEARPCNT(_i) (0x000C4900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEARPCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT 0 +#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT) +#define I40E_GLHMC_PEARPMAX 0x000C2038 /* Reset: CORER */ +#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT 0 +#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_MASK I40E_MASK(0x1FFFF, I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT) +#define I40E_GLHMC_PEARPOBJSZ 0x000C2034 /* Reset: CORER */ +#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT 0 +#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_MASK I40E_MASK(0x7, I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT) +#define I40E_GLHMC_PECQBASE(_i) (0x000C4200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PECQBASE_MAX_INDEX 15 +#define I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT 0 +#define I40E_GLHMC_PECQBASE_FPMPECQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT) +#define I40E_GLHMC_PECQCNT(_i) (0x000C4300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PECQCNT_MAX_INDEX 15 +#define I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT 0 +#define I40E_GLHMC_PECQCNT_FPMPECQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT) +#define I40E_GLHMC_PECQOBJSZ 0x000C2020 /* Reset: CORER */ +#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT 0 +#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT) +#define I40E_GLHMC_PEHTCNT(_i) (0x000C4700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEHTCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT 0 +#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT) +#define I40E_GLHMC_PEHTEBASE(_i) (0x000C4600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEHTEBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT 0 +#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT) +#define I40E_GLHMC_PEHTEOBJSZ 0x000C202c /* Reset: CORER */ +#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT 0 +#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT) +#define I40E_GLHMC_PEHTMAX 0x000C2030 /* Reset: CORER */ +#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT 0 +#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_MASK I40E_MASK(0x1FFFFF, I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT) +#define I40E_GLHMC_PEMRBASE(_i) (0x000C4c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEMRBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT 0 +#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT) +#define I40E_GLHMC_PEMRCNT(_i) (0x000C4d00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEMRCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT 0 +#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT) +#define I40E_GLHMC_PEMRMAX 0x000C2040 /* Reset: CORER */ +#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT 0 +#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_MASK I40E_MASK(0x7FFFFF, I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT) +#define I40E_GLHMC_PEMROBJSZ 0x000C203c /* Reset: CORER */ +#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT 0 +#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT) +#define I40E_GLHMC_PEPBLBASE(_i) (0x000C5800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEPBLBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT 0 +#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT) +#define I40E_GLHMC_PEPBLCNT(_i) (0x000C5900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEPBLCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT 0 +#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT) +#define I40E_GLHMC_PEPBLMAX 0x000C206c /* Reset: CORER */ +#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT 0 +#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT) +#define I40E_GLHMC_PEPFFIRSTSD 0x000C20E4 /* Reset: CORER */ +#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT 0 +#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_MASK I40E_MASK(0xFFF, I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT) +#define I40E_GLHMC_PEQ1BASE(_i) (0x000C5200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEQ1BASE_MAX_INDEX 15 +#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT 0 +#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT) +#define I40E_GLHMC_PEQ1CNT(_i) (0x000C5300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEQ1CNT_MAX_INDEX 15 +#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT 0 +#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT) +#define I40E_GLHMC_PEQ1FLBASE(_i) (0x000C5400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEQ1FLBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0 +#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT) +#define I40E_GLHMC_PEQ1FLMAX 0x000C2058 /* Reset: CORER */ +#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT 0 +#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT) +#define I40E_GLHMC_PEQ1MAX 0x000C2054 /* Reset: CORER */ +#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT 0 +#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT) +#define I40E_GLHMC_PEQ1OBJSZ 0x000C2050 /* Reset: CORER */ +#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT 0 +#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT) +#define I40E_GLHMC_PEQPBASE(_i) (0x000C4000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEQPBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT 0 +#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT) +#define I40E_GLHMC_PEQPCNT(_i) (0x000C4100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEQPCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT 0 +#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT) +#define I40E_GLHMC_PEQPOBJSZ 0x000C201c /* Reset: CORER */ +#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT 0 +#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT) +#define I40E_GLHMC_PESRQBASE(_i) (0x000C4400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PESRQBASE_MAX_INDEX 15 +#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT 0 +#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT) +#define I40E_GLHMC_PESRQCNT(_i) (0x000C4500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PESRQCNT_MAX_INDEX 15 +#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT 0 +#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT) +#define I40E_GLHMC_PESRQMAX 0x000C2028 /* Reset: CORER */ +#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT 0 +#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_MASK I40E_MASK(0xFFFF, I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT) +#define I40E_GLHMC_PESRQOBJSZ 0x000C2024 /* Reset: CORER */ +#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT 0 +#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT) +#define I40E_GLHMC_PETIMERBASE(_i) (0x000C5A00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PETIMERBASE_MAX_INDEX 15 +#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT 0 +#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT) +#define I40E_GLHMC_PETIMERCNT(_i) (0x000C5B00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PETIMERCNT_MAX_INDEX 15 +#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT 0 +#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT) +#define I40E_GLHMC_PETIMERMAX 0x000C2084 /* Reset: CORER */ +#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT 0 +#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT) +#define I40E_GLHMC_PETIMEROBJSZ 0x000C2080 /* Reset: CORER */ +#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT 0 +#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT) +#define I40E_GLHMC_PEXFBASE(_i) (0x000C4e00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEXFBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT 0 +#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT) +#define I40E_GLHMC_PEXFCNT(_i) (0x000C4f00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEXFCNT_MAX_INDEX 15 +#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT 0 +#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT) +#define I40E_GLHMC_PEXFFLBASE(_i) (0x000C5000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PEXFFLBASE_MAX_INDEX 15 +#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT 0 +#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT) +#define I40E_GLHMC_PEXFFLMAX 0x000C204c /* Reset: CORER */ +#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT 0 +#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_MASK I40E_MASK(0x1FFFFFF, I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT) +#define I40E_GLHMC_PEXFMAX 0x000C2048 /* Reset: CORER */ +#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT 0 +#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT) +#define I40E_GLHMC_PEXFOBJSZ 0x000C2044 /* Reset: CORER */ +#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT 0 +#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT) +#define I40E_GLHMC_PFPESDPART(_i) (0x000C0880 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLHMC_PFPESDPART_MAX_INDEX 15 +#define I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT 0 +#define I40E_GLHMC_PFPESDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT) +#define I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT 16 +#define I40E_GLHMC_PFPESDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT) +#define I40E_GLHMC_VFAPBVTINUSEBASE(_i) (0x000Cca00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFAPBVTINUSEBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0 +#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT) +#define I40E_GLHMC_VFCEQPART(_i) (0x00132240 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFCEQPART_MAX_INDEX 31 +#define I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT 0 +#define I40E_GLHMC_VFCEQPART_PMCEQBASE_MASK I40E_MASK(0xFF, I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT) +#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT 16 +#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_MASK I40E_MASK(0x1FF, I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT) +#define I40E_GLHMC_VFDBCQPART(_i) (0x00132140 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFDBCQPART_MAX_INDEX 31 +#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT 0 +#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT) +#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT 16 +#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT) +#define I40E_GLHMC_VFDBQPPART(_i) (0x00138E00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFDBQPPART_MAX_INDEX 31 +#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT 0 +#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT) +#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT 16 +#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT) +#define I40E_GLHMC_VFFSIAVBASE(_i) (0x000Cd600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFFSIAVBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT 0 +#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT) +#define I40E_GLHMC_VFFSIAVCNT(_i) (0x000Cd700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFFSIAVCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT 0 +#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT) +#define I40E_GLHMC_VFPDINV(_i) (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPDINV_MAX_INDEX 31 +#define I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT 0 +#define I40E_GLHMC_VFPDINV_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT) +#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT 15 +#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT) +#define I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT 16 +#define I40E_GLHMC_VFPDINV_PMPDIDX_MASK I40E_MASK(0x1FF, I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT) +#define I40E_GLHMC_VFPEARPBASE(_i) (0x000Cc800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEARPBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT 0 +#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT) +#define I40E_GLHMC_VFPEARPCNT(_i) (0x000Cc900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEARPCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT 0 +#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT) +#define I40E_GLHMC_VFPECQBASE(_i) (0x000Cc200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPECQBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT 0 +#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT) +#define I40E_GLHMC_VFPECQCNT(_i) (0x000Cc300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPECQCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT 0 +#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT) +#define I40E_GLHMC_VFPEHTCNT(_i) (0x000Cc700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEHTCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT 0 +#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT) +#define I40E_GLHMC_VFPEHTEBASE(_i) (0x000Cc600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEHTEBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT 0 +#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT) +#define I40E_GLHMC_VFPEMRBASE(_i) (0x000Ccc00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEMRBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT 0 +#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT) +#define I40E_GLHMC_VFPEMRCNT(_i) (0x000Ccd00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEMRCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT 0 +#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT) +#define I40E_GLHMC_VFPEPBLBASE(_i) (0x000Cd800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEPBLBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT 0 +#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT) +#define I40E_GLHMC_VFPEPBLCNT(_i) (0x000Cd900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEPBLCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT 0 +#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT) +#define I40E_GLHMC_VFPEQ1BASE(_i) (0x000Cd200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEQ1BASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT 0 +#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT) +#define I40E_GLHMC_VFPEQ1CNT(_i) (0x000Cd300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEQ1CNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT 0 +#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT) +#define I40E_GLHMC_VFPEQ1FLBASE(_i) (0x000Cd400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEQ1FLBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0 +#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT) +#define I40E_GLHMC_VFPEQPBASE(_i) (0x000Cc000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEQPBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT 0 +#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT) +#define I40E_GLHMC_VFPEQPCNT(_i) (0x000Cc100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEQPCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT 0 +#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT) +#define I40E_GLHMC_VFPESRQBASE(_i) (0x000Cc400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPESRQBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT 0 +#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT) +#define I40E_GLHMC_VFPESRQCNT(_i) (0x000Cc500 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPESRQCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT 0 +#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT) +#define I40E_GLHMC_VFPETIMERBASE(_i) (0x000CDA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPETIMERBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT 0 +#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT) +#define I40E_GLHMC_VFPETIMERCNT(_i) (0x000CDB00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPETIMERCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT 0 +#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT) +#define I40E_GLHMC_VFPEXFBASE(_i) (0x000Cce00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEXFBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT 0 +#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT) +#define I40E_GLHMC_VFPEXFCNT(_i) (0x000Ccf00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEXFCNT_MAX_INDEX 31 +#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT 0 +#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT) +#define I40E_GLHMC_VFPEXFFLBASE(_i) (0x000Cd000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFPEXFFLBASE_MAX_INDEX 31 +#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT 0 +#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT) +#define I40E_GLHMC_VFSDPART(_i) (0x000C8800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLHMC_VFSDPART_MAX_INDEX 31 +#define I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT 0 +#define I40E_GLHMC_VFSDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT) +#define I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT 16 +#define I40E_GLHMC_VFSDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT) +#define I40E_GLPBLOC_CACHESIZE 0x000A80BC /* Reset: CORER */ +#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLPBLOC_CACHESIZE_SETS_SHIFT 8 +#define I40E_GLPBLOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPBLOC_CACHESIZE_SETS_SHIFT) +#define I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT 20 +#define I40E_GLPBLOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT) +#define I40E_GLPDOC_CACHESIZE 0x000D0088 /* Reset: CORER */ +#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLPDOC_CACHESIZE_SETS_SHIFT 8 +#define I40E_GLPDOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPDOC_CACHESIZE_SETS_SHIFT) +#define I40E_GLPDOC_CACHESIZE_WAYS_SHIFT 20 +#define I40E_GLPDOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPDOC_CACHESIZE_WAYS_SHIFT) +#define I40E_GLPEOC_CACHESIZE 0x000A60E8 /* Reset: CORER */ +#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT 0 +#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT) +#define I40E_GLPEOC_CACHESIZE_SETS_SHIFT 8 +#define I40E_GLPEOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPEOC_CACHESIZE_SETS_SHIFT) +#define I40E_GLPEOC_CACHESIZE_WAYS_SHIFT 20 +#define I40E_GLPEOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPEOC_CACHESIZE_WAYS_SHIFT) +#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15 +#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT) +#define I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT 15 +#define I40E_PFHMC_SDCMD_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT) +#define I40E_GL_PPRS_SPARE 0x000856E0 /* Reset: CORER */ +#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT 0 +#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT) +#define I40E_GL_TLAN_SPARE 0x000E64E0 /* Reset: CORER */ +#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT 0 +#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT) +#define I40E_GL_TUPM_SPARE 0x000a2230 /* Reset: CORER */ +#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT 0 +#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT) +#define I40E_GLGEN_CAR_DEBUG 0x000B81C0 /* Reset: POR */ +#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT 0 +#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT 1 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT 2 +#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT 3 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT 4 +#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT 5 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT 6 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT 7 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT 8 +#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT 9 +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT 10 +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT 11 +#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT 12 +#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT 13 +#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT) +#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT 14 +#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT) +#define I40E_GLGEN_MISC_SPARE 0x000880E0 /* Reset: POR */ +#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT 0 +#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT) +#define I40E_GL_UFUSE_SOC 0x000BE550 /* Reset: POR */ +#define I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT 0 +#define I40E_GL_UFUSE_SOC_PORT_MODE_MASK I40E_MASK(0x3, I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT) +#define I40E_GL_UFUSE_SOC_NIC_ID_SHIFT 2 +#define I40E_GL_UFUSE_SOC_NIC_ID_MASK I40E_MASK(0x1, I40E_GL_UFUSE_SOC_NIC_ID_SHIFT) +#define I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT 3 +#define I40E_GL_UFUSE_SOC_SPARE_FUSES_MASK I40E_MASK(0x1FFF, I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT) +#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT 30 +#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT) +#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT 30 +#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT) +#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT 30 +#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT) +#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT 30 +#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT) +#define I40E_VPLAN_QBASE(_VF) (0x00074800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VPLAN_QBASE_MAX_INDEX 127 +#define I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT 0 +#define I40E_VPLAN_QBASE_VFFIRSTQ_MASK I40E_MASK(0x7FF, I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT) +#define I40E_VPLAN_QBASE_VFNUMQ_SHIFT 11 +#define I40E_VPLAN_QBASE_VFNUMQ_MASK I40E_MASK(0xFF, I40E_VPLAN_QBASE_VFNUMQ_SHIFT) +#define I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT 31 +#define I40E_VPLAN_QBASE_VFQTABLE_ENA_MASK I40E_MASK(0x1, I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT) +#define I40E_PRTMAC_LINK_DOWN_COUNTER 0x001E2440 /* Reset: GLOBR */ +#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT 0 +#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT) +#define I40E_GLNVM_AL_REQ 0x000B6164 /* Reset: POR */ +#define I40E_GLNVM_AL_REQ_POR_SHIFT 0 +#define I40E_GLNVM_AL_REQ_POR_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_POR_SHIFT) +#define I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT 1 +#define I40E_GLNVM_AL_REQ_PCIE_IMIB_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT) +#define I40E_GLNVM_AL_REQ_GLOBR_SHIFT 2 +#define I40E_GLNVM_AL_REQ_GLOBR_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_GLOBR_SHIFT) +#define I40E_GLNVM_AL_REQ_CORER_SHIFT 3 +#define I40E_GLNVM_AL_REQ_CORER_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_CORER_SHIFT) +#define I40E_GLNVM_AL_REQ_PE_SHIFT 4 +#define I40E_GLNVM_AL_REQ_PE_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PE_SHIFT) +#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT 5 +#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT) +#define I40E_GLNVM_ALTIMERS 0x000B6140 /* Reset: POR */ +#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT 0 +#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_MASK I40E_MASK(0xFFF, I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT) +#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT 12 +#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_MASK I40E_MASK(0xFFFFF, I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT) +#define I40E_GLNVM_FLA 0x000B6108 /* Reset: POR */ +#define I40E_GLNVM_FLA_LOCKED_SHIFT 6 +#define I40E_GLNVM_FLA_LOCKED_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT) + +#define I40E_GLNVM_ULD 0x000B6008 /* Reset: POR */ +#define I40E_GLNVM_ULD_PCIER_DONE_SHIFT 0 +#define I40E_GLNVM_ULD_PCIER_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_SHIFT) +#define I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT 1 +#define I40E_GLNVM_ULD_PCIER_DONE_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT) +#define I40E_GLNVM_ULD_CORER_DONE_SHIFT 3 +#define I40E_GLNVM_ULD_CORER_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CORER_DONE_SHIFT) +#define I40E_GLNVM_ULD_GLOBR_DONE_SHIFT 4 +#define I40E_GLNVM_ULD_GLOBR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_GLOBR_DONE_SHIFT) +#define I40E_GLNVM_ULD_POR_DONE_SHIFT 5 +#define I40E_GLNVM_ULD_POR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_SHIFT) +#define I40E_GLNVM_ULD_POR_DONE_1_SHIFT 8 +#define I40E_GLNVM_ULD_POR_DONE_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_1_SHIFT) +#define I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT 9 +#define I40E_GLNVM_ULD_PCIER_DONE_2_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT) +#define I40E_GLNVM_ULD_PE_DONE_SHIFT 10 +#define I40E_GLNVM_ULD_PE_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PE_DONE_SHIFT) +#define I40E_GLNVM_ULT 0x000B6154 /* Reset: POR */ +#define I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT 0 +#define I40E_GLNVM_ULT_CONF_PCIR_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT) +#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT 1 +#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT) +#define I40E_GLNVM_ULT_RESERVED_1_SHIFT 2 +#define I40E_GLNVM_ULT_RESERVED_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_1_SHIFT) +#define I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT 3 +#define I40E_GLNVM_ULT_CONF_CORE_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT) +#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT 4 +#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT) +#define I40E_GLNVM_ULT_CONF_POR_AE_SHIFT 5 +#define I40E_GLNVM_ULT_CONF_POR_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_POR_AE_SHIFT) +#define I40E_GLNVM_ULT_RESERVED_2_SHIFT 6 +#define I40E_GLNVM_ULT_RESERVED_2_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_2_SHIFT) +#define I40E_GLNVM_ULT_RESERVED_3_SHIFT 7 +#define I40E_GLNVM_ULT_RESERVED_3_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_3_SHIFT) +#define I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT 8 +#define I40E_GLNVM_ULT_CONF_EMP_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT) +#define I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT 9 +#define I40E_GLNVM_ULT_CONF_PCIALT_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT) +#define I40E_GLNVM_ULT_RESERVED_4_SHIFT 10 +#define I40E_GLNVM_ULT_RESERVED_4_MASK I40E_MASK(0x3FFFFF, I40E_GLNVM_ULT_RESERVED_4_SHIFT) +#define I40E_MEM_INIT_DONE_STAT 0x000B615C /* Reset: POR */ +#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT 0 +#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT 1 +#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT 2 +#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT 3 +#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT 4 +#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT 5 +#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT 6 +#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT 7 +#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT 8 +#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT 9 +#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT 10 +#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT 11 +#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT 12 +#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT 13 +#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT 14 +#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT 15 +#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT) +#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT 16 +#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT) +#define I40E_MNGSB_DADD 0x000B7030 /* Reset: POR */ +#define I40E_MNGSB_DADD_ADDR_SHIFT 0 +#define I40E_MNGSB_DADD_ADDR_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DADD_ADDR_SHIFT) +#define I40E_MNGSB_DCNT 0x000B7034 /* Reset: POR */ +#define I40E_MNGSB_DCNT_BYTE_CNT_SHIFT 0 +#define I40E_MNGSB_DCNT_BYTE_CNT_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DCNT_BYTE_CNT_SHIFT) +#define I40E_MNGSB_MSGCTL 0x000B7020 /* Reset: POR */ +#define I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT 0 +#define I40E_MNGSB_MSGCTL_HDR_DWS_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT) +#define I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT 8 +#define I40E_MNGSB_MSGCTL_EXP_RDW_MASK I40E_MASK(0x1FF, I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT) +#define I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT 26 +#define I40E_MNGSB_MSGCTL_MSG_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT) +#define I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT 28 +#define I40E_MNGSB_MSGCTL_TOKEN_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT) +#define I40E_MNGSB_MSGCTL_BARCLR_SHIFT 30 +#define I40E_MNGSB_MSGCTL_BARCLR_MASK I40E_MASK(0x1, I40E_MNGSB_MSGCTL_BARCLR_SHIFT) +#define I40E_MNGSB_MSGCTL_CMDV_SHIFT 31 +#define I40E_MNGSB_MSGCTL_CMDV_MASK I40E_MASK(0x1, I40E_MNGSB_MSGCTL_CMDV_SHIFT) +#define I40E_MNGSB_RDATA 0x000B7300 /* Reset: POR */ +#define I40E_MNGSB_RDATA_DATA_SHIFT 0 +#define I40E_MNGSB_RDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_RDATA_DATA_SHIFT) +#define I40E_MNGSB_RHDR0 0x000B72FC /* Reset: POR */ +#define I40E_MNGSB_RHDR0_DESTINATION_SHIFT 0 +#define I40E_MNGSB_RHDR0_DESTINATION_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_DESTINATION_SHIFT) +#define I40E_MNGSB_RHDR0_SOURCE_SHIFT 8 +#define I40E_MNGSB_RHDR0_SOURCE_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_SOURCE_SHIFT) +#define I40E_MNGSB_RHDR0_OPCODE_SHIFT 16 +#define I40E_MNGSB_RHDR0_OPCODE_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_OPCODE_SHIFT) +#define I40E_MNGSB_RHDR0_TAG_SHIFT 24 +#define I40E_MNGSB_RHDR0_TAG_MASK I40E_MASK(0x7, I40E_MNGSB_RHDR0_TAG_SHIFT) +#define I40E_MNGSB_RHDR0_RESPONSE_SHIFT 27 +#define I40E_MNGSB_RHDR0_RESPONSE_MASK I40E_MASK(0x7, I40E_MNGSB_RHDR0_RESPONSE_SHIFT) +#define I40E_MNGSB_RHDR0_EH_SHIFT 31 +#define I40E_MNGSB_RHDR0_EH_MASK I40E_MASK(0x1, I40E_MNGSB_RHDR0_EH_SHIFT) +#define I40E_MNGSB_RSPCTL 0x000B7024 /* Reset: POR */ +#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT 0 +#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_MASK I40E_MASK(0x1FF, I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT) +#define I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT 26 +#define I40E_MNGSB_RSPCTL_RSP_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT) +#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT 30 +#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_MASK I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT) +#define I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT 31 +#define I40E_MNGSB_RSPCTL_RSP_ERR_MASK I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT) +#define I40E_MNGSB_WDATA 0x000B7100 /* Reset: POR */ +#define I40E_MNGSB_WDATA_DATA_SHIFT 0 +#define I40E_MNGSB_WDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WDATA_DATA_SHIFT) +#define I40E_MNGSB_WHDR0 0x000B70F4 /* Reset: POR */ +#define I40E_MNGSB_WHDR0_RAW_DEST_SHIFT 0 +#define I40E_MNGSB_WHDR0_RAW_DEST_MASK I40E_MASK(0xFF, I40E_MNGSB_WHDR0_RAW_DEST_SHIFT) +#define I40E_MNGSB_WHDR0_DEST_SEL_SHIFT 12 +#define I40E_MNGSB_WHDR0_DEST_SEL_MASK I40E_MASK(0xF, I40E_MNGSB_WHDR0_DEST_SEL_SHIFT) +#define I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT 16 +#define I40E_MNGSB_WHDR0_OPCODE_SEL_MASK I40E_MASK(0xFF, I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT) +#define I40E_MNGSB_WHDR0_TAG_SHIFT 24 +#define I40E_MNGSB_WHDR0_TAG_MASK I40E_MASK(0x7F, I40E_MNGSB_WHDR0_TAG_SHIFT) +#define I40E_MNGSB_WHDR1 0x000B70F8 /* Reset: POR */ +#define I40E_MNGSB_WHDR1_ADDR_SHIFT 0 +#define I40E_MNGSB_WHDR1_ADDR_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR1_ADDR_SHIFT) +#define I40E_MNGSB_WHDR2 0x000B70FC /* Reset: POR */ +#define I40E_MNGSB_WHDR2_LENGTH_SHIFT 0 +#define I40E_MNGSB_WHDR2_LENGTH_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR2_LENGTH_SHIFT) + +#define I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT 21 +#define I40E_GLPCI_CAPSUP_WAKUP_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT) + +#define I40E_GLPCI_CUR_CLNT_COMMON 0x0009CA18 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT) +#define I40E_GLPCI_CUR_CLNT_PIPEMON 0x0009CA20 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_MNG_ALWD 0x0009c514 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_MNG_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_MNG_RSVD 0x0009c594 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_MNG_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_PMAT_ALWD 0x0009c510 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_PMAT_RSVD 0x0009c590 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_RLAN_ALWD 0x0009c500 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_RLAN_RSVD 0x0009c580 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_RXPE_ALWD 0x0009c508 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_RXPE_RSVD 0x0009c588 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TDPU_ALWD 0x0009c518 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TDPU_RSVD 0x0009c598 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TLAN_ALWD 0x0009c504 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TLAN_RSVD 0x0009c584 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TXPE_ALWD 0x0009c50C /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT) +#define I40E_GLPCI_CUR_TXPE_RSVD 0x0009c58c /* Reset: PCIR */ +#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT) +#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON 0x0009CA28 /* Reset: PCIR */ +#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT) +#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT 16 +#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT) + +#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT 4 +#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT) +#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT 10 +#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT) +#define I40E_GLPCI_NPQ_CFG 0x0009CA00 /* Reset: PCIR */ +#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT 0 +#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_MASK I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT) +#define I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT 1 +#define I40E_GLPCI_NPQ_CFG_SMALL_TO_MASK I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT) +#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT 2 +#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_MASK I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT) +#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT 6 +#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_MASK I40E_MASK(0x3FF, I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT) +#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT 16 +#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_MASK I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT) +#define I40E_GLPCI_WATMK_CLNT_PIPEMON 0x0009CA30 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_MNG_ALWD 0x0009CB14 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_PMAT_ALWD 0x0009CB10 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_RLAN_ALWD 0x0009CB00 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_RXPE_ALWD 0x0009CB08 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_TLAN_ALWD 0x0009CB04 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_TPDU_ALWD 0x0009CB18 /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT) +#define I40E_GLPCI_WATMK_TXPE_ALWD 0x0009CB0c /* Reset: PCIR */ +#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT 0 +#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT) +#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT 16 +#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT) +#define I40E_GLPE_CPUSTATUS0 0x0000D040 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0 +#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT) +#define I40E_GLPE_CPUSTATUS1 0x0000D044 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0 +#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT) +#define I40E_GLPE_CPUSTATUS2 0x0000D048 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0 +#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT) +#define I40E_GLPE_CPUTRIG0 0x0000D060 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT 0 +#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK I40E_MASK(0xFFFF, I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT) +#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17 +#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT) +#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18 +#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT) +#define I40E_GLPE_DUAL40_RUPM 0x0000DA04 /* Reset: PE_CORER */ +#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0 +#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK I40E_MASK(0x1, I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT) +#define I40E_GLPE_PFAEQEDROPCNT(_i) (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT) +#define I40E_GLPE_PFCEQEDROPCNT(_i) (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT) +#define I40E_GLPE_PFCQEDROPCNT(_i) (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT) +#define I40E_GLPE_RUPM_CQPPOOL 0x0000DACC /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT) +#define I40E_GLPE_RUPM_FLRPOOL 0x0000DAC4 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT) +#define I40E_GLPE_RUPM_GCTL 0x0000DA00 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT 0 +#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26 +#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27 +#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28 +#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29 +#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT 30 +#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT 31 +#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT) +#define I40E_GLPE_RUPM_PTXPOOL 0x0000DAC8 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT) +#define I40E_GLPE_RUPM_PUSHPOOL 0x0000DAC0 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT) +#define I40E_GLPE_RUPM_TXHOST_EN 0x0000DA08 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0 +#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT) +#define I40E_GLPE_VFAEQEDROPCNT(_i) (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT) +#define I40E_GLPE_VFCEQEDROPCNT(_i) (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT) +#define I40E_GLPE_VFCQEDROPCNT(_i) (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT) +#define I40E_GLPE_VFFLMOBJCTRL(_i) (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX 31 +#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0 +#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT) +#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT 8 +#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT) +#define I40E_GLPE_VFFLMQ1ALLOCERR(_i) (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX 31 +#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_GLPE_VFFLMXMITALLOCERR(_i) (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX 31 +#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_GLPE_VFUDACTRL(_i) (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFUDACTRL_MAX_INDEX 31 +#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT 0 +#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT 1 +#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT 2 +#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT 3 +#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4 +#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT) +#define I40E_GLPE_VFUDAUCFBQPN(_i) (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX 31 +#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT 0 +#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK I40E_MASK(0x3FFFF, I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT) +#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31 +#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK I40E_MASK(0x1, I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT) +#define I40E_PFPE_AEQALLOC 0x00131180 /* Reset: PFR */ +#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0 +#define I40E_PFPE_AEQALLOC_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_AEQALLOC_AECOUNT_SHIFT) +#define I40E_PFPE_CCQPHIGH 0x00008200 /* Reset: PFR */ +#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0 +#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT) +#define I40E_PFPE_CCQPLOW 0x00008180 /* Reset: PFR */ +#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0 +#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT) +#define I40E_PFPE_CCQPSTATUS 0x00008100 /* Reset: PFR */ +#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0 +#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT) +#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4 +#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT) +#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16 +#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT) +#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31 +#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT) +#define I40E_PFPE_CQACK 0x00131100 /* Reset: PFR */ +#define I40E_PFPE_CQACK_PECQID_SHIFT 0 +#define I40E_PFPE_CQACK_PECQID_MASK I40E_MASK(0x1FFFF, I40E_PFPE_CQACK_PECQID_SHIFT) +#define I40E_PFPE_CQARM 0x00131080 /* Reset: PFR */ +#define I40E_PFPE_CQARM_PECQID_SHIFT 0 +#define I40E_PFPE_CQARM_PECQID_MASK I40E_MASK(0x1FFFF, I40E_PFPE_CQARM_PECQID_SHIFT) +#define I40E_PFPE_CQPDB 0x00008000 /* Reset: PFR */ +#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0 +#define I40E_PFPE_CQPDB_WQHEAD_MASK I40E_MASK(0x7FF, I40E_PFPE_CQPDB_WQHEAD_SHIFT) +#define I40E_PFPE_CQPERRCODES 0x00008880 /* Reset: PFR */ +#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0 +#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT) +#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT) +#define I40E_PFPE_CQPTAIL 0x00008080 /* Reset: PFR */ +#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT 0 +#define I40E_PFPE_CQPTAIL_WQTAIL_MASK I40E_MASK(0x7FF, I40E_PFPE_CQPTAIL_WQTAIL_SHIFT) +#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31 +#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT) +#define I40E_PFPE_FLMQ1ALLOCERR 0x00008980 /* Reset: PFR */ +#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_PFPE_FLMXMITALLOCERR 0x00008900 /* Reset: PFR */ +#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_PFPE_IPCONFIG0 0x00008280 /* Reset: PFR */ +#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT 0 +#define I40E_PFPE_IPCONFIG0_PEIPID_MASK I40E_MASK(0xFFFF, I40E_PFPE_IPCONFIG0_PEIPID_SHIFT) +#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16 +#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT) +#define I40E_PFPE_MRTEIDXMASK 0x00008600 /* Reset: PFR */ +#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT) +#define I40E_PFPE_RCVUNEXPECTEDERROR 0x00008680 /* Reset: PFR */ +#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_PFPE_TCPNOWTIMER 0x00008580 /* Reset: PFR */ +#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0 +#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT) +#define I40E_PFPE_UDACTRL 0x00008700 /* Reset: PFR */ +#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT 0 +#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT) +#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT 1 +#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT) +#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT 2 +#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT) +#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT 3 +#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT) +#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT 4 +#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT) +#define I40E_PFPE_UDAUCFBQPN 0x00008780 /* Reset: PFR */ +#define I40E_PFPE_UDAUCFBQPN_QPN_SHIFT 0 +#define I40E_PFPE_UDAUCFBQPN_QPN_MASK I40E_MASK(0x3FFFF, I40E_PFPE_UDAUCFBQPN_QPN_SHIFT) +#define I40E_PFPE_UDAUCFBQPN_VALID_SHIFT 31 +#define I40E_PFPE_UDAUCFBQPN_VALID_MASK I40E_MASK(0x1, I40E_PFPE_UDAUCFBQPN_VALID_SHIFT) +#define I40E_PFPE_WQEALLOC 0x00138C00 /* Reset: PFR */ +#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT 0 +#define I40E_PFPE_WQEALLOC_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_PFPE_WQEALLOC_PEQPID_SHIFT) +#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20 +#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT) +#define I40E_PRTDCB_RLPMC 0x0001F140 /* Reset: PE_CORER */ +#define I40E_PRTDCB_RLPMC_TC2PFC_SHIFT 0 +#define I40E_PRTDCB_RLPMC_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTDCB_RLPMC_TC2PFC_SHIFT) +#define I40E_PRTDCB_TCMSTC_RLPM(_i) (0x0001F040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: PE_CORER */ +#define I40E_PRTDCB_TCMSTC_RLPM_MAX_INDEX 7 +#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT 0 +#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT) +#define I40E_PRTDCB_TCPMC_RLPM 0x0001F1A0 /* Reset: PE_CORER */ +#define I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT 0 +#define I40E_PRTDCB_TCPMC_RLPM_CPM_MASK I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT) +#define I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT 13 +#define I40E_PRTDCB_TCPMC_RLPM_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT) +#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT 30 +#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT) +#define I40E_PRTE_RUPM_TCCNTR03 0x0000DAE0 /* Reset: PE_CORER */ +#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT 0 +#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT) +#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT 8 +#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT) +#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT 16 +#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT) +#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT 24 +#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT) +#define I40E_PRTPE_RUPM_CNTR 0x0000DB20 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT 0 +#define I40E_PRTPE_RUPM_CNTR_COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT) +#define I40E_PRTPE_RUPM_CTL 0x0000DA40 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_CTL_LLTC_SHIFT 13 +#define I40E_PRTPE_RUPM_CTL_LLTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_CTL_LLTC_SHIFT) +#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT 30 +#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_MASK I40E_MASK(0x1, I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT) +#define I40E_PRTPE_RUPM_PFCCTL 0x0000DA60 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT 0 +#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT) +#define I40E_PRTPE_RUPM_PFCPC 0x0000DA80 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT 0 +#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT) +#define I40E_PRTPE_RUPM_PFCTCC 0x0000DAA0 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT 0 +#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT) +#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT 16 +#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT) +#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT 31 +#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_MASK I40E_MASK(0x1, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT) +#define I40E_PRTPE_RUPM_PTCTCCNTR47 0x0000DB60 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT 0 +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT 8 +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT 16 +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT 24 +#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTXTCCNTR03 0x0000DB40 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT 0 +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT 8 +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT 16 +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT) +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT 24 +#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT) +#define I40E_PRTPE_RUPM_TCCNTR47 0x0000DB00 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT 0 +#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT) +#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT 8 +#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT) +#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT 16 +#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT) +#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT 24 +#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT) +#define I40E_PRTPE_RUPM_THRES 0x0000DA20 /* Reset: PE_CORER */ +#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT 0 +#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT) +#define I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT 8 +#define I40E_PRTPE_RUPM_THRES_MAXSPADS_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT) +#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT 16 +#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT) +#define I40E_VFPE_AEQALLOC(_VF) (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_AEQALLOC_MAX_INDEX 127 +#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0 +#define I40E_VFPE_AEQALLOC_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC_AECOUNT_SHIFT) +#define I40E_VFPE_CCQPHIGH(_VF) (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPHIGH_MAX_INDEX 127 +#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0 +#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT) +#define I40E_VFPE_CCQPLOW(_VF) (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPLOW_MAX_INDEX 127 +#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0 +#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT) +#define I40E_VFPE_CCQPSTATUS(_VF) (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPSTATUS_MAX_INDEX 127 +#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0 +#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT) +#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4 +#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT) +#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16 +#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT) +#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31 +#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT) +#define I40E_VFPE_CQACK(_VF) (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQACK_MAX_INDEX 127 +#define I40E_VFPE_CQACK_PECQID_SHIFT 0 +#define I40E_VFPE_CQACK_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQACK_PECQID_SHIFT) +#define I40E_VFPE_CQARM(_VF) (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQARM_MAX_INDEX 127 +#define I40E_VFPE_CQARM_PECQID_SHIFT 0 +#define I40E_VFPE_CQARM_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQARM_PECQID_SHIFT) +#define I40E_VFPE_CQPDB(_VF) (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPDB_MAX_INDEX 127 +#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0 +#define I40E_VFPE_CQPDB_WQHEAD_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPDB_WQHEAD_SHIFT) +#define I40E_VFPE_CQPERRCODES(_VF) (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPERRCODES_MAX_INDEX 127 +#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0 +#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT) +#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT) +#define I40E_VFPE_CQPTAIL(_VF) (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPTAIL_MAX_INDEX 127 +#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT 0 +#define I40E_VFPE_CQPTAIL_WQTAIL_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL_WQTAIL_SHIFT) +#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31 +#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT) +#define I40E_VFPE_IPCONFIG0(_VF) (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_IPCONFIG0_MAX_INDEX 127 +#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT 0 +#define I40E_VFPE_IPCONFIG0_PEIPID_MASK I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG0_PEIPID_SHIFT) +#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16 +#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT) +#define I40E_VFPE_MRTEIDXMASK(_VF) (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX 127 +#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT) +#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF) (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX 127 +#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_VFPE_TCPNOWTIMER(_VF) (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX 127 +#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0 +#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT) +#define I40E_VFPE_WQEALLOC(_VF) (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_WQEALLOC_MAX_INDEX 127 +#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT 0 +#define I40E_VFPE_WQEALLOC_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC_PEQPID_SHIFT) +#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20 +#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT) +#define I40E_GLPES_PFIP4RXDISCARD(_i) (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0 +#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT) +#define I40E_GLPES_PFIP4RXFRAGSHI(_i) (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP4RXFRAGSLO(_i) (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP4RXMCOCTSHI(_i) (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXMCOCTSLO(_i) (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXMCPKTSHI(_i) (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXMCPKTSLO(_i) (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXOCTSHI(_i) (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXOCTSLO(_i) (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXPKTSHI(_i) (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXPKTSLO(_i) (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXTRUNC(_i) (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0 +#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT) +#define I40E_GLPES_PFIP4TXFRAGSHI(_i) (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP4TXFRAGSLO(_i) (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP4TXMCOCTSHI(_i) (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXMCOCTSLO(_i) (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXMCPKTSHI(_i) (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXMCPKTSLO(_i) (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXNOROUTE(_i) (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0 +#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT) +#define I40E_GLPES_PFIP4TXOCTSHI(_i) (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXOCTSLO(_i) (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXPKTSHI(_i) (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXPKTSLO(_i) (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXDISCARD(_i) (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0 +#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT) +#define I40E_GLPES_PFIP6RXFRAGSHI(_i) (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP6RXFRAGSLO(_i) (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP6RXMCOCTSHI(_i) (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXMCOCTSLO(_i) (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXMCPKTSHI(_i) (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXMCPKTSLO(_i) (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXOCTSHI(_i) (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXOCTSLO(_i) (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXPKTSHI(_i) (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXPKTSLO(_i) (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXTRUNC(_i) (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0 +#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT) +#define I40E_GLPES_PFIP6TXFRAGSHI(_i) (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP6TXFRAGSLO(_i) (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP6TXMCOCTSHI(_i) (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXMCOCTSLO(_i) (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXMCPKTSHI(_i) (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXMCPKTSLO(_i) (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXNOROUTE(_i) (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0 +#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT) +#define I40E_GLPES_PFIP6TXOCTSHI(_i) (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXOCTSLO(_i) (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXPKTSHI(_i) (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXPKTSLO(_i) (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT) +#define I40E_GLPES_PFRDMARXRDSHI(_i) (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_PFRDMARXRDSLO(_i) (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_PFRDMARXSNDSHI(_i) (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_PFRDMARXSNDSLO(_i) (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_PFRDMARXWRSHI(_i) (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_PFRDMARXWRSLO(_i) (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_PFRDMATXRDSHI(_i) (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_PFRDMATXRDSLO(_i) (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_PFRDMATXSNDSHI(_i) (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_PFRDMATXSNDSLO(_i) (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_PFRDMATXWRSHI(_i) (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_PFRDMATXWRSLO(_i) (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_PFRDMAVBNDHI(_i) (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0 +#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT) +#define I40E_GLPES_PFRDMAVBNDLO(_i) (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0 +#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT) +#define I40E_GLPES_PFRDMAVINVHI(_i) (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0 +#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT) +#define I40E_GLPES_PFRDMAVINVLO(_i) (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0 +#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT) +#define I40E_GLPES_PFRXVLANERR(_i) (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRXVLANERR_MAX_INDEX 15 +#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0 +#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT) +#define I40E_GLPES_PFTCPRTXSEG(_i) (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0 +#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT) +#define I40E_GLPES_PFTCPRXOPTERR(_i) (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0 +#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT) +#define I40E_GLPES_PFTCPRXPROTOERR(_i) (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0 +#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT) +#define I40E_GLPES_PFTCPRXSEGSHI(_i) (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0 +#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT) +#define I40E_GLPES_PFTCPRXSEGSLO(_i) (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0 +#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT) +#define I40E_GLPES_PFTCPTXSEGHI(_i) (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX 15 +#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0 +#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT) +#define I40E_GLPES_PFTCPTXSEGLO(_i) (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX 15 +#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0 +#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT) +#define I40E_GLPES_PFUDPRXPKTSHI(_i) (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT) +#define I40E_GLPES_PFUDPRXPKTSLO(_i) (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT) +#define I40E_GLPES_PFUDPTXPKTSHI(_i) (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT) +#define I40E_GLPES_PFUDPTXPKTSLO(_i) (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT) +#define I40E_GLPES_RDMARXMULTFPDUSHI 0x0001E014 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0 +#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT) +#define I40E_GLPES_RDMARXMULTFPDUSLO 0x0001E010 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0 +#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT) +#define I40E_GLPES_RDMARXOOODDPHI 0x0001E01C /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0 +#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT) +#define I40E_GLPES_RDMARXOOODDPLO 0x0001E018 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0 +#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT) +#define I40E_GLPES_RDMARXOOONOMARK 0x0001E004 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0 +#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT) +#define I40E_GLPES_RDMARXUNALIGN 0x0001E000 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0 +#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT) +#define I40E_GLPES_TCPRXFOURHOLEHI 0x0001E044 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXFOURHOLELO 0x0001E040 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT) +#define I40E_GLPES_TCPRXONEHOLEHI 0x0001E02C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXONEHOLELO 0x0001E028 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT) +#define I40E_GLPES_TCPRXPUREACKHI 0x0001E024 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0 +#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT) +#define I40E_GLPES_TCPRXPUREACKSLO 0x0001E020 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0 +#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT) +#define I40E_GLPES_TCPRXTHREEHOLEHI 0x0001E03C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXTHREEHOLELO 0x0001E038 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT) +#define I40E_GLPES_TCPRXTWOHOLEHI 0x0001E034 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXTWOHOLELO 0x0001E030 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT) +#define I40E_GLPES_TCPTXRETRANSFASTHI 0x0001E04C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0 +#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT) +#define I40E_GLPES_TCPTXRETRANSFASTLO 0x0001E048 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0 +#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT) +#define I40E_GLPES_TCPTXTOUTSFASTHI 0x0001E054 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT) +#define I40E_GLPES_TCPTXTOUTSFASTLO 0x0001E050 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT) +#define I40E_GLPES_TCPTXTOUTSHI 0x0001E05C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT) +#define I40E_GLPES_TCPTXTOUTSLO 0x0001E058 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXDISCARD(_i) (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0 +#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT) +#define I40E_GLPES_VFIP4RXFRAGSHI(_i) (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP4RXFRAGSLO(_i) (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP4RXMCOCTSHI(_i) (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXMCOCTSLO(_i) (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXMCPKTSHI(_i) (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXMCPKTSLO(_i) (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXOCTSHI(_i) (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXOCTSLO(_i) (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXPKTSHI(_i) (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXPKTSLO(_i) (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXTRUNC(_i) (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0 +#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT) +#define I40E_GLPES_VFIP4TXFRAGSHI(_i) (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP4TXFRAGSLO(_i) (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP4TXMCOCTSHI(_i) (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXMCOCTSLO(_i) (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXMCPKTSHI(_i) (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXMCPKTSLO(_i) (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXNOROUTE(_i) (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0 +#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT) +#define I40E_GLPES_VFIP4TXOCTSHI(_i) (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXOCTSLO(_i) (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXPKTSHI(_i) (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXPKTSLO(_i) (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXDISCARD(_i) (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0 +#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT) +#define I40E_GLPES_VFIP6RXFRAGSHI(_i) (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP6RXFRAGSLO(_i) (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP6RXMCOCTSHI(_i) (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXMCOCTSLO(_i) (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXMCPKTSHI(_i) (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXMCPKTSLO(_i) (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXOCTSHI(_i) (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXOCTSLO(_i) (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXPKTSHI(_i) (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXPKTSLO(_i) (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXTRUNC(_i) (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0 +#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT) +#define I40E_GLPES_VFIP6TXFRAGSHI(_i) (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP6TXFRAGSLO(_i) (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP6TXMCOCTSHI(_i) (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXMCOCTSLO(_i) (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXMCPKTSHI(_i) (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXMCPKTSLO(_i) (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXNOROUTE(_i) (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0 +#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT) +#define I40E_GLPES_VFIP6TXOCTSHI(_i) (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXOCTSLO(_i) (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXPKTSHI(_i) (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXPKTSLO(_i) (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT) +#define I40E_GLPES_VFRDMARXRDSHI(_i) (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_VFRDMARXRDSLO(_i) (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_VFRDMARXSNDSHI(_i) (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_VFRDMARXSNDSLO(_i) (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_VFRDMARXWRSHI(_i) (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_VFRDMARXWRSLO(_i) (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_VFRDMATXRDSHI(_i) (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_VFRDMATXRDSLO(_i) (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_VFRDMATXSNDSHI(_i) (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_VFRDMATXSNDSLO(_i) (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_VFRDMATXWRSHI(_i) (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_VFRDMATXWRSLO(_i) (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_VFRDMAVBNDHI(_i) (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0 +#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT) +#define I40E_GLPES_VFRDMAVBNDLO(_i) (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0 +#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT) +#define I40E_GLPES_VFRDMAVINVHI(_i) (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0 +#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT) +#define I40E_GLPES_VFRDMAVINVLO(_i) (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0 +#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT) +#define I40E_GLPES_VFRXVLANERR(_i) (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRXVLANERR_MAX_INDEX 31 +#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0 +#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT) +#define I40E_GLPES_VFTCPRTXSEG(_i) (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0 +#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT) +#define I40E_GLPES_VFTCPRXOPTERR(_i) (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0 +#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT) +#define I40E_GLPES_VFTCPRXPROTOERR(_i) (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0 +#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT) +#define I40E_GLPES_VFTCPRXSEGSHI(_i) (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0 +#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT) +#define I40E_GLPES_VFTCPRXSEGSLO(_i) (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0 +#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT) +#define I40E_GLPES_VFTCPTXSEGHI(_i) (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX 31 +#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0 +#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT) +#define I40E_GLPES_VFTCPTXSEGLO(_i) (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX 31 +#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0 +#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT) +#define I40E_GLPES_VFUDPRXPKTSHI(_i) (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT) +#define I40E_GLPES_VFUDPRXPKTSLO(_i) (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT) +#define I40E_GLPES_VFUDPTXPKTSHI(_i) (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT) +#define I40E_GLPES_VFUDPTXPKTSLO(_i) (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT) +#define I40E_GLGEN_PME_TO 0x000B81BC /* Reset: POR */ +#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT 0 +#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_MASK I40E_MASK(0x1, I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT) +#define I40E_GLQF_APBVT(_i) (0x00260000 + ((_i) * 4)) /* _i=0...2047 */ /* Reset: CORER */ +#define I40E_GLQF_APBVT_MAX_INDEX 2047 +#define I40E_GLQF_APBVT_APBVT_SHIFT 0 +#define I40E_GLQF_APBVT_APBVT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_APBVT_APBVT_SHIFT) +#define I40E_GLQF_FD_PCTYPES(_i) (0x00268000 + ((_i) * 4)) /* _i=0...63 */ /* Reset: POR */ +#define I40E_GLQF_FD_PCTYPES_MAX_INDEX 63 +#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT 0 +#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_MASK I40E_MASK(0x3F, I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT) +#define I40E_GLQF_FDEVICTENA(_i) (0x00270384 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */ +#define I40E_GLQF_FDEVICTENA_MAX_INDEX 1 +#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT 0 +#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT) +#define I40E_GLQF_FDEVICTFLAG 0x00270280 /* Reset: CORER */ +#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT 0 +#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_MASK I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT) +#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT 8 +#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_MASK I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT) +#define I40E_PFQF_CTL_2 0x00270300 /* Reset: CORER */ +#define I40E_PFQF_CTL_2_PEHSIZE_SHIFT 0 +#define I40E_PFQF_CTL_2_PEHSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEHSIZE_SHIFT) +#define I40E_PFQF_CTL_2_PEDSIZE_SHIFT 5 +#define I40E_PFQF_CTL_2_PEDSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEDSIZE_SHIFT) +/* Redefined for X722 family */ +#define I40E_X722_PFQF_HLUT(_i) (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */ +#define I40E_X722_PFQF_HLUT_MAX_INDEX 127 +#define I40E_X722_PFQF_HLUT_LUT0_SHIFT 0 +#define I40E_X722_PFQF_HLUT_LUT0_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT0_SHIFT) +#define I40E_X722_PFQF_HLUT_LUT1_SHIFT 8 +#define I40E_X722_PFQF_HLUT_LUT1_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT1_SHIFT) +#define I40E_X722_PFQF_HLUT_LUT2_SHIFT 16 +#define I40E_X722_PFQF_HLUT_LUT2_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT2_SHIFT) +#define I40E_X722_PFQF_HLUT_LUT3_SHIFT 24 +#define I40E_X722_PFQF_HLUT_LUT3_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT3_SHIFT) +#define I40E_PFQF_HREGION(_i) (0x00245400 + ((_i) * 128)) /* _i=0...7 */ /* Reset: CORER */ +#define I40E_PFQF_HREGION_MAX_INDEX 7 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT) +#define I40E_PFQF_HREGION_REGION_0_SHIFT 1 +#define I40E_PFQF_HREGION_REGION_0_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_0_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT) +#define I40E_PFQF_HREGION_REGION_1_SHIFT 5 +#define I40E_PFQF_HREGION_REGION_1_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_1_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT) +#define I40E_PFQF_HREGION_REGION_2_SHIFT 9 +#define I40E_PFQF_HREGION_REGION_2_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_2_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT) +#define I40E_PFQF_HREGION_REGION_3_SHIFT 13 +#define I40E_PFQF_HREGION_REGION_3_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_3_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT) +#define I40E_PFQF_HREGION_REGION_4_SHIFT 17 +#define I40E_PFQF_HREGION_REGION_4_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_4_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT) +#define I40E_PFQF_HREGION_REGION_5_SHIFT 21 +#define I40E_PFQF_HREGION_REGION_5_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_5_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT) +#define I40E_PFQF_HREGION_REGION_6_SHIFT 25 +#define I40E_PFQF_HREGION_REGION_6_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_6_SHIFT) +#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28 +#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT) +#define I40E_PFQF_HREGION_REGION_7_SHIFT 29 +#define I40E_PFQF_HREGION_REGION_7_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_7_SHIFT) +#define I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT 8 +#define I40E_VSIQF_CTL_RSS_LUT_TYPE_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT) +#define I40E_VSIQF_HKEY(_i, _VSI) (0x002A0000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...12, _VSI=0...383 */ /* Reset: CORER */ +#define I40E_VSIQF_HKEY_MAX_INDEX 12 +#define I40E_VSIQF_HKEY_KEY_0_SHIFT 0 +#define I40E_VSIQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_0_SHIFT) +#define I40E_VSIQF_HKEY_KEY_1_SHIFT 8 +#define I40E_VSIQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_1_SHIFT) +#define I40E_VSIQF_HKEY_KEY_2_SHIFT 16 +#define I40E_VSIQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_2_SHIFT) +#define I40E_VSIQF_HKEY_KEY_3_SHIFT 24 +#define I40E_VSIQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_3_SHIFT) +#define I40E_VSIQF_HLUT(_i, _VSI) (0x00220000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...15, _VSI=0...383 */ /* Reset: CORER */ +#define I40E_VSIQF_HLUT_MAX_INDEX 15 +#define I40E_VSIQF_HLUT_LUT0_SHIFT 0 +#define I40E_VSIQF_HLUT_LUT0_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT0_SHIFT) +#define I40E_VSIQF_HLUT_LUT1_SHIFT 8 +#define I40E_VSIQF_HLUT_LUT1_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT1_SHIFT) +#define I40E_VSIQF_HLUT_LUT2_SHIFT 16 +#define I40E_VSIQF_HLUT_LUT2_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT2_SHIFT) +#define I40E_VSIQF_HLUT_LUT3_SHIFT 24 +#define I40E_VSIQF_HLUT_LUT3_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT3_SHIFT) +#define I40E_GLGEN_STAT_CLEAR 0x00390004 /* Reset: CORER */ +#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT 0 +#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT) +#define I40E_GLGEN_STAT_HALT 0x00390000 /* Reset: CORER */ +#define I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT 0 +#define I40E_GLGEN_STAT_HALT_HALT_CELLS_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT) +#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT 30 +#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT) +#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT 30 +#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT) +#define I40E_VFPE_AEQALLOC1 0x0000A400 /* Reset: VFR */ +#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0 +#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT) +#define I40E_VFPE_CCQPHIGH1 0x00009800 /* Reset: VFR */ +#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0 +#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT) +#define I40E_VFPE_CCQPLOW1 0x0000AC00 /* Reset: VFR */ +#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0 +#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT) +#define I40E_VFPE_CCQPSTATUS1 0x0000B800 /* Reset: VFR */ +#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT 0 +#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4 +#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16 +#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT 31 +#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT) +#define I40E_VFPE_CQACK1 0x0000B000 /* Reset: VFR */ +#define I40E_VFPE_CQACK1_PECQID_SHIFT 0 +#define I40E_VFPE_CQACK1_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQACK1_PECQID_SHIFT) +#define I40E_VFPE_CQARM1 0x0000B400 /* Reset: VFR */ +#define I40E_VFPE_CQARM1_PECQID_SHIFT 0 +#define I40E_VFPE_CQARM1_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQARM1_PECQID_SHIFT) +#define I40E_VFPE_CQPDB1 0x0000BC00 /* Reset: VFR */ +#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0 +#define I40E_VFPE_CQPDB1_WQHEAD_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPDB1_WQHEAD_SHIFT) +#define I40E_VFPE_CQPERRCODES1 0x00009C00 /* Reset: VFR */ +#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0 +#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT) +#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT) +#define I40E_VFPE_CQPTAIL1 0x0000A000 /* Reset: VFR */ +#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT 0 +#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT) +#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31 +#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT) +#define I40E_VFPE_IPCONFIG01 0x00008C00 /* Reset: VFR */ +#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT 0 +#define I40E_VFPE_IPCONFIG01_PEIPID_MASK I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG01_PEIPID_SHIFT) +#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16 +#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT) +#define I40E_VFPE_MRTEIDXMASK1 0x00009000 /* Reset: VFR */ +#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT) +#define I40E_VFPE_RCVUNEXPECTEDERROR1 0x00009400 /* Reset: VFR */ +#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_VFPE_TCPNOWTIMER1 0x0000A800 /* Reset: VFR */ +#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0 +#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT) +#define I40E_VFPE_WQEALLOC1 0x0000C000 /* Reset: VFR */ +#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT 0 +#define I40E_VFPE_WQEALLOC1_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC1_PEQPID_SHIFT) +#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20 +#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT) + +#endif /* X722_SUPPORT */ +#endif /* _I40E_REGISTER_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_status.h b/usr/src/uts/common/io/i40e/core/i40e_status.h new file mode 100644 index 0000000000..1f27507970 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_status.h @@ -0,0 +1,108 @@ +/****************************************************************************** + + Copyright (c) 2013-2014, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_status.h 283119 2015-05-19 18:35:18Z jhb $*/ + +#ifndef _I40E_STATUS_H_ +#define _I40E_STATUS_H_ + +/* Error Codes */ +enum i40e_status_code { + I40E_SUCCESS = 0, + I40E_ERR_NVM = -1, + I40E_ERR_NVM_CHECKSUM = -2, + I40E_ERR_PHY = -3, + I40E_ERR_CONFIG = -4, + I40E_ERR_PARAM = -5, + I40E_ERR_MAC_TYPE = -6, + I40E_ERR_UNKNOWN_PHY = -7, + I40E_ERR_LINK_SETUP = -8, + I40E_ERR_ADAPTER_STOPPED = -9, + I40E_ERR_INVALID_MAC_ADDR = -10, + I40E_ERR_DEVICE_NOT_SUPPORTED = -11, + I40E_ERR_MASTER_REQUESTS_PENDING = -12, + I40E_ERR_INVALID_LINK_SETTINGS = -13, + I40E_ERR_AUTONEG_NOT_COMPLETE = -14, + I40E_ERR_RESET_FAILED = -15, + I40E_ERR_SWFW_SYNC = -16, + I40E_ERR_NO_AVAILABLE_VSI = -17, + I40E_ERR_NO_MEMORY = -18, + I40E_ERR_BAD_PTR = -19, + I40E_ERR_RING_FULL = -20, + I40E_ERR_INVALID_PD_ID = -21, + I40E_ERR_INVALID_QP_ID = -22, + I40E_ERR_INVALID_CQ_ID = -23, + I40E_ERR_INVALID_CEQ_ID = -24, + I40E_ERR_INVALID_AEQ_ID = -25, + I40E_ERR_INVALID_SIZE = -26, + I40E_ERR_INVALID_ARP_INDEX = -27, + I40E_ERR_INVALID_FPM_FUNC_ID = -28, + I40E_ERR_QP_INVALID_MSG_SIZE = -29, + I40E_ERR_QP_TOOMANY_WRS_POSTED = -30, + I40E_ERR_INVALID_FRAG_COUNT = -31, + I40E_ERR_QUEUE_EMPTY = -32, + I40E_ERR_INVALID_ALIGNMENT = -33, + I40E_ERR_FLUSHED_QUEUE = -34, + I40E_ERR_INVALID_PUSH_PAGE_INDEX = -35, + I40E_ERR_INVALID_IMM_DATA_SIZE = -36, + I40E_ERR_TIMEOUT = -37, + I40E_ERR_OPCODE_MISMATCH = -38, + I40E_ERR_CQP_COMPL_ERROR = -39, + I40E_ERR_INVALID_VF_ID = -40, + I40E_ERR_INVALID_HMCFN_ID = -41, + I40E_ERR_BACKING_PAGE_ERROR = -42, + I40E_ERR_NO_PBLCHUNKS_AVAILABLE = -43, + I40E_ERR_INVALID_PBLE_INDEX = -44, + I40E_ERR_INVALID_SD_INDEX = -45, + I40E_ERR_INVALID_PAGE_DESC_INDEX = -46, + I40E_ERR_INVALID_SD_TYPE = -47, + I40E_ERR_MEMCPY_FAILED = -48, + I40E_ERR_INVALID_HMC_OBJ_INDEX = -49, + I40E_ERR_INVALID_HMC_OBJ_COUNT = -50, + I40E_ERR_INVALID_SRQ_ARM_LIMIT = -51, + I40E_ERR_SRQ_ENABLED = -52, + I40E_ERR_ADMIN_QUEUE_ERROR = -53, + I40E_ERR_ADMIN_QUEUE_TIMEOUT = -54, + I40E_ERR_BUF_TOO_SHORT = -55, + I40E_ERR_ADMIN_QUEUE_FULL = -56, + I40E_ERR_ADMIN_QUEUE_NO_WORK = -57, + I40E_ERR_BAD_IWARP_CQE = -58, + I40E_ERR_NVM_BLANK_MODE = -59, + I40E_ERR_NOT_IMPLEMENTED = -60, + I40E_ERR_PE_DOORBELL_NOT_ENABLED = -61, + I40E_ERR_DIAG_TEST_FAILED = -62, + I40E_ERR_NOT_READY = -63, + I40E_NOT_SUPPORTED = -64, + I40E_ERR_FIRMWARE_API_VERSION = -65, +}; + +#endif /* _I40E_STATUS_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_type.h b/usr/src/uts/common/io/i40e/core/i40e_type.h new file mode 100644 index 0000000000..b4a84993e9 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_type.h @@ -0,0 +1,1581 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_type.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_TYPE_H_ +#define _I40E_TYPE_H_ + +#include "i40e_status.h" +#include "i40e_osdep.h" +#include "i40e_register.h" +#include "i40e_adminq.h" +#include "i40e_hmc.h" +#include "i40e_lan_hmc.h" +#include "i40e_devids.h" + +#define UNREFERENCED_XPARAMETER + +#define BIT(a) (1UL << (a)) +#define BIT_ULL(a) (1ULL << (a)) + +#ifndef I40E_MASK +/* I40E_MASK is a macro used on 32 bit registers */ +#define I40E_MASK(mask, shift) (((uint32_t)(mask)) << ((uint32_t)(shift))) +#endif + +#define I40E_MAX_PF 16 +#define I40E_MAX_PF_VSI 64 +#define I40E_MAX_PF_QP 128 +#define I40E_MAX_VSI_QP 16 +#define I40E_MAX_VF_VSI 3 +#define I40E_MAX_CHAINED_RX_BUFFERS 5 +#define I40E_MAX_PF_UDP_OFFLOAD_PORTS 16 + +/* something less than 1 minute */ +#define I40E_HEARTBEAT_TIMEOUT (HZ * 50) + +/* Max default timeout in ms, */ +#define I40E_MAX_NVM_TIMEOUT 18000 + +/* Check whether address is multicast. */ +#define I40E_IS_MULTICAST(address) (bool)(((u8 *)(address))[0] & ((u8)0x01)) + +/* Check whether an address is broadcast. */ +#define I40E_IS_BROADCAST(address) \ + ((((u8 *)(address))[0] == ((u8)0xff)) && \ + (((u8 *)(address))[1] == ((u8)0xff))) + +/* Switch from ms to the 1usec global time (this is the GTIME resolution) */ +#define I40E_MS_TO_GTIME(time) ((time) * 1000) + +/* forward declaration */ +struct i40e_hw; +typedef void (*I40E_ADMINQ_CALLBACK)(struct i40e_hw *, struct i40e_aq_desc *); + +#define I40E_ETH_LENGTH_OF_ADDRESS 6 +/* Data type manipulation macros. */ +#define I40E_HI_DWORD(x) ((u32)((((x) >> 16) >> 16) & 0xFFFFFFFF)) +#define I40E_LO_DWORD(x) ((u32)((x) & 0xFFFFFFFF)) + +#define I40E_HI_WORD(x) ((u16)(((x) >> 16) & 0xFFFF)) +#define I40E_LO_WORD(x) ((u16)((x) & 0xFFFF)) + +#define I40E_HI_BYTE(x) ((u8)(((x) >> 8) & 0xFF)) +#define I40E_LO_BYTE(x) ((u8)((x) & 0xFF)) + +/* Number of Transmit Descriptors must be a multiple of 8. */ +#define I40E_REQ_TX_DESCRIPTOR_MULTIPLE 8 +/* Number of Receive Descriptors must be a multiple of 32 if + * the number of descriptors is greater than 32. + */ +#define I40E_REQ_RX_DESCRIPTOR_MULTIPLE 32 + +#define I40E_DESC_UNUSED(R) \ + ((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \ + (R)->next_to_clean - (R)->next_to_use - 1) + +/* bitfields for Tx queue mapping in QTX_CTL */ +#define I40E_QTX_CTL_VF_QUEUE 0x0 +#define I40E_QTX_CTL_VM_QUEUE 0x1 +#define I40E_QTX_CTL_PF_QUEUE 0x2 + +/* debug masks - set these bits in hw->debug_mask to control output */ +enum i40e_debug_mask { + I40E_DEBUG_INIT = 0x00000001, + I40E_DEBUG_RELEASE = 0x00000002, + + I40E_DEBUG_LINK = 0x00000010, + I40E_DEBUG_PHY = 0x00000020, + I40E_DEBUG_HMC = 0x00000040, + I40E_DEBUG_NVM = 0x00000080, + I40E_DEBUG_LAN = 0x00000100, + I40E_DEBUG_FLOW = 0x00000200, + I40E_DEBUG_DCB = 0x00000400, + I40E_DEBUG_DIAG = 0x00000800, + I40E_DEBUG_FD = 0x00001000, + + I40E_DEBUG_AQ_MESSAGE = 0x01000000, + I40E_DEBUG_AQ_DESCRIPTOR = 0x02000000, + I40E_DEBUG_AQ_DESC_BUFFER = 0x04000000, + I40E_DEBUG_AQ_COMMAND = 0x06000000, + I40E_DEBUG_AQ = 0x0F000000, + + /* + * Ugggh, have to cast these because of enums being "int" and these + * overflow int. + */ + I40E_DEBUG_USER = (int)0xF0000000, + + I40E_DEBUG_ALL = (int)0xFFFFFFFF +}; + +/* PCI Bus Info */ +#define I40E_PCI_LINK_STATUS 0xB2 +#define I40E_PCI_LINK_WIDTH 0x3F0 +#define I40E_PCI_LINK_WIDTH_1 0x10 +#define I40E_PCI_LINK_WIDTH_2 0x20 +#define I40E_PCI_LINK_WIDTH_4 0x40 +#define I40E_PCI_LINK_WIDTH_8 0x80 +#define I40E_PCI_LINK_SPEED 0xF +#define I40E_PCI_LINK_SPEED_2500 0x1 +#define I40E_PCI_LINK_SPEED_5000 0x2 +#define I40E_PCI_LINK_SPEED_8000 0x3 + +/* Memory types */ +enum i40e_memset_type { + I40E_NONDMA_MEM = 0, + I40E_DMA_MEM +}; + +/* Memcpy types */ +enum i40e_memcpy_type { + I40E_NONDMA_TO_NONDMA = 0, + I40E_NONDMA_TO_DMA, + I40E_DMA_TO_DMA, + I40E_DMA_TO_NONDMA +}; + +/* These are structs for managing the hardware information and the operations. + * The structures of function pointers are filled out at init time when we + * know for sure exactly which hardware we're working with. This gives us the + * flexibility of using the same main driver code but adapting to slightly + * different hardware needs as new parts are developed. For this architecture, + * the Firmware and AdminQ are intended to insulate the driver from most of the + * future changes, but these structures will also do part of the job. + */ +enum i40e_mac_type { + I40E_MAC_UNKNOWN = 0, + I40E_MAC_X710, + I40E_MAC_XL710, + I40E_MAC_VF, +#ifdef X722_SUPPORT + I40E_MAC_X722, + I40E_MAC_X722_VF, +#endif + I40E_MAC_GENERIC, +}; + +enum i40e_media_type { + I40E_MEDIA_TYPE_UNKNOWN = 0, + I40E_MEDIA_TYPE_FIBER, + I40E_MEDIA_TYPE_BASET, + I40E_MEDIA_TYPE_BACKPLANE, + I40E_MEDIA_TYPE_CX4, + I40E_MEDIA_TYPE_DA, + I40E_MEDIA_TYPE_VIRTUAL +}; + +enum i40e_fc_mode { + I40E_FC_NONE = 0, + I40E_FC_RX_PAUSE, + I40E_FC_TX_PAUSE, + I40E_FC_FULL, + I40E_FC_PFC, + I40E_FC_DEFAULT +}; + +enum i40e_set_fc_aq_failures { + I40E_SET_FC_AQ_FAIL_NONE = 0, + I40E_SET_FC_AQ_FAIL_GET = 1, + I40E_SET_FC_AQ_FAIL_SET = 2, + I40E_SET_FC_AQ_FAIL_UPDATE = 4, + I40E_SET_FC_AQ_FAIL_SET_UPDATE = 6 +}; + +enum i40e_vsi_type { + I40E_VSI_MAIN = 0, + I40E_VSI_VMDQ1 = 1, + I40E_VSI_VMDQ2 = 2, + I40E_VSI_CTRL = 3, + I40E_VSI_FCOE = 4, + I40E_VSI_MIRROR = 5, + I40E_VSI_SRIOV = 6, + I40E_VSI_FDIR = 7, + I40E_VSI_TYPE_UNKNOWN +}; + +enum i40e_queue_type { + I40E_QUEUE_TYPE_RX = 0, + I40E_QUEUE_TYPE_TX, + I40E_QUEUE_TYPE_PE_CEQ, + I40E_QUEUE_TYPE_UNKNOWN +}; + +struct i40e_link_status { + enum i40e_aq_phy_type phy_type; + enum i40e_aq_link_speed link_speed; + u8 link_info; + u8 an_info; + u8 ext_info; + u8 loopback; + /* is Link Status Event notification to SW enabled */ + bool lse_enable; + u16 max_frame_size; + bool crc_enable; + u8 pacing; + u8 requested_speeds; + u8 module_type[3]; + /* 1st byte: module identifier */ +#define I40E_MODULE_TYPE_SFP 0x03 +#define I40E_MODULE_TYPE_QSFP 0x0D + /* 2nd byte: ethernet compliance codes for 10/40G */ +#define I40E_MODULE_TYPE_40G_ACTIVE 0x01 +#define I40E_MODULE_TYPE_40G_LR4 0x02 +#define I40E_MODULE_TYPE_40G_SR4 0x04 +#define I40E_MODULE_TYPE_40G_CR4 0x08 +#define I40E_MODULE_TYPE_10G_BASE_SR 0x10 +#define I40E_MODULE_TYPE_10G_BASE_LR 0x20 +#define I40E_MODULE_TYPE_10G_BASE_LRM 0x40 +#define I40E_MODULE_TYPE_10G_BASE_ER 0x80 + /* 3rd byte: ethernet compliance codes for 1G */ +#define I40E_MODULE_TYPE_1000BASE_SX 0x01 +#define I40E_MODULE_TYPE_1000BASE_LX 0x02 +#define I40E_MODULE_TYPE_1000BASE_CX 0x04 +#define I40E_MODULE_TYPE_1000BASE_T 0x08 +}; + +enum i40e_aq_capabilities_phy_type { + I40E_CAP_PHY_TYPE_SGMII = BIT(I40E_PHY_TYPE_SGMII), + I40E_CAP_PHY_TYPE_1000BASE_KX = BIT(I40E_PHY_TYPE_1000BASE_KX), + I40E_CAP_PHY_TYPE_10GBASE_KX4 = BIT(I40E_PHY_TYPE_10GBASE_KX4), + I40E_CAP_PHY_TYPE_10GBASE_KR = BIT(I40E_PHY_TYPE_10GBASE_KR), + I40E_CAP_PHY_TYPE_40GBASE_KR4 = BIT(I40E_PHY_TYPE_40GBASE_KR4), + I40E_CAP_PHY_TYPE_XAUI = BIT(I40E_PHY_TYPE_XAUI), + I40E_CAP_PHY_TYPE_XFI = BIT(I40E_PHY_TYPE_XFI), + I40E_CAP_PHY_TYPE_SFI = BIT(I40E_PHY_TYPE_SFI), + I40E_CAP_PHY_TYPE_XLAUI = BIT(I40E_PHY_TYPE_XLAUI), + I40E_CAP_PHY_TYPE_XLPPI = BIT(I40E_PHY_TYPE_XLPPI), + I40E_CAP_PHY_TYPE_40GBASE_CR4_CU = BIT(I40E_PHY_TYPE_40GBASE_CR4_CU), + I40E_CAP_PHY_TYPE_10GBASE_CR1_CU = BIT(I40E_PHY_TYPE_10GBASE_CR1_CU), + I40E_CAP_PHY_TYPE_10GBASE_AOC = BIT(I40E_PHY_TYPE_10GBASE_AOC), + I40E_CAP_PHY_TYPE_40GBASE_AOC = BIT(I40E_PHY_TYPE_40GBASE_AOC), + I40E_CAP_PHY_TYPE_100BASE_TX = BIT(I40E_PHY_TYPE_100BASE_TX), + I40E_CAP_PHY_TYPE_1000BASE_T = BIT(I40E_PHY_TYPE_1000BASE_T), + I40E_CAP_PHY_TYPE_10GBASE_T = BIT(I40E_PHY_TYPE_10GBASE_T), + I40E_CAP_PHY_TYPE_10GBASE_SR = BIT(I40E_PHY_TYPE_10GBASE_SR), + I40E_CAP_PHY_TYPE_10GBASE_LR = BIT(I40E_PHY_TYPE_10GBASE_LR), + I40E_CAP_PHY_TYPE_10GBASE_SFPP_CU = BIT(I40E_PHY_TYPE_10GBASE_SFPP_CU), + I40E_CAP_PHY_TYPE_10GBASE_CR1 = BIT(I40E_PHY_TYPE_10GBASE_CR1), + I40E_CAP_PHY_TYPE_40GBASE_CR4 = BIT(I40E_PHY_TYPE_40GBASE_CR4), + I40E_CAP_PHY_TYPE_40GBASE_SR4 = BIT(I40E_PHY_TYPE_40GBASE_SR4), + I40E_CAP_PHY_TYPE_40GBASE_LR4 = BIT(I40E_PHY_TYPE_40GBASE_LR4), + I40E_CAP_PHY_TYPE_1000BASE_SX = BIT(I40E_PHY_TYPE_1000BASE_SX), + I40E_CAP_PHY_TYPE_1000BASE_LX = BIT(I40E_PHY_TYPE_1000BASE_LX), + I40E_CAP_PHY_TYPE_1000BASE_T_OPTICAL = BIT(I40E_PHY_TYPE_1000BASE_T_OPTICAL), + I40E_CAP_PHY_TYPE_20GBASE_KR2 = BIT(I40E_PHY_TYPE_20GBASE_KR2) +}; + +struct i40e_phy_info { + struct i40e_link_status link_info; + struct i40e_link_status link_info_old; + bool get_link_info; + enum i40e_media_type media_type; + /* all the phy types the NVM is capable of */ + enum i40e_aq_capabilities_phy_type phy_types; +}; + +#define I40E_HW_CAP_MAX_GPIO 30 +#define I40E_HW_CAP_MDIO_PORT_MODE_MDIO 0 +#define I40E_HW_CAP_MDIO_PORT_MODE_I2C 1 + +/* Capabilities of a PF or a VF or the whole device */ +struct i40e_hw_capabilities { + u32 switch_mode; +#define I40E_NVM_IMAGE_TYPE_EVB 0x0 +#define I40E_NVM_IMAGE_TYPE_CLOUD 0x2 +#define I40E_NVM_IMAGE_TYPE_UDP_CLOUD 0x3 + + u32 management_mode; + u32 npar_enable; + u32 os2bmc; + u32 valid_functions; + bool sr_iov_1_1; + bool vmdq; + bool evb_802_1_qbg; /* Edge Virtual Bridging */ + bool evb_802_1_qbh; /* Bridge Port Extension */ + bool dcb; + bool fcoe; + bool iscsi; /* Indicates iSCSI enabled */ + bool flex10_enable; + bool flex10_capable; + u32 flex10_mode; +#define I40E_FLEX10_MODE_UNKNOWN 0x0 +#define I40E_FLEX10_MODE_DCC 0x1 +#define I40E_FLEX10_MODE_DCI 0x2 + + u32 flex10_status; +#define I40E_FLEX10_STATUS_DCC_ERROR 0x1 +#define I40E_FLEX10_STATUS_VC_MODE 0x2 + + bool mgmt_cem; + bool ieee_1588; + bool iwarp; + bool fd; + u32 fd_filters_guaranteed; + u32 fd_filters_best_effort; + bool rss; + u32 rss_table_size; + u32 rss_table_entry_width; + bool led[I40E_HW_CAP_MAX_GPIO]; + bool sdp[I40E_HW_CAP_MAX_GPIO]; + u32 nvm_image_type; + u32 num_flow_director_filters; + u32 num_vfs; + u32 vf_base_id; + u32 num_vsis; + u32 num_rx_qp; + u32 num_tx_qp; + u32 base_queue; + u32 num_msix_vectors; + u32 num_msix_vectors_vf; + u32 led_pin_num; + u32 sdp_pin_num; + u32 mdio_port_num; + u32 mdio_port_mode; + u8 rx_buf_chain_len; + u32 enabled_tcmap; + u32 maxtc; + u64 wr_csr_prot; +}; + +struct i40e_mac_info { + enum i40e_mac_type type; + u8 addr[I40E_ETH_LENGTH_OF_ADDRESS]; + u8 perm_addr[I40E_ETH_LENGTH_OF_ADDRESS]; + u8 san_addr[I40E_ETH_LENGTH_OF_ADDRESS]; + u8 port_addr[I40E_ETH_LENGTH_OF_ADDRESS]; + u16 max_fcoeq; +}; + +enum i40e_aq_resources_ids { + I40E_NVM_RESOURCE_ID = 1 +}; + +enum i40e_aq_resource_access_type { + I40E_RESOURCE_READ = 1, + I40E_RESOURCE_WRITE +}; + +struct i40e_nvm_info { + u64 hw_semaphore_timeout; /* usec global time (GTIME resolution) */ + u32 timeout; /* [ms] */ + u16 sr_size; /* Shadow RAM size in words */ + bool blank_nvm_mode; /* is NVM empty (no FW present)*/ + u16 version; /* NVM package version */ + u32 eetrack; /* NVM data version */ + u32 oem_ver; /* OEM version info */ +}; + +/* definitions used in NVM update support */ + +enum i40e_nvmupd_cmd { + I40E_NVMUPD_INVALID, + I40E_NVMUPD_READ_CON, + I40E_NVMUPD_READ_SNT, + I40E_NVMUPD_READ_LCB, + I40E_NVMUPD_READ_SA, + I40E_NVMUPD_WRITE_ERA, + I40E_NVMUPD_WRITE_CON, + I40E_NVMUPD_WRITE_SNT, + I40E_NVMUPD_WRITE_LCB, + I40E_NVMUPD_WRITE_SA, + I40E_NVMUPD_CSUM_CON, + I40E_NVMUPD_CSUM_SA, + I40E_NVMUPD_CSUM_LCB, + I40E_NVMUPD_STATUS, + I40E_NVMUPD_EXEC_AQ, + I40E_NVMUPD_GET_AQ_RESULT, +}; + +enum i40e_nvmupd_state { + I40E_NVMUPD_STATE_INIT, + I40E_NVMUPD_STATE_READING, + I40E_NVMUPD_STATE_WRITING, + I40E_NVMUPD_STATE_INIT_WAIT, + I40E_NVMUPD_STATE_WRITE_WAIT, +}; + +/* nvm_access definition and its masks/shifts need to be accessible to + * application, core driver, and shared code. Where is the right file? + */ +#define I40E_NVM_READ 0xB +#define I40E_NVM_WRITE 0xC + +#define I40E_NVM_MOD_PNT_MASK 0xFF + +#define I40E_NVM_TRANS_SHIFT 8 +#define I40E_NVM_TRANS_MASK (0xf << I40E_NVM_TRANS_SHIFT) +#define I40E_NVM_CON 0x0 +#define I40E_NVM_SNT 0x1 +#define I40E_NVM_LCB 0x2 +#define I40E_NVM_SA (I40E_NVM_SNT | I40E_NVM_LCB) +#define I40E_NVM_ERA 0x4 +#define I40E_NVM_CSUM 0x8 +#define I40E_NVM_EXEC 0xf + +#define I40E_NVM_ADAPT_SHIFT 16 +#define I40E_NVM_ADAPT_MASK (0xffffULL << I40E_NVM_ADAPT_SHIFT) + +#define I40E_NVMUPD_MAX_DATA 4096 +#define I40E_NVMUPD_IFACE_TIMEOUT 2 /* seconds */ + +struct i40e_nvm_access { + u32 command; + u32 config; + u32 offset; /* in bytes */ + u32 data_size; /* in bytes */ + u8 data[1]; +}; + +/* PCI bus types */ +enum i40e_bus_type { + i40e_bus_type_unknown = 0, + i40e_bus_type_pci, + i40e_bus_type_pcix, + i40e_bus_type_pci_express, + i40e_bus_type_reserved +}; + +/* PCI bus speeds */ +enum i40e_bus_speed { + i40e_bus_speed_unknown = 0, + i40e_bus_speed_33 = 33, + i40e_bus_speed_66 = 66, + i40e_bus_speed_100 = 100, + i40e_bus_speed_120 = 120, + i40e_bus_speed_133 = 133, + i40e_bus_speed_2500 = 2500, + i40e_bus_speed_5000 = 5000, + i40e_bus_speed_8000 = 8000, + i40e_bus_speed_reserved +}; + +/* PCI bus widths */ +enum i40e_bus_width { + i40e_bus_width_unknown = 0, + i40e_bus_width_pcie_x1 = 1, + i40e_bus_width_pcie_x2 = 2, + i40e_bus_width_pcie_x4 = 4, + i40e_bus_width_pcie_x8 = 8, + i40e_bus_width_32 = 32, + i40e_bus_width_64 = 64, + i40e_bus_width_reserved +}; + +/* Bus parameters */ +struct i40e_bus_info { + enum i40e_bus_speed speed; + enum i40e_bus_width width; + enum i40e_bus_type type; + + u16 func; + u16 device; + u16 lan_id; +}; + +/* Flow control (FC) parameters */ +struct i40e_fc_info { + enum i40e_fc_mode current_mode; /* FC mode in effect */ + enum i40e_fc_mode requested_mode; /* FC mode requested by caller */ +}; + +#define I40E_MAX_TRAFFIC_CLASS 8 +#define I40E_MAX_USER_PRIORITY 8 +#define I40E_DCBX_MAX_APPS 32 +#define I40E_LLDPDU_SIZE 1500 +#define I40E_TLV_STATUS_OPER 0x1 +#define I40E_TLV_STATUS_SYNC 0x2 +#define I40E_TLV_STATUS_ERR 0x4 +#define I40E_CEE_OPER_MAX_APPS 3 +#define I40E_APP_PROTOID_FCOE 0x8906 +#define I40E_APP_PROTOID_ISCSI 0x0cbc +#define I40E_APP_PROTOID_FIP 0x8914 +#define I40E_APP_SEL_ETHTYPE 0x1 +#define I40E_APP_SEL_TCPIP 0x2 +#define I40E_CEE_APP_SEL_ETHTYPE 0x0 +#define I40E_CEE_APP_SEL_TCPIP 0x1 + +/* CEE or IEEE 802.1Qaz ETS Configuration data */ +struct i40e_dcb_ets_config { + u8 willing; + u8 cbs; + u8 maxtcs; + u8 prioritytable[I40E_MAX_TRAFFIC_CLASS]; + u8 tcbwtable[I40E_MAX_TRAFFIC_CLASS]; + u8 tsatable[I40E_MAX_TRAFFIC_CLASS]; +}; + +/* CEE or IEEE 802.1Qaz PFC Configuration data */ +struct i40e_dcb_pfc_config { + u8 willing; + u8 mbc; + u8 pfccap; + u8 pfcenable; +}; + +/* CEE or IEEE 802.1Qaz Application Priority data */ +struct i40e_dcb_app_priority_table { + u8 priority; + u8 selector; + u16 protocolid; +}; + +struct i40e_dcbx_config { + u8 dcbx_mode; +#define I40E_DCBX_MODE_CEE 0x1 +#define I40E_DCBX_MODE_IEEE 0x2 + u32 numapps; + u32 tlv_status; /* CEE mode TLV status */ + struct i40e_dcb_ets_config etscfg; + struct i40e_dcb_ets_config etsrec; + struct i40e_dcb_pfc_config pfc; + struct i40e_dcb_app_priority_table app[I40E_DCBX_MAX_APPS]; +}; + +/* Port hardware description */ +struct i40e_hw { + u8 *hw_addr; + void *back; + + /* subsystem structs */ + struct i40e_phy_info phy; + struct i40e_mac_info mac; + struct i40e_bus_info bus; + struct i40e_nvm_info nvm; + struct i40e_fc_info fc; + + /* pci info */ + u16 device_id; + u16 vendor_id; + u16 subsystem_device_id; + u16 subsystem_vendor_id; + u8 revision_id; + u8 port; + bool adapter_stopped; + + /* capabilities for entire device and PCI func */ + struct i40e_hw_capabilities dev_caps; + struct i40e_hw_capabilities func_caps; + + /* Flow Director shared filter space */ + u16 fdir_shared_filter_count; + + /* device profile info */ + u8 pf_id; + u16 main_vsi_seid; + + /* for multi-function MACs */ + u16 partition_id; + u16 num_partitions; + u16 num_ports; + + /* Closest numa node to the device */ + u16 numa_node; + + /* Admin Queue info */ + struct i40e_adminq_info aq; + + /* state of nvm update process */ + enum i40e_nvmupd_state nvmupd_state; + struct i40e_aq_desc nvm_wb_desc; + struct i40e_virt_mem nvm_buff; + + /* HMC info */ + struct i40e_hmc_info hmc; /* HMC info struct */ + + /* LLDP/DCBX Status */ + u16 dcbx_status; + + /* DCBX info */ + struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */ + struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */ + struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */ + + /* debug mask */ + u32 debug_mask; + char err_str[16]; +}; + +static INLINE bool i40e_is_vf(struct i40e_hw *hw) +{ +#ifdef X722_SUPPORT + return (hw->mac.type == I40E_MAC_VF || + hw->mac.type == I40E_MAC_X722_VF); +#else + return hw->mac.type == I40E_MAC_VF; +#endif +} + +struct i40e_driver_version { + u8 major_version; + u8 minor_version; + u8 build_version; + u8 subbuild_version; + u8 driver_string[32]; +}; + +/* RX Descriptors */ +union i40e_16byte_rx_desc { + struct { + __le64 pkt_addr; /* Packet buffer address */ + __le64 hdr_addr; /* Header buffer address */ + } read; + struct { + struct { + struct { + union { + __le16 mirroring_status; + __le16 fcoe_ctx_id; + } mirr_fcoe; + __le16 l2tag1; + } lo_dword; + union { + __le32 rss; /* RSS Hash */ + __le32 fd_id; /* Flow director filter id */ + __le32 fcoe_param; /* FCoE DDP Context id */ + } hi_dword; + } qword0; + struct { + /* ext status/error/pktype/length */ + __le64 status_error_len; + } qword1; + } wb; /* writeback */ +}; + +union i40e_32byte_rx_desc { + struct { + __le64 pkt_addr; /* Packet buffer address */ + __le64 hdr_addr; /* Header buffer address */ + /* bit 0 of hdr_buffer_addr is DD bit */ + __le64 rsvd1; + __le64 rsvd2; + } read; + struct { + struct { + struct { + union { + __le16 mirroring_status; + __le16 fcoe_ctx_id; + } mirr_fcoe; + __le16 l2tag1; + } lo_dword; + union { + __le32 rss; /* RSS Hash */ + __le32 fcoe_param; /* FCoE DDP Context id */ + /* Flow director filter id in case of + * Programming status desc WB + */ + __le32 fd_id; + } hi_dword; + } qword0; + struct { + /* status/error/pktype/length */ + __le64 status_error_len; + } qword1; + struct { + __le16 ext_status; /* extended status */ + __le16 rsvd; + __le16 l2tag2_1; + __le16 l2tag2_2; + } qword2; + struct { + union { + __le32 flex_bytes_lo; + __le32 pe_status; + } lo_dword; + union { + __le32 flex_bytes_hi; + __le32 fd_id; + } hi_dword; + } qword3; + } wb; /* writeback */ +}; + +#define I40E_RXD_QW0_MIRROR_STATUS_SHIFT 8 +#define I40E_RXD_QW0_MIRROR_STATUS_MASK (0x3FUL << \ + I40E_RXD_QW0_MIRROR_STATUS_SHIFT) +#define I40E_RXD_QW0_FCOEINDX_SHIFT 0 +#define I40E_RXD_QW0_FCOEINDX_MASK (0xFFFUL << \ + I40E_RXD_QW0_FCOEINDX_SHIFT) + +enum i40e_rx_desc_status_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_DESC_STATUS_DD_SHIFT = 0, + I40E_RX_DESC_STATUS_EOF_SHIFT = 1, + I40E_RX_DESC_STATUS_L2TAG1P_SHIFT = 2, + I40E_RX_DESC_STATUS_L3L4P_SHIFT = 3, + I40E_RX_DESC_STATUS_CRCP_SHIFT = 4, + I40E_RX_DESC_STATUS_TSYNINDX_SHIFT = 5, /* 2 BITS */ + I40E_RX_DESC_STATUS_TSYNVALID_SHIFT = 7, +#ifdef X722_SUPPORT + I40E_RX_DESC_STATUS_EXT_UDP_0_SHIFT = 8, +#else + I40E_RX_DESC_STATUS_RESERVED1_SHIFT = 8, +#endif + + I40E_RX_DESC_STATUS_UMBCAST_SHIFT = 9, /* 2 BITS */ + I40E_RX_DESC_STATUS_FLM_SHIFT = 11, + I40E_RX_DESC_STATUS_FLTSTAT_SHIFT = 12, /* 2 BITS */ + I40E_RX_DESC_STATUS_LPBK_SHIFT = 14, + I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT = 15, + I40E_RX_DESC_STATUS_RESERVED2_SHIFT = 16, /* 2 BITS */ +#ifdef X722_SUPPORT + I40E_RX_DESC_STATUS_INT_UDP_0_SHIFT = 18, +#else + I40E_RX_DESC_STATUS_UDP_0_SHIFT = 18, +#endif + I40E_RX_DESC_STATUS_LAST /* this entry must be last!!! */ +}; + +#define I40E_RXD_QW1_STATUS_SHIFT 0 +#define I40E_RXD_QW1_STATUS_MASK ((BIT(I40E_RX_DESC_STATUS_LAST) - 1) << \ + I40E_RXD_QW1_STATUS_SHIFT) + +#define I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT I40E_RX_DESC_STATUS_TSYNINDX_SHIFT +#define I40E_RXD_QW1_STATUS_TSYNINDX_MASK (0x3UL << \ + I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT) + +#define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT I40E_RX_DESC_STATUS_TSYNVALID_SHIFT +#define I40E_RXD_QW1_STATUS_TSYNVALID_MASK BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT) + +#define I40E_RXD_QW1_STATUS_UMBCAST_SHIFT I40E_RX_DESC_STATUS_UMBCAST +#define I40E_RXD_QW1_STATUS_UMBCAST_MASK (0x3UL << \ + I40E_RXD_QW1_STATUS_UMBCAST_SHIFT) + +enum i40e_rx_desc_fltstat_values { + I40E_RX_DESC_FLTSTAT_NO_DATA = 0, + I40E_RX_DESC_FLTSTAT_RSV_FD_ID = 1, /* 16byte desc? FD_ID : RSV */ + I40E_RX_DESC_FLTSTAT_RSV = 2, + I40E_RX_DESC_FLTSTAT_RSS_HASH = 3, +}; + +#define I40E_RXD_PACKET_TYPE_UNICAST 0 +#define I40E_RXD_PACKET_TYPE_MULTICAST 1 +#define I40E_RXD_PACKET_TYPE_BROADCAST 2 +#define I40E_RXD_PACKET_TYPE_MIRRORED 3 + +#define I40E_RXD_QW1_ERROR_SHIFT 19 +#define I40E_RXD_QW1_ERROR_MASK (0xFFUL << I40E_RXD_QW1_ERROR_SHIFT) + +enum i40e_rx_desc_error_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_DESC_ERROR_RXE_SHIFT = 0, + I40E_RX_DESC_ERROR_RECIPE_SHIFT = 1, + I40E_RX_DESC_ERROR_HBO_SHIFT = 2, + I40E_RX_DESC_ERROR_L3L4E_SHIFT = 3, /* 3 BITS */ + I40E_RX_DESC_ERROR_IPE_SHIFT = 3, + I40E_RX_DESC_ERROR_L4E_SHIFT = 4, + I40E_RX_DESC_ERROR_EIPE_SHIFT = 5, + I40E_RX_DESC_ERROR_OVERSIZE_SHIFT = 6, + I40E_RX_DESC_ERROR_PPRS_SHIFT = 7 +}; + +enum i40e_rx_desc_error_l3l4e_fcoe_masks { + I40E_RX_DESC_ERROR_L3L4E_NONE = 0, + I40E_RX_DESC_ERROR_L3L4E_PROT = 1, + I40E_RX_DESC_ERROR_L3L4E_FC = 2, + I40E_RX_DESC_ERROR_L3L4E_DMAC_ERR = 3, + I40E_RX_DESC_ERROR_L3L4E_DMAC_WARN = 4 +}; + +#define I40E_RXD_QW1_PTYPE_SHIFT 30 +#define I40E_RXD_QW1_PTYPE_MASK (0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT) + +/* Packet type non-ip values */ +enum i40e_rx_l2_ptype { + I40E_RX_PTYPE_L2_RESERVED = 0, + I40E_RX_PTYPE_L2_MAC_PAY2 = 1, + I40E_RX_PTYPE_L2_TIMESYNC_PAY2 = 2, + I40E_RX_PTYPE_L2_FIP_PAY2 = 3, + I40E_RX_PTYPE_L2_OUI_PAY2 = 4, + I40E_RX_PTYPE_L2_MACCNTRL_PAY2 = 5, + I40E_RX_PTYPE_L2_LLDP_PAY2 = 6, + I40E_RX_PTYPE_L2_ECP_PAY2 = 7, + I40E_RX_PTYPE_L2_EVB_PAY2 = 8, + I40E_RX_PTYPE_L2_QCN_PAY2 = 9, + I40E_RX_PTYPE_L2_EAPOL_PAY2 = 10, + I40E_RX_PTYPE_L2_ARP = 11, + I40E_RX_PTYPE_L2_FCOE_PAY3 = 12, + I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3 = 13, + I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3 = 14, + I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3 = 15, + I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA = 16, + I40E_RX_PTYPE_L2_FCOE_VFT_PAY3 = 17, + I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA = 18, + I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY = 19, + I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP = 20, + I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER = 21, + I40E_RX_PTYPE_GRENAT4_MAC_PAY3 = 58, + I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4 = 87, + I40E_RX_PTYPE_GRENAT6_MAC_PAY3 = 124, + I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4 = 153 +}; + +struct i40e_rx_ptype_decoded { + u32 ptype:8; + u32 known:1; + u32 outer_ip:1; + u32 outer_ip_ver:1; + u32 outer_frag:1; + u32 tunnel_type:3; + u32 tunnel_end_prot:2; + u32 tunnel_end_frag:1; + u32 inner_prot:4; + u32 payload_layer:3; +}; + +enum i40e_rx_ptype_outer_ip { + I40E_RX_PTYPE_OUTER_L2 = 0, + I40E_RX_PTYPE_OUTER_IP = 1 +}; + +enum i40e_rx_ptype_outer_ip_ver { + I40E_RX_PTYPE_OUTER_NONE = 0, + I40E_RX_PTYPE_OUTER_IPV4 = 0, + I40E_RX_PTYPE_OUTER_IPV6 = 1 +}; + +enum i40e_rx_ptype_outer_fragmented { + I40E_RX_PTYPE_NOT_FRAG = 0, + I40E_RX_PTYPE_FRAG = 1 +}; + +enum i40e_rx_ptype_tunnel_type { + I40E_RX_PTYPE_TUNNEL_NONE = 0, + I40E_RX_PTYPE_TUNNEL_IP_IP = 1, + I40E_RX_PTYPE_TUNNEL_IP_GRENAT = 2, + I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC = 3, + I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN = 4, +}; + +enum i40e_rx_ptype_tunnel_end_prot { + I40E_RX_PTYPE_TUNNEL_END_NONE = 0, + I40E_RX_PTYPE_TUNNEL_END_IPV4 = 1, + I40E_RX_PTYPE_TUNNEL_END_IPV6 = 2, +}; + +enum i40e_rx_ptype_inner_prot { + I40E_RX_PTYPE_INNER_PROT_NONE = 0, + I40E_RX_PTYPE_INNER_PROT_UDP = 1, + I40E_RX_PTYPE_INNER_PROT_TCP = 2, + I40E_RX_PTYPE_INNER_PROT_SCTP = 3, + I40E_RX_PTYPE_INNER_PROT_ICMP = 4, + I40E_RX_PTYPE_INNER_PROT_TIMESYNC = 5 +}; + +enum i40e_rx_ptype_payload_layer { + I40E_RX_PTYPE_PAYLOAD_LAYER_NONE = 0, + I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2 = 1, + I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3 = 2, + I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4 = 3, +}; + +#define I40E_RX_PTYPE_BIT_MASK 0x0FFFFFFF +#define I40E_RX_PTYPE_SHIFT 56 + +#define I40E_RXD_QW1_LENGTH_PBUF_SHIFT 38 +#define I40E_RXD_QW1_LENGTH_PBUF_MASK (0x3FFFULL << \ + I40E_RXD_QW1_LENGTH_PBUF_SHIFT) + +#define I40E_RXD_QW1_LENGTH_HBUF_SHIFT 52 +#define I40E_RXD_QW1_LENGTH_HBUF_MASK (0x7FFULL << \ + I40E_RXD_QW1_LENGTH_HBUF_SHIFT) + +#define I40E_RXD_QW1_LENGTH_SPH_SHIFT 63 +#define I40E_RXD_QW1_LENGTH_SPH_MASK BIT_ULL(I40E_RXD_QW1_LENGTH_SPH_SHIFT) + +#define I40E_RXD_QW1_NEXTP_SHIFT 38 +#define I40E_RXD_QW1_NEXTP_MASK (0x1FFFULL << I40E_RXD_QW1_NEXTP_SHIFT) + +#define I40E_RXD_QW2_EXT_STATUS_SHIFT 0 +#define I40E_RXD_QW2_EXT_STATUS_MASK (0xFFFFFUL << \ + I40E_RXD_QW2_EXT_STATUS_SHIFT) + +enum i40e_rx_desc_ext_status_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT = 0, + I40E_RX_DESC_EXT_STATUS_L2TAG3P_SHIFT = 1, + I40E_RX_DESC_EXT_STATUS_FLEXBL_SHIFT = 2, /* 2 BITS */ + I40E_RX_DESC_EXT_STATUS_FLEXBH_SHIFT = 4, /* 2 BITS */ + I40E_RX_DESC_EXT_STATUS_FDLONGB_SHIFT = 9, + I40E_RX_DESC_EXT_STATUS_FCOELONGB_SHIFT = 10, + I40E_RX_DESC_EXT_STATUS_PELONGB_SHIFT = 11, +}; + +#define I40E_RXD_QW2_L2TAG2_SHIFT 0 +#define I40E_RXD_QW2_L2TAG2_MASK (0xFFFFUL << I40E_RXD_QW2_L2TAG2_SHIFT) + +#define I40E_RXD_QW2_L2TAG3_SHIFT 16 +#define I40E_RXD_QW2_L2TAG3_MASK (0xFFFFUL << I40E_RXD_QW2_L2TAG3_SHIFT) + +enum i40e_rx_desc_pe_status_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_DESC_PE_STATUS_QPID_SHIFT = 0, /* 18 BITS */ + I40E_RX_DESC_PE_STATUS_L4PORT_SHIFT = 0, /* 16 BITS */ + I40E_RX_DESC_PE_STATUS_IPINDEX_SHIFT = 16, /* 8 BITS */ + I40E_RX_DESC_PE_STATUS_QPIDHIT_SHIFT = 24, + I40E_RX_DESC_PE_STATUS_APBVTHIT_SHIFT = 25, + I40E_RX_DESC_PE_STATUS_PORTV_SHIFT = 26, + I40E_RX_DESC_PE_STATUS_URG_SHIFT = 27, + I40E_RX_DESC_PE_STATUS_IPFRAG_SHIFT = 28, + I40E_RX_DESC_PE_STATUS_IPOPT_SHIFT = 29 +}; + +#define I40E_RX_PROG_STATUS_DESC_LENGTH_SHIFT 38 +#define I40E_RX_PROG_STATUS_DESC_LENGTH 0x2000000 + +#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT 2 +#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK (0x7UL << \ + I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT) + +#define I40E_RX_PROG_STATUS_DESC_QW1_STATUS_SHIFT 0 +#define I40E_RX_PROG_STATUS_DESC_QW1_STATUS_MASK (0x7FFFUL << \ + I40E_RX_PROG_STATUS_DESC_QW1_STATUS_SHIFT) + +#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT 19 +#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK (0x3FUL << \ + I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT) + +enum i40e_rx_prog_status_desc_status_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_PROG_STATUS_DESC_DD_SHIFT = 0, + I40E_RX_PROG_STATUS_DESC_PROG_ID_SHIFT = 2 /* 3 BITS */ +}; + +enum i40e_rx_prog_status_desc_prog_id_masks { + I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS = 1, + I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS = 2, + I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS = 4, +}; + +enum i40e_rx_prog_status_desc_error_bits { + /* Note: These are predefined bit offsets */ + I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT = 0, + I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT = 1, + I40E_RX_PROG_STATUS_DESC_FCOE_TBL_FULL_SHIFT = 2, + I40E_RX_PROG_STATUS_DESC_FCOE_CONFLICT_SHIFT = 3 +}; + +#define I40E_TWO_BIT_MASK 0x3 +#define I40E_THREE_BIT_MASK 0x7 +#define I40E_FOUR_BIT_MASK 0xF +#define I40E_EIGHTEEN_BIT_MASK 0x3FFFF + +/* TX Descriptor */ +struct i40e_tx_desc { + __le64 buffer_addr; /* Address of descriptor's data buf */ + __le64 cmd_type_offset_bsz; +}; + +#define I40E_TXD_QW1_DTYPE_SHIFT 0 +#define I40E_TXD_QW1_DTYPE_MASK (0xFUL << I40E_TXD_QW1_DTYPE_SHIFT) + +enum i40e_tx_desc_dtype_value { + I40E_TX_DESC_DTYPE_DATA = 0x0, + I40E_TX_DESC_DTYPE_NOP = 0x1, /* same as Context desc */ + I40E_TX_DESC_DTYPE_CONTEXT = 0x1, + I40E_TX_DESC_DTYPE_FCOE_CTX = 0x2, + I40E_TX_DESC_DTYPE_FILTER_PROG = 0x8, + I40E_TX_DESC_DTYPE_DDP_CTX = 0x9, + I40E_TX_DESC_DTYPE_FLEX_DATA = 0xB, + I40E_TX_DESC_DTYPE_FLEX_CTX_1 = 0xC, + I40E_TX_DESC_DTYPE_FLEX_CTX_2 = 0xD, + I40E_TX_DESC_DTYPE_DESC_DONE = 0xF +}; + +#define I40E_TXD_QW1_CMD_SHIFT 4 +#define I40E_TXD_QW1_CMD_MASK (0x3FFUL << I40E_TXD_QW1_CMD_SHIFT) + +enum i40e_tx_desc_cmd_bits { + I40E_TX_DESC_CMD_EOP = 0x0001, + I40E_TX_DESC_CMD_RS = 0x0002, + I40E_TX_DESC_CMD_ICRC = 0x0004, + I40E_TX_DESC_CMD_IL2TAG1 = 0x0008, + I40E_TX_DESC_CMD_DUMMY = 0x0010, + I40E_TX_DESC_CMD_IIPT_NONIP = 0x0000, /* 2 BITS */ + I40E_TX_DESC_CMD_IIPT_IPV6 = 0x0020, /* 2 BITS */ + I40E_TX_DESC_CMD_IIPT_IPV4 = 0x0040, /* 2 BITS */ + I40E_TX_DESC_CMD_IIPT_IPV4_CSUM = 0x0060, /* 2 BITS */ + I40E_TX_DESC_CMD_FCOET = 0x0080, + I40E_TX_DESC_CMD_L4T_EOFT_UNK = 0x0000, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_TCP = 0x0100, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_SCTP = 0x0200, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_UDP = 0x0300, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_EOF_N = 0x0000, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_EOF_T = 0x0100, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_EOF_NI = 0x0200, /* 2 BITS */ + I40E_TX_DESC_CMD_L4T_EOFT_EOF_A = 0x0300, /* 2 BITS */ +}; + +#define I40E_TXD_QW1_OFFSET_SHIFT 16 +#define I40E_TXD_QW1_OFFSET_MASK (0x3FFFFULL << \ + I40E_TXD_QW1_OFFSET_SHIFT) + +enum i40e_tx_desc_length_fields { + /* Note: These are predefined bit offsets */ + I40E_TX_DESC_LENGTH_MACLEN_SHIFT = 0, /* 7 BITS */ + I40E_TX_DESC_LENGTH_IPLEN_SHIFT = 7, /* 7 BITS */ + I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT = 14 /* 4 BITS */ +}; + +#define I40E_TXD_QW1_MACLEN_MASK (0x7FUL << I40E_TX_DESC_LENGTH_MACLEN_SHIFT) +#define I40E_TXD_QW1_IPLEN_MASK (0x7FUL << I40E_TX_DESC_LENGTH_IPLEN_SHIFT) +#define I40E_TXD_QW1_L4LEN_MASK (0xFUL << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT) +#define I40E_TXD_QW1_FCLEN_MASK (0xFUL << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT) + +#define I40E_TXD_QW1_TX_BUF_SZ_SHIFT 34 +#define I40E_TXD_QW1_TX_BUF_SZ_MASK (0x3FFFULL << \ + I40E_TXD_QW1_TX_BUF_SZ_SHIFT) + +#define I40E_TXD_QW1_L2TAG1_SHIFT 48 +#define I40E_TXD_QW1_L2TAG1_MASK (0xFFFFULL << I40E_TXD_QW1_L2TAG1_SHIFT) + +/* Context descriptors */ +struct i40e_tx_context_desc { + __le32 tunneling_params; + __le16 l2tag2; + __le16 rsvd; + __le64 type_cmd_tso_mss; +}; + +#define I40E_TXD_CTX_QW1_DTYPE_SHIFT 0 +#define I40E_TXD_CTX_QW1_DTYPE_MASK (0xFUL << I40E_TXD_CTX_QW1_DTYPE_SHIFT) + +#define I40E_TXD_CTX_QW1_CMD_SHIFT 4 +#define I40E_TXD_CTX_QW1_CMD_MASK (0xFFFFUL << I40E_TXD_CTX_QW1_CMD_SHIFT) + +enum i40e_tx_ctx_desc_cmd_bits { + I40E_TX_CTX_DESC_TSO = 0x01, + I40E_TX_CTX_DESC_TSYN = 0x02, + I40E_TX_CTX_DESC_IL2TAG2 = 0x04, + I40E_TX_CTX_DESC_IL2TAG2_IL2H = 0x08, + I40E_TX_CTX_DESC_SWTCH_NOTAG = 0x00, + I40E_TX_CTX_DESC_SWTCH_UPLINK = 0x10, + I40E_TX_CTX_DESC_SWTCH_LOCAL = 0x20, + I40E_TX_CTX_DESC_SWTCH_VSI = 0x30, + I40E_TX_CTX_DESC_SWPE = 0x40 +}; + +#define I40E_TXD_CTX_QW1_TSO_LEN_SHIFT 30 +#define I40E_TXD_CTX_QW1_TSO_LEN_MASK (0x3FFFFULL << \ + I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) + +#define I40E_TXD_CTX_QW1_MSS_SHIFT 50 +#define I40E_TXD_CTX_QW1_MSS_MASK (0x3FFFULL << \ + I40E_TXD_CTX_QW1_MSS_SHIFT) + +#define I40E_TXD_CTX_QW1_VSI_SHIFT 50 +#define I40E_TXD_CTX_QW1_VSI_MASK (0x1FFULL << I40E_TXD_CTX_QW1_VSI_SHIFT) + +#define I40E_TXD_CTX_QW0_EXT_IP_SHIFT 0 +#define I40E_TXD_CTX_QW0_EXT_IP_MASK (0x3ULL << \ + I40E_TXD_CTX_QW0_EXT_IP_SHIFT) + +enum i40e_tx_ctx_desc_eipt_offload { + I40E_TX_CTX_EXT_IP_NONE = 0x0, + I40E_TX_CTX_EXT_IP_IPV6 = 0x1, + I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM = 0x2, + I40E_TX_CTX_EXT_IP_IPV4 = 0x3 +}; + +#define I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT 2 +#define I40E_TXD_CTX_QW0_EXT_IPLEN_MASK (0x3FULL << \ + I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT) + +#define I40E_TXD_CTX_QW0_NATT_SHIFT 9 +#define I40E_TXD_CTX_QW0_NATT_MASK (0x3ULL << I40E_TXD_CTX_QW0_NATT_SHIFT) + +#define I40E_TXD_CTX_UDP_TUNNELING BIT_ULL(I40E_TXD_CTX_QW0_NATT_SHIFT) +#define I40E_TXD_CTX_GRE_TUNNELING (0x2ULL << I40E_TXD_CTX_QW0_NATT_SHIFT) + +#define I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT 11 +#define I40E_TXD_CTX_QW0_EIP_NOINC_MASK BIT_ULL(I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT) + +#define I40E_TXD_CTX_EIP_NOINC_IPID_CONST I40E_TXD_CTX_QW0_EIP_NOINC_MASK + +#define I40E_TXD_CTX_QW0_NATLEN_SHIFT 12 +#define I40E_TXD_CTX_QW0_NATLEN_MASK (0X7FULL << \ + I40E_TXD_CTX_QW0_NATLEN_SHIFT) + +#define I40E_TXD_CTX_QW0_DECTTL_SHIFT 19 +#define I40E_TXD_CTX_QW0_DECTTL_MASK (0xFULL << \ + I40E_TXD_CTX_QW0_DECTTL_SHIFT) + +#ifdef X722_SUPPORT +#define I40E_TXD_CTX_QW0_L4T_CS_SHIFT 23 +#define I40E_TXD_CTX_QW0_L4T_CS_MASK BIT_ULL(I40E_TXD_CTX_QW0_L4T_CS_SHIFT) +#endif +struct i40e_nop_desc { + __le64 rsvd; + __le64 dtype_cmd; +}; + +#define I40E_TXD_NOP_QW1_DTYPE_SHIFT 0 +#define I40E_TXD_NOP_QW1_DTYPE_MASK (0xFUL << I40E_TXD_NOP_QW1_DTYPE_SHIFT) + +#define I40E_TXD_NOP_QW1_CMD_SHIFT 4 +#define I40E_TXD_NOP_QW1_CMD_MASK (0x7FUL << I40E_TXD_NOP_QW1_CMD_SHIFT) + +enum i40e_tx_nop_desc_cmd_bits { + /* Note: These are predefined bit offsets */ + I40E_TX_NOP_DESC_EOP_SHIFT = 0, + I40E_TX_NOP_DESC_RS_SHIFT = 1, + I40E_TX_NOP_DESC_RSV_SHIFT = 2 /* 5 bits */ +}; + +struct i40e_filter_program_desc { + __le32 qindex_flex_ptype_vsi; + __le32 rsvd; + __le32 dtype_cmd_cntindex; + __le32 fd_id; +}; +#define I40E_TXD_FLTR_QW0_QINDEX_SHIFT 0 +#define I40E_TXD_FLTR_QW0_QINDEX_MASK (0x7FFUL << \ + I40E_TXD_FLTR_QW0_QINDEX_SHIFT) +#define I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT 11 +#define I40E_TXD_FLTR_QW0_FLEXOFF_MASK (0x7UL << \ + I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT) +#define I40E_TXD_FLTR_QW0_PCTYPE_SHIFT 17 +#define I40E_TXD_FLTR_QW0_PCTYPE_MASK (0x3FUL << \ + I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) + +/* Packet Classifier Types for filters */ +enum i40e_filter_pctype { +#ifdef X722_SUPPORT + /* Note: Values 0-28 are reserved for future use. + * Value 29, 30, 32 are not supported on XL710 and X710. + */ + I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP = 29, + I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP = 30, +#else + /* Note: Values 0-30 are reserved for future use */ +#endif + I40E_FILTER_PCTYPE_NONF_IPV4_UDP = 31, +#ifdef X722_SUPPORT + I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK = 32, +#else + /* Note: Value 32 is reserved for future use */ +#endif + I40E_FILTER_PCTYPE_NONF_IPV4_TCP = 33, + I40E_FILTER_PCTYPE_NONF_IPV4_SCTP = 34, + I40E_FILTER_PCTYPE_NONF_IPV4_OTHER = 35, + I40E_FILTER_PCTYPE_FRAG_IPV4 = 36, +#ifdef X722_SUPPORT + /* Note: Values 37-38 are reserved for future use. + * Value 39, 40, 42 are not supported on XL710 and X710. + */ + I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP = 39, + I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP = 40, +#else + /* Note: Values 37-40 are reserved for future use */ +#endif + I40E_FILTER_PCTYPE_NONF_IPV6_UDP = 41, +#ifdef X722_SUPPORT + I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK = 42, +#endif + I40E_FILTER_PCTYPE_NONF_IPV6_TCP = 43, + I40E_FILTER_PCTYPE_NONF_IPV6_SCTP = 44, + I40E_FILTER_PCTYPE_NONF_IPV6_OTHER = 45, + I40E_FILTER_PCTYPE_FRAG_IPV6 = 46, + /* Note: Value 47 is reserved for future use */ + I40E_FILTER_PCTYPE_FCOE_OX = 48, + I40E_FILTER_PCTYPE_FCOE_RX = 49, + I40E_FILTER_PCTYPE_FCOE_OTHER = 50, + /* Note: Values 51-62 are reserved for future use */ + I40E_FILTER_PCTYPE_L2_PAYLOAD = 63, +}; + +enum i40e_filter_program_desc_dest { + I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET = 0x0, + I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX = 0x1, + I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_OTHER = 0x2, +}; + +enum i40e_filter_program_desc_fd_status { + I40E_FILTER_PROGRAM_DESC_FD_STATUS_NONE = 0x0, + I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID = 0x1, + I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID_4FLEX_BYTES = 0x2, + I40E_FILTER_PROGRAM_DESC_FD_STATUS_8FLEX_BYTES = 0x3, +}; + +#define I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT 23 +#define I40E_TXD_FLTR_QW0_DEST_VSI_MASK BIT_ULL(I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT) + +#define I40E_TXD_FLTR_QW1_DTYPE_SHIFT 0 +#define I40E_TXD_FLTR_QW1_DTYPE_MASK (0xFUL << I40E_TXD_FLTR_QW1_DTYPE_SHIFT) + +#define I40E_TXD_FLTR_QW1_CMD_SHIFT 4 +#define I40E_TXD_FLTR_QW1_CMD_MASK (0xFFFFULL << \ + I40E_TXD_FLTR_QW1_CMD_SHIFT) + +#define I40E_TXD_FLTR_QW1_PCMD_SHIFT (0x0ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT) +#define I40E_TXD_FLTR_QW1_PCMD_MASK (0x7ULL << I40E_TXD_FLTR_QW1_PCMD_SHIFT) + +enum i40e_filter_program_desc_pcmd { + I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE = 0x1, + I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE = 0x2, +}; + +#define I40E_TXD_FLTR_QW1_DEST_SHIFT (0x3ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT) +#define I40E_TXD_FLTR_QW1_DEST_MASK (0x3ULL << I40E_TXD_FLTR_QW1_DEST_SHIFT) + +#define I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT (0x7ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT) +#define I40E_TXD_FLTR_QW1_CNT_ENA_MASK BIT_ULL(I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT) + +#define I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT (0x9ULL + \ + I40E_TXD_FLTR_QW1_CMD_SHIFT) +#define I40E_TXD_FLTR_QW1_FD_STATUS_MASK (0x3ULL << \ + I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT) +#ifdef X722_SUPPORT + +#define I40E_TXD_FLTR_QW1_ATR_SHIFT (0xEULL + \ + I40E_TXD_FLTR_QW1_CMD_SHIFT) +#define I40E_TXD_FLTR_QW1_ATR_MASK BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT) +#endif + +#define I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT 20 +#define I40E_TXD_FLTR_QW1_CNTINDEX_MASK (0x1FFUL << \ + I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) + +enum i40e_filter_type { + I40E_FLOW_DIRECTOR_FLTR = 0, + I40E_PE_QUAD_HASH_FLTR = 1, + I40E_ETHERTYPE_FLTR, + I40E_FCOE_CTX_FLTR, + I40E_MAC_VLAN_FLTR, + I40E_HASH_FLTR +}; + +struct i40e_vsi_context { + u16 seid; + u16 uplink_seid; + u16 vsi_number; + u16 vsis_allocated; + u16 vsis_unallocated; + u16 flags; + u8 pf_num; + u8 vf_num; + u8 connection_type; + struct i40e_aqc_vsi_properties_data info; +}; + +struct i40e_veb_context { + u16 seid; + u16 uplink_seid; + u16 veb_number; + u16 vebs_allocated; + u16 vebs_unallocated; + u16 flags; + struct i40e_aqc_get_veb_parameters_completion info; +}; + +/* Statistics collected by each port, VSI, VEB, and S-channel */ +struct i40e_eth_stats { + u64 rx_bytes; /* gorc */ + u64 rx_unicast; /* uprc */ + u64 rx_multicast; /* mprc */ + u64 rx_broadcast; /* bprc */ + u64 rx_discards; /* rdpc */ + u64 rx_unknown_protocol; /* rupp */ + u64 tx_bytes; /* gotc */ + u64 tx_unicast; /* uptc */ + u64 tx_multicast; /* mptc */ + u64 tx_broadcast; /* bptc */ + u64 tx_discards; /* tdpc */ + u64 tx_errors; /* tepc */ +}; + +/* Statistics collected per VEB per TC */ +struct i40e_veb_tc_stats { + u64 tc_rx_packets[I40E_MAX_TRAFFIC_CLASS]; + u64 tc_rx_bytes[I40E_MAX_TRAFFIC_CLASS]; + u64 tc_tx_packets[I40E_MAX_TRAFFIC_CLASS]; + u64 tc_tx_bytes[I40E_MAX_TRAFFIC_CLASS]; +}; + +/* Statistics collected by the MAC */ +struct i40e_hw_port_stats { + /* eth stats collected by the port */ + struct i40e_eth_stats eth; + + /* additional port specific stats */ + u64 tx_dropped_link_down; /* tdold */ + u64 crc_errors; /* crcerrs */ + u64 illegal_bytes; /* illerrc */ + u64 error_bytes; /* errbc */ + u64 mac_local_faults; /* mlfc */ + u64 mac_remote_faults; /* mrfc */ + u64 rx_length_errors; /* rlec */ + u64 link_xon_rx; /* lxonrxc */ + u64 link_xoff_rx; /* lxoffrxc */ + u64 priority_xon_rx[8]; /* pxonrxc[8] */ + u64 priority_xoff_rx[8]; /* pxoffrxc[8] */ + u64 link_xon_tx; /* lxontxc */ + u64 link_xoff_tx; /* lxofftxc */ + u64 priority_xon_tx[8]; /* pxontxc[8] */ + u64 priority_xoff_tx[8]; /* pxofftxc[8] */ + u64 priority_xon_2_xoff[8]; /* rxon2offcnt[8] */ + u64 rx_size_64; /* prc64 */ + u64 rx_size_127; /* prc127 */ + u64 rx_size_255; /* prc255 */ + u64 rx_size_511; /* prc511 */ + u64 rx_size_1023; /* prc1023 */ + u64 rx_size_1522; /* prc1522 */ + u64 rx_size_big; /* prc9522 */ + u64 rx_undersize; /* ruc */ + u64 rx_fragments; /* rfc */ + u64 rx_oversize; /* roc */ + u64 rx_jabber; /* rjc */ + u64 tx_size_64; /* ptc64 */ + u64 tx_size_127; /* ptc127 */ + u64 tx_size_255; /* ptc255 */ + u64 tx_size_511; /* ptc511 */ + u64 tx_size_1023; /* ptc1023 */ + u64 tx_size_1522; /* ptc1522 */ + u64 tx_size_big; /* ptc9522 */ + u64 mac_short_packet_dropped; /* mspdc */ + u64 checksum_error; /* xec */ + /* flow director stats */ + u64 fd_atr_match; + u64 fd_sb_match; + u64 fd_atr_tunnel_match; + u32 fd_atr_status; + u32 fd_sb_status; + /* EEE LPI */ + u32 tx_lpi_status; + u32 rx_lpi_status; + u64 tx_lpi_count; /* etlpic */ + u64 rx_lpi_count; /* erlpic */ +}; + +/* Checksum and Shadow RAM pointers */ +#define I40E_SR_NVM_CONTROL_WORD 0x00 +#define I40E_SR_PCIE_ANALOG_CONFIG_PTR 0x03 +#define I40E_SR_PHY_ANALOG_CONFIG_PTR 0x04 +#define I40E_SR_OPTION_ROM_PTR 0x05 +#define I40E_SR_RO_PCIR_REGS_AUTO_LOAD_PTR 0x06 +#define I40E_SR_AUTO_GENERATED_POINTERS_PTR 0x07 +#define I40E_SR_PCIR_REGS_AUTO_LOAD_PTR 0x08 +#define I40E_SR_EMP_GLOBAL_MODULE_PTR 0x09 +#define I40E_SR_RO_PCIE_LCB_PTR 0x0A +#define I40E_SR_EMP_IMAGE_PTR 0x0B +#define I40E_SR_PE_IMAGE_PTR 0x0C +#define I40E_SR_CSR_PROTECTED_LIST_PTR 0x0D +#define I40E_SR_MNG_CONFIG_PTR 0x0E +#define I40E_SR_EMP_MODULE_PTR 0x0F +#define I40E_SR_PBA_FLAGS 0x15 +#define I40E_SR_PBA_BLOCK_PTR 0x16 +#define I40E_SR_BOOT_CONFIG_PTR 0x17 +#define I40E_NVM_OEM_VER_OFF 0x83 +#define I40E_SR_NVM_DEV_STARTER_VERSION 0x18 +#define I40E_SR_NVM_WAKE_ON_LAN 0x19 +#define I40E_SR_ALTERNATE_SAN_MAC_ADDRESS_PTR 0x27 +#define I40E_SR_PERMANENT_SAN_MAC_ADDRESS_PTR 0x28 +#define I40E_SR_NVM_MAP_VERSION 0x29 +#define I40E_SR_NVM_IMAGE_VERSION 0x2A +#define I40E_SR_NVM_STRUCTURE_VERSION 0x2B +#define I40E_SR_NVM_EETRACK_LO 0x2D +#define I40E_SR_NVM_EETRACK_HI 0x2E +#define I40E_SR_VPD_PTR 0x2F +#define I40E_SR_PXE_SETUP_PTR 0x30 +#define I40E_SR_PXE_CONFIG_CUST_OPTIONS_PTR 0x31 +#define I40E_SR_NVM_ORIGINAL_EETRACK_LO 0x34 +#define I40E_SR_NVM_ORIGINAL_EETRACK_HI 0x35 +#define I40E_SR_SW_ETHERNET_MAC_ADDRESS_PTR 0x37 +#define I40E_SR_POR_REGS_AUTO_LOAD_PTR 0x38 +#define I40E_SR_EMPR_REGS_AUTO_LOAD_PTR 0x3A +#define I40E_SR_GLOBR_REGS_AUTO_LOAD_PTR 0x3B +#define I40E_SR_CORER_REGS_AUTO_LOAD_PTR 0x3C +#define I40E_SR_PCIE_ALT_AUTO_LOAD_PTR 0x3E +#define I40E_SR_SW_CHECKSUM_WORD 0x3F +#define I40E_SR_1ST_FREE_PROVISION_AREA_PTR 0x40 +#define I40E_SR_4TH_FREE_PROVISION_AREA_PTR 0x42 +#define I40E_SR_3RD_FREE_PROVISION_AREA_PTR 0x44 +#define I40E_SR_2ND_FREE_PROVISION_AREA_PTR 0x46 +#define I40E_SR_EMP_SR_SETTINGS_PTR 0x48 +#define I40E_SR_FEATURE_CONFIGURATION_PTR 0x49 +#define I40E_SR_CONFIGURATION_METADATA_PTR 0x4D +#define I40E_SR_IMMEDIATE_VALUES_PTR 0x4E + +/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */ +#define I40E_SR_VPD_MODULE_MAX_SIZE 1024 +#define I40E_SR_PCIE_ALT_MODULE_MAX_SIZE 1024 +#define I40E_SR_CONTROL_WORD_1_SHIFT 0x06 +#define I40E_SR_CONTROL_WORD_1_MASK (0x03 << I40E_SR_CONTROL_WORD_1_SHIFT) + +/* Shadow RAM related */ +#define I40E_SR_SECTOR_SIZE_IN_WORDS 0x800 +#define I40E_SR_BUF_ALIGNMENT 4096 +#define I40E_SR_WORDS_IN_1KB 512 +/* Checksum should be calculated such that after adding all the words, + * including the checksum word itself, the sum should be 0xBABA. + */ +#define I40E_SR_SW_CHECKSUM_BASE 0xBABA + +#define I40E_SRRD_SRCTL_ATTEMPTS 100000 + +enum i40e_switch_element_types { + I40E_SWITCH_ELEMENT_TYPE_MAC = 1, + I40E_SWITCH_ELEMENT_TYPE_PF = 2, + I40E_SWITCH_ELEMENT_TYPE_VF = 3, + I40E_SWITCH_ELEMENT_TYPE_EMP = 4, + I40E_SWITCH_ELEMENT_TYPE_BMC = 6, + I40E_SWITCH_ELEMENT_TYPE_PE = 16, + I40E_SWITCH_ELEMENT_TYPE_VEB = 17, + I40E_SWITCH_ELEMENT_TYPE_PA = 18, + I40E_SWITCH_ELEMENT_TYPE_VSI = 19, +}; + +/* Supported EtherType filters */ +enum i40e_ether_type_index { + I40E_ETHER_TYPE_1588 = 0, + I40E_ETHER_TYPE_FIP = 1, + I40E_ETHER_TYPE_OUI_EXTENDED = 2, + I40E_ETHER_TYPE_MAC_CONTROL = 3, + I40E_ETHER_TYPE_LLDP = 4, + I40E_ETHER_TYPE_EVB_PROTOCOL1 = 5, + I40E_ETHER_TYPE_EVB_PROTOCOL2 = 6, + I40E_ETHER_TYPE_QCN_CNM = 7, + I40E_ETHER_TYPE_8021X = 8, + I40E_ETHER_TYPE_ARP = 9, + I40E_ETHER_TYPE_RSV1 = 10, + I40E_ETHER_TYPE_RSV2 = 11, +}; + +/* Filter context base size is 1K */ +#define I40E_HASH_FILTER_BASE_SIZE 1024 +/* Supported Hash filter values */ +enum i40e_hash_filter_size { + I40E_HASH_FILTER_SIZE_1K = 0, + I40E_HASH_FILTER_SIZE_2K = 1, + I40E_HASH_FILTER_SIZE_4K = 2, + I40E_HASH_FILTER_SIZE_8K = 3, + I40E_HASH_FILTER_SIZE_16K = 4, + I40E_HASH_FILTER_SIZE_32K = 5, + I40E_HASH_FILTER_SIZE_64K = 6, + I40E_HASH_FILTER_SIZE_128K = 7, + I40E_HASH_FILTER_SIZE_256K = 8, + I40E_HASH_FILTER_SIZE_512K = 9, + I40E_HASH_FILTER_SIZE_1M = 10, +}; + +/* DMA context base size is 0.5K */ +#define I40E_DMA_CNTX_BASE_SIZE 512 +/* Supported DMA context values */ +enum i40e_dma_cntx_size { + I40E_DMA_CNTX_SIZE_512 = 0, + I40E_DMA_CNTX_SIZE_1K = 1, + I40E_DMA_CNTX_SIZE_2K = 2, + I40E_DMA_CNTX_SIZE_4K = 3, + I40E_DMA_CNTX_SIZE_8K = 4, + I40E_DMA_CNTX_SIZE_16K = 5, + I40E_DMA_CNTX_SIZE_32K = 6, + I40E_DMA_CNTX_SIZE_64K = 7, + I40E_DMA_CNTX_SIZE_128K = 8, + I40E_DMA_CNTX_SIZE_256K = 9, +}; + +/* Supported Hash look up table (LUT) sizes */ +enum i40e_hash_lut_size { + I40E_HASH_LUT_SIZE_128 = 0, + I40E_HASH_LUT_SIZE_512 = 1, +}; + +/* Structure to hold a per PF filter control settings */ +struct i40e_filter_control_settings { + /* number of PE Quad Hash filter buckets */ + enum i40e_hash_filter_size pe_filt_num; + /* number of PE Quad Hash contexts */ + enum i40e_dma_cntx_size pe_cntx_num; + /* number of FCoE filter buckets */ + enum i40e_hash_filter_size fcoe_filt_num; + /* number of FCoE DDP contexts */ + enum i40e_dma_cntx_size fcoe_cntx_num; + /* size of the Hash LUT */ + enum i40e_hash_lut_size hash_lut_size; + /* enable FDIR filters for PF and its VFs */ + bool enable_fdir; + /* enable Ethertype filters for PF and its VFs */ + bool enable_ethtype; + /* enable MAC/VLAN filters for PF and its VFs */ + bool enable_macvlan; +}; + +/* Structure to hold device level control filter counts */ +struct i40e_control_filter_stats { + u16 mac_etype_used; /* Used perfect match MAC/EtherType filters */ + u16 etype_used; /* Used perfect EtherType filters */ + u16 mac_etype_free; /* Un-used perfect match MAC/EtherType filters */ + u16 etype_free; /* Un-used perfect EtherType filters */ +}; + +enum i40e_reset_type { + I40E_RESET_POR = 0, + I40E_RESET_CORER = 1, + I40E_RESET_GLOBR = 2, + I40E_RESET_EMPR = 3, +}; + +/* IEEE 802.1AB LLDP Agent Variables from NVM */ +#define I40E_NVM_LLDP_CFG_PTR 0xD +struct i40e_lldp_variables { + u16 length; + u16 adminstatus; + u16 msgfasttx; + u16 msgtxinterval; + u16 txparams; + u16 timers; + u16 crc8; +}; + +/* Offsets into Alternate Ram */ +#define I40E_ALT_STRUCT_FIRST_PF_OFFSET 0 /* in dwords */ +#define I40E_ALT_STRUCT_DWORDS_PER_PF 64 /* in dwords */ +#define I40E_ALT_STRUCT_OUTER_VLAN_TAG_OFFSET 0xD /* in dwords */ +#define I40E_ALT_STRUCT_USER_PRIORITY_OFFSET 0xC /* in dwords */ +#define I40E_ALT_STRUCT_MIN_BW_OFFSET 0xE /* in dwords */ +#define I40E_ALT_STRUCT_MAX_BW_OFFSET 0xF /* in dwords */ + +/* Alternate Ram Bandwidth Masks */ +#define I40E_ALT_BW_VALUE_MASK 0xFF +#define I40E_ALT_BW_RELATIVE_MASK 0x40000000 +#define I40E_ALT_BW_VALID_MASK 0x80000000 + +/* RSS Hash Table Size */ +#define I40E_PFQF_CTL_0_HASHLUTSIZE_512 0x00010000 + +/* PBA length (and one with additional zero-padding byte), see Table 6-2. */ +#define I40E_PBANUM_LENGTH 12 +#define I40E_PBANUM_STRLEN 13 + +#endif /* _I40E_TYPE_H_ */ diff --git a/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h b/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h new file mode 100644 index 0000000000..17b090f454 --- /dev/null +++ b/usr/src/uts/common/io/i40e/core/i40e_virtchnl.h @@ -0,0 +1,378 @@ +/****************************************************************************** + + Copyright (c) 2013-2015, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ +/*$FreeBSD: head/sys/dev/ixl/i40e_virtchnl.h 284049 2015-06-05 22:52:42Z jfv $*/ + +#ifndef _I40E_VIRTCHNL_H_ +#define _I40E_VIRTCHNL_H_ + +#include "i40e_type.h" + +/* Description: + * This header file describes the VF-PF communication protocol used + * by the various i40e drivers. + * + * Admin queue buffer usage: + * desc->opcode is always i40e_aqc_opc_send_msg_to_pf + * flags, retval, datalen, and data addr are all used normally. + * Firmware copies the cookie fields when sending messages between the PF and + * VF, but uses all other fields internally. Due to this limitation, we + * must send all messages as "indirect", i.e. using an external buffer. + * + * All the vsi indexes are relative to the VF. Each VF can have maximum of + * three VSIs. All the queue indexes are relative to the VSI. Each VF can + * have a maximum of sixteen queues for all of its VSIs. + * + * The PF is required to return a status code in v_retval for all messages + * except RESET_VF, which does not require any response. The return value is of + * i40e_status_code type, defined in the i40e_type.h. + * + * In general, VF driver initialization should roughly follow the order of these + * opcodes. The VF driver must first validate the API version of the PF driver, + * then request a reset, then get resources, then configure queues and + * interrupts. After these operations are complete, the VF driver may start + * its queues, optionally add MAC and VLAN filters, and process traffic. + */ + +/* Opcodes for VF-PF communication. These are placed in the v_opcode field + * of the virtchnl_msg structure. + */ +enum i40e_virtchnl_ops { +/* The PF sends status change events to VFs using + * the I40E_VIRTCHNL_OP_EVENT opcode. + * VFs send requests to the PF using the other ops. + */ + I40E_VIRTCHNL_OP_UNKNOWN = 0, + I40E_VIRTCHNL_OP_VERSION = 1, /* must ALWAYS be 1 */ + I40E_VIRTCHNL_OP_RESET_VF = 2, + I40E_VIRTCHNL_OP_GET_VF_RESOURCES = 3, + I40E_VIRTCHNL_OP_CONFIG_TX_QUEUE = 4, + I40E_VIRTCHNL_OP_CONFIG_RX_QUEUE = 5, + I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES = 6, + I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP = 7, + I40E_VIRTCHNL_OP_ENABLE_QUEUES = 8, + I40E_VIRTCHNL_OP_DISABLE_QUEUES = 9, + I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS = 10, + I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS = 11, + I40E_VIRTCHNL_OP_ADD_VLAN = 12, + I40E_VIRTCHNL_OP_DEL_VLAN = 13, + I40E_VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14, + I40E_VIRTCHNL_OP_GET_STATS = 15, + I40E_VIRTCHNL_OP_FCOE = 16, + I40E_VIRTCHNL_OP_EVENT = 17, +}; + +/* Virtual channel message descriptor. This overlays the admin queue + * descriptor. All other data is passed in external buffers. + */ + +struct i40e_virtchnl_msg { + u8 pad[8]; /* AQ flags/opcode/len/retval fields */ + enum i40e_virtchnl_ops v_opcode; /* avoid confusion with desc->opcode */ + enum i40e_status_code v_retval; /* ditto for desc->retval */ + u32 vfid; /* used by PF when sending to VF */ +}; + +/* Message descriptions and data structures.*/ + +/* I40E_VIRTCHNL_OP_VERSION + * VF posts its version number to the PF. PF responds with its version number + * in the same format, along with a return code. + * Reply from PF has its major/minor versions also in param0 and param1. + * If there is a major version mismatch, then the VF cannot operate. + * If there is a minor version mismatch, then the VF can operate but should + * add a warning to the system log. + * + * This enum element MUST always be specified as == 1, regardless of other + * changes in the API. The PF must always respond to this message without + * error regardless of version mismatch. + */ +#define I40E_VIRTCHNL_VERSION_MAJOR 1 +#define I40E_VIRTCHNL_VERSION_MINOR 1 +#define I40E_VIRTCHNL_VERSION_MINOR_NO_VF_CAPS 0 + +struct i40e_virtchnl_version_info { + u32 major; + u32 minor; +}; + +/* I40E_VIRTCHNL_OP_RESET_VF + * VF sends this request to PF with no parameters + * PF does NOT respond! VF driver must delay then poll VFGEN_RSTAT register + * until reset completion is indicated. The admin queue must be reinitialized + * after this operation. + * + * When reset is complete, PF must ensure that all queues in all VSIs associated + * with the VF are stopped, all queue configurations in the HMC are set to 0, + * and all MAC and VLAN filters (except the default MAC address) on all VSIs + * are cleared. + */ + +/* I40E_VIRTCHNL_OP_GET_VF_RESOURCES + * Version 1.0 VF sends this request to PF with no parameters + * Version 1.1 VF sends this request to PF with u32 bitmap of its capabilities + * PF responds with an indirect message containing + * i40e_virtchnl_vf_resource and one or more + * i40e_virtchnl_vsi_resource structures. + */ + +struct i40e_virtchnl_vsi_resource { + u16 vsi_id; + u16 num_queue_pairs; + enum i40e_vsi_type vsi_type; + u16 qset_handle; + u8 default_mac_addr[I40E_ETH_LENGTH_OF_ADDRESS]; +}; +/* VF offload flags */ +#define I40E_VIRTCHNL_VF_OFFLOAD_L2 0x00000001 +#define I40E_VIRTCHNL_VF_OFFLOAD_IWARP 0x00000002 +#define I40E_VIRTCHNL_VF_OFFLOAD_FCOE 0x00000004 +#define I40E_VIRTCHNL_VF_OFFLOAD_RSS_AQ 0x00000008 +#define I40E_VIRTCHNL_VF_OFFLOAD_RSS_REG 0x00000010 +#define I40E_VIRTCHNL_VF_OFFLOAD_VLAN 0x00010000 +#define I40E_VIRTCHNL_VF_OFFLOAD_RX_POLLING 0x00020000 + +struct i40e_virtchnl_vf_resource { + u16 num_vsis; + u16 num_queue_pairs; + u16 max_vectors; + u16 max_mtu; + + u32 vf_offload_flags; + u32 max_fcoe_contexts; + u32 max_fcoe_filters; + + struct i40e_virtchnl_vsi_resource vsi_res[1]; +}; + +/* I40E_VIRTCHNL_OP_CONFIG_TX_QUEUE + * VF sends this message to set up parameters for one TX queue. + * External data buffer contains one instance of i40e_virtchnl_txq_info. + * PF configures requested queue and returns a status code. + */ + +/* Tx queue config info */ +struct i40e_virtchnl_txq_info { + u16 vsi_id; + u16 queue_id; + u16 ring_len; /* number of descriptors, multiple of 8 */ + u16 headwb_enabled; + u64 dma_ring_addr; + u64 dma_headwb_addr; +}; + +/* I40E_VIRTCHNL_OP_CONFIG_RX_QUEUE + * VF sends this message to set up parameters for one RX queue. + * External data buffer contains one instance of i40e_virtchnl_rxq_info. + * PF configures requested queue and returns a status code. + */ + +/* Rx queue config info */ +struct i40e_virtchnl_rxq_info { + u16 vsi_id; + u16 queue_id; + u32 ring_len; /* number of descriptors, multiple of 32 */ + u16 hdr_size; + u16 splithdr_enabled; + u32 databuffer_size; + u32 max_pkt_size; + u64 dma_ring_addr; + enum i40e_hmc_obj_rx_hsplit_0 rx_split_pos; +}; + +/* I40E_VIRTCHNL_OP_CONFIG_VSI_QUEUES + * VF sends this message to set parameters for all active TX and RX queues + * associated with the specified VSI. + * PF configures queues and returns status. + * If the number of queues specified is greater than the number of queues + * associated with the VSI, an error is returned and no queues are configured. + */ +struct i40e_virtchnl_queue_pair_info { + /* NOTE: vsi_id and queue_id should be identical for both queues. */ + struct i40e_virtchnl_txq_info txq; + struct i40e_virtchnl_rxq_info rxq; +}; + +struct i40e_virtchnl_vsi_queue_config_info { + u16 vsi_id; + u16 num_queue_pairs; + struct i40e_virtchnl_queue_pair_info qpair[1]; +}; + +/* I40E_VIRTCHNL_OP_CONFIG_IRQ_MAP + * VF uses this message to map vectors to queues. + * The rxq_map and txq_map fields are bitmaps used to indicate which queues + * are to be associated with the specified vector. + * The "other" causes are always mapped to vector 0. + * PF configures interrupt mapping and returns status. + */ +struct i40e_virtchnl_vector_map { + u16 vsi_id; + u16 vector_id; + u16 rxq_map; + u16 txq_map; + u16 rxitr_idx; + u16 txitr_idx; +}; + +struct i40e_virtchnl_irq_map_info { + u16 num_vectors; + struct i40e_virtchnl_vector_map vecmap[1]; +}; + +/* I40E_VIRTCHNL_OP_ENABLE_QUEUES + * I40E_VIRTCHNL_OP_DISABLE_QUEUES + * VF sends these message to enable or disable TX/RX queue pairs. + * The queues fields are bitmaps indicating which queues to act upon. + * (Currently, we only support 16 queues per VF, but we make the field + * u32 to allow for expansion.) + * PF performs requested action and returns status. + */ +struct i40e_virtchnl_queue_select { + u16 vsi_id; + u16 pad; + u32 rx_queues; + u32 tx_queues; +}; + +/* I40E_VIRTCHNL_OP_ADD_ETHER_ADDRESS + * VF sends this message in order to add one or more unicast or multicast + * address filters for the specified VSI. + * PF adds the filters and returns status. + */ + +/* I40E_VIRTCHNL_OP_DEL_ETHER_ADDRESS + * VF sends this message in order to remove one or more unicast or multicast + * filters for the specified VSI. + * PF removes the filters and returns status. + */ + +struct i40e_virtchnl_ether_addr { + u8 addr[I40E_ETH_LENGTH_OF_ADDRESS]; + u8 pad[2]; +}; + +struct i40e_virtchnl_ether_addr_list { + u16 vsi_id; + u16 num_elements; + struct i40e_virtchnl_ether_addr list[1]; +}; + +/* I40E_VIRTCHNL_OP_ADD_VLAN + * VF sends this message to add one or more VLAN tag filters for receives. + * PF adds the filters and returns status. + * If a port VLAN is configured by the PF, this operation will return an + * error to the VF. + */ + +/* I40E_VIRTCHNL_OP_DEL_VLAN + * VF sends this message to remove one or more VLAN tag filters for receives. + * PF removes the filters and returns status. + * If a port VLAN is configured by the PF, this operation will return an + * error to the VF. + */ + +struct i40e_virtchnl_vlan_filter_list { + u16 vsi_id; + u16 num_elements; + u16 vlan_id[1]; +}; + +/* I40E_VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE + * VF sends VSI id and flags. + * PF returns status code in retval. + * Note: we assume that broadcast accept mode is always enabled. + */ +struct i40e_virtchnl_promisc_info { + u16 vsi_id; + u16 flags; +}; + +#define I40E_FLAG_VF_UNICAST_PROMISC 0x00000001 +#define I40E_FLAG_VF_MULTICAST_PROMISC 0x00000002 + +/* I40E_VIRTCHNL_OP_GET_STATS + * VF sends this message to request stats for the selected VSI. VF uses + * the i40e_virtchnl_queue_select struct to specify the VSI. The queue_id + * field is ignored by the PF. + * + * PF replies with struct i40e_eth_stats in an external buffer. + */ + +/* I40E_VIRTCHNL_OP_EVENT + * PF sends this message to inform the VF driver of events that may affect it. + * No direct response is expected from the VF, though it may generate other + * messages in response to this one. + */ +enum i40e_virtchnl_event_codes { + I40E_VIRTCHNL_EVENT_UNKNOWN = 0, + I40E_VIRTCHNL_EVENT_LINK_CHANGE, + I40E_VIRTCHNL_EVENT_RESET_IMPENDING, + I40E_VIRTCHNL_EVENT_PF_DRIVER_CLOSE, +}; +#define I40E_PF_EVENT_SEVERITY_INFO 0 +#define I40E_PF_EVENT_SEVERITY_ATTENTION 1 +#define I40E_PF_EVENT_SEVERITY_ACTION_REQUIRED 2 +#define I40E_PF_EVENT_SEVERITY_CERTAIN_DOOM 255 + +struct i40e_virtchnl_pf_event { + enum i40e_virtchnl_event_codes event; + union { + struct { + enum i40e_aq_link_speed link_speed; + bool link_status; + } link_event; + } event_data; + + int severity; +}; + +/* VF reset states - these are written into the RSTAT register: + * I40E_VFGEN_RSTAT1 on the PF + * I40E_VFGEN_RSTAT on the VF + * When the PF initiates a reset, it writes 0 + * When the reset is complete, it writes 1 + * When the PF detects that the VF has recovered, it writes 2 + * VF checks this register periodically to determine if a reset has occurred, + * then polls it to know when the reset is complete. + * If either the PF or VF reads the register while the hardware + * is in a reset state, it will return DEADBEEF, which, when masked + * will result in 3. + */ +enum i40e_vfr_states { + I40E_VFR_INPROGRESS = 0, + I40E_VFR_COMPLETED, + I40E_VFR_VFACTIVE, + I40E_VFR_UNKNOWN, +}; + +#endif /* _I40E_VIRTCHNL_H_ */ diff --git a/usr/src/uts/common/io/i40e/i40e.conf b/usr/src/uts/common/io/i40e/i40e.conf new file mode 100644 index 0000000000..b4c3459931 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e.conf @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# Driver.conf file for Intel XL710 PCIe NIC Driver (i40e) +# See i40e(7D) for valid options. +# diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c new file mode 100644 index 0000000000..0af4c4c71f --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_gld.c @@ -0,0 +1,1097 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +/* + * For more information, please see the big theory statement in i40e_main.c. + */ + +#include "i40e_sw.h" + +#define I40E_PROP_RX_DMA_THRESH "_rx_dma_threshold" +#define I40E_PROP_TX_DMA_THRESH "_tx_dma_threshold" +#define I40E_PROP_RX_ITR "_rx_intr_throttle" +#define I40E_PROP_TX_ITR "_tx_intr_throttle" +#define I40E_PROP_OTHER_ITR "_other_intr_throttle" + +char *i40e_priv_props[] = { + I40E_PROP_RX_DMA_THRESH, + I40E_PROP_TX_DMA_THRESH, + I40E_PROP_RX_ITR, + I40E_PROP_TX_ITR, + I40E_PROP_OTHER_ITR, + NULL +}; + +static int +i40e_group_remove_mac(void *arg, const uint8_t *mac_addr) +{ + i40e_t *i40e = arg; + struct i40e_aqc_remove_macvlan_element_data filt; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int ret, i, last; + i40e_uaddr_t *iua; + + if (I40E_IS_MULTICAST(mac_addr)) + return (EINVAL); + + mutex_enter(&i40e->i40e_general_lock); + + if (i40e->i40e_state & I40E_SUSPENDED) { + ret = ECANCELED; + goto done; + } + + for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) { + if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac, + ETHERADDRL) == 0) + break; + } + + if (i == i40e->i40e_resources.ifr_nmacfilt_used) { + ret = ENOENT; + goto done; + } + + iua = &i40e->i40e_uaddrs[i]; + ASSERT(i40e->i40e_resources.ifr_nmacfilt_used > 0); + + bzero(&filt, sizeof (filt)); + bcopy(mac_addr, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH | + I40E_AQC_MACVLAN_DEL_IGNORE_VLAN; + + if (i40e_aq_remove_macvlan(hw, iua->iua_vsi, &filt, 1, NULL) != + I40E_SUCCESS) { + i40e_error(i40e, "failed to remove mac address " + "%2x:%2x:%2x:%2x:%2x:%2x from unicast filter: %d", + mac_addr[0], mac_addr[1], mac_addr[2], mac_addr[3], + mac_addr[4], mac_addr[5], filt.error_code); + ret = EIO; + goto done; + } + + last = i40e->i40e_resources.ifr_nmacfilt_used - 1; + if (i != last) { + i40e_uaddr_t *src = &i40e->i40e_uaddrs[last]; + bcopy(src, iua, sizeof (i40e_uaddr_t)); + } + + /* + * Set the multicast bit in the last one to indicate to ourselves that + * it's invalid. + */ + bzero(&i40e->i40e_uaddrs[last], sizeof (i40e_uaddr_t)); + i40e->i40e_uaddrs[last].iua_mac[0] = 0x01; + i40e->i40e_resources.ifr_nmacfilt_used--; + ret = 0; +done: + mutex_exit(&i40e->i40e_general_lock); + + return (ret); +} + +static int +i40e_group_add_mac(void *arg, const uint8_t *mac_addr) +{ + i40e_t *i40e = arg; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int i, ret; + i40e_uaddr_t *iua; + struct i40e_aqc_add_macvlan_element_data filt; + + if (I40E_IS_MULTICAST(mac_addr)) + return (EINVAL); + + mutex_enter(&i40e->i40e_general_lock); + if (i40e->i40e_state & I40E_SUSPENDED) { + ret = ECANCELED; + goto done; + } + + if (i40e->i40e_resources.ifr_nmacfilt == + i40e->i40e_resources.ifr_nmacfilt_used) { + ret = ENOSPC; + goto done; + } + + for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) { + if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac, + ETHERADDRL) == 0) { + ret = EEXIST; + goto done; + } + } + + /* + * Note, the general use of the i40e_vsi_id will have to be refactored + * when we have proper group support. + */ + bzero(&filt, sizeof (filt)); + bcopy(mac_addr, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH | + I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; + + if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + NULL)) != I40E_SUCCESS) { + i40e_error(i40e, "failed to add mac address " + "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d", + mac_addr[0], mac_addr[1], mac_addr[2], mac_addr[3], + mac_addr[4], mac_addr[5], ret); + ret = EIO; + goto done; + } + + iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used]; + bcopy(mac_addr, iua->iua_mac, ETHERADDRL); + iua->iua_vsi = i40e->i40e_vsi_id; + i40e->i40e_resources.ifr_nmacfilt_used++; + ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <= + i40e->i40e_resources.ifr_nmacfilt); + ret = 0; +done: + mutex_exit(&i40e->i40e_general_lock); + return (ret); +} + +static int +i40e_m_start(void *arg) +{ + i40e_t *i40e = arg; + int rc = 0; + + mutex_enter(&i40e->i40e_general_lock); + if (i40e->i40e_state & I40E_SUSPENDED) { + rc = ECANCELED; + goto done; + } + + if (!i40e_start(i40e, B_TRUE)) { + rc = EIO; + goto done; + } + + atomic_or_32(&i40e->i40e_state, I40E_STARTED); +done: + mutex_exit(&i40e->i40e_general_lock); + + return (rc); +} + +static void +i40e_m_stop(void *arg) +{ + i40e_t *i40e = arg; + + mutex_enter(&i40e->i40e_general_lock); + + if (i40e->i40e_state & I40E_SUSPENDED) + goto done; + + atomic_and_32(&i40e->i40e_state, ~I40E_STARTED); + i40e_stop(i40e, B_TRUE); +done: + mutex_exit(&i40e->i40e_general_lock); +} + +/* + * Enable and disable promiscuous mode as requested. We have to toggle both + * unicast and multicast. Note that multicast may already be enabled due to the + * i40e_m_multicast may toggle it itself. See i40e_main.c for more information + * on this. + */ +static int +i40e_m_promisc(void *arg, boolean_t on) +{ + i40e_t *i40e = arg; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int ret = 0, err = 0; + + mutex_enter(&i40e->i40e_general_lock); + if (i40e->i40e_state & I40E_SUSPENDED) { + ret = ECANCELED; + goto done; + } + + + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, + on, NULL); + if (ret != I40E_SUCCESS) { + i40e_error(i40e, "failed to %s unicast promiscuity on " + "the default VSI: %d", on == B_TRUE ? "enable" : "disable", + ret); + err = EIO; + goto done; + } + + /* + * If we have a non-zero mcast_promisc_count, then it has already been + * enabled or we need to leave it that way and not touch it. + */ + if (i40e->i40e_mcast_promisc_count > 0) { + i40e->i40e_promisc_on = on; + goto done; + } + + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id, + on, NULL); + if (ret != I40E_SUCCESS) { + i40e_error(i40e, "failed to %s multicast promiscuity on " + "the default VSI: %d", on == B_TRUE ? "enable" : "disable", + ret); + + /* + * Try our best to put us back into a state that MAC expects us + * to be in. + */ + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, + !on, NULL); + if (ret != I40E_SUCCESS) { + i40e_error(i40e, "failed to %s unicast promiscuity on " + "the default VSI after toggling multicast failed: " + "%d", on == B_TRUE ? "disable" : "enable", ret); + } + + err = EIO; + goto done; + } else { + i40e->i40e_promisc_on = on; + } + +done: + mutex_exit(&i40e->i40e_general_lock); + return (err); +} + +/* + * See the big theory statement in i40e_main.c for multicast address management. + */ +static int +i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) +{ + struct i40e_hw *hw = &i40e->i40e_hw_space; + struct i40e_aqc_add_macvlan_element_data filt; + i40e_maddr_t *mc; + int ret; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + if (i40e->i40e_resources.ifr_nmcastfilt_used == + i40e->i40e_resources.ifr_nmcastfilt) { + if (i40e->i40e_mcast_promisc_count == 0 && + i40e->i40e_promisc_on == B_FALSE) { + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, + i40e->i40e_vsi_id, B_TRUE, NULL); + if (ret != I40E_SUCCESS) { + i40e_error(i40e, "failed to enable promiscuous " + "mode on VSI %d: %d", i40e->i40e_vsi_id, + ret); + return (EIO); + } + } + i40e->i40e_mcast_promisc_count++; + return (0); + } + + mc = &i40e->i40e_maddrs[i40e->i40e_resources.ifr_nmcastfilt_used]; + bzero(&filt, sizeof (filt)); + bcopy(multicast_address, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH | + I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; + + if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + NULL)) != I40E_SUCCESS) { + i40e_error(i40e, "failed to add mac address " + "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d", + multicast_address[0], multicast_address[1], + multicast_address[2], multicast_address[3], + multicast_address[4], multicast_address[5], + ret); + return (EIO); + } + + bcopy(multicast_address, mc->ima_mac, ETHERADDRL); + i40e->i40e_resources.ifr_nmcastfilt_used++; + return (0); +} + +/* + * See the big theory statement in i40e_main.c for multicast address management. + */ +static int +i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) +{ + int i, ret; + struct i40e_hw *hw = &i40e->i40e_hw_space; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + for (i = 0; i < i40e->i40e_resources.ifr_nmcastfilt_used; i++) { + struct i40e_aqc_remove_macvlan_element_data filt; + int last; + + if (bcmp(multicast_address, i40e->i40e_maddrs[i].ima_mac, + ETHERADDRL) != 0) { + continue; + } + + bzero(&filt, sizeof (filt)); + bcopy(multicast_address, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH | + I40E_AQC_MACVLAN_DEL_IGNORE_VLAN; + + if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id, + &filt, 1, NULL) != I40E_SUCCESS) { + i40e_error(i40e, "failed to remove mac address " + "%2x:%2x:%2x:%2x:%2x:%2x from multicast " + "filter: %d", + multicast_address[0], multicast_address[1], + multicast_address[2], multicast_address[3], + multicast_address[4], multicast_address[5], + filt.error_code); + return (EIO); + } + + last = i40e->i40e_resources.ifr_nmcastfilt_used - 1; + if (i != last) { + bcopy(&i40e->i40e_maddrs[last], &i40e->i40e_maddrs[i], + sizeof (i40e_maddr_t)); + bzero(&i40e->i40e_maddrs[last], sizeof (i40e_maddr_t)); + } + + ASSERT(i40e->i40e_resources.ifr_nmcastfilt_used > 0); + i40e->i40e_resources.ifr_nmcastfilt_used--; + return (0); + } + + if (i40e->i40e_mcast_promisc_count > 0) { + if (i40e->i40e_mcast_promisc_count == 1 && + i40e->i40e_promisc_on == B_FALSE) { + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, + i40e->i40e_vsi_id, B_FALSE, NULL); + if (ret != I40E_SUCCESS) { + i40e_error(i40e, "failed to disable " + "promiscuous mode on VSI %d: %d", + i40e->i40e_vsi_id, ret); + return (EIO); + } + } + i40e->i40e_mcast_promisc_count--; + + return (0); + } + + return (ENOENT); +} + +static int +i40e_m_multicast(void *arg, boolean_t add, const uint8_t *multicast_address) +{ + i40e_t *i40e = arg; + int rc; + + mutex_enter(&i40e->i40e_general_lock); + + if (i40e->i40e_state & I40E_SUSPENDED) { + mutex_exit(&i40e->i40e_general_lock); + return (ECANCELED); + } + + if (add == B_TRUE) { + rc = i40e_multicast_add(i40e, multicast_address); + } else { + rc = i40e_multicast_remove(i40e, multicast_address); + } + + mutex_exit(&i40e->i40e_general_lock); + return (rc); +} + +/* ARGSUSED */ +static void +i40e_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + /* + * At this time, we don't support toggling i40e into loopback mode. It's + * questionable how much value this has when there's no clear way to + * toggle this behavior from a supported way in userland. + */ + miocnak(q, mp, 0, EINVAL); +} + +static int +i40e_ring_start(mac_ring_driver_t rh, uint64_t gen_num) +{ + i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh; + + /* + * GLDv3 requires we keep track of a generation number, as it uses + * that number to keep track of whether or not a ring is active. + */ + mutex_enter(&itrq->itrq_rx_lock); + itrq->itrq_rxgen = gen_num; + mutex_exit(&itrq->itrq_rx_lock); + return (0); +} + +/* ARGSUSED */ +static int +i40e_rx_ring_intr_enable(mac_intr_handle_t intrh) +{ + i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh; + i40e_t *i40e = itrq->itrq_i40e; + + mutex_enter(&i40e->i40e_general_lock); + ASSERT(i40e->i40e_intr_poll == B_TRUE); + i40e_intr_rx_queue_enable(i40e, itrq->itrq_index); + i40e->i40e_intr_poll = B_FALSE; + mutex_exit(&i40e->i40e_general_lock); + + return (0); +} + +/* ARGSUSED */ +static int +i40e_rx_ring_intr_disable(mac_intr_handle_t intrh) +{ + i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh; + i40e_t *i40e = itrq->itrq_i40e; + + mutex_enter(&i40e->i40e_general_lock); + i40e_intr_rx_queue_disable(i40e, itrq->itrq_index); + i40e->i40e_intr_poll = B_TRUE; + mutex_exit(&i40e->i40e_general_lock); + + return (0); +} + +/* ARGSUSED */ +static void +i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + i40e_t *i40e = arg; + mac_intr_t *mintr = &infop->mri_intr; + i40e_trqpair_t *itrq = &(i40e->i40e_trqpairs[ring_index]); + + /* + * Note the group index here is expected to be -1 due to the fact that + * we're not actually grouping things tx-wise at this time. + */ + ASSERT(group_index == -1); + ASSERT(ring_index < i40e->i40e_num_trqpairs); + + itrq->itrq_mactxring = rh; + infop->mri_driver = (mac_ring_driver_t)itrq; + infop->mri_start = NULL; + infop->mri_stop = NULL; + infop->mri_tx = i40e_ring_tx; + infop->mri_stat = i40e_tx_ring_stat; + + /* + * We only provide the handle in cases where we have MSI-X interrupts, + * to indicate that we'd actually support retargetting. + */ + if (i40e->i40e_intr_type & DDI_INTR_TYPE_MSIX) { + mintr->mi_ddi_handle = + i40e->i40e_intr_handles[itrq->itrq_tx_intrvec]; + } +} + +/* ARGSUSED */ +static void +i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + i40e_t *i40e = arg; + mac_intr_t *mintr = &infop->mri_intr; + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index]; + + /* + * We assert the group number and ring index to help sanity check + * ourselves and mark that we'll need to rework this when we have + * multiple groups. + */ + ASSERT3S(group_index, ==, 0); + ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs); + + itrq->itrq_macrxring = rh; + infop->mri_driver = (mac_ring_driver_t)itrq; + infop->mri_start = i40e_ring_start; + infop->mri_stop = NULL; + infop->mri_poll = i40e_ring_rx_poll; + infop->mri_stat = i40e_rx_ring_stat; + mintr->mi_handle = (mac_intr_handle_t)itrq; + mintr->mi_enable = i40e_rx_ring_intr_enable; + mintr->mi_disable = i40e_rx_ring_intr_disable; + + /* + * We only provide the handle in cases where we have MSI-X interrupts, + * to indicate that we'd actually support retargetting. + */ + if (i40e->i40e_intr_type & DDI_INTR_TYPE_MSIX) { + mintr->mi_ddi_handle = + i40e->i40e_intr_handles[itrq->itrq_rx_intrvec]; + } +} + +/* ARGSUSED */ +static void +i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + i40e_t *i40e = arg; + + if (rtype != MAC_RING_TYPE_RX) + return; + + /* + * Note, this is a simplified view of a group, given that we only have a + * single group and a single ring at the moment. We'll want to expand + * upon this as we leverage more hardware functionality. + */ + i40e->i40e_rx_group_handle = gh; + infop->mgi_driver = (mac_group_driver_t)i40e; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = i40e_group_add_mac; + infop->mgi_remmac = i40e_group_remove_mac; + + ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX); + infop->mgi_count = i40e->i40e_num_trqpairs; +} + +static boolean_t +i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + i40e_t *i40e = arg; + mac_capab_rings_t *cap_rings; + + switch (cap) { + case MAC_CAPAB_HCKSUM: { + uint32_t *txflags = cap_data; + + *txflags = 0; + if (i40e->i40e_tx_hcksum_enable == B_TRUE) + *txflags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM; + break; + } + + case MAC_CAPAB_RINGS: + cap_rings = cap_data; + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + switch (cap_rings->mr_type) { + case MAC_RING_TYPE_TX: + /* + * Note, saying we have no rings, but some number of + * groups indicates to MAC that it should create + * psuedo-groups with one for each TX ring. This may not + * be the long term behavior we want, but it'll work for + * now. + */ + cap_rings->mr_gnum = 0; + cap_rings->mr_rnum = i40e->i40e_num_trqpairs; + cap_rings->mr_rget = i40e_fill_tx_ring; + cap_rings->mr_gget = NULL; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + break; + case MAC_RING_TYPE_RX: + cap_rings->mr_rnum = i40e->i40e_num_trqpairs; + cap_rings->mr_rget = i40e_fill_rx_ring; + cap_rings->mr_gnum = I40E_GROUP_MAX; + cap_rings->mr_gget = i40e_fill_rx_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + break; + default: + return (B_FALSE); + } + break; + default: + return (B_FALSE); + } + + return (B_TRUE); +} + +/* ARGSUSED */ +static int +i40e_m_setprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize, + const void *pr_val) +{ + int ret; + long val; + char *eptr; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + if ((ret = ddi_strtol(pr_val, &eptr, 10, &val)) != 0 || + *eptr != '\0') { + return (ret); + } + + if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) { + if (val < I40E_MIN_RX_DMA_THRESH || + val > I40E_MAX_RX_DMA_THRESH) { + return (EINVAL); + } + i40e->i40e_rx_dma_min = (uint32_t)val; + return (0); + } + + if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) { + if (val < I40E_MIN_TX_DMA_THRESH || + val > I40E_MAX_TX_DMA_THRESH) { + return (EINVAL); + } + i40e->i40e_tx_dma_min = (uint32_t)val; + return (0); + } + + if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) { + if (val < I40E_MIN_ITR || + val > I40E_MAX_ITR) { + return (EINVAL); + } + i40e->i40e_rx_itr = (uint32_t)val; + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr); + return (0); + } + + if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) { + if (val < I40E_MIN_ITR || + val > I40E_MAX_ITR) { + return (EINVAL); + } + i40e->i40e_tx_itr = (uint32_t)val; + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr); + return (0); + } + + if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) { + if (val < I40E_MIN_ITR || + val > I40E_MAX_ITR) { + return (EINVAL); + } + i40e->i40e_tx_itr = (uint32_t)val; + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER, + i40e->i40e_other_itr); + return (0); + } + + return (ENOTSUP); +} + +static int +i40e_m_getprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize, + void *pr_val) +{ + uint32_t val; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) { + val = i40e->i40e_rx_dma_min; + } else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) { + val = i40e->i40e_tx_dma_min; + } else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) { + val = i40e->i40e_rx_itr; + } else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) { + val = i40e->i40e_tx_itr; + } else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) { + val = i40e->i40e_other_itr; + } else { + return (ENOTSUP); + } + + if (snprintf(pr_val, pr_valsize, "%d", val) >= pr_valsize) + return (ERANGE); + return (0); +} + +/* + * Annoyingly for private properties MAC seems to ignore default values that + * aren't strings. That means that we have to translate all of these into + * uint32_t's and instead we size the buffer to be large enough to hold a + * uint32_t. + */ +/* ARGSUSED */ +static void +i40e_m_propinfo_private(i40e_t *i40e, const char *pr_name, + mac_prop_info_handle_t prh) +{ + char buf[64]; + uint32_t def; + + if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + def = I40E_DEF_RX_DMA_THRESH; + mac_prop_info_set_range_uint32(prh, + I40E_MIN_RX_DMA_THRESH, + I40E_MAX_RX_DMA_THRESH); + } else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + def = I40E_DEF_TX_DMA_THRESH; + mac_prop_info_set_range_uint32(prh, + I40E_MIN_TX_DMA_THRESH, + I40E_MAX_TX_DMA_THRESH); + } else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + def = I40E_DEF_RX_ITR; + mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR); + } else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + def = I40E_DEF_TX_ITR; + mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR); + } else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + def = I40E_DEF_OTHER_ITR; + mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR); + } else { + return; + } + + (void) snprintf(buf, sizeof (buf), "%d", def); + mac_prop_info_set_default_str(prh, buf); +} + +static int +i40e_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t new_mtu; + i40e_t *i40e = arg; + int ret = 0; + + mutex_enter(&i40e->i40e_general_lock); + if (i40e->i40e_state & I40E_SUSPENDED) { + mutex_exit(&i40e->i40e_general_lock); + return (ECANCELED); + } + + switch (pr_num) { + /* + * These properties are always read-only across every device. + */ + case MAC_PROP_DUPLEX: + case MAC_PROP_SPEED: + case MAC_PROP_STATUS: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_10GFDX_CAP: + case MAC_PROP_ADV_40GFDX_CAP: + ret = ENOTSUP; + break; + /* + * These are read-only at this time as we don't support configuring + * auto-negotiation. See the theory statement in i40e_main.c. + */ + case MAC_PROP_EN_100FDX_CAP: + case MAC_PROP_EN_1000FDX_CAP: + case MAC_PROP_EN_10GFDX_CAP: + case MAC_PROP_EN_40GFDX_CAP: + case MAC_PROP_AUTONEG: + case MAC_PROP_FLOWCTRL: + ret = ENOTSUP; + break; + + case MAC_PROP_MTU: + bcopy(pr_val, &new_mtu, sizeof (new_mtu)); + if (new_mtu == i40e->i40e_sdu) + break; + + if (new_mtu < I40E_MIN_MTU || + new_mtu > I40E_MAX_MTU) { + ret = EINVAL; + break; + } + + if (i40e->i40e_state & I40E_STARTED) { + ret = EBUSY; + break; + } + + ret = mac_maxsdu_update(i40e->i40e_mac_hdl, new_mtu); + if (ret == 0) { + i40e->i40e_sdu = new_mtu; + i40e_update_mtu(i40e); + } + break; + + case MAC_PROP_PRIVATE: + ret = i40e_m_setprop_private(i40e, pr_name, pr_valsize, pr_val); + break; + default: + ret = ENOTSUP; + break; + } + + mutex_exit(&i40e->i40e_general_lock); + return (ret); +} + +static int +i40e_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + i40e_t *i40e = arg; + uint64_t speed; + int ret = 0; + uint8_t *u8; + link_flowctrl_t fctl; + + mutex_enter(&i40e->i40e_general_lock); + + switch (pr_num) { + case MAC_PROP_DUPLEX: + if (pr_valsize < sizeof (link_duplex_t)) { + ret = EOVERFLOW; + break; + } + bcopy(&i40e->i40e_link_duplex, pr_val, sizeof (link_duplex_t)); + break; + case MAC_PROP_SPEED: + if (pr_valsize < sizeof (uint64_t)) { + ret = EOVERFLOW; + break; + } + speed = i40e->i40e_link_speed * 1000000ULL; + bcopy(&speed, pr_val, sizeof (speed)); + break; + case MAC_PROP_STATUS: + if (pr_valsize < sizeof (link_state_t)) { + ret = EOVERFLOW; + break; + } + bcopy(&i40e->i40e_link_state, pr_val, sizeof (link_state_t)); + break; + case MAC_PROP_AUTONEG: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + u8 = pr_val; + *u8 = 1; + break; + case MAC_PROP_FLOWCTRL: + /* + * Because we don't currently support hardware flow control, we + * just hardcode this to be none. + */ + if (pr_valsize < sizeof (link_flowctrl_t)) { + ret = EOVERFLOW; + break; + } + fctl = LINK_FLOWCTRL_NONE; + bcopy(&fctl, pr_val, sizeof (link_flowctrl_t)); + break; + case MAC_PROP_MTU: + if (pr_valsize < sizeof (uint32_t)) { + ret = EOVERFLOW; + break; + } + bcopy(&i40e->i40e_sdu, pr_val, sizeof (uint32_t)); + break; + + /* + * Because we don't let users control the speeds we may auto-negotiate + * to, the values of the ADV_ and EN_ will always be the same. + */ + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_EN_100FDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + u8 = pr_val; + *u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0; + break; + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_EN_1000FDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + u8 = pr_val; + *u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0; + break; + case MAC_PROP_ADV_10GFDX_CAP: + case MAC_PROP_EN_10GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + u8 = pr_val; + *u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0; + break; + case MAC_PROP_ADV_40GFDX_CAP: + case MAC_PROP_EN_40GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + u8 = pr_val; + *u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0; + break; + case MAC_PROP_PRIVATE: + ret = i40e_m_getprop_private(i40e, pr_name, pr_valsize, pr_val); + break; + default: + ret = ENOTSUP; + break; + } + + mutex_exit(&i40e->i40e_general_lock); + + return (ret); +} + +static void +i40e_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + i40e_t *i40e = arg; + + mutex_enter(&i40e->i40e_general_lock); + + switch (pr_num) { + case MAC_PROP_DUPLEX: + case MAC_PROP_SPEED: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + break; + case MAC_PROP_FLOWCTRL: + /* + * At the moment, the driver doesn't support flow control, hence + * why this is set to read-only and none. + */ + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_link_flowctrl(prh, + LINK_FLOWCTRL_NONE); + break; + case MAC_PROP_MTU: + mac_prop_info_set_range_uint32(prh, I40E_MIN_MTU, I40E_MAX_MTU); + break; + + /* + * We set the defaults for these based upon the phy's ability to + * support the speeds. Note, auto-negotiation is required for fiber, + * hence it is read-only and always enabled. When we have access to + * copper phys we can revisit this. + */ + case MAC_PROP_AUTONEG: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, 1); + break; + case MAC_PROP_ADV_100FDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0); + break; + case MAC_PROP_EN_100FDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0); + break; + case MAC_PROP_ADV_1000FDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0); + break; + case MAC_PROP_EN_1000FDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0); + break; + case MAC_PROP_ADV_10GFDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0); + break; + case MAC_PROP_EN_10GFDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0); + break; + case MAC_PROP_ADV_40GFDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0); + break; + case MAC_PROP_EN_40GFDX_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_uint8(prh, + (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0); + break; + case MAC_PROP_PRIVATE: + i40e_m_propinfo_private(i40e, pr_name, prh); + break; + default: + break; + } + + mutex_exit(&i40e->i40e_general_lock); +} + +#define I40E_M_CALLBACK_FLAGS \ + (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) + +static mac_callbacks_t i40e_m_callbacks = { + I40E_M_CALLBACK_FLAGS, + i40e_m_stat, + i40e_m_start, + i40e_m_stop, + i40e_m_promisc, + i40e_m_multicast, + NULL, + NULL, + NULL, + i40e_m_ioctl, + i40e_m_getcapab, + NULL, + NULL, + i40e_m_setprop, + i40e_m_getprop, + i40e_m_propinfo +}; + +boolean_t +i40e_register_mac(i40e_t *i40e) +{ + struct i40e_hw *hw = &i40e->i40e_hw_space; + int status; + mac_register_t *mac = mac_alloc(MAC_VERSION); + + if (mac == NULL) + return (B_FALSE); + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = i40e; + mac->m_dip = i40e->i40e_dip; + mac->m_src_addr = hw->mac.addr; + mac->m_callbacks = &i40e_m_callbacks; + mac->m_min_sdu = 0; + mac->m_max_sdu = i40e->i40e_sdu; + mac->m_margin = VLAN_TAGSZ; + mac->m_priv_props = i40e_priv_props; + mac->m_v12n = MAC_VIRT_LEVEL1; + + status = mac_register(mac, &i40e->i40e_mac_hdl); + if (status != 0) + i40e_error(i40e, "mac_register() returned %d", status); + mac_free(mac); + + return (status == 0); +} diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c new file mode 100644 index 0000000000..ba9bea7b20 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_intr.c @@ -0,0 +1,757 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * ------------------------- + * Interrupt Handling Theory + * ------------------------- + * + * There are a couple different sets of interrupts that we need to worry about: + * + * - Interrupts from receive queues + * - Interrupts from transmit queues + * - 'Other Interrupts', such as the administrative queue + * + * 'Other Interrupts' are asynchronous events such as a link status change event + * being posted to the administrative queue, unrecoverable ECC errors, and more. + * If we have something being posted to the administrative queue, then we go + * through and process it, because it's generally enabled as a separate logical + * interrupt. Note, we may need to do more here eventually. To re-enable the + * interrupts from the 'Other Interrupts' section, we need to clear the PBA and + * write ENA to PFINT_ICR0. + * + * Interrupts from the transmit and receive queues indicates that our requests + * have been processed. In the rx case, it means that we have data that we + * should take a look at and send up the stack. In the tx case, it means that + * data which we got from MAC has now been sent out on the wire and we can free + * the associated data. Most of the logic for acting upon the presence of this + * data can be found in i40e_transciever.c which handles all of the DMA, rx, and + * tx operations. This file is dedicated to handling and dealing with interrupt + * processing. + * + * All devices supported by this driver support three kinds of interrupts: + * + * o Extended Message Signaled Interrupts (MSI-X) + * o Message Signaled Interrupts (MSI) + * o Legacy PCI interrupts (INTx) + * + * Generally speaking the hardware logically handles MSI and INTx the same and + * restricts us to only using a single interrupt, which isn't the interesting + * case. With MSI-X available, each physical function of the device provides the + * opportunity for multiple interrupts which is what we'll focus on. + * + * -------------------- + * Interrupt Management + * -------------------- + * + * By default, the admin queue, which consists of the asynchronous other + * interrupts is always bound to MSI-X vector zero. Next, we spread out all of + * the other interrupts that we have available to us over the remaining + * interrupt vectors. + * + * This means that there may be multiple queues, both tx and rx, which are + * mapped to the same interrupt. When the interrupt fires, we'll have to check + * all of them for servicing, before we go through and indicate that the + * interrupt is claimed. + * + * The hardware provides the means of mapping various queues to MSI-X interrupts + * by programming the I40E_QINT_RQCTL() and I4OE_QINT_TQCTL() registers. These + * registers can also be used to enable and disable whether or not the queue is + * a source of interrupts. As part of this, the hardware requires that we + * maintain a linked list of queues for each interrupt vector. While it may seem + * like this is only there for the purproses of ITRs, that's not the case. The + * first queue must be programmed in I40E_QINT_LNKLSTN(%vector) register. Each + * queue defines the next one in either the I40E_QINT_RQCTL or I40E_QINT_TQCTL + * register. + * + * Because we only have a single queue enabled at the moment and we always have + * two interrupts, we do something pretty simple and just know that there's one + * data queue in the interrupt handler. Longer term, we'll need to think harder + * about this, but for the moment it'll have to suffice. + * + * Finally, the individual interrupt vector itself has the ability to be enabled + * and disabled. The overall interrupt is controlled through the + * I40E_PFINT_DYN_CTLN() register. This is used to turn on and off the interrupt + * as a whole. + * + * Note that this means that both the individual queue and the interrupt as a + * whole can be toggled and re-enabled. + * + * ------------------- + * Non-MSIX Management + * ------------------- + * + * We may have a case where the Operating System is unable to actually allocate + * any MSI-X to the system. In such a world, there is only one transmit/receive + * queue pair and it is bound to the same interrupt with index zero. The + * hardware doesn't allow us access to additional interrupt vectors in these + * modes. Note that technically we could support more transmit/receive queues if + * we wanted. + * + * In this world, because the interrupts for the admin queue and traffic are + * mixed together, we have to consult ICR0 to determine what has occurred. The + * QINT_TQCTL and QINT_RQCTL registers have a field, 'MSI-X 0 index' which + * allows us to set a specific bit in ICR0. There are up to seven such bits; + * however, we only use the bit 0 and 1 for the rx and tx queue respectively. + * These are contained by the I40E_INTR_NOTX_{R|T}X_QUEUE and + * I40E_INTR_NOTX_{R|T}X_MASK registers respectively. + * + * Unfortunately, these corresponding queue bits have no corresponding entry in + * the ICR0_ENA register. So instead, when enabling interrupts on the queues, we + * end up enabling it on the queue registers rather than on the MSI-X registers. + * In the MSI-X world, because they can be enabled and disabled, this is + * different and the queues can always be enabled and disabled, but the + * interrupts themselves are toggled (ignoring the question of interrupt + * blanking for polling on rings). + * + * Finally, we still have to set up the interrupt linked list, but the list is + * instead rooted at the register I40E_PFINT_LNKLST0, rather than being tied to + * one of the other MSI-X registers. + * + * -------------------- + * Interrupt Moderation + * -------------------- + * + * The XL710 hardware has three different interrupt moderation registers per + * interrupt. Unsurprisingly, we use these for: + * + * o RX interrupts + * o TX interrupts + * o 'Other interrupts' (link status change, admin queue, etc.) + * + * By default, we throttle 'other interrupts' the most, then TX interrupts, and + * then RX interrupts. The default values for these were based on trying to + * reason about both the importance and frequency of events. Generally speaking + * 'other interrupts' are not very frequent and they're not important for the + * I/O data path in and of itself (though they may indicate issues with the I/O + * data path). + * + * On the flip side, when we're not polling, RX interrupts are very important. + * The longer we wait for them, the more latency that we inject into the system. + * However, if we allow interrupts to occur too frequently, we risk a few + * problems: + * + * 1) Abusing system resources. Without proper interrupt blanking and polling, + * we can see upwards of 200k-300k interrupts per second on the system. + * + * 2) Not enough data coalescing to enable polling. In other words, the more + * data that we allow to build up, the more likely we'll be able to enable + * polling mode and allowing us to better handle bulk data. + * + * In-between the 'other interrupts' and the TX interrupts we have the + * reclamation of TX buffers. This operation is not quite as important as we + * generally size the ring large enough that we should be able to reclaim a + * substantial amount of the descriptors that we have used per interrupt. So + * while it's important that this interrupt occur, we don't necessarily need it + * firing as frequently as RX; it doesn't, on its own, induce additional latency + * into the system. + * + * Based on all this we currently assign static ITR values for the system. While + * we could move to a dynamic system (the hardware supports that), we'd want to + * make sure that we're seeing problems from this that we believe would be + * generally helped by the added complexity. + * + * Based on this, the default values that we have allow for the following + * interrupt thresholds: + * + * o 20k interrupts/s for RX + * o 5k interrupts/s for TX + * o 2k interupts/s for 'Other Interrupts' + */ + +#include "i40e_sw.h" + +#define I40E_INTR_NOTX_QUEUE 0 +#define I40E_INTR_NOTX_INTR 0 +#define I40E_INTR_NOTX_RX_QUEUE 0 +#define I40E_INTR_NOTX_RX_MASK (1 << I40E_PFINT_ICR0_QUEUE_0_SHIFT) +#define I40E_INTR_NOTX_TX_QUEUE 1 +#define I40E_INTR_NOTX_TX_MASK (1 << I40E_PFINT_ICR0_QUEUE_1_SHIFT) + +void +i40e_intr_set_itr(i40e_t *i40e, i40e_itr_index_t itr, uint_t val) +{ + int i; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + VERIFY3U(val, <=, I40E_MAX_ITR); + VERIFY3U(itr, <, I40E_ITR_INDEX_NONE); + + /* + * No matter the interrupt mode, the ITR for other interrupts is always + * on interrupt zero and the same is true if we're not using MSI-X. + */ + if (itr == I40E_ITR_INDEX_OTHER || + i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { + I40E_WRITE_REG(hw, I40E_PFINT_ITR0(itr), val); + return; + } + + for (i = 1; i < i40e->i40e_intr_count; i++) { + I40E_WRITE_REG(hw, I40E_PFINT_ITRN(itr, i - 1), val); + } +} + +/* + * Re-enable the adminq. Note that the adminq doesn't have a traditional queue + * associated with it from an interrupt perspective and just lives on ICR0. + * However when MSI-X interrupts are not being used, then this also enables and + * disables those interrupts. + */ +static void +i40e_intr_adminq_enable(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + + reg = I40E_PFINT_DYN_CTL0_INTENA_MASK | + I40E_PFINT_DYN_CTL0_CLEARPBA_MASK | + (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT); + I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg); + i40e_flush(hw); +} + +static void +i40e_intr_adminq_disable(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + + reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT; + I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg); +} + +static void +i40e_intr_io_enable(i40e_t *i40e, int vector) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = I40E_PFINT_DYN_CTLN_INTENA_MASK | + I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | + (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT); + I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg); +} + +static void +i40e_intr_io_disable(i40e_t *i40e, int vector) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT; + I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg); +} + +/* + * When MSI-X interrupts are being used, then we can enable the actual + * interrupts themselves. However, when they are not, we instead have to turn + * towards the queue's CAUSE_ENA bit and enable that. + */ +void +i40e_intr_io_enable_all(i40e_t *i40e) +{ + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + int i; + + for (i = 1; i < i40e->i40e_intr_count; i++) { + i40e_intr_io_enable(i40e, i); + } + } else { + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE)); + reg |= I40E_QINT_RQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg); + + reg = I40E_READ_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE)); + reg |= I40E_QINT_TQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg); + } +} + +/* + * When MSI-X interrupts are being used, then we can disable the actual + * interrupts themselves. However, when they are not, we instead have to turn + * towards the queue's CAUSE_ENA bit and disable that. + */ +void +i40e_intr_io_disable_all(i40e_t *i40e) +{ + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + int i; + + for (i = 1; i < i40e->i40e_intr_count; i++) { + i40e_intr_io_disable(i40e, i); + } + } else { + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE)); + reg &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg); + + reg = I40E_READ_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE)); + reg &= ~I40E_QINT_TQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg); + } +} + +/* + * As part of disabling the tx and rx queue's we're technically supposed to + * remove the linked list entries. The simplest way is to clear the LNKLSTN + * register by setting it to I40E_QUEUE_TYPE_EOL (0x7FF). + * + * Note all of the FM register access checks are performed by the caller. + */ +void +i40e_intr_io_clear_cause(i40e_t *i40e) +{ + int i; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { + uint32_t reg; + reg = I40E_QUEUE_TYPE_EOL; + I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, reg); + return; + } + + for (i = 1; i < i40e->i40e_intr_count; i++) { + uint32_t reg; +#ifdef DEBUG + /* + * Verify that the interrupt in question is disabled. This is a + * prerequisite of modifying the data in question. + */ + reg = I40E_READ_REG(hw, I40E_PFINT_DYN_CTLN(i - 1)); + VERIFY0(reg & I40E_PFINT_DYN_CTLN_INTENA_MASK); +#endif + reg = I40E_QUEUE_TYPE_EOL; + I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i - 1), reg); + } + + i40e_flush(hw); +} + +/* + * Finalize interrupt handling. Mostly this disables the admin queue. + */ +void +i40e_intr_chip_fini(i40e_t *i40e) +{ +#ifdef DEBUG + int i; + uint32_t reg; + + i40e_hw_t *hw = &i40e->i40e_hw_space; + + /* + * Take a look and verify that all other interrupts have been disabled + * and the interrupt linked lists have been zeroed. + */ + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + for (i = 1; i < i40e->i40e_intr_count; i++) { + reg = I40E_READ_REG(hw, I40E_PFINT_DYN_CTLN(i - 1)); + VERIFY0(reg & I40E_PFINT_DYN_CTLN_INTENA_MASK); + + reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1)); + VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL); + } + } +#endif + + i40e_intr_adminq_disable(i40e); +} + +/* + * Enable all of the queues and set the corresponding LNKLSTN registers. Note + * that we always enable queues as interrupt sources, even though we don't + * enable the MSI-X interrupt vectors. + */ +static void +i40e_intr_init_queue_msix(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + + /* + * Because we only have a single queue, just do something simple now. + * How this all works will need to really be properly redone based on + * the bit maps, etc. Note that we skip the ITR logic for the moment, + * just to make our lives as explicit and simple as possible. + */ + reg = (0 << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); + I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(0), reg); + + reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (0 << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_RQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(0), reg); + + reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_TQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(0), reg); + +} + +/* + * Set up a single queue to share the admin queue interrupt in the non-MSI-X + * world. Note we do not enable the queue as an interrupt cause at this time. We + * don't have any other vector of control here, unlike with the MSI-X interrupt + * case. + */ +static void +i40e_intr_init_queue_shared(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + + VERIFY(i40e->i40e_intr_type == DDI_INTR_TYPE_FIXED || + i40e->i40e_intr_type == DDI_INTR_TYPE_MSI); + + reg = (I40E_INTR_NOTX_QUEUE << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); + I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, reg); + + reg = (I40E_INTR_NOTX_INTR << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (I40E_INTR_NOTX_RX_QUEUE << I40E_QINT_RQCTL_MSIX0_INDX_SHIFT) | + (I40E_INTR_NOTX_QUEUE << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT); + + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg); + + reg = (I40E_INTR_NOTX_INTR << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) | + (I40E_INTR_NOTX_TX_QUEUE << I40E_QINT_TQCTL_MSIX0_INDX_SHIFT) | + (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT); + + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(I40E_INTR_NOTX_QUEUE), reg); +} + +/* + * Enable the specified queue as a valid source of interrupts. Note, this should + * only be used as part of the GLDv3's interrupt blanking routines. The debug + * build assertions are specific to that. + */ +void +i40e_intr_rx_queue_enable(i40e_t *i40e, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + ASSERT(queue < i40e->i40e_num_trqpairs); + + reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue)); + ASSERT0(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK); + reg |= I40E_QINT_RQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg); +} + +/* + * Disable the specified queue as a valid source of interrupts. Note, this + * should only be used as part of the GLDv3's interrupt blanking routines. The + * debug build assertions are specific to that. + */ +void +i40e_intr_rx_queue_disable(i40e_t *i40e, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + ASSERT(queue < i40e->i40e_num_trqpairs); + + reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue)); + ASSERT3U(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK, ==, + I40E_QINT_RQCTL_CAUSE_ENA_MASK); + reg &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK; + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg); +} + +/* + * Start up the various chip's interrupt handling. We not only configure the + * adminq here, but we also go through and configure all of the actual queues, + * the interrupt linked lists, and others. + */ +void +i40e_intr_chip_init(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + + /* + * Ensure that all non adminq interrupts are disabled at the chip level. + */ + i40e_intr_io_disable_all(i40e); + + I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, 0); + (void) I40E_READ_REG(hw, I40E_PFINT_ICR0); + + /* + * Always enable all of the other-class interrupts to be on their own + * ITR. This only needs to be set on interrupt zero, which has its own + * special setting. + */ + reg = I40E_ITR_INDEX_OTHER << I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT; + I40E_WRITE_REG(hw, I40E_PFINT_STAT_CTL0, reg); + + /* + * Enable interrupt types we expect to receive. At the moment, this + * is limited to the adminq; however, we'll want to review 11.2.2.9.22 + * for more types here as we add support for detecting them, handling + * them, and resetting the device as appropriate. + */ + reg = I40E_PFINT_ICR0_ENA_ADMINQ_MASK; + I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, reg); + + /* + * Always set the interrupt linked list to empty. We'll come back and + * change this if MSI-X are actually on the scene. + */ + I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, I40E_QUEUE_TYPE_EOL); + + i40e_intr_adminq_enable(i40e); + + /* + * Set up all of the queues and map them to interrupts based on the bit + * assignments. + */ + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + i40e_intr_init_queue_msix(i40e); + } else { + i40e_intr_init_queue_shared(i40e); + } + + /* + * Finally set all of the default ITRs for the interrupts. Note that the + * queues will have been set up above. + */ + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr); + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr); + i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER, i40e->i40e_other_itr); +} + +static void +i40e_intr_adminq_work(i40e_t *i40e) +{ + struct i40e_hw *hw = &i40e->i40e_hw_space; + struct i40e_arq_event_info evt; + uint16_t remain = 1; + + bzero(&evt, sizeof (struct i40e_arq_event_info)); + evt.buf_len = I40E_ADMINQ_BUFSZ; + evt.msg_buf = i40e->i40e_aqbuf; + + while (remain != 0) { + enum i40e_status_code ret; + uint16_t opcode; + + /* + * At the moment, the only error code that seems to be returned + * is one saying that there's no work. In such a case we leave + * this be. + */ + ret = i40e_clean_arq_element(hw, &evt, &remain); + if (ret != I40E_SUCCESS) + break; + + opcode = LE_16(evt.desc.opcode); + switch (opcode) { + case i40e_aqc_opc_get_link_status: + mutex_enter(&i40e->i40e_general_lock); + i40e_link_check(i40e); + mutex_exit(&i40e->i40e_general_lock); + break; + default: + /* + * Longer term we'll want to enable other causes here + * and get these cleaned up and doing something. + */ + break; + } + } +} + +static void +i40e_intr_rx_work(i40e_t *i40e, int queue) +{ + mblk_t *mp; + i40e_trqpair_t *itrq; + + ASSERT(queue < i40e->i40e_num_trqpairs); + itrq = &i40e->i40e_trqpairs[queue]; + + mutex_enter(&itrq->itrq_rx_lock); + mp = i40e_ring_rx(itrq, I40E_POLL_NULL); + mutex_exit(&itrq->itrq_rx_lock); + + if (mp != NULL) { + mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, + itrq->itrq_rxgen); + } +} + +static void +i40e_intr_tx_work(i40e_t *i40e, int queue) +{ + i40e_trqpair_t *itrq; + + itrq = &i40e->i40e_trqpairs[queue]; + i40e_tx_recycle_ring(itrq); +} + +/* + * At the moment, the only 'other' interrupt on ICR0 that we handle is the + * adminq. We should go through and support the other notifications at some + * point. + */ +static void +i40e_intr_other_work(i40e_t *i40e) +{ + struct i40e_hw *hw = &i40e->i40e_hw_space; + uint32_t reg; + + reg = I40E_READ_REG(hw, I40E_PFINT_ICR0); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + return; + } + + if (reg & I40E_PFINT_ICR0_ADMINQ_MASK) + i40e_intr_adminq_work(i40e); + + /* + * Make sure that the adminq interrupt is not masked and then explicitly + * enable the adminq and thus the other interrupt. + */ + reg = I40E_READ_REG(hw, I40E_PFINT_ICR0_ENA); + reg |= I40E_PFINT_ICR0_ENA_ADMINQ_MASK; + I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, reg); + + i40e_intr_adminq_enable(i40e); +} + +uint_t +i40e_intr_msix(void *arg1, void *arg2) +{ + i40e_t *i40e = (i40e_t *)arg1; + int vector_idx = (int)(uintptr_t)arg2; + + /* + * When using MSI-X interrupts, vector 0 is always reserved for the + * adminq at this time. Though longer term, we'll want to also bridge + * some I/O to them. + */ + if (vector_idx == 0) { + i40e_intr_other_work(i40e); + return (DDI_INTR_CLAIMED); + } + + VERIFY(vector_idx == 1); + + /* + * Note that we explicitly do not check this value under the lock even + * though assignments to it are done so. In this case, the cost of + * getting this wrong is at worst a bit of additional contention and + * even more rarely, a duplicated packet. However, the cost on the other + * hand is a lot more. This is something that as we more generally + * implement ring support we should revisit. + */ + if (i40e->i40e_intr_poll != B_TRUE) + i40e_intr_rx_work(i40e, 0); + i40e_intr_tx_work(i40e, 0); + i40e_intr_io_enable(i40e, 1); + + return (DDI_INTR_CLAIMED); +} + +static uint_t +i40e_intr_notx(i40e_t *i40e, boolean_t shared) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint32_t reg; + int ret = DDI_INTR_CLAIMED; + + if (shared == B_TRUE) { + mutex_enter(&i40e->i40e_general_lock); + if (i40e->i40e_state & I40E_SUSPENDED) { + mutex_exit(&i40e->i40e_general_lock); + return (DDI_INTR_UNCLAIMED); + } + mutex_exit(&i40e->i40e_general_lock); + } + + reg = I40E_READ_REG(hw, I40E_PFINT_ICR0); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + return (DDI_INTR_CLAIMED); + } + + if (reg == 0) { + if (shared == B_TRUE) + ret = DDI_INTR_UNCLAIMED; + goto done; + } + + if (reg & I40E_PFINT_ICR0_ADMINQ_MASK) + i40e_intr_adminq_work(i40e); + + if (reg & I40E_INTR_NOTX_RX_MASK) + i40e_intr_rx_work(i40e, 0); + + if (reg & I40E_INTR_NOTX_TX_MASK) + i40e_intr_tx_work(i40e, 0); + +done: + i40e_intr_adminq_enable(i40e); + return (ret); + +} + +/* ARGSUSED */ +uint_t +i40e_intr_msi(void *arg1, void *arg2) +{ + i40e_t *i40e = (i40e_t *)arg1; + + return (i40e_intr_notx(i40e, B_FALSE)); +} + +/* ARGSUSED */ +uint_t +i40e_intr_legacy(void *arg1, void *arg2) +{ + i40e_t *i40e = (i40e_t *)arg1; + + return (i40e_intr_notx(i40e, B_TRUE)); +} diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c new file mode 100644 index 0000000000..91164abf87 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_main.c @@ -0,0 +1,2883 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +/* + * i40e - Intel 10/40 Gb Ethernet driver + * + * The i40e driver is the main software device driver for the Intel 40 Gb family + * of devices. Note that these devices come in many flavors with both 40 GbE + * ports and 10 GbE ports. This device is the successor to the 82599 family of + * devices (ixgbe). + * + * Unlike previous generations of Intel 1 GbE and 10 GbE devices, the 40 GbE + * devices defined in the XL710 controller (previously known as Fortville) are a + * rather different beast and have a small switch embedded inside of them. In + * addition, the way that most of the programming is done has been overhauled. + * As opposed to just using PCIe memory mapped registers, it also has an + * administrative queue which is used to communicate with firmware running on + * the chip. + * + * Each physical function in the hardware shows up as a device that this driver + * will bind to. The hardware splits many resources evenly across all of the + * physical functions present on the device, while other resources are instead + * shared across the entire card and its up to the device driver to + * intelligently partition them. + * + * ------------ + * Organization + * ------------ + * + * This driver is made up of several files which have their own theory + * statements spread across them. We'll touch on the high level purpose of each + * file here, and then we'll get into more discussion on how the device is + * generally modelled with respect to the interfaces in illumos. + * + * i40e_gld.c: This file contains all of the bindings to MAC and the networking + * stack. + * + * i40e_intr.c: This file contains all of the interrupt service routines and + * contains logic to enable and disable interrupts on the hardware. + * It also contains the logic to map hardware resources such as the + * rings to and from interrupts and controls their ability to fire. + * + * There is a big theory statement on interrupts present there. + * + * i40e_main.c: The file that you're currently in. It interfaces with the + * traditional OS DDI interfaces and is in charge of configuring + * the device. + * + * i40e_osdep.[ch]: These files contain interfaces and definitions needed to + * work with Intel's common code for the device. + * + * i40e_stats.c: This file contains the general work and logic around our + * kstats. A theory statement on their organization and use of the + * hardware exists there. + * + * i40e_sw.h: This header file contains all of the primary structure definitions + * and constants that are used across the entire driver. + * + * i40e_transceiver.c: This file contains all of the logic for sending and + * receiving data. It contains all of the ring and DMA + * allocation logic, as well as, the actual interfaces to + * send and receive data. + * + * A big theory statement on ring management, descriptors, + * and how it ties into the OS is present there. + * + * -------------- + * General Design + * -------------- + * + * Before we go too far into the general way we've laid out data structures and + * the like, it's worth taking some time to explain how the hardware is + * organized. This organization informs a lot of how we do things at this time + * in the driver. + * + * Each physical device consists of a number of one or more ports, which are + * considered physical functions in the PCI sense and thus each get enumerated + * by the system, resulting in an instance being created and attached to. While + * there are many resources that are unique to each physical function eg. + * instance of the device, there are many that are shared across all of them. + * Several resources have an amount reserved for each VSI and then a static pool + * of resources, available for all functions on the card. + * + * The most important resource in hardware are its transmit and receive queue + * pairs (i40e_trqpair_t). These should be thought of as rings in GLDv3 + * parlance. There are a set number of these on each device; however, they are + * statically partitioned among all of the different physical functions. + * + * 'Fortville' (the code name for this device family) is basically a switch. To + * map MAC addresses and other things to queues, we end up having to create + * Virtual Station Interfaces (VSIs) and establish forwarding rules that direct + * traffic to a queue. A VSI owns a collection of queues and has a series of + * forwarding rules that point to it. One way to think of this is to treat it + * like MAC does a VNIC. When MAC refers to a group, a collection of rings and + * classification resources, that is a VSI in i40e. + * + * The sets of VSIs is shared across the entire device, though there may be some + * amount that are reserved to each PF. Because the GLDv3 does not let us change + * the number of groups dynamically, we instead statically divide this amount + * evenly between all the functions that exist. In addition, we have the same + * problem with the mac address forwarding rules. There are a static number that + * exist shared across all the functions. + * + * To handle both of these resources, what we end up doing is going through and + * determining which functions belong to the same device. Nominally one might do + * this by having a nexus driver; however, a prime requirement for a nexus + * driver is identifying the various children and activating them. While it is + * possible to get this information from NVRAM, we would end up duplicating a + * lot of the PCI enumeration logic. Really, at the end of the day, the device + * doesn't give us the traditional identification properties we want from a + * nexus driver. + * + * Instead, we rely on some properties that are guaranteed to be unique. While + * it might be tempting to leverage the PBA or serial number of the device from + * NVRAM, there is nothing that says that two devices can't be mis-programmed to + * have the same values in NVRAM. Instead, we uniquely identify a group of + * functions based on their parent in the /devices tree, their PCI bus and PCI + * function identifiers. Using either on their own may not be sufficient. + * + * For each unique PCI device that we encounter, we'll create a i40e_device_t. + * From there, because we don't have a good way to tell the GLDv3 about sharing + * resources between everything, we'll end up just dividing the resources + * evenly between all of the functions. Longer term, if we don't have to declare + * to the GLDv3 that these resources are shared, then we'll maintain a pool and + * hae each PF allocate from the pool in the device, thus if only two of four + * ports are being used, for example, then all of the resources can still be + * used. + * + * ------------------------------------------- + * Transmit and Receive Queue Pair Allocations + * ------------------------------------------- + * + * NVRAM ends up assigning each PF its own share of the transmit and receive LAN + * queue pairs, we have no way of modifying it, only observing it. From there, + * it's up to us to map these queues to VSIs and VFs. Since we don't support any + * VFs at this time, we only focus on assignments to VSIs. + * + * At the moment, we used a static mapping of transmit/receive queue pairs to a + * given VSI (eg. rings to a group). Though in the fullness of time, we want to + * make this something which is fully dynamic and take advantage of documented, + * but not yet available functionality for adding filters based on VXLAN and + * other encapsulation technologies. + * + * ------------------------------------- + * Broadcast, Multicast, and Promiscuous + * ------------------------------------- + * + * As part of the GLDv3, we need to make sure that we can handle receiving + * broadcast and multicast traffic. As well as enabling promiscuous mode when + * requested. GLDv3 requires that all broadcast and multicast traffic be + * retrieved by the default group, eg. the first one. This is the same thing as + * the default VSI. + * + * To receieve broadcast traffic, we enable it through the admin queue, rather + * than use one of our filters for it. For multicast traffic, we reserve a + * certain number of the hash filters and assign them to a given PF. When we + * exceed those, we then switch to using promicuous mode for multicast traffic. + * + * More specifically, once we exceed the number of filters (indicated because + * the i40e_t`i40e_resources.ifr_nmcastfilt == + * i40e_t`i40e_resources.ifr_nmcastfilt_used), we then instead need to toggle + * promiscuous mode. If promiscuous mode is toggled then we keep track of the + * number of MACs added to it by incrementing i40e_t`i40e_mcast_promisc_count. + * That will stay enabled until that count reaches zero indicating that we have + * only added multicast addresses that we have a corresponding entry for. + * + * Because MAC itself wants to toggle promiscuous mode, which includes both + * unicast and multicast traffic, we go through and keep track of that + * ourselves. That is maintained through the use of the i40e_t`i40e_promisc_on + * member. + * + * -------------- + * VSI Management + * -------------- + * + * At this time, we currently only support a single MAC group, and thus a single + * VSI. This VSI is considered the default VSI and should be the only one that + * exists after a reset. Currently it is stored as the member + * i40e_t`i40e_vsi_id. While this works for the moment and for an initial + * driver, it's not sufficient for the longer-term path of the driver. Instead, + * we'll want to actually have a unique i40e_vsi_t structure which is used + * everywhere. Note that this means that every place that uses the + * i40e_t`i40e_vsi_id will need to be refactored. + * + * ---------------- + * Structure Layout + * ---------------- + * + * The following images relates the core data structures together. The primary + * structure in the system is the i40e_t. It itself contains multiple rings, + * i40e_trqpair_t's which contain the various transmit and receive data. The + * receive data is stored outside of the i40e_trqpair_t and instead in the + * i40e_rx_data_t. The i40e_t has a corresponding i40e_device_t which keeps + * track of per-physical device state. Finally, for every active descriptor, + * there is a corresponding control block, which is where the + * i40e_rx_control_block_t and the i40e_tx_control_block_t come from. + * + * +-----------------------+ +-----------------------+ + * | Global i40e_t list | | Global Device list | + * | | +--| | + * | i40e_glist | | | i40e_dlist | + * +-----------------------+ | +-----------------------+ + * | v + * | +------------------------+ +-----------------------+ + * | | Device-wide Structure |----->| Device-wide Structure |--> ... + * | | i40e_device_t | | i40e_device_t | + * | | | +-----------------------+ + * | | dev_info_t * ------+--> Parent in devices tree. + * | | uint_t ------+--> PCI bus number + * | | uint_t ------+--> PCI device number + * | | uint_t ------+--> Number of functions + * | | i40e_switch_rsrcs_t ---+--> Captured total switch resources + * | | list_t ------+-------------+ + * | +------------------------+ | + * | ^ | + * | +--------+ | + * | | v + * | +---------------------------+ | +-------------------+ + * +->| GLDv3 Device, per PF |-----|-->| GLDv3 Device (PF) |--> ... + * | i40e_t | | | i40e_t | + * | **Primary Structure** | | +-------------------+ + * | | | + * | i40e_device_t * --+-----+ + * | i40e_state_t --+---> Device State + * | i40e_hw_t --+---> Intel common code structure + * | mac_handle_t --+---> GLDv3 handle to MAC + * | ddi_periodic_t --+---> Link activity timer + * | int (vsi_id) --+---> VSI ID, main identifier + * | i40e_func_rsrc_t --+---> Available hardware resources + * | i40e_switch_rsrc_t * --+---> Switch resource snapshot + * | i40e_sdu --+---> Current MTU + * | i40e_frame_max --+---> Current HW frame size + * | i40e_uaddr_t * --+---> Array of assigned unicast MACs + * | i40e_maddr_t * --+---> Array of assigned multicast MACs + * | i40e_mcast_promisccount --+---> Active multicast state + * | i40e_promisc_on --+---> Current promiscuous mode state + * | int --+---> Number of transmit/receive pairs + * | kstat_t * --+---> PF kstats + * | kstat_t * --+---> VSI kstats + * | i40e_pf_stats_t --+---> PF kstat backing data + * | i40e_vsi_stats_t --+---> VSI kstat backing data + * | i40e_trqpair_t * --+---------+ + * +---------------------------+ | + * | + * v + * +-------------------------------+ +-----------------------------+ + * | Transmit/Receive Queue Pair |-------| Transmit/Receive Queue Pair |->... + * | i40e_trqpair_t | | i40e_trqpair_t | + * + Ring Data Structure | +-----------------------------+ + * | | + * | mac_ring_handle_t +--> MAC RX ring handle + * | mac_ring_handle_t +--> MAC TX ring handle + * | i40e_rxq_stat_t --+--> RX Queue stats + * | i40e_txq_stat_t --+--> TX Queue stats + * | uint32_t (tx ring size) +--> TX Ring Size + * | uint32_t (tx free list size) +--> TX Free List Size + * | i40e_dma_buffer_t --------+--> TX Descriptor ring DMA + * | i40e_tx_desc_t * --------+--> TX descriptor ring + * | volatile unt32_t * +--> TX Write back head + * | uint32_t -------+--> TX ring head + * | uint32_t -------+--> TX ring tail + * | uint32_t -------+--> Num TX desc free + * | i40e_tx_control_block_t * --+--> TX control block array ---+ + * | i40e_tx_control_block_t ** --+--> TCB work list ----+ + * | i40e_tx_control_block_t ** --+--> TCB free list ---+ + * | uint32_t -------+--> Free TCB count | + * | i40e_rx_data_t * -------+--+ v + * +-------------------------------+ | +---------------------------+ + * | | Per-TX Frame Metadata | + * | | i40e_tx_control_block_t | + * +--------------------+ | | + * | mblk to transmit <--+--- mblk_t * | + * | type of transmit <--+--- i40e_tx_type_t | + * | TX DMA handle <--+--- ddi_dma_handle_t | + * v TX DMA buffer <--+--- i40e_dma_buffer_t | + * +------------------------------+ +---------------------------+ + * | Core Receive Data | + * | i40e_rx_data_t | + * | | + * | i40e_dma_buffer_t --+--> RX descriptor DMA Data + * | i40e_rx_desc_t --+--> RX descriptor ring + * | uint32_t --+--> Next free desc. + * | i40e_rx_control_block_t * --+--> RX Control Block Array ---+ + * | i40e_rx_control_block_t ** --+--> RCB work list ---+ + * | i40e_rx_control_block_t ** --+--> RCB free list ---+ + * +------------------------------+ | + * ^ | + * | +---------------------------+ | + * | | Per-RX Frame Metadata |<---------------+ + * | | i40e_rx_control_block_t | + * | | | + * | | mblk_t * ----+--> Received mblk_t data + * | | uint32_t ----+--> Reference count + * | | i40e_dma_buffer_t ----+--> Receive data DMA info + * | | frtn_t ----+--> mblk free function info + * +-----+-- i40e_rx_data_t * | + * +---------------------------+ + * + * ------------- + * Lock Ordering + * ------------- + * + * In order to ensure that we don't deadlock, the following represents the + * lock oder being used. When grabbing locks, follow the following order. Lower + * numbers are more important. Thus, the i40e_glock which is number 0, must be + * taken before any other locks in the driver. On the other hand, the + * i40e_t`i40e_stat_lock, has the highest number because it's the least + * important lock. Note, that just because one lock is higher than another does + * not mean that all intermediary locks are required. + * + * 0) i40e_glock + * 1) i40e_t`i40e_general_lock + * + * 2) i40e_trqpair_t`itrq_rx_lock + * 3) i40e_trqpair_t`itrq_tx_lock + * 4) i40e_t`i40e_rx_pending_lock + * 5) i40e_trqpair_t`itrq_tcb_lock + * + * 6) i40e_t`i40e_stat_lock + * + * Rules and expectations: + * + * 1) A thread holding locks belong to one PF should not hold locks belonging to + * a second. If for some reason this becomes necessary, locks should be grabbed + * based on the list order in the i40e_device_t, which implies that the + * i40e_glock is held. + * + * 2) When grabbing locks between multiple transmit and receive queues, the + * locks for the lowest number transmit/receive queue should be grabbed first. + * + * 3) When grabbing both the transmit and receive lock for a given queue, always + * grab i40e_trqpair_t`itrq_rx_lock before the i40e_trqpair_t`itrq_tx_lock. + * + * 4) The following pairs of locks are not expected to be held at the same time: + * + * o i40e_t`i40e_rx_pending_lock and i40e_trqpair_t`itrq_tcb_lock + * + * ----------- + * Future Work + * ----------- + * + * At the moment the i40e_t driver is rather bare bones, allowing us to start + * getting data flowing and folks using it while we develop additional features. + * While bugs have been filed to cover this future work, the following gives an + * overview of expected work: + * + * o TSO support + * o RSS / multiple ring support + * o Multiple group support + * o DMA binding and breaking up the locking in ring recycling. + * o Enhanced detection of device errors + * o Participation in IRM + * o FMA device reset + * o Stall detection, temperature error detection, etc. + * o More dynamic resource pools + */ + +#include "i40e_sw.h" + +static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.0"; + +/* + * The i40e_glock primarily protects the lists below and the i40e_device_t + * structures. + */ +static kmutex_t i40e_glock; +static list_t i40e_glist; +static list_t i40e_dlist; + +/* + * Access attributes for register mapping. + */ +static ddi_device_acc_attr_t i40e_regs_acc_attr = { + DDI_DEVICE_ATTR_V1, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC, + DDI_FLAGERR_ACC +}; + +/* + * Logging function for this driver. + */ +static void +i40e_dev_err(i40e_t *i40e, int level, boolean_t console, const char *fmt, + va_list ap) +{ + char buf[1024]; + + (void) vsnprintf(buf, sizeof (buf), fmt, ap); + + if (i40e == NULL) { + cmn_err(level, (console) ? "%s: %s" : "!%s: %s", + I40E_MODULE_NAME, buf); + } else { + dev_err(i40e->i40e_dip, level, (console) ? "%s" : "!%s", + buf); + } +} + +/* + * Because there's the stupid trailing-comma problem with the C preprocessor + * and variable arguments, I need to instantiate these. Pardon the redundant + * code. + */ +void +i40e_error(i40e_t *i40e, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + i40e_dev_err(i40e, CE_WARN, B_FALSE, fmt, ap); + va_end(ap); +} + +void +i40e_log(i40e_t *i40e, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + i40e_dev_err(i40e, CE_NOTE, B_FALSE, fmt, ap); + va_end(ap); +} + +void +i40e_notice(i40e_t *i40e, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + i40e_dev_err(i40e, CE_NOTE, B_TRUE, fmt, ap); + va_end(ap); +} + +static void +i40e_device_rele(i40e_t *i40e) +{ + i40e_device_t *idp = i40e->i40e_device; + + if (idp == NULL) + return; + + mutex_enter(&i40e_glock); + VERIFY(idp->id_nreg > 0); + list_remove(&idp->id_i40e_list, i40e); + idp->id_nreg--; + if (idp->id_nreg == 0) { + list_remove(&i40e_dlist, idp); + list_destroy(&idp->id_i40e_list); + kmem_free(idp->id_rsrcs, sizeof (i40e_switch_rsrc_t) * + idp->id_rsrcs_alloc); + kmem_free(idp, sizeof (i40e_device_t)); + } + i40e->i40e_device = NULL; + mutex_exit(&i40e_glock); +} + +static i40e_device_t * +i40e_device_find(i40e_t *i40e, dev_info_t *parent, uint_t bus, uint_t device) +{ + i40e_device_t *idp; + mutex_enter(&i40e_glock); + for (idp = list_head(&i40e_dlist); idp != NULL; + idp = list_next(&i40e_dlist, idp)) { + if (idp->id_parent == parent && idp->id_pci_bus == bus && + idp->id_pci_device == device) { + break; + } + } + + if (idp != NULL) { + VERIFY(idp->id_nreg < idp->id_nfuncs); + idp->id_nreg++; + } else { + i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT(hw->num_ports > 0); + ASSERT(hw->num_partitions > 0); + + /* + * The Intel common code doesn't exactly keep the number of PCI + * functions. But it calculates it during discovery of + * partitions and ports. So what we do is undo the calculation + * that it does originally, as functions are evenly spread + * across ports in the rare case of partitions. + */ + idp = kmem_alloc(sizeof (i40e_device_t), KM_SLEEP); + idp->id_parent = parent; + idp->id_pci_bus = bus; + idp->id_pci_device = device; + idp->id_nfuncs = hw->num_ports * hw->num_partitions; + idp->id_nreg = 1; + idp->id_rsrcs_alloc = i40e->i40e_switch_rsrc_alloc; + idp->id_rsrcs_act = i40e->i40e_switch_rsrc_actual; + idp->id_rsrcs = kmem_alloc(sizeof (i40e_switch_rsrc_t) * + idp->id_rsrcs_alloc, KM_SLEEP); + bcopy(i40e->i40e_switch_rsrcs, idp->id_rsrcs, + sizeof (i40e_switch_rsrc_t) * idp->id_rsrcs_alloc); + list_create(&idp->id_i40e_list, sizeof (i40e_t), + offsetof(i40e_t, i40e_dlink)); + + list_insert_tail(&i40e_dlist, idp); + } + + list_insert_tail(&idp->id_i40e_list, i40e); + mutex_exit(&i40e_glock); + + return (idp); +} + +static void +i40e_link_state_set(i40e_t *i40e, link_state_t state) +{ + if (i40e->i40e_link_state == state) + return; + + i40e->i40e_link_state = state; + mac_link_update(i40e->i40e_mac_hdl, i40e->i40e_link_state); +} + +/* + * This is a basic link check routine. Mostly we're using this just to see + * if we can get any accurate information about the state of the link being + * up or down, as well as updating the link state, speed, etc. information. + */ +void +i40e_link_check(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + boolean_t ls; + int ret; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + hw->phy.get_link_info = B_TRUE; + if ((ret = i40e_get_link_status(hw, &ls)) != I40E_SUCCESS) { + i40e->i40e_s_link_status_errs++; + i40e->i40e_s_link_status_lasterr = ret; + return; + } + + /* + * Firmware abstracts all of the mac and phy information for us, so we + * can use i40e_get_link_status to determine the current state. + */ + if (ls == B_TRUE) { + enum i40e_aq_link_speed speed; + + speed = i40e_get_link_speed(hw); + + /* + * Translate from an i40e value to a value in Mbits/s. + */ + switch (speed) { + case I40E_LINK_SPEED_100MB: + i40e->i40e_link_speed = 100; + break; + case I40E_LINK_SPEED_1GB: + i40e->i40e_link_speed = 1000; + break; + case I40E_LINK_SPEED_10GB: + i40e->i40e_link_speed = 10000; + break; + case I40E_LINK_SPEED_20GB: + i40e->i40e_link_speed = 20000; + break; + case I40E_LINK_SPEED_40GB: + i40e->i40e_link_speed = 40000; + break; + default: + i40e->i40e_link_speed = 0; + break; + } + + /* + * At this time, hardware does not support half-duplex + * operation, hence why we don't ask the hardware about our + * current speed. + */ + i40e->i40e_link_duplex = LINK_DUPLEX_FULL; + i40e_link_state_set(i40e, LINK_STATE_UP); + } else { + i40e->i40e_link_speed = 0; + i40e->i40e_link_duplex = 0; + i40e_link_state_set(i40e, LINK_STATE_DOWN); + } +} + +static void +i40e_rem_intrs(i40e_t *i40e) +{ + int i, rc; + + for (i = 0; i < i40e->i40e_intr_count; i++) { + rc = ddi_intr_free(i40e->i40e_intr_handles[i]); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "failed to free interrupt %d: %d", + i, rc); + } + } + + kmem_free(i40e->i40e_intr_handles, i40e->i40e_intr_size); + i40e->i40e_intr_handles = NULL; +} + +static void +i40e_rem_intr_handlers(i40e_t *i40e) +{ + int i, rc; + + for (i = 0; i < i40e->i40e_intr_count; i++) { + rc = ddi_intr_remove_handler(i40e->i40e_intr_handles[i]); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "failed to remove interrupt %d: %d", + i, rc); + } + } +} + +/* + * illumos Fault Management Architecture (FMA) support. + */ + +int +i40e_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + ddi_fm_acc_err_clear(handle, DDI_FME_VERSION); + return (de.fme_status); +} + +int +i40e_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + return (de.fme_status); +} + +/* + * Fault service error handling callback function. + */ +/* ARGSUSED */ +static int +i40e_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +i40e_fm_init(i40e_t *i40e) +{ + ddi_iblock_cookie_t iblk; + + i40e->i40e_fm_capabilities = ddi_prop_get_int(DDI_DEV_T_ANY, + i40e->i40e_dip, DDI_PROP_DONTPASS, "fm_capable", + DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | + DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); + + if (i40e->i40e_fm_capabilities < 0) { + i40e->i40e_fm_capabilities = 0; + } else if (i40e->i40e_fm_capabilities > 0xf) { + i40e->i40e_fm_capabilities = DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE | + DDI_FM_ERRCB_CAPABLE; + } + + /* + * Only register with IO Fault Services if we have some capability + */ + if (i40e->i40e_fm_capabilities & DDI_FM_ACCCHK_CAPABLE) { + i40e_regs_acc_attr.devacc_attr_access = DDI_FLAGERR_ACC; + } else { + i40e_regs_acc_attr.devacc_attr_access = DDI_DEFAULT_ACC; + } + + if (i40e->i40e_fm_capabilities) { + ddi_fm_init(i40e->i40e_dip, &i40e->i40e_fm_capabilities, &iblk); + + if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) || + DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) { + pci_ereport_setup(i40e->i40e_dip); + } + + if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) { + ddi_fm_handler_register(i40e->i40e_dip, + i40e_fm_error_cb, (void*)i40e); + } + } + + if (i40e->i40e_fm_capabilities & DDI_FM_DMACHK_CAPABLE) { + i40e_init_dma_attrs(i40e, B_TRUE); + } else { + i40e_init_dma_attrs(i40e, B_FALSE); + } +} + +static void +i40e_fm_fini(i40e_t *i40e) +{ + if (i40e->i40e_fm_capabilities) { + + if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities) || + DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) + pci_ereport_teardown(i40e->i40e_dip); + + if (DDI_FM_ERRCB_CAP(i40e->i40e_fm_capabilities)) + ddi_fm_handler_unregister(i40e->i40e_dip); + + ddi_fm_fini(i40e->i40e_dip); + } +} + +void +i40e_fm_ereport(i40e_t *i40e, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(i40e->i40e_fm_capabilities)) { + ddi_fm_ereport_post(i40e->i40e_dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL); + } +} + +/* + * Here we're trying to get the ID of the default VSI. In general, when we come + * through and look at this shortly after attach, we expect there to only be a + * single element present, which is the default VSI. Importantly, each PF seems + * to not see any other devices, in part because of the simple switch mode that + * we're using. If for some reason, we see more artifact, we'll need to revisit + * what we're doing here. + */ +static int +i40e_get_vsi_id(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_aqc_get_switch_config_resp *sw_config; + uint8_t aq_buf[I40E_AQ_LARGE_BUF]; + uint16_t next = 0; + int rc; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf; + rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next, + NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d\n", + rc, hw->aq.asq_last_status); + return (-1); + } + + if (LE_16(sw_config->header.num_reported) != 1) { + i40e_error(i40e, "encountered multiple (%d) switching units " + "during attach, not proceeding", + LE_16(sw_config->header.num_reported)); + return (-1); + } + + return (sw_config->element[0].seid); +} + +/* + * We need to fill the i40e_hw_t structure with the capabilities of this PF. We + * must also provide the memory for it; however, we don't need to keep it around + * to the call to the common code. It takes it and parses it into an internal + * structure. + */ +static boolean_t +i40e_get_hw_capabilities(i40e_t *i40e, i40e_hw_t *hw) +{ + struct i40e_aqc_list_capabilities_element_resp *buf; + int rc; + size_t len; + uint16_t needed; + int nelems = I40E_HW_CAP_DEFAULT; + + for (;;) { + len = nelems * sizeof (*buf); + ASSERT(len > 0); + buf = kmem_alloc(len, KM_SLEEP); + rc = i40e_aq_discover_capabilities(hw, buf, len, + &needed, i40e_aqc_opc_list_func_capabilities, NULL); + kmem_free(buf, len); + + if (hw->aq.asq_last_status == I40E_AQ_RC_ENOMEM && + nelems == I40E_HW_CAP_DEFAULT) { + if (nelems == needed) { + i40e_error(i40e, "Capability discovery failed " + "due to byzantine common code"); + return (B_FALSE); + } + nelems = needed; + continue; + } else if (hw->aq.asq_last_status != I40E_AQ_RC_OK) { + i40e_error(i40e, "Capability discovery failed: %d", rc); + return (B_FALSE); + } + + break; + } + + return (B_TRUE); +} + +/* + * Obtain the switch's capabilities as seen by this PF and keep it around for + * our later use. + */ +static boolean_t +i40e_get_switch_resources(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint8_t cnt = 2; + uint8_t act; + size_t size; + i40e_switch_rsrc_t *buf; + + for (;;) { + enum i40e_status_code ret; + size = cnt * sizeof (i40e_switch_rsrc_t); + ASSERT(size > 0); + if (size > UINT16_MAX) + return (B_FALSE); + buf = kmem_alloc(size, KM_SLEEP); + + ret = i40e_aq_get_switch_resource_alloc(hw, &act, buf, + cnt, NULL); + if (ret == I40E_ERR_ADMIN_QUEUE_ERROR && + hw->aq.asq_last_status == I40E_AQ_RC_EINVAL) { + kmem_free(buf, size); + cnt += I40E_SWITCH_CAP_DEFAULT; + continue; + } else if (ret != I40E_SUCCESS) { + kmem_free(buf, size); + i40e_error(i40e, + "failed to retrieve switch statistics: %d\n", ret); + return (B_FALSE); + } + + break; + } + + i40e->i40e_switch_rsrc_alloc = cnt; + i40e->i40e_switch_rsrc_actual = act; + i40e->i40e_switch_rsrcs = buf; + + return (B_TRUE); +} + +static void +i40e_cleanup_resources(i40e_t *i40e) +{ + if (i40e->i40e_uaddrs != NULL) { + kmem_free(i40e->i40e_uaddrs, sizeof (i40e_uaddr_t) * + i40e->i40e_resources.ifr_nmacfilt); + i40e->i40e_uaddrs = NULL; + } + + if (i40e->i40e_maddrs != NULL) { + kmem_free(i40e->i40e_maddrs, sizeof (i40e_maddr_t) * + i40e->i40e_resources.ifr_nmcastfilt); + i40e->i40e_maddrs = NULL; + } + + if (i40e->i40e_switch_rsrcs != NULL) { + size_t sz = sizeof (i40e_switch_rsrc_t) * + i40e->i40e_switch_rsrc_alloc; + ASSERT(sz > 0); + kmem_free(i40e->i40e_switch_rsrcs, sz); + i40e->i40e_switch_rsrcs = NULL; + } + + if (i40e->i40e_device != NULL) + i40e_device_rele(i40e); +} + +static boolean_t +i40e_get_available_resources(i40e_t *i40e) +{ + dev_info_t *parent; + uint16_t bus, device, func; + uint_t nregs; + int *regs, i; + i40e_device_t *idp; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + parent = ddi_get_parent(i40e->i40e_dip); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, i40e->i40e_dip, 0, "reg", + ®s, &nregs) != DDI_PROP_SUCCESS) { + return (B_FALSE); + } + + if (nregs < 1) { + ddi_prop_free(regs); + return (B_FALSE); + } + + bus = PCI_REG_BUS_G(regs[0]); + device = PCI_REG_DEV_G(regs[0]); + func = PCI_REG_FUNC_G(regs[0]); + ddi_prop_free(regs); + + i40e->i40e_hw_space.bus.func = func; + i40e->i40e_hw_space.bus.device = device; + + if (i40e_get_switch_resources(i40e) == B_FALSE) { + return (B_FALSE); + } + + /* + * To calculate the total amount of a resource we have available, we + * need to add how many our i40e_t thinks it has guaranteed, if any, and + * then we need to go through and divide the number of available on the + * device, which was snapshotted before anyone should have allocated + * anything, and use that to derive how many are available from the + * pool. Longer term, we may want to turn this into something that's + * more of a pool-like resource that everything can share (though that + * may require some more assistance from MAC). + * + * Though for transmit and receive queue pairs, we just have to ask + * firmware instead. + */ + idp = i40e_device_find(i40e, parent, bus, device); + i40e->i40e_device = idp; + i40e->i40e_resources.ifr_nvsis = 0; + i40e->i40e_resources.ifr_nvsis_used = 0; + i40e->i40e_resources.ifr_nmacfilt = 0; + i40e->i40e_resources.ifr_nmacfilt_used = 0; + i40e->i40e_resources.ifr_nmcastfilt = 0; + i40e->i40e_resources.ifr_nmcastfilt_used = 0; + + for (i = 0; i < i40e->i40e_switch_rsrc_actual; i++) { + i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i]; + + switch (srp->resource_type) { + case I40E_AQ_RESOURCE_TYPE_VSI: + i40e->i40e_resources.ifr_nvsis += + LE_16(srp->guaranteed); + i40e->i40e_resources.ifr_nvsis_used = LE_16(srp->used); + break; + case I40E_AQ_RESOURCE_TYPE_MACADDR: + i40e->i40e_resources.ifr_nmacfilt += + LE_16(srp->guaranteed); + i40e->i40e_resources.ifr_nmacfilt_used = + LE_16(srp->used); + break; + case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH: + i40e->i40e_resources.ifr_nmcastfilt += + LE_16(srp->guaranteed); + i40e->i40e_resources.ifr_nmcastfilt_used = + LE_16(srp->used); + break; + default: + break; + } + } + + for (i = 0; i < idp->id_rsrcs_act; i++) { + i40e_switch_rsrc_t *srp = &i40e->i40e_switch_rsrcs[i]; + switch (srp->resource_type) { + case I40E_AQ_RESOURCE_TYPE_VSI: + i40e->i40e_resources.ifr_nvsis += + LE_16(srp->total_unalloced) / idp->id_nfuncs; + break; + case I40E_AQ_RESOURCE_TYPE_MACADDR: + i40e->i40e_resources.ifr_nmacfilt += + LE_16(srp->total_unalloced) / idp->id_nfuncs; + break; + case I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH: + i40e->i40e_resources.ifr_nmcastfilt += + LE_16(srp->total_unalloced) / idp->id_nfuncs; + default: + break; + } + } + + i40e->i40e_resources.ifr_nrx_queue = hw->func_caps.num_rx_qp; + i40e->i40e_resources.ifr_ntx_queue = hw->func_caps.num_tx_qp; + + i40e->i40e_uaddrs = kmem_zalloc(sizeof (i40e_uaddr_t) * + i40e->i40e_resources.ifr_nmacfilt, KM_SLEEP); + i40e->i40e_maddrs = kmem_zalloc(sizeof (i40e_maddr_t) * + i40e->i40e_resources.ifr_nmcastfilt, KM_SLEEP); + + /* + * Initialize these a multicast address to indicate it's invalid for + * sanity purposes. Think of it like 0xdeadbeef. + */ + for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt; i++) + i40e->i40e_uaddrs[i].iua_mac[0] = 0x01; + + return (B_TRUE); +} + +static boolean_t +i40e_enable_interrupts(i40e_t *i40e) +{ + int i, rc; + + if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) { + rc = ddi_intr_block_enable(i40e->i40e_intr_handles, + i40e->i40e_intr_count); + if (rc != DDI_SUCCESS) { + i40e_error(i40e, "Interrupt block-enable failed: %d", + rc); + return (B_FALSE); + } + } else { + for (i = 0; i < i40e->i40e_intr_count; i++) { + rc = ddi_intr_enable(i40e->i40e_intr_handles[i]); + if (rc != DDI_SUCCESS) { + i40e_error(i40e, + "Failed to enable interrupt %d: %d", i, rc); + while (--i >= 0) { + (void) ddi_intr_disable( + i40e->i40e_intr_handles[i]); + } + return (B_FALSE); + } + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_disable_interrupts(i40e_t *i40e) +{ + int i, rc; + + if (i40e->i40e_intr_cap & DDI_INTR_FLAG_BLOCK) { + rc = ddi_intr_block_disable(i40e->i40e_intr_handles, + i40e->i40e_intr_count); + if (rc != DDI_SUCCESS) { + i40e_error(i40e, + "Interrupt block-disabled failed: %d", rc); + return (B_FALSE); + } + } else { + for (i = 0; i < i40e->i40e_intr_count; i++) { + rc = ddi_intr_disable(i40e->i40e_intr_handles[i]); + if (rc != DDI_SUCCESS) { + i40e_error(i40e, + "Failed to disable interrupt %d: %d", + i, rc); + return (B_FALSE); + } + } + } + + return (B_TRUE); +} + +/* + * Free receive & transmit rings. + */ +static void +i40e_free_trqpairs(i40e_t *i40e) +{ + int i; + i40e_trqpair_t *itrq; + + if (i40e->i40e_trqpairs != NULL) { + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + itrq = &i40e->i40e_trqpairs[i]; + mutex_destroy(&itrq->itrq_rx_lock); + mutex_destroy(&itrq->itrq_tx_lock); + mutex_destroy(&itrq->itrq_tcb_lock); + + /* + * Should have already been cleaned up by start/stop, + * etc. + */ + ASSERT(itrq->itrq_txkstat == NULL); + ASSERT(itrq->itrq_rxkstat == NULL); + } + + kmem_free(i40e->i40e_trqpairs, + sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs); + i40e->i40e_trqpairs = NULL; + } + + cv_destroy(&i40e->i40e_rx_pending_cv); + mutex_destroy(&i40e->i40e_rx_pending_lock); + mutex_destroy(&i40e->i40e_general_lock); +} + +/* + * Allocate transmit and receive rings, as well as other data structures that we + * need. + */ +static boolean_t +i40e_alloc_trqpairs(i40e_t *i40e) +{ + int i; + void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri); + + /* + * Now that we have the priority for the interrupts, initialize + * all relevant locks. + */ + mutex_init(&i40e->i40e_general_lock, NULL, MUTEX_DRIVER, mutexpri); + mutex_init(&i40e->i40e_rx_pending_lock, NULL, MUTEX_DRIVER, mutexpri); + cv_init(&i40e->i40e_rx_pending_cv, NULL, CV_DRIVER, NULL); + + i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) * + i40e->i40e_num_trqpairs, KM_SLEEP); + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + + itrq->itrq_i40e = i40e; + mutex_init(&itrq->itrq_rx_lock, NULL, MUTEX_DRIVER, mutexpri); + mutex_init(&itrq->itrq_tx_lock, NULL, MUTEX_DRIVER, mutexpri); + mutex_init(&itrq->itrq_tcb_lock, NULL, MUTEX_DRIVER, mutexpri); + itrq->itrq_index = i; + } + + return (B_TRUE); +} + + + +/* + * Unless a .conf file already overrode i40e_t structure values, they will + * be 0, and need to be set in conjunction with the now-available HW report. + * + * However, at the moment, we cap all of these resources as we only support a + * single receive ring and a single group. + */ +/* ARGSUSED */ +static void +i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw) +{ + if (i40e->i40e_num_trqpairs == 0) { + i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX; + } + + if (i40e->i40e_num_rx_groups == 0) { + i40e->i40e_num_rx_groups = I40E_GROUP_MAX; + } +} + +/* + * Free any resources required by, or setup by, the Intel common code. + */ +static void +i40e_common_code_fini(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + int rc; + + rc = i40e_shutdown_lan_hmc(hw); + if (rc != I40E_SUCCESS) + i40e_error(i40e, "failed to shutdown LAN hmc: %d", rc); + + rc = i40e_shutdown_adminq(hw); + if (rc != I40E_SUCCESS) + i40e_error(i40e, "failed to shutdown admin queue: %d", rc); +} + +/* + * Initialize and call Intel common-code routines, includes some setup + * the common code expects from the driver. Also prints on failure, so + * the caller doesn't have to. + */ +static boolean_t +i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw) +{ + int rc; + + i40e_clear_hw(hw); + rc = i40e_pf_reset(hw); + if (rc != 0) { + i40e_error(i40e, "failed to reset hardware: %d", rc); + i40e_fm_ereport(i40e, DDI_FM_DEVICE_NO_RESPONSE); + return (B_FALSE); + } + + rc = i40e_init_shared_code(hw); + if (rc != 0) { + i40e_error(i40e, "failed to initialize i40e core: %d", rc); + return (B_FALSE); + } + + hw->aq.num_arq_entries = I40E_DEF_ADMINQ_SIZE; + hw->aq.num_asq_entries = I40E_DEF_ADMINQ_SIZE; + hw->aq.arq_buf_size = I40E_ADMINQ_BUFSZ; + hw->aq.asq_buf_size = I40E_ADMINQ_BUFSZ; + + rc = i40e_init_adminq(hw); + if (rc != 0) { + i40e_error(i40e, "failed to initialize firmware admin queue: " + "%d, potential firmware version mismatch", rc); + i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE); + return (B_FALSE); + } + + if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR && + hw->aq.api_min_ver > I40E_FW_API_VERSION_MINOR) { + i40e_notice(i40e, "The driver for the device detected a newer " + "version of the NVM image (%d.%d) than expected (%d.%d).\n" + "Please install the most recent version of the network " + "driver.\n", hw->aq.api_maj_ver, hw->aq.api_min_ver, + I40E_FW_API_VERSION_MAJOR, I40E_FW_API_VERSION_MINOR); + } else if (hw->aq.api_maj_ver < I40E_FW_API_VERSION_MAJOR || + hw->aq.api_min_ver < (I40E_FW_API_VERSION_MINOR - 1)) { + i40e_notice(i40e, "The driver for the device detected an older" + " version of the NVM image (%d.%d) than expected (%d.%d)." + "\nPlease update the NVM image.\n", + hw->aq.api_maj_ver, hw->aq.api_min_ver, + I40E_FW_API_VERSION_MAJOR, I40E_FW_API_VERSION_MINOR - 1); + } + + i40e_clear_pxe_mode(hw); + + /* + * We need to call this so that the common code can discover + * capabilities of the hardware, which it uses throughout the rest. + */ + if (!i40e_get_hw_capabilities(i40e, hw)) { + i40e_error(i40e, "failed to obtain hardware capabilities"); + return (B_FALSE); + } + + if (i40e_get_available_resources(i40e) == B_FALSE) { + i40e_error(i40e, "failed to obtain hardware resources"); + return (B_FALSE); + } + + i40e_hw_to_instance(i40e, hw); + + rc = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp, + hw->func_caps.num_rx_qp, 0, 0); + if (rc != 0) { + i40e_error(i40e, "failed to initialize hardware memory cache: " + "%d\n", rc); + return (B_FALSE); + } + + rc = i40e_configure_lan_hmc(hw, I40E_HMC_MODEL_DIRECT_ONLY); + if (rc != 0) { + i40e_error(i40e, "failed to configure hardware memory cache: " + "%d\n", rc); + return (B_FALSE); + } + + (void) i40e_aq_stop_lldp(hw, TRUE, NULL); + + rc = i40e_get_mac_addr(hw, hw->mac.addr); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "failed to retrieve hardware mac address: %d", + rc); + return (B_FALSE); + } + + rc = i40e_validate_mac_addr(hw->mac.addr); + if (rc != 0) { + i40e_error(i40e, "failed to validate internal mac address: " + "%d\n", rc); + return (B_FALSE); + } + bcopy(hw->mac.addr, hw->mac.perm_addr, ETHERADDRL); + if ((rc = i40e_get_port_mac_addr(hw, hw->mac.port_addr)) != + I40E_SUCCESS) { + i40e_error(i40e, "failed to retrieve port mac address: %d", + rc); + return (B_FALSE); + } + + /* + * We need to obtain the Virtual Station ID (VSI) before we can + * perform other operations on the device. + */ + i40e->i40e_vsi_id = i40e_get_vsi_id(i40e); + if (i40e->i40e_vsi_id == -1) { + i40e_error(i40e, "failed to obtain VSI ID"); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +i40e_unconfigure(dev_info_t *devinfo, i40e_t *i40e) +{ + int rc; + + if (i40e->i40e_attach_progress & I40E_ATTACH_ENABLE_INTR) + (void) i40e_disable_interrupts(i40e); + + if ((i40e->i40e_attach_progress & I40E_ATTACH_LINK_TIMER) && + i40e->i40e_periodic_id != 0) { + ddi_periodic_delete(i40e->i40e_periodic_id); + i40e->i40e_periodic_id = 0; + } + + if (i40e->i40e_attach_progress & I40E_ATTACH_MAC) { + rc = mac_unregister(i40e->i40e_mac_hdl); + if (rc != 0) { + i40e_error(i40e, "failed to unregister from mac: %d", + rc); + } + } + + if (i40e->i40e_attach_progress & I40E_ATTACH_STATS) { + i40e_stats_fini(i40e); + } + + if (i40e->i40e_attach_progress & I40E_ATTACH_ADD_INTR) + i40e_rem_intr_handlers(i40e); + + if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_RINGSLOCKS) + i40e_free_trqpairs(i40e); + + if (i40e->i40e_attach_progress & I40E_ATTACH_ALLOC_INTR) + i40e_rem_intrs(i40e); + + if (i40e->i40e_attach_progress & I40E_ATTACH_COMMON_CODE) + i40e_common_code_fini(i40e); + + i40e_cleanup_resources(i40e); + + if (i40e->i40e_attach_progress & I40E_ATTACH_PROPS) + (void) ddi_prop_remove_all(devinfo); + + if (i40e->i40e_attach_progress & I40E_ATTACH_REGS_MAP && + i40e->i40e_osdep_space.ios_reg_handle != NULL) { + ddi_regs_map_free(&i40e->i40e_osdep_space.ios_reg_handle); + i40e->i40e_osdep_space.ios_reg_handle = NULL; + } + + if ((i40e->i40e_attach_progress & I40E_ATTACH_PCI_CONFIG) && + i40e->i40e_osdep_space.ios_cfg_handle != NULL) { + pci_config_teardown(&i40e->i40e_osdep_space.ios_cfg_handle); + i40e->i40e_osdep_space.ios_cfg_handle = NULL; + } + + if (i40e->i40e_attach_progress & I40E_ATTACH_FM_INIT) + i40e_fm_fini(i40e); + + kmem_free(i40e->i40e_aqbuf, I40E_ADMINQ_BUFSZ); + kmem_free(i40e, sizeof (i40e_t)); + + ddi_set_driver_private(devinfo, NULL); +} + +static boolean_t +i40e_final_init(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_osdep *osdep = OS_DEP(hw); + uint8_t pbanum[I40E_PBANUM_STRLEN]; + enum i40e_status_code irc; + char buf[I40E_DDI_PROP_LEN]; + + pbanum[0] = '\0'; + irc = i40e_read_pba_string(hw, pbanum, sizeof (pbanum)); + if (irc != I40E_SUCCESS) { + i40e_log(i40e, "failed to read PBA string: %d", irc); + } else { + (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip, + "printed-board-assembly", (char *)pbanum); + } + +#ifdef DEBUG + ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.fw_maj_ver, + hw->aq.fw_min_ver) < sizeof (buf)); + ASSERT(snprintf(NULL, 0, "%x", hw->aq.fw_build) < sizeof (buf)); + ASSERT(snprintf(NULL, 0, "%d.%d", hw->aq.api_maj_ver, + hw->aq.api_min_ver) < sizeof (buf)); +#endif + + (void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.fw_maj_ver, + hw->aq.fw_min_ver); + (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip, + "firmware-version", buf); + (void) snprintf(buf, sizeof (buf), "%x", hw->aq.fw_build); + (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip, + "firmware-build", buf); + (void) snprintf(buf, sizeof (buf), "%d.%d", hw->aq.api_maj_ver, + hw->aq.api_min_ver); + (void) ddi_prop_update_string(DDI_DEV_T_NONE, i40e->i40e_dip, + "api-version", buf); + + if (!i40e_set_hw_bus_info(hw)) + return (B_FALSE); + + if (i40e_check_acc_handle(osdep->ios_reg_handle) != DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST); + return (B_FALSE); + } + + return (B_TRUE); +} + +static boolean_t +i40e_identify_hardware(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_osdep *osdep = &i40e->i40e_osdep_space; + + hw->vendor_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_VENID); + hw->device_id = pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_DEVID); + hw->revision_id = pci_config_get8(osdep->ios_cfg_handle, + PCI_CONF_REVID); + hw->subsystem_device_id = + pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBSYSID); + hw->subsystem_vendor_id = + pci_config_get16(osdep->ios_cfg_handle, PCI_CONF_SUBVENID); + + /* + * Note that we set the hardware's bus information later on, in + * i40e_get_available_resources(). The common code doesn't seem to + * require that it be set in any ways, it seems to be mostly for + * book-keeping. + */ + + /* Call common code to set the MAC type for this adapter. */ + if (i40e_set_mac_type(hw) != I40E_SUCCESS) + return (B_FALSE); + + return (B_TRUE); +} + +static boolean_t +i40e_regs_map(i40e_t *i40e) +{ + dev_info_t *devinfo = i40e->i40e_dip; + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_osdep *osdep = &i40e->i40e_osdep_space; + off_t memsize; + int ret; + + if (ddi_dev_regsize(devinfo, I40E_ADAPTER_REGSET, &memsize) != + DDI_SUCCESS) { + i40e_error(i40e, "Used invalid register set to map PCIe regs"); + return (B_FALSE); + } + + if ((ret = ddi_regs_map_setup(devinfo, I40E_ADAPTER_REGSET, + (caddr_t *)&hw->hw_addr, 0, memsize, &i40e_regs_acc_attr, + &osdep->ios_reg_handle)) != DDI_SUCCESS) { + i40e_error(i40e, "failed to map device registers: %d", ret); + return (B_FALSE); + } + + osdep->ios_reg_size = memsize; + return (B_TRUE); +} + +/* + * Update parameters required when a new MTU has been configured. Calculate the + * maximum frame size, as well as, size our DMA buffers which we size in + * increments of 1K. + */ +void +i40e_update_mtu(i40e_t *i40e) +{ + uint32_t rx, tx; + + i40e->i40e_frame_max = i40e->i40e_sdu + + sizeof (struct ether_vlan_header) + ETHERFCSL; + + rx = i40e->i40e_frame_max + I40E_BUF_IPHDR_ALIGNMENT; + i40e->i40e_rx_buf_size = ((rx >> 10) + + ((rx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10; + + tx = i40e->i40e_frame_max; + i40e->i40e_tx_buf_size = ((tx >> 10) + + ((tx & (((uint32_t)1 << 10) -1)) > 0 ? 1 : 0)) << 10; +} + +static int +i40e_get_prop(i40e_t *i40e, char *prop, int min, int max, int def) +{ + int val; + + val = ddi_prop_get_int(DDI_DEV_T_ANY, i40e->i40e_dip, DDI_PROP_DONTPASS, + prop, def); + if (val > max) + val = max; + if (val < min) + val = min; + return (val); +} + +static void +i40e_init_properties(i40e_t *i40e) +{ + i40e->i40e_sdu = i40e_get_prop(i40e, "default_mtu", + I40E_MIN_MTU, I40E_MAX_MTU, I40E_DEF_MTU); + + i40e->i40e_intr_force = i40e_get_prop(i40e, "intr_force", + I40E_INTR_NONE, I40E_INTR_LEGACY, I40E_INTR_NONE); + + i40e->i40e_mr_enable = i40e_get_prop(i40e, "mr_enable", + B_FALSE, B_TRUE, B_TRUE); + + i40e->i40e_tx_ring_size = i40e_get_prop(i40e, "tx_ring_size", + I40E_MIN_TX_RING_SIZE, I40E_MAX_TX_RING_SIZE, + I40E_DEF_TX_RING_SIZE); + if ((i40e->i40e_tx_ring_size % I40E_DESC_ALIGN) != 0) { + i40e->i40e_tx_ring_size = P2ROUNDUP(i40e->i40e_tx_ring_size, + I40E_DESC_ALIGN); + } + + i40e->i40e_tx_block_thresh = i40e_get_prop(i40e, "tx_resched_threshold", + I40E_MIN_TX_BLOCK_THRESH, + i40e->i40e_tx_ring_size - I40E_TX_MAX_COOKIE, + I40E_DEF_TX_BLOCK_THRESH); + + i40e->i40e_rx_ring_size = i40e_get_prop(i40e, "rx_ring_size", + I40E_MIN_RX_RING_SIZE, I40E_MAX_RX_RING_SIZE, + I40E_DEF_RX_RING_SIZE); + if ((i40e->i40e_rx_ring_size % I40E_DESC_ALIGN) != 0) { + i40e->i40e_rx_ring_size = P2ROUNDUP(i40e->i40e_rx_ring_size, + I40E_DESC_ALIGN); + } + + i40e->i40e_rx_limit_per_intr = i40e_get_prop(i40e, "rx_limit_per_intr", + I40E_MIN_RX_LIMIT_PER_INTR, I40E_MAX_RX_LIMIT_PER_INTR, + I40E_DEF_RX_LIMIT_PER_INTR); + + i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable", + B_FALSE, B_TRUE, B_TRUE); + + i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable", + B_FALSE, B_TRUE, B_TRUE); + + i40e->i40e_rx_dma_min = i40e_get_prop(i40e, "rx_dma_threshold", + I40E_MIN_RX_DMA_THRESH, I40E_MAX_RX_DMA_THRESH, + I40E_DEF_RX_DMA_THRESH); + + i40e->i40e_tx_dma_min = i40e_get_prop(i40e, "tx_dma_threshold", + I40E_MIN_TX_DMA_THRESH, I40E_MAX_TX_DMA_THRESH, + I40E_DEF_TX_DMA_THRESH); + + i40e->i40e_tx_itr = i40e_get_prop(i40e, "tx_intr_throttle", + I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_TX_ITR); + + i40e->i40e_rx_itr = i40e_get_prop(i40e, "rx_intr_throttle", + I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_RX_ITR); + + i40e->i40e_other_itr = i40e_get_prop(i40e, "other_intr_throttle", + I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_OTHER_ITR); + + if (!i40e->i40e_mr_enable) { + i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX; + i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX; + } + + i40e_update_mtu(i40e); +} + +/* + * There are a few constraints on interrupts that we're currently imposing, some + * of which are restrictions from hardware. For a fuller treatment, see + * i40e_intr.c. + * + * Currently, to use MSI-X we require two interrupts be available though in + * theory we should participate in IRM and happily use more interrupts. + * + * Hardware only supports a single MSI being programmed and therefore if we + * don't have MSI-X interrupts available at this time, then we ratchet down the + * number of rings and groups available. Obviously, we only bother with a single + * fixed interrupt. + */ +static boolean_t +i40e_alloc_intr_handles(i40e_t *i40e, dev_info_t *devinfo, int intr_type) +{ + int request, count, actual, rc, min; + + switch (intr_type) { + case DDI_INTR_TYPE_FIXED: + case DDI_INTR_TYPE_MSI: + request = 1; + min = 1; + break; + case DDI_INTR_TYPE_MSIX: + /* + * At the moment, we always request two MSI-X while we still + * only support a single interrupt. The upper bound on what's + * supported by a given device is defined by MSI_X_PF_N in + * GLPCI_CNF2. When we evolve, we should read it to determine + * what the real max is. + */ + ASSERT(i40e->i40e_num_trqpairs == 1); + request = 2; + min = 2; + break; + default: + panic("bad interrupt type passed to i40e_alloc_intr_handles: " + "%d", intr_type); + return (B_FALSE); + } + + rc = ddi_intr_get_nintrs(devinfo, intr_type, &count); + if (rc != DDI_SUCCESS || count < min) { + i40e_log(i40e, "Get interrupt number failed, " + "returned %d, count %d\n", rc, count); + return (B_FALSE); + } + + rc = ddi_intr_get_navail(devinfo, intr_type, &count); + if (rc != DDI_SUCCESS || count < min) { + i40e_log(i40e, "Get AVAILABLE interrupt number failed, " + "returned %d, count %d\n", rc, count); + return (B_FALSE); + } + + actual = 0; + i40e->i40e_intr_count = 0; + i40e->i40e_intr_count_max = 0; + i40e->i40e_intr_count_min = 0; + + i40e->i40e_intr_size = request * sizeof (ddi_intr_handle_t); + ASSERT(i40e->i40e_intr_size != 0); + i40e->i40e_intr_handles = kmem_alloc(i40e->i40e_intr_size, KM_SLEEP); + + rc = ddi_intr_alloc(devinfo, i40e->i40e_intr_handles, intr_type, 0, + min(request, count), &actual, DDI_INTR_ALLOC_NORMAL); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "Interrupt allocation failed with %d.", rc); + goto alloc_handle_fail; + } + + i40e->i40e_intr_count = actual; + i40e->i40e_intr_count_max = request; + i40e->i40e_intr_count_min = min; + + if (actual < min) { + i40e_log(i40e, "actual (%d) is less than minimum (%d).", + actual, min); + goto alloc_handle_fail; + } + + /* + * Record the priority and capabilities for our first vector. Once + * we have it, that's our priority until detach time. Even if we + * eventually participate in IRM, our priority shouldn't change. + */ + rc = ddi_intr_get_pri(i40e->i40e_intr_handles[0], &i40e->i40e_intr_pri); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, + "Getting interrupt priority failed with %d.", rc); + goto alloc_handle_fail; + } + + rc = ddi_intr_get_cap(i40e->i40e_intr_handles[0], &i40e->i40e_intr_cap); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, + "Getting interrupt capabilities failed with %d.", rc); + goto alloc_handle_fail; + } + + i40e->i40e_intr_type = intr_type; + return (B_TRUE); + +alloc_handle_fail: + + i40e_rem_intrs(i40e); + return (B_FALSE); +} + +static boolean_t +i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) +{ + int intr_types, rc; + + rc = ddi_intr_get_supported_types(devinfo, &intr_types); + if (rc != DDI_SUCCESS) { + i40e_error(i40e, "failed to get supported interrupt types: %d", + rc); + return (B_FALSE); + } + + i40e->i40e_intr_type = 0; + + if ((intr_types & DDI_INTR_TYPE_MSIX) && + i40e->i40e_intr_force <= I40E_INTR_MSIX) { + if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX)) + return (B_TRUE); + } + + /* + * We only use multiple transmit/receive pairs when MSI-X interrupts are + * available due to the fact that the device basically only supports a + * single MSI interrupt. + */ + i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX; + i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX; + + if ((intr_types & DDI_INTR_TYPE_MSI) && + (i40e->i40e_intr_force <= I40E_INTR_MSI)) { + if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSI)) + return (B_TRUE); + } + + if (intr_types & DDI_INTR_TYPE_FIXED) { + if (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_FIXED)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Map different interrupts to MSI-X vectors. + */ +static boolean_t +i40e_map_intrs_to_vectors(i40e_t *i40e) +{ + if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { + return (B_TRUE); + } + + /* + * At the moment, we only have one queue and one interrupt thus both are + * on that one interrupt. However, longer term we need to go back to + * using the ixgbe style map of queues to vectors or walk the linked + * list from the device to know what to go handle. Therefore for the + * moment, since we need to map our single set of rings to the one + * I/O interrupt that exists for MSI-X. + */ + ASSERT(i40e->i40e_intr_count == 2); + ASSERT(i40e->i40e_num_trqpairs == 1); + + i40e->i40e_trqpairs[0].itrq_rx_intrvec = 1; + i40e->i40e_trqpairs[0].itrq_tx_intrvec = 1; + + return (B_TRUE); +} + +static boolean_t +i40e_add_intr_handlers(i40e_t *i40e) +{ + int rc, vector; + + switch (i40e->i40e_intr_type) { + case DDI_INTR_TYPE_MSIX: + for (vector = 0; vector < i40e->i40e_intr_count; vector++) { + rc = ddi_intr_add_handler( + i40e->i40e_intr_handles[vector], + (ddi_intr_handler_t *)i40e_intr_msix, i40e, + (void *)(uintptr_t)vector); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "Add interrupt handler (MSI-X) " + "failed: return %d, vector %d", rc, vector); + for (vector--; vector >= 0; vector--) { + (void) ddi_intr_remove_handler( + i40e->i40e_intr_handles[vector]); + } + return (B_FALSE); + } + } + break; + case DDI_INTR_TYPE_MSI: + rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0], + (ddi_intr_handler_t *)i40e_intr_msi, i40e, NULL); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "Add interrupt handler (MSI) failed: " + "return %d", rc); + return (B_FALSE); + } + break; + case DDI_INTR_TYPE_FIXED: + rc = ddi_intr_add_handler(i40e->i40e_intr_handles[0], + (ddi_intr_handler_t *)i40e_intr_legacy, i40e, NULL); + if (rc != DDI_SUCCESS) { + i40e_log(i40e, "Add interrupt handler (legacy) failed:" + " return %d", rc); + return (B_FALSE); + } + break; + default: + /* Cast to pacify lint */ + panic("i40e_intr_type %p contains an unknown type: %d", + (void *)i40e, i40e->i40e_intr_type); + } + + return (B_TRUE); +} + +/* + * Perform periodic checks. Longer term, we should be thinking about additional + * things here: + * + * o Stall Detection + * o Temperature sensor detection + * o Device resetting + * o Statistics updating to avoid wraparound + */ +static void +i40e_timer(void *arg) +{ + i40e_t *i40e = arg; + + mutex_enter(&i40e->i40e_general_lock); + i40e_link_check(i40e); + mutex_exit(&i40e->i40e_general_lock); +} + +/* + * Get the hardware state, and scribble away anything that needs scribbling. + */ +static void +i40e_get_hw_state(i40e_t *i40e, i40e_hw_t *hw) +{ + int rc; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + (void) i40e_aq_get_link_info(hw, TRUE, NULL, NULL); + i40e_link_check(i40e); + + /* + * Try and determine our PHY. Note that we may have to retry to and + * delay to detect fiber correctly. + */ + rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE, &i40e->i40e_phy, + NULL); + if (rc == I40E_ERR_UNKNOWN_PHY) { + i40e_msec_delay(200); + rc = i40e_aq_get_phy_capabilities(hw, B_FALSE, B_TRUE, + &i40e->i40e_phy, NULL); + } + + if (rc != I40E_SUCCESS) { + if (rc == I40E_ERR_UNKNOWN_PHY) { + i40e_error(i40e, "encountered unknown PHY type, " + "not attaching."); + } else { + i40e_error(i40e, "error getting physical capabilities: " + "%d, %d", rc, hw->aq.asq_last_status); + } + } + + rc = i40e_update_link_info(hw); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "failed to update link information: %d", rc); + } + + /* + * In general, we don't want to mask off (as in stop from being a cause) + * any of the interrupts that the phy might be able to generate. + */ + rc = i40e_aq_set_phy_int_mask(hw, 0, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "failed to update phy link mask: %d\n", rc); + } +} + +/* + * Go through and re-initialize any existing filters that we may have set up for + * this device. Note that we would only expect them to exist if hardware had + * already been initialized and we had just reset it. While we're not + * implementing this yet, we're keeping this around for when we add reset + * capabilities, so this isn't forgotten. + */ +/* ARGSUSED */ +static void +i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw) +{ +} + +/* + * Configure the hardware for the Virtual Station Interface (VSI). Currently + * we only support one, but in the future we could instantiate more than one + * per attach-point. + */ +static boolean_t +i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw) +{ + struct i40e_vsi_context context; + int err; + + bzero(&context, sizeof (struct i40e_vsi_context)); + context.seid = i40e->i40e_vsi_id; + context.pf_num = hw->pf_id; + err = i40e_aq_get_vsi_params(hw, &context, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "get VSI params failed with %d\n", err); + return (B_FALSE); + } + + /* + * Set the queue and traffic class bits. Keep it simple for now. + */ + context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; + context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG; + context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES; + context.info.tc_mapping[0] = I40E_TRAFFIC_CLASS_NO_QUEUES; + + context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID; + context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | + I40E_AQ_VSI_PVLAN_EMOD_NOTHING; + + context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF); + + i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx); + if (i40e_stat_vsi_init(i40e) == B_FALSE) + return (B_FALSE); + + err = i40e_aq_update_vsi_params(hw, &context, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "Update VSI params failed with %d", err); + return (B_FALSE); + } + + + return (B_TRUE); +} + +/* + * Wrapper to kick the chipset on. + */ +static boolean_t +i40e_chip_start(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_filter_control_settings filter; + int rc; + + if (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) || + (hw->aq.fw_maj_ver < 4)) { + i40e_msec_delay(75); + if (i40e_aq_set_link_restart_an(hw, TRUE, NULL) != + I40E_SUCCESS) { + i40e_error(i40e, "failed to restart link: admin queue " + "error: %d\n", hw->aq.asq_last_status); + return (B_FALSE); + } + } + + /* Determine hardware state */ + i40e_get_hw_state(i40e, hw); + + /* Initialize mac addresses. */ + i40e_init_macaddrs(i40e, hw); + + /* + * Set up the filter control. + */ + bzero(&filter, sizeof (filter)); + filter.enable_ethtype = TRUE; + filter.enable_macvlan = TRUE; + + rc = i40e_set_filter_control(hw, &filter); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_set_filter_control() returned %d", rc); + return (B_FALSE); + } + + i40e_intr_chip_init(i40e); + + if (!i40e_config_vsi(i40e, hw)) + return (B_FALSE); + + i40e_flush(hw); + + return (B_TRUE); +} + +/* + * Take care of tearing down the rx ring. See 8.3.3.1.2 for more information. + */ +static void +i40e_shutdown_rx_rings(i40e_t *i40e) +{ + int i; + uint32_t reg; + + i40e_hw_t *hw = &i40e->i40e_hw_space; + + /* + * Step 1. The interrupt linked list (see i40e_intr.c for more + * information) should have already been cleared before calling this + * function. + */ +#ifdef DEBUG + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + for (i = 1; i < i40e->i40e_intr_count; i++) { + reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1)); + VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL); + } + } else { + reg = I40E_READ_REG(hw, I40E_PFINT_LNKLST0); + VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL); + } + +#endif /* DEBUG */ + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + /* + * Step 1. Request the queue by clearing QENA_REQ. It may not be + * set due to unwinding from failures and a partially enabled + * ring set. + */ + reg = I40E_READ_REG(hw, I40E_QRX_ENA(i)); + if (!(reg & I40E_QRX_ENA_QENA_REQ_MASK)) + continue; + VERIFY((reg & I40E_QRX_ENA_QENA_REQ_MASK) == + I40E_QRX_ENA_QENA_REQ_MASK); + reg &= ~I40E_QRX_ENA_QENA_REQ_MASK; + I40E_WRITE_REG(hw, I40E_QRX_ENA(i), reg); + } + + /* + * Step 2. Wait for the disable to take, by having QENA_STAT in the FPM + * be cleared. Note that we could still receive data in the queue during + * this time. We don't actually wait for this now and instead defer this + * to i40e_shutdown_rings_wait(), after we've interleaved disabling the + * TX queues as well. + */ +} + +static void +i40e_shutdown_tx_rings(i40e_t *i40e) +{ + int i; + uint32_t reg; + + i40e_hw_t *hw = &i40e->i40e_hw_space; + + /* + * Step 1. The interrupt linked list should already have been cleared. + */ +#ifdef DEBUG + if (i40e->i40e_intr_type == DDI_INTR_TYPE_MSIX) { + for (i = 1; i < i40e->i40e_intr_count; i++) { + reg = I40E_READ_REG(hw, I40E_PFINT_LNKLSTN(i - 1)); + VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL); + } + } else { + reg = I40E_READ_REG(hw, I40E_PFINT_LNKLST0); + VERIFY3U(reg, ==, I40E_QUEUE_TYPE_EOL); + + } +#endif /* DEBUG */ + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + /* + * Step 2. Set the SET_QDIS flag for every queue. + */ + i40e_pre_tx_queue_cfg(hw, i, B_FALSE); + } + + /* + * Step 3. Wait at least 400 usec (can be done once for all queues). + */ + drv_usecwait(500); + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + /* + * Step 4. Clear the QENA_REQ flag which tells hardware to + * quiesce. If QENA_REQ is not already set then that means that + * we likely already tried to disable this queue. + */ + reg = I40E_READ_REG(hw, I40E_QTX_ENA(i)); + if (!(reg & I40E_QTX_ENA_QENA_REQ_MASK)) + continue; + reg &= ~I40E_QTX_ENA_QENA_REQ_MASK; + I40E_WRITE_REG(hw, I40E_QTX_ENA(i), reg); + } + + /* + * Step 5. Wait for all drains to finish. This will be done by the + * hardware removing the QENA_STAT flag from the queue. Rather than + * waiting here, we interleave it with all the others in + * i40e_shutdown_rings_wait(). + */ +} + +/* + * Wait for all the rings to be shut down. e.g. Steps 2 and 5 from the above + * functions. + */ +static boolean_t +i40e_shutdown_rings_wait(i40e_t *i40e) +{ + int i, try; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint32_t reg; + + for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) { + reg = I40E_READ_REG(hw, I40E_QRX_ENA(i)); + if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0) + break; + i40e_msec_delay(I40E_RING_WAIT_PAUSE); + } + + if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) != 0) { + i40e_error(i40e, "timed out disabling rx queue %d\n", + i); + return (B_FALSE); + } + + for (try = 0; try < I40E_RING_WAIT_NTRIES; try++) { + reg = I40E_READ_REG(hw, I40E_QTX_ENA(i)); + if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0) + break; + i40e_msec_delay(I40E_RING_WAIT_PAUSE); + } + + if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) != 0) { + i40e_error(i40e, "timed out disabling tx queue %d\n", + i); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_shutdown_rings(i40e_t *i40e) +{ + i40e_shutdown_rx_rings(i40e); + i40e_shutdown_tx_rings(i40e); + return (i40e_shutdown_rings_wait(i40e)); +} + +static void +i40e_setup_rx_descs(i40e_trqpair_t *itrq) +{ + int i; + i40e_rx_data_t *rxd = itrq->itrq_rxdata; + + for (i = 0; i < rxd->rxd_ring_size; i++) { + i40e_rx_control_block_t *rcb; + i40e_rx_desc_t *rdesc; + + rcb = rxd->rxd_work_list[i]; + rdesc = &rxd->rxd_desc_ring[i]; + + rdesc->read.pkt_addr = + CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); + rdesc->read.hdr_addr = 0; + } +} + +static boolean_t +i40e_setup_rx_hmc(i40e_trqpair_t *itrq) +{ + i40e_rx_data_t *rxd = itrq->itrq_rxdata; + i40e_t *i40e = itrq->itrq_i40e; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + struct i40e_hmc_obj_rxq rctx; + int err; + + bzero(&rctx, sizeof (struct i40e_hmc_obj_rxq)); + rctx.base = rxd->rxd_desc_area.dmab_dma_address / + I40E_HMC_RX_CTX_UNIT; + rctx.qlen = rxd->rxd_ring_size; + VERIFY(i40e->i40e_rx_buf_size >= I40E_HMC_RX_DBUFF_MIN); + VERIFY(i40e->i40e_rx_buf_size <= I40E_HMC_RX_DBUFF_MAX); + rctx.dbuff = i40e->i40e_rx_buf_size >> I40E_RXQ_CTX_DBUFF_SHIFT; + rctx.hbuff = 0 >> I40E_RXQ_CTX_HBUFF_SHIFT; + rctx.dtype = I40E_HMC_RX_DTYPE_NOSPLIT; + rctx.dsize = I40E_HMC_RX_DSIZE_32BYTE; + rctx.crcstrip = I40E_HMC_RX_CRCSTRIP_ENABLE; + rctx.fc_ena = I40E_HMC_RX_FC_DISABLE; + rctx.l2tsel = I40E_HMC_RX_L2TAGORDER; + rctx.hsplit_0 = I40E_HMC_RX_HDRSPLIT_DISABLE; + rctx.hsplit_1 = I40E_HMC_RX_HDRSPLIT_DISABLE; + rctx.showiv = I40E_HMC_RX_INVLAN_DONTSTRIP; + rctx.rxmax = i40e->i40e_frame_max; + rctx.tphrdesc_ena = I40E_HMC_RX_TPH_DISABLE; + rctx.tphwdesc_ena = I40E_HMC_RX_TPH_DISABLE; + rctx.tphdata_ena = I40E_HMC_RX_TPH_DISABLE; + rctx.tphhead_ena = I40E_HMC_RX_TPH_DISABLE; + rctx.lrxqthresh = I40E_HMC_RX_LOWRXQ_NOINTR; + + /* + * This must be set to 0x1, see Table 8-12 in section 8.3.3.2.2. + */ + rctx.prefena = I40E_HMC_RX_PREFENA; + + err = i40e_clear_lan_rx_queue_context(hw, itrq->itrq_index); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to clear rx queue %d context: %d\n", + itrq->itrq_index, err); + return (B_FALSE); + } + + err = i40e_set_lan_rx_queue_context(hw, itrq->itrq_index, &rctx); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to set rx queue %d context: %d\n", + itrq->itrq_index, err); + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Take care of setting up the descriptor rings and actually programming the + * device. See 8.3.3.1.1 for the full list of steps we need to do to enable the + * rx rings. + */ +static boolean_t +i40e_setup_rx_rings(i40e_t *i40e) +{ + int i; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + i40e_rx_data_t *rxd = itrq->itrq_rxdata; + uint32_t reg; + + /* + * Step 1. Program all receive ring descriptors. + */ + i40e_setup_rx_descs(itrq); + + /* + * Step 2. Program the queue's FPM/HMC context. + */ + if (i40e_setup_rx_hmc(itrq) == B_FALSE) + return (B_FALSE); + + /* + * Step 3. Clear the queue's tail pointer and set it to the end + * of the space. + */ + I40E_WRITE_REG(hw, I40E_QRX_TAIL(i), 0); + I40E_WRITE_REG(hw, I40E_QRX_TAIL(i), rxd->rxd_ring_size - 1); + + /* + * Step 4. Enable the queue via the QENA_REQ. + */ + reg = I40E_READ_REG(hw, I40E_QRX_ENA(i)); + VERIFY0(reg & (I40E_QRX_ENA_QENA_REQ_MASK | + I40E_QRX_ENA_QENA_STAT_MASK)); + reg |= I40E_QRX_ENA_QENA_REQ_MASK; + I40E_WRITE_REG(hw, I40E_QRX_ENA(i), reg); + } + + /* + * Note, we wait for every queue to be enabled before we start checking. + * This will hopefully cause most queues to be enabled at this point. + */ + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint32_t j, reg; + + /* + * Step 5. Verify that QENA_STAT has been set. It's promised + * that this should occur within about 10 us, but like other + * systems, we give the card a bit more time. + */ + for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) { + reg = I40E_READ_REG(hw, I40E_QRX_ENA(i)); + + if (reg & I40E_QRX_ENA_QENA_STAT_MASK) + break; + i40e_msec_delay(I40E_RING_WAIT_PAUSE); + } + + if ((reg & I40E_QRX_ENA_QENA_STAT_MASK) == 0) { + i40e_error(i40e, "failed to enable rx queue %d, timed " + "out."); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_setup_tx_hmc(i40e_trqpair_t *itrq) +{ + i40e_t *i40e = itrq->itrq_i40e; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + struct i40e_hmc_obj_txq tctx; + struct i40e_vsi_context context; + int err; + + bzero(&tctx, sizeof (struct i40e_hmc_obj_txq)); + tctx.new_context = I40E_HMC_TX_NEW_CONTEXT; + tctx.base = itrq->itrq_desc_area.dmab_dma_address / + I40E_HMC_TX_CTX_UNIT; + tctx.fc_ena = I40E_HMC_TX_FC_DISABLE; + tctx.timesync_ena = I40E_HMC_TX_TS_DISABLE; + tctx.fd_ena = I40E_HMC_TX_FD_DISABLE; + tctx.alt_vlan_ena = I40E_HMC_TX_ALT_VLAN_DISABLE; + tctx.head_wb_ena = I40E_HMC_TX_WB_ENABLE; + tctx.qlen = itrq->itrq_tx_ring_size; + tctx.tphrdesc_ena = I40E_HMC_TX_TPH_DISABLE; + tctx.tphrpacket_ena = I40E_HMC_TX_TPH_DISABLE; + tctx.tphwdesc_ena = I40E_HMC_TX_TPH_DISABLE; + tctx.head_wb_addr = itrq->itrq_desc_area.dmab_dma_address + + sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; + + /* + * This field isn't actually documented, like crc, but it suggests that + * it should be zeroed. We leave both of these here because of that for + * now. We should check with Intel on why these are here even. + */ + tctx.crc = 0; + tctx.rdylist_act = 0; + + /* + * We're supposed to assign the rdylist field with the value of the + * traffic class index for the first device. We query the VSI parameters + * again to get what the handle is. Note that every queue is always + * assigned to traffic class zero, because we don't actually use them. + */ + bzero(&context, sizeof (struct i40e_vsi_context)); + context.seid = i40e->i40e_vsi_id; + context.pf_num = hw->pf_id; + err = i40e_aq_get_vsi_params(hw, &context, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "get VSI params failed with %d\n", err); + return (B_FALSE); + } + tctx.rdylist = LE_16(context.info.qs_handle[0]); + + err = i40e_clear_lan_tx_queue_context(hw, itrq->itrq_index); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to clear tx queue %d context: %d\n", + itrq->itrq_index, err); + return (B_FALSE); + } + + err = i40e_set_lan_tx_queue_context(hw, itrq->itrq_index, &tctx); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to set tx queue %d context: %d\n", + itrq->itrq_index, err); + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Take care of setting up the descriptor rings and actually programming the + * device. See 8.4.3.1.1 for what we need to do here. + */ +static boolean_t +i40e_setup_tx_rings(i40e_t *i40e) +{ + int i; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + uint32_t reg; + + /* + * Step 1. Clear the queue disable flag and verify that the + * index is set correctly. + */ + i40e_pre_tx_queue_cfg(hw, i, B_TRUE); + + /* + * Step 2. Prepare the queue's FPM/HMC context. + */ + if (i40e_setup_tx_hmc(itrq) == B_FALSE) + return (B_FALSE); + + /* + * Step 3. Verify that it's clear that this PF owns this queue. + */ + reg = I40E_QTX_CTL_PF_QUEUE; + reg |= (hw->pf_id << I40E_QTX_CTL_PF_INDX_SHIFT) & + I40E_QTX_CTL_PF_INDX_MASK; + I40E_WRITE_REG(hw, I40E_QTX_CTL(itrq->itrq_index), reg); + i40e_flush(hw); + + /* + * Step 4. Set the QENA_REQ flag. + */ + reg = I40E_READ_REG(hw, I40E_QTX_ENA(i)); + VERIFY0(reg & (I40E_QTX_ENA_QENA_REQ_MASK | + I40E_QTX_ENA_QENA_STAT_MASK)); + reg |= I40E_QTX_ENA_QENA_REQ_MASK; + I40E_WRITE_REG(hw, I40E_QTX_ENA(i), reg); + } + + /* + * Note, we wait for every queue to be enabled before we start checking. + * This will hopefully cause most queues to be enabled at this point. + */ + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint32_t j, reg; + + /* + * Step 5. Verify that QENA_STAT has been set. It's promised + * that this should occur within about 10 us, but like BSD, + * we'll try for up to 100 ms for this queue. + */ + for (j = 0; j < I40E_RING_WAIT_NTRIES; j++) { + reg = I40E_READ_REG(hw, I40E_QTX_ENA(i)); + + if (reg & I40E_QTX_ENA_QENA_STAT_MASK) + break; + i40e_msec_delay(I40E_RING_WAIT_PAUSE); + } + + if ((reg & I40E_QTX_ENA_QENA_STAT_MASK) == 0) { + i40e_error(i40e, "failed to enable tx queue %d, timed " + "out"); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +void +i40e_stop(i40e_t *i40e, boolean_t free_allocations) +{ + int i; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + /* + * Shutdown and drain the tx and rx pipeline. We do this using the + * following steps. + * + * 1) Shutdown interrupts to all the queues (trying to keep the admin + * queue alive). + * + * 2) Remove all of the interrupt tx and rx causes by setting the + * interrupt linked lists to zero. + * + * 2) Shutdown the tx and rx rings. Because i40e_shutdown_rings() should + * wait for all the queues to be disabled, once we reach that point + * it should be safe to free associated data. + * + * 4) Wait 50ms after all that is done. This ensures that the rings are + * ready for programming again and we don't have to think about this + * in other parts of the driver. + * + * 5) Disable remaining chip interrupts, (admin queue, etc.) + * + * 6) Verify that FM is happy with all the register accesses we + * performed. + */ + i40e_intr_io_disable_all(i40e); + i40e_intr_io_clear_cause(i40e); + + if (i40e_shutdown_rings(i40e) == B_FALSE) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST); + } + + delay(50 * drv_usectohz(1000)); + + i40e_intr_chip_fini(i40e); + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + mutex_enter(&i40e->i40e_trqpairs[i].itrq_rx_lock); + mutex_enter(&i40e->i40e_trqpairs[i].itrq_tx_lock); + } + + /* + * We should consider refactoring this to be part of the ring start / + * stop routines at some point. + */ + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_stats_trqpair_fini(&i40e->i40e_trqpairs[i]); + } + + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST); + } + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_tx_cleanup_ring(&i40e->i40e_trqpairs[i]); + } + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + mutex_exit(&i40e->i40e_trqpairs[i].itrq_rx_lock); + mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock); + } + + i40e_stat_vsi_fini(i40e); + + i40e->i40e_link_speed = 0; + i40e->i40e_link_duplex = 0; + i40e_link_state_set(i40e, LINK_STATE_UNKNOWN); + + if (free_allocations) { + i40e_free_ring_mem(i40e, B_FALSE); + } +} + +boolean_t +i40e_start(i40e_t *i40e, boolean_t alloc) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + boolean_t rc = B_TRUE; + int i, err; + + ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); + + if (alloc) { + if (i40e_alloc_ring_mem(i40e) == B_FALSE) { + i40e_error(i40e, + "Failed to allocate ring memory"); + return (B_FALSE); + } + } + + /* + * This should get refactored to be part of ring start and stop at + * some point, along with most of the logic here. + */ + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + if (i40e_stats_trqpair_init(&i40e->i40e_trqpairs[i]) == + B_FALSE) { + int j; + + for (j = 0; j < i; j++) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[j]; + i40e_stats_trqpair_fini(itrq); + } + return (B_FALSE); + } + } + + if (!i40e_chip_start(i40e)) { + i40e_fm_ereport(i40e, DDI_FM_DEVICE_INVAL_STATE); + rc = B_FALSE; + goto done; + } + + if (i40e_setup_rx_rings(i40e) == B_FALSE) { + rc = B_FALSE; + goto done; + } + + if (i40e_setup_tx_rings(i40e) == B_FALSE) { + rc = B_FALSE; + goto done; + } + + /* + * Enable broadcast traffic; however, do not enable multicast traffic. + * That's handle exclusively through MAC's mc_multicst routines. + */ + err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to set default VSI: %d\n", err); + rc = B_FALSE; + goto done; + } + + err = i40e_aq_set_mac_config(hw, i40e->i40e_frame_max, B_TRUE, 0, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "failed to set MAC config: %d\n", err); + rc = B_FALSE; + goto done; + } + + /* + * Finally, make sure that we're happy from an FM perspective. + */ + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + rc = B_FALSE; + goto done; + } + + /* Clear state bits prior to final interrupt enabling. */ + atomic_and_32(&i40e->i40e_state, + ~(I40E_ERROR | I40E_STALL | I40E_OVERTEMP)); + + i40e_intr_io_enable_all(i40e); + +done: + if (rc == B_FALSE) { + i40e_stop(i40e, B_FALSE); + if (alloc == B_TRUE) { + i40e_free_ring_mem(i40e, B_TRUE); + } + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST); + } + + return (rc); +} + +/* + * We may have loaned up descriptors to the stack. As such, if we still have + * them outstanding, then we will not continue with detach. + */ +static boolean_t +i40e_drain_rx(i40e_t *i40e) +{ + mutex_enter(&i40e->i40e_rx_pending_lock); + while (i40e->i40e_rx_pending > 0) { + if (cv_reltimedwait(&i40e->i40e_rx_pending_cv, + &i40e->i40e_rx_pending_lock, + drv_usectohz(I40E_DRAIN_RX_WAIT), TR_CLOCK_TICK) == -1) { + mutex_exit(&i40e->i40e_rx_pending_lock); + return (B_FALSE); + } + } + mutex_exit(&i40e->i40e_rx_pending_lock); + + return (B_TRUE); +} + +static int +i40e_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) +{ + i40e_t *i40e; + struct i40e_osdep *osdep; + i40e_hw_t *hw; + int instance; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(devinfo); + i40e = kmem_zalloc(sizeof (i40e_t), KM_SLEEP); + + i40e->i40e_aqbuf = kmem_zalloc(I40E_ADMINQ_BUFSZ, KM_SLEEP); + i40e->i40e_instance = instance; + i40e->i40e_dip = devinfo; + + hw = &i40e->i40e_hw_space; + osdep = &i40e->i40e_osdep_space; + hw->back = osdep; + osdep->ios_i40e = i40e; + + ddi_set_driver_private(devinfo, i40e); + + i40e_fm_init(i40e); + i40e->i40e_attach_progress |= I40E_ATTACH_FM_INIT; + + if (pci_config_setup(devinfo, &osdep->ios_cfg_handle) != DDI_SUCCESS) { + i40e_error(i40e, "Failed to map PCI configurations."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_PCI_CONFIG; + + if (!i40e_identify_hardware(i40e)) { + i40e_error(i40e, "Failed to identify hardware"); + goto attach_fail; + } + + if (!i40e_regs_map(i40e)) { + i40e_error(i40e, "Failed to map device registers."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_REGS_MAP; + + i40e_init_properties(i40e); + i40e->i40e_attach_progress |= I40E_ATTACH_PROPS; + + if (!i40e_common_code_init(i40e, hw)) + goto attach_fail; + i40e->i40e_attach_progress |= I40E_ATTACH_COMMON_CODE; + + /* + * When we participate in IRM, we should make sure that we register + * ourselves with it before callbacks. + */ + if (!i40e_alloc_intrs(i40e, devinfo)) { + i40e_error(i40e, "Failed to allocate interrupts."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_INTR; + + if (!i40e_alloc_trqpairs(i40e)) { + i40e_error(i40e, + "Failed to allocate receive & transmit rings."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_ALLOC_RINGSLOCKS; + + if (!i40e_map_intrs_to_vectors(i40e)) { + i40e_error(i40e, "Failed to map interrupts to vectors."); + goto attach_fail; + } + + if (!i40e_add_intr_handlers(i40e)) { + i40e_error(i40e, "Failed to add the interrupt handlers."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_ADD_INTR; + + if (!i40e_final_init(i40e)) { + i40e_error(i40e, "Final initialization failed."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_INIT; + + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_cfg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_LOST); + goto attach_fail; + } + + if (!i40e_stats_init(i40e)) { + i40e_error(i40e, "Stats initialization failed."); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_STATS; + + if (!i40e_register_mac(i40e)) { + i40e_error(i40e, "Failed to register to MAC/GLDv3"); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_MAC; + + i40e->i40e_periodic_id = ddi_periodic_add(i40e_timer, i40e, + I40E_CYCLIC_PERIOD, DDI_IPL_0); + if (i40e->i40e_periodic_id == 0) { + i40e_error(i40e, "Failed to add the link-check timer"); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_LINK_TIMER; + + if (!i40e_enable_interrupts(i40e)) { + i40e_error(i40e, "Failed to enable DDI interrupts"); + goto attach_fail; + } + i40e->i40e_attach_progress |= I40E_ATTACH_ENABLE_INTR; + + atomic_or_32(&i40e->i40e_state, I40E_INITIALIZED); + + mutex_enter(&i40e_glock); + list_insert_tail(&i40e_glist, i40e); + mutex_exit(&i40e_glock); + + return (DDI_SUCCESS); + +attach_fail: + i40e_unconfigure(devinfo, i40e); + return (DDI_FAILURE); +} + +static int +i40e_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) +{ + i40e_t *i40e; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + i40e = (i40e_t *)ddi_get_driver_private(devinfo); + if (i40e == NULL) { + i40e_log(NULL, "i40e_detach() called with no i40e pointer!"); + return (DDI_FAILURE); + } + + if (i40e_drain_rx(i40e) == B_FALSE) { + i40e_log(i40e, "timed out draining DMA resources, %d buffers " + "remain", i40e->i40e_rx_pending); + return (DDI_FAILURE); + } + + mutex_enter(&i40e_glock); + list_remove(&i40e_glist, i40e); + mutex_exit(&i40e_glock); + + i40e_unconfigure(devinfo, i40e); + + return (DDI_SUCCESS); +} + +static struct cb_ops i40e_cb_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +static struct dev_ops i40e_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + NULL, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + i40e_attach, /* devo_attach */ + i40e_detach, /* devo_detach */ + nodev, /* devo_reset */ + &i40e_cb_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + ddi_power, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv i40e_modldrv = { + &mod_driverops, + i40e_ident, + &i40e_dev_ops +}; + +static struct modlinkage i40e_modlinkage = { + MODREV_1, + &i40e_modldrv, + NULL +}; + +/* + * Module Initialization Functions. + */ +int +_init(void) +{ + int status; + + list_create(&i40e_glist, sizeof (i40e_t), offsetof(i40e_t, i40e_glink)); + list_create(&i40e_dlist, sizeof (i40e_device_t), + offsetof(i40e_device_t, id_link)); + mutex_init(&i40e_glock, NULL, MUTEX_DRIVER, NULL); + mac_init_ops(&i40e_dev_ops, I40E_MODULE_NAME); + + status = mod_install(&i40e_modlinkage); + if (status != DDI_SUCCESS) { + mac_fini_ops(&i40e_dev_ops); + mutex_destroy(&i40e_glock); + list_destroy(&i40e_dlist); + list_destroy(&i40e_glist); + } + + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&i40e_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int status; + + status = mod_remove(&i40e_modlinkage); + if (status == DDI_SUCCESS) { + mac_fini_ops(&i40e_dev_ops); + mutex_destroy(&i40e_glock); + list_destroy(&i40e_dlist); + list_destroy(&i40e_glist); + } + + return (status); +} diff --git a/usr/src/uts/common/io/i40e/i40e_osdep.c b/usr/src/uts/common/io/i40e/i40e_osdep.c new file mode 100644 index 0000000000..41a13ee4ec --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_osdep.c @@ -0,0 +1,236 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +#include "i40e_sw.h" +#include "i40e_type.h" +#include "i40e_alloc.h" +#include "i40e_osdep.h" + +#include <sys/dtrace.h> + +/* ARGSUSED */ +i40e_status +i40e_allocate_virt_mem(struct i40e_hw *hw, struct i40e_virt_mem *mem, u32 size) +{ + mem->va = kmem_zalloc(size, KM_SLEEP); + mem->size = size; + return (I40E_SUCCESS); +} + +/* ARGSUSED */ +i40e_status +i40e_free_virt_mem(struct i40e_hw *hw, struct i40e_virt_mem *mem) +{ + if (mem->va != NULL) + kmem_free(mem->va, mem->size); + return (I40E_SUCCESS); +} + +/* ARGSUSED */ +i40e_status +i40e_allocate_dma_mem(struct i40e_hw *hw, struct i40e_dma_mem *mem, + enum i40e_memory_type type, u64 size, u32 alignment) +{ + int rc; + i40e_t *i40e = OS_DEP(hw)->ios_i40e; + dev_info_t *dip = i40e->i40e_dip; + size_t len; + ddi_dma_cookie_t cookie; + uint_t cookie_num; + ddi_dma_attr_t attr; + + /* + * Because we need to honor the specified alignment, we need to + * dynamically construct the attributes. We save the alignment for + * debugging purposes. + */ + bcopy(&i40e->i40e_static_dma_attr, &attr, sizeof (ddi_dma_attr_t)); + attr.dma_attr_align = alignment; + mem->idm_alignment = alignment; + rc = ddi_dma_alloc_handle(dip, &i40e->i40e_static_dma_attr, + DDI_DMA_DONTWAIT, NULL, &mem->idm_dma_handle); + if (rc != DDI_SUCCESS) { + mem->idm_dma_handle = NULL; + i40e_error(i40e, "failed to allocate DMA handle for common " + "code: %d", rc); + + /* + * Swallow unknown errors and treat them like we do + * DDI_DMA_NORESOURCES, in other words, a memory error. + */ + if (rc == DDI_DMA_BADATTR) + return (I40E_ERR_PARAM); + return (I40E_ERR_NO_MEMORY); + } + + rc = ddi_dma_mem_alloc(mem->idm_dma_handle, size, + &i40e->i40e_buf_acc_attr, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, + NULL, (caddr_t *)&mem->va, &len, &mem->idm_acc_handle); + if (rc != DDI_SUCCESS) { + mem->idm_acc_handle = NULL; + mem->va = NULL; + ASSERT(mem->idm_dma_handle != NULL); + ddi_dma_free_handle(&mem->idm_dma_handle); + mem->idm_dma_handle = NULL; + + i40e_error(i40e, "failed to allocate %d bytes of DMA memory " + "for common code", size); + return (I40E_ERR_NO_MEMORY); + } + + bzero(mem->va, len); + + rc = ddi_dma_addr_bind_handle(mem->idm_dma_handle, NULL, mem->va, len, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL, + &cookie, &cookie_num); + if (rc != DDI_DMA_MAPPED) { + mem->pa = NULL; + ASSERT(mem->idm_acc_handle != NULL); + ddi_dma_mem_free(&mem->idm_acc_handle); + mem->idm_acc_handle = NULL; + mem->va = NULL; + ASSERT(mem->idm_dma_handle != NULL); + ddi_dma_free_handle(&mem->idm_dma_handle); + mem->idm_dma_handle = NULL; + + i40e_error(i40e, "failed to bind %d byte sized dma region: %d", + len, rc); + switch (rc) { + case DDI_DMA_INUSE: + return (I40E_ERR_NOT_READY); + case DDI_DMA_TOOBIG: + return (I40E_ERR_INVALID_SIZE); + case DDI_DMA_NOMAPPING: + case DDI_DMA_NORESOURCES: + default: + return (I40E_ERR_NO_MEMORY); + } + } + + ASSERT(cookie_num == 1); + mem->pa = cookie.dmac_laddress; + /* + * Lint doesn't like this because the common code gives us a uint64_t as + * input, but the common code then asks us to assign it to a size_t. So + * lint's right, but in this case there isn't much we can do. + */ + mem->size = (size_t)size; + + return (I40E_SUCCESS); +} + +/* ARGSUSED */ +i40e_status +i40e_free_dma_mem(struct i40e_hw *hw, struct i40e_dma_mem *mem) +{ + if (mem->pa != 0) { + VERIFY(mem->idm_dma_handle != NULL); + (void) ddi_dma_unbind_handle(mem->idm_dma_handle); + mem->pa = 0; + mem->size = 0; + } + + if (mem->idm_acc_handle != NULL) { + ddi_dma_mem_free(&mem->idm_acc_handle); + mem->idm_acc_handle = NULL; + mem->va = NULL; + } + + if (mem->idm_dma_handle != NULL) { + ddi_dma_free_handle(&mem->idm_dma_handle); + mem->idm_dma_handle = NULL; + } + + /* + * Watch out for sloppiness. + */ + ASSERT(mem->pa == 0); + ASSERT(mem->va == NULL); + ASSERT(mem->size == 0); + mem->idm_alignment = UINT32_MAX; + + return (I40E_SUCCESS); +} + +/* + * The common code wants to initialize its 'spinlocks' here, aka adaptive + * mutexes. At this time these are only used to maintain the adminq's data and + * as such it will only be used outside of interrupt context and even then, + * we're not going to actually end up ever doing anything above lock level and + * up in doing stuff with high level interrupts. + */ +void +i40e_init_spinlock(struct i40e_spinlock *lock) +{ + mutex_init(&lock->ispl_mutex, NULL, MUTEX_DRIVER, NULL); +} + +void +i40e_acquire_spinlock(struct i40e_spinlock *lock) +{ + mutex_enter(&lock->ispl_mutex); +} + +void +i40e_release_spinlock(struct i40e_spinlock *lock) +{ + mutex_exit(&lock->ispl_mutex); +} + +void +i40e_destroy_spinlock(struct i40e_spinlock *lock) +{ + mutex_destroy(&lock->ispl_mutex); +} + +boolean_t +i40e_set_hw_bus_info(struct i40e_hw *hw) +{ + uint8_t pcie_id = PCI_CAP_ID_PCI_E; + uint16_t pcie_cap, value; + int status; + + /* locate the pci-e capability block */ + status = pci_lcap_locate((OS_DEP(hw))->ios_cfg_handle, pcie_id, + &pcie_cap); + if (status != DDI_SUCCESS) { + i40e_error(OS_DEP(hw)->ios_i40e, "failed to locate PCIe " + "capability block: %d", + status); + return (B_FALSE); + } + + value = pci_config_get16(OS_DEP(hw)->ios_cfg_handle, + pcie_cap + PCIE_LINKSTS); + + i40e_set_pci_config_data(hw, value); + + return (B_TRUE); +} + +/* ARGSUSED */ +void +i40e_debug(void *hw, u32 mask, char *fmt, ...) +{ + char buf[1024]; + va_list args; + + va_start(args, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, args); + va_end(args); + + DTRACE_PROBE2(i40e__debug, uint32_t, mask, char *, buf); +} diff --git a/usr/src/uts/common/io/i40e/i40e_osdep.h b/usr/src/uts/common/io/i40e/i40e_osdep.h new file mode 100644 index 0000000000..12f498bc72 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_osdep.h @@ -0,0 +1,201 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _I40E_OSDEP_H +#define _I40E_OSDEP_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci_cap.h> +#include <sys/sysmacros.h> + +/* + * For the moment, we use this to basically deal with a few custom changes + * particularly around mutex initialization. This is used to indicate that we + * should take illumos variants. + */ +#define I40E_ILLUMOS 1 + +#define DEBUGOUT(S) i40e_debug(NULL, 0, S) +#define DEBUGOUT1(S, A) i40e_debug(NULL, 0, S, A) +#define DEBUGOUT2(S, A, B) i40e_debug(NULL, 0, S, A, B) +#define DEBUGOUT3(S, A, B, C) i40e_debug(NULL, 0, S, A, B, C) +#define DEBUGOUT4(S, A, B, C, D) \ + i40e_debug(NULL, 0, S, A, B, C, D) +#define DEBUGOUT5(S, A, B, C, D, E) \ + i40e_debug(NULL, 0, S, A, B, C, D, E) +#define DEBUGOUT6(S, A, B, C, D, E, F) \ + i40e_debug(NULL, 0, S, A, B, C, D, E, F) +#define DEBUGOUT7(S, A, B, C, D, E, F, G) \ + i40e_debug(NULL, 0, S, A, B, C, D, E, F, G) +#define DEBUGFUNC(F) DEBUGOUT(F); + + +#define UNREFERENCED_PARAMETER(x) _NOTE(ARGUNUSED(x)) +#define UNREFERENCED_1PARAMETER(_p) UNREFERENCED_PARAMETER(_p) +#define UNREFERENCED_2PARAMETER(_p, _q) _NOTE(ARGUNUSED(_p, _q)) +#define UNREFERENCED_3PARAMETER(_p, _q, _r) _NOTE(ARGUNUSED(_p, _q, _r)) +#define UNREFERENCED_4PARAMETER(_p, _q, _r, _s) _NOTE(ARGUNUSED(_p, _q,_r, _s)) + +#define INLINE inline + +/* + * The mdb dmod needs to use this code as well, but mdb already defines TRUE and + * FALSE in the module API. Thus we don't define these if we're building the + * dmod, as indicated by _I40E_MDB_DMOD. However, if we don't define these, then + * the shared code will be upset. + */ +#ifndef _I40E_MDB_DMOD +#define FALSE B_FALSE +#define false B_FALSE +#define TRUE B_TRUE +#define true B_TRUE +#endif /* _I40E_MDB_DMOD */ + + +#define CPU_TO_LE16(o) LE_16(o) +#define CPU_TO_LE32(s) LE_32(s) +#define CPU_TO_LE64(h) LE_64(h) +#define LE16_TO_CPU(a) LE_16(a) +#define LE32_TO_CPU(c) LE_32(c) +#define LE64_TO_CPU(k) LE_64(k) + +#define I40E_NTOHS(a) ntohs(a) +#define I40E_NTOHL(a) ntohl(a) +#define I40E_HTONS(a) htons(a) +#define I40E_HTONL(a) htonl(a) + +#define i40e_memset(a, b, c, d) memset((a), (b), (c)) +#define i40e_memcpy(a, b, c, d) bcopy((b), (a), (c)) + +#define i40e_usec_delay(x) drv_usecwait(x) +#define i40e_msec_delay(x) drv_usecwait(1000 * (x)) + +#define FIELD_SIZEOF(x, y) (sizeof (((x*)0)->y)) + +#define BIT(a) (1UL << (a)) +#define BIT_ULL(a) (1ULL << (a)) + +typedef boolean_t bool; + +typedef uint8_t u8; +typedef int8_t s8; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint64_t u64; + +/* long string relief */ +typedef enum i40e_status_code i40e_status; + +#define __le16 u16 +#define __le32 u32 +#define __le64 u64 +#define __be16 u16 +#define __be32 u32 +#define __be64 u64 + +/* + * Most other systems use spin locks for interrupts. However, illumos always + * uses a single kmutex_t for both and we decide what to do based on IPL (hint: + * it's not going to be a true spin lock, we'll use an adaptive mutex). + */ +struct i40e_spinlock { + kmutex_t ispl_mutex; +}; + +/* + * Note, while prefetch is strictly not present on all architectures, (it's an + * SSE extension on i386), it is expected that the platforms provide it. + */ +#define prefetch(x) prefetch_read_many(x) + +struct i40e_osdep { + off_t ios_reg_size; + ddi_acc_handle_t ios_reg_handle; + ddi_acc_handle_t ios_cfg_handle; + struct i40e *ios_i40e; +}; + +/* + * This structure and its members are defined by the common code. This means we + * cannot structure prefix it, even if we want to. + */ +struct i40e_virt_mem { + void *va; + u32 size; +}; + +/* + * The first three members of this structure are defined by the common code. + * This means we cannot structure prefix them, even if we wanted to. + */ +struct i40e_dma_mem { + void *va; /* Virtual address. */ + u64 pa; /* Physical (DMA/Hardware) address. */ + size_t size; /* Buffer size. */ + + /* illumos-private members */ + ddi_acc_handle_t idm_acc_handle; /* Data access handle */ + ddi_dma_handle_t idm_dma_handle; /* DMA handle */ + uint32_t idm_alignment; /* Requested alignment */ +}; + +struct i40e_hw; /* forward decl */ + +#define OS_DEP(hw) ((struct i40e_osdep *)((hw)->back)) +#define i40e_read_pci_cfg(hw, reg) \ + (pci_config_get16(OS_DEP(hw)->ios_cfg_handle, (reg))) +#define i40e_write_pci_cfg(hw, reg, value) \ + (pci_config_put16(OS_DEP(hw)->ios_cfg_handle, (reg), (value))) + +/* + * Intel expects that the symbol wr32 and r32 be defined to something which can + * read and write the 32-bit register in PCI space. + * + * To make it easier for readers and satisfy the general agreement that macros + * should be in all capitals, we use our own versions of these macros. + */ +#define wr32(hw, reg, value) \ + ddi_put32(OS_DEP(hw)->ios_reg_handle, \ + (uint32_t *)((uintptr_t)(hw)->hw_addr + (reg)), (value)) +#define rd32(hw, reg) \ + ddi_get32(OS_DEP(hw)->ios_reg_handle, \ + (uint32_t *)((uintptr_t)(hw)->hw_addr + (reg))) +#define I40E_WRITE_REG wr32 +#define I40E_READ_REG rd32 + +/* + * The use of GLEN_STAT presumes that we're only using this file for a PF + * driver. If we end up doing a VF driver, then we'll want to logically change + * this. + */ +#define i40e_flush(hw) (void) rd32(hw, I40E_GLGEN_STAT) + +extern void i40e_debug(void *, u32, char *, ...); +extern boolean_t i40e_set_hw_bus_info(struct i40e_hw *); + +#ifdef __cplusplus +} +#endif + +#endif /* _I40E_OSDEP_H */ diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c new file mode 100644 index 0000000000..c7dd403fc8 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_stats.c @@ -0,0 +1,1310 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +#include "i40e_sw.h" + +/* + * ------------------- + * Statistics Overview + * ------------------- + * + * As part of managing the driver and understanding what's going on, we keep + * track of statistics from two different sources: + * + * - Statistics from the device + * - Statistics maintained by the driver + * + * Generally, the hardware provides us traditional IETF and MIB Ethernet + * statistics, for example, the total packets in and out, various errors in + * packets, the negotiated status etc. The driver, on the other hand, primarily + * contains statistics around driver-specific issues, such as information about + * checksumming on receive and transmit and the data in and out of a specific + * ring. + * + * We export statistics in two different forms. The first form is the required + * GLDv3 endpoints, specifically: + * + * - The general GLDv3 mc_getstat interface + * - The GLDv3 ring mri_stat interface + * + * The second form that we export statistics is through kstats. kstats are + * exported in different ways. Particularly we arrange the kstats to monitor the + * layout of the device. Currently we have kstats which capture both the IEEE + * and driver-implementation specific stats. There are kstats for each of the + * following structures: + * + * - Each physical function + * - Each VSI + * - Each Queue + * + * The PF's kstat is called 'pfstats' so as not to collide with other system + * provided kstats. Thus, for instance 0, usually the first PF, the full kstat + * would be: i40e:0:pfstats:. + * + * The kstat for each VSI is called vsi_%instance. So for the first PF, which is + * instance zero and the first vsi, which has id 0, it will be named vsi_0 and + * the full kstat would be i40e:0:vsi_0:. + * + * The kstat for each queue is trqpair_tx_%queue and trqpair_rx_%queue. Note + * that these are labeled based on their local index, which may mean that + * different instances have overlapping sets of queues. This isn't a problem as + * the kstats will always use the instance number of the pf to distinguish it in + * the kstat tuple. + * + * --------------------- + * Hardware Arrangements + * --------------------- + * + * The hardware keeps statistics at each physical function/MAC (PF) and it keeps + * statistics on each virtual station interface (VSI). Currently we only use one + * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited + * number of statistics units available. While every PF is guaranteed to have a + * statistics unit, it is possible that we will run out for a given VSI. We'll + * have to figure out an appropriate strategy here when we end up supporting + * multiple VSIs. + * + * The hardware keeps these statistics as 32-bit and 48-bit counters. We are + * required to read them and then compute the differences between them. The + * 48-bit counters span more than one 32-bit register in the BAR. The hardware + * suggests that to read them, we perform 64-bit reads of the lower of the two + * registers that make up a 48-bit stat. The hardware guarantees that the reads + * of those two registers will be atomic and we'll get a consistent value, not a + * property it has for every read of two registers. + * + * For every kstat we have based on this, we have a corresponding uint64_t that + * we keep around as a base value in a separate structure. Whenever we read a + * value, we end up grabbing the current value, calculating a difference between + * the previously stored value and the current one, and updating the kstat with + * that difference. After which, we go through and update the base value that we + * stored. This is all encapsulated in i40e_stat_get_uint32() and + * i40e_stat_get_uint48(). + * + * The only unfortunate thing here is that the hardware doesn't give us any kind + * of overflow counter. It just tries to make sure that the uint32_t and + * uint48_t counters are large enough to hopefully not overflow right away. This + * isn't the most reassuring statement and we should investigate ways of + * ensuring that if a system is active, but not actively measured, we don't lose + * data. + * + * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the + * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in + * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All + * of this data is protected by the i40e_stat_lock, which should be taken last, + * when acquiring locks. + */ + +static void +i40e_stat_get_uint48(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat, + uint64_t *base, boolean_t init) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint64_t raw, delta; + + ASSERT(MUTEX_HELD(&i40e->i40e_stat_lock)); + + raw = ddi_get64(i40e->i40e_osdep_space.ios_reg_handle, + (uint64_t *)((uintptr_t)hw->hw_addr + reg)); + + if (init == B_TRUE) { + *base = raw; + return; + } + + /* + * Check for wraparound, note that the counter is actually only 48-bits, + * even though it has two uint32_t regs present. + */ + if (raw >= *base) { + delta = raw - *base; + } else { + delta = 0x1000000000000ULL - *base + raw; + } + + kstat->value.ui64 += delta; + *base = raw; +} + +static void +i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat, + uint64_t *base, boolean_t init) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint64_t raw, delta; + + ASSERT(MUTEX_HELD(&i40e->i40e_stat_lock)); + + raw = ddi_get32(i40e->i40e_osdep_space.ios_reg_handle, + (uint32_t *)((uintptr_t)hw->hw_addr + reg)); + + if (init == B_TRUE) { + *base = raw; + return; + } + + /* + * Watch out for wraparound as we only have a 32-bit counter. + */ + if (raw >= *base) { + delta = raw - *base; + } else { + delta = 0x100000000ULL - *base + raw; + } + + kstat->value.ui64 += delta; + *base = raw; + +} + +static void +i40e_stat_vsi_update(i40e_t *i40e, boolean_t init) +{ + i40e_vsi_stats_t *ivs; + i40e_vsi_kstats_t *ivk; + int id = i40e->i40e_vsi_stat_id; + + ASSERT(i40e->i40e_vsi_kstat != NULL); + ivs = &i40e->i40e_vsi_stat; + ivk = i40e->i40e_vsi_kstat->ks_data; + + mutex_enter(&i40e->i40e_stat_lock); + + i40e_stat_get_uint48(i40e, I40E_GLV_GORCL(id), &ivk->ivk_rx_bytes, + &ivs->ivs_rx_bytes, init); + i40e_stat_get_uint48(i40e, I40E_GLV_UPRCL(id), &ivk->ivk_rx_unicast, + &ivs->ivs_rx_unicast, init); + i40e_stat_get_uint48(i40e, I40E_GLV_MPRCL(id), &ivk->ivk_rx_multicast, + &ivs->ivs_rx_multicast, init); + i40e_stat_get_uint48(i40e, I40E_GLV_BPRCL(id), &ivk->ivk_rx_broadcast, + &ivs->ivs_rx_broadcast, init); + + i40e_stat_get_uint32(i40e, I40E_GLV_RDPC(id), &ivk->ivk_rx_discards, + &ivs->ivs_rx_discards, init); + i40e_stat_get_uint32(i40e, I40E_GLV_RUPP(id), + &ivk->ivk_rx_unknown_protocol, + &ivs->ivs_rx_unknown_protocol, + init); + + i40e_stat_get_uint48(i40e, I40E_GLV_GOTCL(id), &ivk->ivk_tx_bytes, + &ivs->ivs_tx_bytes, init); + i40e_stat_get_uint48(i40e, I40E_GLV_UPTCL(id), &ivk->ivk_tx_unicast, + &ivs->ivs_tx_unicast, init); + i40e_stat_get_uint48(i40e, I40E_GLV_MPTCL(id), &ivk->ivk_tx_multicast, + &ivs->ivs_tx_multicast, init); + i40e_stat_get_uint48(i40e, I40E_GLV_BPTCL(id), &ivk->ivk_tx_broadcast, + &ivs->ivs_tx_broadcast, init); + + i40e_stat_get_uint32(i40e, I40E_GLV_TEPC(id), &ivk->ivk_tx_errors, + &ivs->ivs_tx_errors, init); + + mutex_exit(&i40e->i40e_stat_lock); + + /* + * We follow ixgbe's lead here and that if a kstat update didn't work + * 100% then we mark service unaffected as opposed to when fetching + * things for MAC directly. + */ + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_UNAFFECTED); + } +} + +static int +i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw) +{ + i40e_t *i40e; + + if (rw == KSTAT_WRITE) + return (EACCES); + + i40e = ksp->ks_private; + i40e_stat_vsi_update(i40e, B_FALSE); + return (0); +} + +void +i40e_stat_vsi_fini(i40e_t *i40e) +{ + if (i40e->i40e_vsi_kstat != NULL) { + kstat_delete(i40e->i40e_vsi_kstat); + i40e->i40e_vsi_kstat = NULL; + } +} + +boolean_t +i40e_stat_vsi_init(i40e_t *i40e) +{ + kstat_t *ksp; + i40e_vsi_kstats_t *ivk; + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id); + + ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip), + buf, "net", KSTAT_TYPE_NAMED, + sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0); + + if (ksp == NULL) { + i40e_error(i40e, "Failed to create kstats for VSI %d", + i40e->i40e_vsi_id); + return (B_FALSE); + } + + i40e->i40e_vsi_kstat = ksp; + ivk = ksp->ks_data; + ksp->ks_update = i40e_stat_vsi_kstat_update; + ksp->ks_private = i40e; + + kstat_named_init(&ivk->ivk_rx_bytes, "rx_bytes", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_rx_unicast, "rx_unicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_rx_multicast, "rx_multicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_rx_broadcast, "rx_broadcast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_rx_discards, "rx_discards", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_rx_unknown_protocol, "rx_unknown_protocol", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_tx_bytes, "tx_bytes", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_tx_unicast, "tx_unicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_tx_multicast, "tx_multicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_tx_broadcast, "tx_broadcast", + KSTAT_DATA_UINT64); + kstat_named_init(&ivk->ivk_tx_errors, "tx_errors", + KSTAT_DATA_UINT64); + + bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t)); + i40e_stat_vsi_update(i40e, B_TRUE); + kstat_install(i40e->i40e_vsi_kstat); + + return (B_TRUE); +} + +static void +i40e_stat_pf_update(i40e_t *i40e, boolean_t init) +{ + i40e_pf_stats_t *ips; + i40e_pf_kstats_t *ipk; + int port = i40e->i40e_hw_space.port; + int i; + + ASSERT(i40e->i40e_pf_kstat != NULL); + ips = &i40e->i40e_pf_stat; + ipk = i40e->i40e_pf_kstat->ks_data; + + mutex_enter(&i40e->i40e_stat_lock); + + /* 64-bit PCIe regs */ + i40e_stat_get_uint48(i40e, I40E_GLPRT_GORCL(port), + &ipk->ipk_rx_bytes, &ips->ips_rx_bytes, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_UPRCL(port), + &ipk->ipk_rx_unicast, &ips->ips_rx_unicast, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port), + &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port), + &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_GOTCL(port), + &ipk->ipk_tx_bytes, &ips->ips_tx_bytes, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_UPTCL(port), + &ipk->ipk_tx_unicast, &ips->ips_tx_unicast, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port), + &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port), + &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, init); + + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC64L(port), + &ipk->ipk_rx_size_64, &ips->ips_rx_size_64, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC127L(port), + &ipk->ipk_rx_size_127, &ips->ips_rx_size_127, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC255L(port), + &ipk->ipk_rx_size_255, &ips->ips_rx_size_255, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC511L(port), + &ipk->ipk_rx_size_511, &ips->ips_rx_size_511, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC1023L(port), + &ipk->ipk_rx_size_1023, &ips->ips_rx_size_1023, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC1522L(port), + &ipk->ipk_rx_size_1522, &ips->ips_rx_size_1522, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PRC9522L(port), + &ipk->ipk_rx_size_9522, &ips->ips_rx_size_9522, init); + + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC64L(port), + &ipk->ipk_tx_size_64, &ips->ips_tx_size_64, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC127L(port), + &ipk->ipk_tx_size_127, &ips->ips_tx_size_127, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC255L(port), + &ipk->ipk_tx_size_255, &ips->ips_tx_size_255, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC511L(port), + &ipk->ipk_tx_size_511, &ips->ips_tx_size_511, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC1023L(port), + &ipk->ipk_tx_size_1023, &ips->ips_tx_size_1023, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC1522L(port), + &ipk->ipk_tx_size_1522, &ips->ips_tx_size_1522, init); + i40e_stat_get_uint48(i40e, I40E_GLPRT_PTC9522L(port), + &ipk->ipk_tx_size_9522, &ips->ips_tx_size_9522, init); + + /* 32-bit PCIe regs */ + i40e_stat_get_uint32(i40e, I40E_GLPRT_LXONRXC(port), + &ipk->ipk_link_xon_rx, &ips->ips_link_xon_rx, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_LXOFFRXC(port), + &ipk->ipk_link_xoff_rx, &ips->ips_link_xoff_rx, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_LXONTXC(port), + &ipk->ipk_link_xon_tx, &ips->ips_link_xon_tx, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_LXOFFTXC(port), + &ipk->ipk_link_xoff_tx, &ips->ips_link_xoff_tx, init); + + for (i = 0; i < 8; i++) { + i40e_stat_get_uint32(i40e, I40E_GLPRT_PXONRXC(port, i), + &ipk->ipk_priority_xon_rx[i], &ips->ips_priority_xon_rx[i], + init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_PXOFFRXC(port, i), + &ipk->ipk_priority_xoff_rx[i], + &ips->ips_priority_xoff_rx[i], + init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_PXONTXC(port, i), + &ipk->ipk_priority_xon_tx[i], &ips->ips_priority_xon_tx[i], + init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_PXOFFTXC(port, i), + &ipk->ipk_priority_xoff_tx[i], + &ips->ips_priority_xoff_tx[i], + init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RXON2OFFCNT(port, i), + &ipk->ipk_priority_xon_2_xoff[i], + &ips->ips_priority_xon_2_xoff[i], + init); + } + + i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port), + &ipk->ipk_crc_errors, &ips->ips_crc_errors, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port), + &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_MLFC(port), + &ipk->ipk_mac_local_faults, &ips->ips_mac_local_faults, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_MRFC(port), + &ipk->ipk_mac_remote_faults, &ips->ips_mac_remote_faults, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port), + &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port), + &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port), + &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port), + &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port), + &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RDPC(port), + &ipk->ipk_rx_discards, &ips->ips_rx_discards, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_LDPC(port), + &ipk->ipk_rx_vm_discards, &ips->ips_rx_vm_discards, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port), + &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards, init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_TDOLD(port), + &ipk->ipk_tx_dropped_link_down, &ips->ips_tx_dropped_link_down, + init); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RUPP(port), + &ipk->ipk_rx_unknown_protocol, &ips->ips_rx_unknown_protocol, init); + + /* 64-bit */ + i40e_stat_get_uint48(i40e, I40E_GL_RXERR1_L(port), &ipk->ipk_rx_err1, + &ips->ips_rx_err1, init); + i40e_stat_get_uint48(i40e, I40E_GL_RXERR2_L(port), &ipk->ipk_rx_err2, + &ips->ips_rx_err2, init); + + mutex_exit(&i40e->i40e_stat_lock); + + /* + * We follow ixgbe's lead here and that if a kstat update didn't work + * 100% then we mark service unaffected as opposed to when fetching + * things for MAC directly. + */ + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_UNAFFECTED); + } +} + +static int +i40e_stat_pf_kstat_update(kstat_t *ksp, int rw) +{ + i40e_t *i40e; + + if (rw == KSTAT_WRITE) + return (EACCES); + + i40e = ksp->ks_private; + i40e_stat_pf_update(i40e, B_FALSE); + return (0); +} + + +static boolean_t +i40e_stat_pf_init(i40e_t *i40e) +{ + kstat_t *ksp; + i40e_pf_kstats_t *ipk; + + ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip), + "pfstats", "net", KSTAT_TYPE_NAMED, + sizeof (i40e_pf_kstats_t) / sizeof (kstat_named_t), 0); + if (ksp == NULL) { + i40e_error(i40e, "Could not create kernel statistics."); + return (B_FALSE); + } + + i40e->i40e_pf_kstat = ksp; + ipk = ksp->ks_data; + ksp->ks_update = i40e_stat_pf_kstat_update; + ksp->ks_private = i40e; + + kstat_named_init(&ipk->ipk_rx_bytes, "rx_bytes", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_unicast, "rx_unicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_multicast, "rx_multicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_broadcast, "rx_broadcast", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_bytes, "tx_bytes", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_unicast, "tx_unicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_multicast, "tx_multicast", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_broadcast, "tx_broadcast", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_rx_size_64, "rx_size_64", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_127, "rx_size_127", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_255, "rx_size_255", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_511, "rx_size_511", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_1023, "rx_size_1023", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_1522, "rx_size_1522", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_size_9522, "rx_size_9522", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_tx_size_64, "tx_size_64", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_127, "tx_size_127", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_255, "tx_size_255", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_511, "tx_size_511", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_1023, "tx_size_1023", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_1522, "tx_size_1522", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_size_9522, "tx_size_9522", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_link_xon_rx, "link_xon_rx", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_link_xoff_rx, "link_xoff_rx", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_link_xon_tx, "link_xon_tx", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_link_xoff_tx, "link_xoff_tx", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[0], "priority_xon_rx[0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[0], "priority_xoff_rx[0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[0], "priority_xon_tx[0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[0], "priority_xoff_tx[0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[0], + "priority_xon_2_xoff[0]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[1], "priority_xon_rx[1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[1], "priority_xoff_rx[1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[1], "priority_xon_tx[1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[1], "priority_xoff_tx[1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[1], + "priority_xon_2_xoff[1]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[2], "priority_xon_rx[2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[2], "priority_xoff_rx[2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[2], "priority_xon_tx[2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[2], "priority_xoff_tx[2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[2], + "priority_xon_2_xoff[2]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[3], "priority_xon_rx[3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[3], "priority_xoff_rx[3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[3], "priority_xon_tx[3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[3], "priority_xoff_tx[3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[3], + "priority_xon_2_xoff[3]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[4], "priority_xon_rx[4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[4], "priority_xoff_rx[4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[4], "priority_xon_tx[4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[4], "priority_xoff_tx[4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[4], + "priority_xon_2_xoff[4]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[5], "priority_xon_rx[5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[5], "priority_xoff_rx[5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[5], "priority_xon_tx[5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[5], "priority_xoff_tx[5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[5], + "priority_xon_2_xoff[5]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[6], "priority_xon_rx[6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[6], "priority_xoff_rx[6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[6], "priority_xon_tx[6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[6], "priority_xoff_tx[6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[6], + "priority_xon_2_xoff[6]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_priority_xon_rx[7], "priority_xon_rx[7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_rx[7], "priority_xoff_rx[7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_tx[7], "priority_xon_tx[7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xoff_tx[7], "priority_xoff_tx[7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_priority_xon_2_xoff[7], + "priority_xon_2_xoff[7]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ipk->ipk_crc_errors, "crc_errors", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_illegal_bytes, "illegal_bytes", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_mac_local_faults, "mac_local_faults", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_mac_remote_faults, "mac_remote_faults", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_length_errors, "rx_length_errors", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_undersize, "rx_undersize", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_fragments, "rx_fragments", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_oversize, "rx_oversize", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_jabber, "rx_jabber", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_discards, "rx_discards", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_vm_discards, "rx_vm_discards", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_short_discards, "rx_short_discards", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_tx_dropped_link_down, "tx_dropped_link_down", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_unknown_protocol, "rx_unknown_protocol", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_err1, "rx_err1", + KSTAT_DATA_UINT64); + kstat_named_init(&ipk->ipk_rx_err2, "rx_err2", + KSTAT_DATA_UINT64); + + + bzero(&i40e->i40e_pf_stat, sizeof (i40e_pf_stats_t)); + i40e_stat_pf_update(i40e, B_TRUE); + + kstat_install(i40e->i40e_pf_kstat); + + return (B_TRUE); +} + +void +i40e_stats_fini(i40e_t *i40e) +{ + ASSERT(i40e->i40e_vsi_kstat == NULL); + if (i40e->i40e_pf_kstat != NULL) { + kstat_delete(i40e->i40e_pf_kstat); + i40e->i40e_pf_kstat = NULL; + } + + mutex_destroy(&i40e->i40e_stat_lock); +} + +boolean_t +i40e_stats_init(i40e_t *i40e) +{ + mutex_init(&i40e->i40e_stat_lock, NULL, MUTEX_DRIVER, NULL); + if (i40e_stat_pf_init(i40e) == B_FALSE) { + mutex_destroy(&i40e->i40e_stat_lock); + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * For Nemo/GLDv3. + */ +int +i40e_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + i40e_t *i40e = (i40e_t *)arg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + int port = i40e->i40e_hw_space.port; + i40e_pf_stats_t *ips; + i40e_pf_kstats_t *ipk; + + + ASSERT(i40e->i40e_pf_kstat != NULL); + ips = &i40e->i40e_pf_stat; + ipk = i40e->i40e_pf_kstat->ks_data; + + /* + * We need both locks, as various stats are protected by different + * things here. + */ + mutex_enter(&i40e->i40e_general_lock); + + if (i40e->i40e_state & I40E_SUSPENDED) { + mutex_exit(&i40e->i40e_general_lock); + return (ECANCELED); + } + + mutex_enter(&i40e->i40e_stat_lock); + + /* + * Unfortunately the GLDv3 conflates two rather different things here. + * We're combining statistics about the physical port represented by + * this instance with statistics that describe the properties of the + * logical interface. As such, we're going to use the various aspects of + * the port to describe these stats as they represent what the physical + * instance is doing, even though that that means some tools may be + * confused and that to see the logical traffic on the interface itself + * sans VNICs and the like will require more work. + * + * Stats which are not listed in this switch statement are unimplemented + * at this time in hardware or don't currently apply to the device. + */ + switch (stat) { + /* MIB-II stats (RFC 1213 and RFC 1573) */ + case MAC_STAT_IFSPEED: + *val = i40e->i40e_link_speed * 1000000ull; + break; + case MAC_STAT_MULTIRCV: + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port), + &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, B_FALSE); + *val = ipk->ipk_rx_multicast.value.ui64; + break; + case MAC_STAT_BRDCSTRCV: + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port), + &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, B_FALSE); + *val = ipk->ipk_rx_broadcast.value.ui64; + break; + case MAC_STAT_MULTIXMT: + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port), + &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, B_FALSE); + *val = ipk->ipk_tx_multicast.value.ui64; + break; + case MAC_STAT_BRDCSTXMT: + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port), + &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, B_FALSE); + *val = ipk->ipk_tx_broadcast.value.ui64; + break; + case MAC_STAT_NORCVBUF: + i40e_stat_get_uint32(i40e, I40E_GLPRT_RDPC(port), + &ipk->ipk_rx_discards, &ips->ips_rx_discards, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_LDPC(port), + &ipk->ipk_rx_vm_discards, &ips->ips_rx_vm_discards, + B_FALSE); + *val = ipk->ipk_rx_discards.value.ui64 + + ipk->ipk_rx_vm_discards.value.ui64; + break; + /* + * Note, that some RXERR2 stats are also duplicated by the switch filter + * stats; however, since we're not using those at this time, it seems + * reasonable to include them. + */ + case MAC_STAT_IERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port), + &ipk->ipk_crc_errors, &ips->ips_crc_errors, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port), + &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port), + &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors, + B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GL_RXERR1_L(port), + &ipk->ipk_rx_err1, &ips->ips_rx_err1, B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GL_RXERR2_L(port), + &ipk->ipk_rx_err2, &ips->ips_rx_err2, B_FALSE); + + *val = ipk->ipk_crc_errors.value.ui64 + + ipk->ipk_illegal_bytes.value.ui64 + + ipk->ipk_rx_length_errors.value.ui64 + + ipk->ipk_rx_err1.value.ui64 + + ipk->ipk_rx_err2.value.ui64; + break; + case MAC_STAT_UNKNOWNS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_RUPP(port), + &ipk->ipk_rx_unknown_protocol, + &ips->ips_rx_unknown_protocol, + B_FALSE); + *val = ipk->ipk_rx_unknown_protocol.value.ui64; + break; + case MAC_STAT_RBYTES: + i40e_stat_get_uint48(i40e, I40E_GLPRT_GORCL(port), + &ipk->ipk_rx_bytes, &ips->ips_rx_bytes, B_FALSE); + *val = ipk->ipk_rx_bytes.value.ui64; + break; + case MAC_STAT_IPACKETS: + i40e_stat_get_uint48(i40e, I40E_GLPRT_UPRCL(port), + &ipk->ipk_rx_unicast, &ips->ips_rx_unicast, B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPRCL(port), + &ipk->ipk_rx_multicast, &ips->ips_rx_multicast, B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPRCL(port), + &ipk->ipk_rx_broadcast, &ips->ips_rx_broadcast, B_FALSE); + *val = ipk->ipk_rx_unicast.value.ui64 + + ipk->ipk_rx_multicast.value.ui64 + + ipk->ipk_rx_broadcast.value.ui64; + break; + case MAC_STAT_OBYTES: + i40e_stat_get_uint48(i40e, I40E_GLPRT_GOTCL(port), + &ipk->ipk_tx_bytes, &ips->ips_tx_bytes, B_FALSE); + *val = ipk->ipk_tx_bytes.value.ui64; + break; + case MAC_STAT_OPACKETS: + i40e_stat_get_uint48(i40e, I40E_GLPRT_UPTCL(port), + &ipk->ipk_tx_unicast, &ips->ips_tx_unicast, B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GLPRT_MPTCL(port), + &ipk->ipk_tx_multicast, &ips->ips_tx_multicast, B_FALSE); + i40e_stat_get_uint48(i40e, I40E_GLPRT_BPTCL(port), + &ipk->ipk_tx_broadcast, &ips->ips_tx_broadcast, B_FALSE); + *val = ipk->ipk_tx_unicast.value.ui64 + + ipk->ipk_tx_multicast.value.ui64 + + ipk->ipk_tx_broadcast.value.ui64; + break; + case MAC_STAT_UNDERFLOWS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port), + &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port), + &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port), + &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards, + B_FALSE); + *val = ipk->ipk_rx_undersize.value.ui64 + + ipk->ipk_rx_fragments.value.ui64 + + ipk->ipk_rx_short_discards.value.ui64; + break; + case MAC_STAT_OVERFLOWS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port), + &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port), + &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, B_FALSE); + *val = ipk->ipk_rx_oversize.value.ui64 + + ipk->ipk_rx_fragments.value.ui64; + break; + + /* RFC 1643 stats */ + case ETHER_STAT_FCS_ERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_CRCERRS(port), + &ipk->ipk_crc_errors, &ips->ips_crc_errors, B_FALSE); + *val = ipk->ipk_crc_errors.value.ui64; + break; + case ETHER_STAT_TOOLONG_ERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_ROC(port), + &ipk->ipk_rx_oversize, &ips->ips_rx_oversize, B_FALSE); + *val = ipk->ipk_rx_oversize.value.ui64; + break; + case ETHER_STAT_MACRCV_ERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_ILLERRC(port), + &ipk->ipk_illegal_bytes, &ips->ips_illegal_bytes, B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RLEC(port), + &ipk->ipk_rx_length_errors, &ips->ips_rx_length_errors, + B_FALSE); + i40e_stat_get_uint32(i40e, I40E_GLPRT_RFC(port), + &ipk->ipk_rx_fragments, &ips->ips_rx_fragments, B_FALSE); + *val = ipk->ipk_illegal_bytes.value.ui64 + + ipk->ipk_rx_length_errors.value.ui64 + + ipk->ipk_rx_fragments.value.ui64; + break; + /* MII/GMII stats */ + + /* + * The receiver address is apparently the same as the port number. + */ + case ETHER_STAT_XCVR_ADDR: + /* The Receiver address is apparently the same as the port */ + *val = i40e->i40e_hw_space.port; + break; + case ETHER_STAT_XCVR_ID: + switch (hw->phy.media_type) { + case I40E_MEDIA_TYPE_BASET: + /* + * Transform the data here into the ID. Note, generally + * the revision is left out. + */ + *val = i40e->i40e_phy.phy_id[3] << 24 | + i40e->i40e_phy.phy_id[2] << 16 | + i40e->i40e_phy.phy_id[1] << 8; + break; + case I40E_MEDIA_TYPE_FIBER: + case I40E_MEDIA_TYPE_BACKPLANE: + case I40E_MEDIA_TYPE_CX4: + case I40E_MEDIA_TYPE_DA: + case I40E_MEDIA_TYPE_VIRTUAL: + *val = i40e->i40e_phy.phy_id[0] | + i40e->i40e_phy.phy_id[1] << 8 | + i40e->i40e_phy.phy_id[2] << 16; + break; + case I40E_MEDIA_TYPE_UNKNOWN: + default: + goto unimpl; + } + break; + case ETHER_STAT_XCVR_INUSE: + switch (hw->phy.link_info.phy_type) { + case I40E_PHY_TYPE_100BASE_TX: + *val = XCVR_100T2; + break; + case I40E_PHY_TYPE_1000BASE_T: + *val = XCVR_1000T; + break; + default: + *val = XCVR_UNDEFINED; + break; + } + break; + + /* + * This group answers the question of do we support a given speed in + * theory. + */ + case ETHER_STAT_CAP_100FDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0; + break; + case ETHER_STAT_CAP_1000FDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0; + break; + case ETHER_STAT_CAP_10GFDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0; + break; + case ETHER_STAT_CAP_40GFDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0; + break; + + /* + * These ask are we currently advertising these speeds and abilities. + * Until we support setting these because we're working with a copper + * PHY, then the only things we advertise are based on the link PHY + * speeds. In other words, we advertise everything we support. + */ + case ETHER_STAT_ADV_CAP_100FDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_100MB) != 0; + break; + case ETHER_STAT_ADV_CAP_1000FDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_1GB) != 0; + break; + case ETHER_STAT_ADV_CAP_10GFDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_10GB) != 0; + break; + case ETHER_STAT_ADV_CAP_40GFDX: + *val = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0; + break; + + /* + * These ask if the peer supports these speeds, e.g. what did they tell + * us in auto-negotiation. Unfortunately, hardware doesn't appear to + * give us a way to determine whether or not they actually support + * something, only what they have enabled. This means that all we can + * tell the user is the speed that we're currently at, unfortunately. + */ + case ETHER_STAT_LP_CAP_100FDX: + *val = i40e->i40e_link_speed == 100; + break; + case ETHER_STAT_LP_CAP_1000FDX: + *val = i40e->i40e_link_speed == 1000; + break; + case ETHER_STAT_LP_CAP_10GFDX: + *val = i40e->i40e_link_speed == 10000; + break; + case ETHER_STAT_LP_CAP_40GFDX: + *val = i40e->i40e_link_speed == 40000; + break; + + /* + * Statistics for unsupported speeds. Note that these often have the + * same constraints as the other ones. For example, we can't answer the + * question of the ETHER_STAT_LP_CAP family because hardware doesn't + * give us any way of knowing whether or not it does. + */ + case ETHER_STAT_CAP_100HDX: + case ETHER_STAT_CAP_1000HDX: + case ETHER_STAT_CAP_10FDX: + case ETHER_STAT_CAP_10HDX: + case ETHER_STAT_CAP_100T4: + case ETHER_STAT_CAP_100GFDX: + case ETHER_STAT_CAP_2500FDX: + case ETHER_STAT_CAP_5000FDX: + case ETHER_STAT_ADV_CAP_1000HDX: + case ETHER_STAT_ADV_CAP_100HDX: + case ETHER_STAT_ADV_CAP_10FDX: + case ETHER_STAT_ADV_CAP_10HDX: + case ETHER_STAT_ADV_CAP_100T4: + case ETHER_STAT_ADV_CAP_100GFDX: + case ETHER_STAT_ADV_CAP_2500FDX: + case ETHER_STAT_ADV_CAP_5000FDX: + case ETHER_STAT_LP_CAP_1000HDX: + case ETHER_STAT_LP_CAP_100HDX: + case ETHER_STAT_LP_CAP_10FDX: + case ETHER_STAT_LP_CAP_10HDX: + case ETHER_STAT_LP_CAP_100T4: + case ETHER_STAT_LP_CAP_100GFDX: + case ETHER_STAT_LP_CAP_2500FDX: + case ETHER_STAT_LP_CAP_5000FDX: + *val = 0; + break; + + case ETHER_STAT_LINK_DUPLEX: + *val = i40e->i40e_link_duplex; + break; + case ETHER_STAT_TOOSHORT_ERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_RUC(port), + &ipk->ipk_rx_undersize, &ips->ips_rx_undersize, B_FALSE); + + i40e_stat_get_uint32(i40e, I40E_GLPRT_MSPDC(port), + &ipk->ipk_rx_short_discards, &ips->ips_rx_short_discards, + B_FALSE); + *val = ipk->ipk_rx_undersize.value.ui64 + + ipk->ipk_rx_short_discards.value.ui64; + break; + case ETHER_STAT_JABBER_ERRORS: + i40e_stat_get_uint32(i40e, I40E_GLPRT_RJC(port), + &ipk->ipk_rx_jabber, &ips->ips_rx_jabber, B_FALSE); + *val = ipk->ipk_rx_jabber.value.ui64; + break; + + /* + * Non-Link speed related capabilities. + */ + case ETHER_STAT_CAP_AUTONEG: + *val = 1; + break; + + case ETHER_STAT_ADV_CAP_AUTONEG: + *val = 1; + break; + + case ETHER_STAT_LP_CAP_AUTONEG: + *val = (hw->phy.link_info.an_info & I40E_AQ_LP_AN_ABILITY) != 0; + break; + + case ETHER_STAT_LINK_AUTONEG: + *val = 1; + break; + + /* + * Note that while the hardware does support the pause functionality, at + * this time we do not use it at all and effectively disable it. + */ + case ETHER_STAT_CAP_ASMPAUSE: + *val = (i40e->i40e_phy.abilities & + I40E_AQ_PHY_FLAG_PAUSE_RX) != 0; + break; + case ETHER_STAT_CAP_PAUSE: + *val = (i40e->i40e_phy.abilities & + I40E_AQ_PHY_FLAG_PAUSE_TX) != 0; + break; + + /* + * Because we don't support these at this time, they are always + * hard-coded to zero. + */ + case ETHER_STAT_ADV_CAP_ASMPAUSE: + case ETHER_STAT_ADV_CAP_PAUSE: + *val = 0; + break; + + /* + * Like the other LP fields, we can only answer the question have we + * enabled it, not whether the other end actually supports it. + */ + case ETHER_STAT_LP_CAP_ASMPAUSE: + case ETHER_STAT_LINK_ASMPAUSE: + *val = (hw->phy.link_info.an_info & I40E_AQ_LINK_PAUSE_RX) != 0; + break; + case ETHER_STAT_LP_CAP_PAUSE: + case ETHER_STAT_LINK_PAUSE: + *val = (hw->phy.link_info.an_info & I40E_AQ_LINK_PAUSE_TX) != 0; + break; + + default: + unimpl: + mutex_exit(&i40e->i40e_stat_lock); + mutex_exit(&i40e->i40e_general_lock); + return (ENOTSUP); + } + + mutex_exit(&i40e->i40e_stat_lock); + mutex_exit(&i40e->i40e_general_lock); + + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + return (EIO); + } + + return (0); +} + +int +i40e_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) +{ + i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh; + i40e_t *i40e = itrq->itrq_i40e; + + if (i40e->i40e_state & I40E_SUSPENDED) { + return (ECANCELED); + } + + switch (stat) { + case MAC_STAT_RBYTES: + *val = itrq->itrq_rxstat.irxs_bytes.value.ui64; + break; + case MAC_STAT_IPACKETS: + *val = itrq->itrq_rxstat.irxs_packets.value.ui64; + break; + default: + *val = 0; + return (ENOTSUP); + } + + return (0); +} + +int +i40e_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) +{ + i40e_trqpair_t *itrq = (i40e_trqpair_t *)rh; + i40e_t *i40e = itrq->itrq_i40e; + + if (i40e->i40e_state & I40E_SUSPENDED) { + return (ECANCELED); + } + + switch (stat) { + case MAC_STAT_OBYTES: + *val = itrq->itrq_txstat.itxs_bytes.value.ui64; + break; + case MAC_STAT_OPACKETS: + *val = itrq->itrq_txstat.itxs_packets.value.ui64; + break; + default: + *val = 0; + return (ENOTSUP); + } + + return (0); +} + +/* + * When we end up refactoring all off the queue assignments and have non-static + * queue to VSI mappings, then we may need to revisit the general locking + * strategy that we employ and have the kstat creation / deletion be part of the + * ring start and stop routines. + */ +void +i40e_stats_trqpair_fini(i40e_trqpair_t *itrq) +{ + if (itrq->itrq_txkstat != NULL) { + kstat_delete(itrq->itrq_txkstat); + itrq->itrq_txkstat = NULL; + } + + if (itrq->itrq_rxkstat != NULL) { + kstat_delete(itrq->itrq_rxkstat); + itrq->itrq_rxkstat = NULL; + } +} + +boolean_t +i40e_stats_trqpair_init(i40e_trqpair_t *itrq) +{ + char buf[128]; + i40e_t *i40e = itrq->itrq_i40e; + i40e_txq_stat_t *tsp = &itrq->itrq_txstat; + i40e_rxq_stat_t *rsp = &itrq->itrq_rxstat; + + (void) snprintf(buf, sizeof (buf), "trqpair_tx_%d", itrq->itrq_index); + itrq->itrq_txkstat = kstat_create(I40E_MODULE_NAME, + ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED, + sizeof (i40e_txq_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (itrq->itrq_txkstat == NULL) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), "trqpair_rx_%d", itrq->itrq_index); + itrq->itrq_rxkstat = kstat_create(I40E_MODULE_NAME, + ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED, + sizeof (i40e_rxq_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (itrq->itrq_rxkstat == NULL) { + kstat_delete(itrq->itrq_txkstat); + itrq->itrq_txkstat = NULL; + return (B_FALSE); + } + + itrq->itrq_txkstat->ks_data = &itrq->itrq_txstat; + itrq->itrq_rxkstat->ks_data = &itrq->itrq_rxstat; + + kstat_named_init(&tsp->itxs_bytes, "tx_bytes", + KSTAT_DATA_UINT64); + tsp->itxs_bytes.value.ui64 = 0; + kstat_named_init(&tsp->itxs_packets, "tx_packets", + KSTAT_DATA_UINT64); + tsp->itxs_packets.value.ui64 = 0; + kstat_named_init(&tsp->itxs_descriptors, "tx_descriptors", + KSTAT_DATA_UINT64); + tsp->itxs_descriptors.value.ui64 = 0; + kstat_named_init(&tsp->itxs_recycled, "tx_recycled", + KSTAT_DATA_UINT64); + tsp->itxs_recycled.value.ui64 = 0; + + kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail", + KSTAT_DATA_UINT64); + tsp->itxs_hck_meoifail.value.ui64 = 0; + kstat_named_init(&tsp->itxs_hck_nol2info, "tx_hck_nol2info", + KSTAT_DATA_UINT64); + tsp->itxs_hck_nol2info.value.ui64 = 0; + kstat_named_init(&tsp->itxs_hck_nol3info, "tx_hck_nol3info", + KSTAT_DATA_UINT64); + tsp->itxs_hck_nol3info.value.ui64 = 0; + kstat_named_init(&tsp->itxs_hck_nol4info, "tx_hck_nol4info", + KSTAT_DATA_UINT64); + tsp->itxs_hck_nol4info.value.ui64 = 0; + kstat_named_init(&tsp->itxs_hck_badl3, "tx_hck_badl3", + KSTAT_DATA_UINT64); + tsp->itxs_hck_badl3.value.ui64 = 0; + kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4", + KSTAT_DATA_UINT64); + tsp->itxs_hck_badl4.value.ui64 = 0; + kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb", + KSTAT_DATA_UINT64); + tsp->itxs_err_notcb.value.ui64 = 0; + kstat_named_init(&tsp->itxs_err_nodescs, "tx_err_nodescs", + KSTAT_DATA_UINT64); + tsp->itxs_err_nodescs.value.ui64 = 0; + kstat_named_init(&tsp->itxs_err_context, "tx_err_context", + KSTAT_DATA_UINT64); + tsp->itxs_err_context.value.ui64 = 0; + kstat_named_init(&tsp->itxs_num_unblocked, "tx_num_unblocked", + KSTAT_DATA_UINT64); + tsp->itxs_num_unblocked.value.ui64 = 0; + + + kstat_named_init(&rsp->irxs_bytes, "rx_bytes", + KSTAT_DATA_UINT64); + rsp->irxs_bytes.value.ui64 = 0; + kstat_named_init(&rsp->irxs_packets, "rx_packets", + KSTAT_DATA_UINT64); + rsp->irxs_packets.value.ui64 = 0; + kstat_named_init(&rsp->irxs_rx_desc_error, "rx_desc_error", + KSTAT_DATA_UINT64); + rsp->irxs_rx_desc_error.value.ui64 = 0; + kstat_named_init(&rsp->irxs_rx_intr_limit, "rx_intr_limit", + KSTAT_DATA_UINT64); + rsp->irxs_rx_intr_limit.value.ui64 = 0; + kstat_named_init(&rsp->irxs_rx_bind_norcb, "rx_bind_norcb", + KSTAT_DATA_UINT64); + rsp->irxs_rx_bind_norcb.value.ui64 = 0; + kstat_named_init(&rsp->irxs_rx_bind_nomp, "rx_bind_nomp", + KSTAT_DATA_UINT64); + rsp->irxs_rx_bind_nomp.value.ui64 = 0; + kstat_named_init(&rsp->irxs_rx_copy_nomem, "rx_copy_nomem", + KSTAT_DATA_UINT64); + rsp->irxs_rx_copy_nomem.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_v4hdrok, "rx_hck_v4hdrok", + KSTAT_DATA_UINT64); + rsp->irxs_hck_v4hdrok.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_l4hdrok, "rx_hck_l4hdrok", + KSTAT_DATA_UINT64); + rsp->irxs_hck_l4hdrok.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_unknown, "rx_hck_unknown", + KSTAT_DATA_UINT64); + rsp->irxs_hck_unknown.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_nol3l4p, "rx_hck_nol3l4p", + KSTAT_DATA_UINT64); + rsp->irxs_hck_nol3l4p.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_iperr, "rx_hck_iperr", + KSTAT_DATA_UINT64); + rsp->irxs_hck_iperr.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_eiperr, "rx_hck_eiperr", + KSTAT_DATA_UINT64); + rsp->irxs_hck_eiperr.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_l4err, "rx_hck_l4err", + KSTAT_DATA_UINT64); + rsp->irxs_hck_l4err.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_v6skip, "rx_hck_v6skip", + KSTAT_DATA_UINT64); + rsp->irxs_hck_v6skip.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_set, "rx_hck_set", + KSTAT_DATA_UINT64); + rsp->irxs_hck_set.value.ui64 = 0; + kstat_named_init(&rsp->irxs_hck_miss, "rx_hck_miss", + KSTAT_DATA_UINT64); + rsp->irxs_hck_miss.value.ui64 = 0; + + kstat_install(itrq->itrq_txkstat); + kstat_install(itrq->itrq_rxkstat); + + return (B_TRUE); +} diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h new file mode 100644 index 0000000000..04959b1590 --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_sw.h @@ -0,0 +1,974 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +/* + * Please see i40e_main.c for an introduction to the device driver, its layout, + * and more. + */ + +#ifndef _I40E_SW_H +#define _I40E_SW_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strlog.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/kstat.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/dlpi.h> +#include <sys/mac_provider.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> +#include <sys/pcie.h> +#include <sys/sdt.h> +#include <sys/ethernet.h> +#include <sys/pattr.h> +#include <sys/strsubr.h> +#include <sys/netlb.h> +#include <sys/random.h> +#include <inet/common.h> +#include <inet/tcp.h> +#include <inet/ip.h> +#include <inet/mi.h> +#include <inet/nd.h> +#include <netinet/udp.h> +#include <netinet/sctp.h> +#include <sys/bitmap.h> +#include <sys/cpuvar.h> +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/disp.h> +#include <sys/fm/io/ddi.h> +#include <sys/list.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include "i40e_type.h" +#include "i40e_osdep.h" +#include "i40e_prototype.h" +#include "i40e_xregs.h" + +#define I40E_MODULE_NAME "i40e" + +#define I40E_ADAPTER_REGSET 1 + +/* + * Configuration constants. Note that the hardware defines a minimum bound of 32 + * descriptors and requires that the programming of the descriptor lengths be + * aligned in units of 32 descriptors. + */ +#define I40E_MIN_TX_RING_SIZE 64 +#define I40E_MAX_TX_RING_SIZE 4096 +#define I40E_DEF_TX_RING_SIZE 1024 + +#define I40E_MIN_RX_RING_SIZE 64 +#define I40E_MAX_RX_RING_SIZE 4096 +#define I40E_DEF_RX_RING_SIZE 1024 + +#define I40E_DESC_ALIGN 32 + +/* + * Sizes used for asynchronous processing of the adminq. We allocate a fixed + * size buffer for each instance of the device during attach time, rather than + * allocating and freeing one during interrupt processing. + * + * We also define the descriptor size of the admin queue here. + */ +#define I40E_ADMINQ_BUFSZ 4096 +#define I40E_MAX_ADMINQ_SIZE 1024 +#define I40E_DEF_ADMINQ_SIZE 256 + +/* + * Note, while the min and maximum values are based upon the sizing of the ring + * itself, the default is taken from ixgbe without much thought. It's basically + * been cargo culted. See i40e_transciever.c for a bit more information. + */ +#define I40E_MIN_RX_LIMIT_PER_INTR 16 +#define I40E_MAX_RX_LIMIT_PER_INTR 4096 +#define I40E_DEF_RX_LIMIT_PER_INTR 256 + +/* + * Valid MTU ranges. Note that the XL710's maximum payload is actually 9728. + * However, we need to adjust for the ETHERFCSL (4 bytes) and the Ethernet VLAN + * header size (18 bytes) to get the actual maximum frame we can use. If + * different adapters end up with different sizes, we should make this value a + * bit more dynamic. + */ +#define I40E_MAX_MTU 9706 +#define I40E_MIN_MTU ETHERMIN +#define I40E_DEF_MTU ETHERMTU + +/* + * Interrupt throttling related values. Interrupt throttling values are defined + * in two microsecond increments. Note that a value of zero basically says do no + * ITR activity. A helpful way to think about these is that setting the ITR to a + * value will allow a certain number of interrupts per second. + * + * Our default values for RX allow 20k interrupts per second while our default + * values for TX allow for 5k interrupts per second. For other class interrupts, + * we limit ourselves to a rate of 2k/s. + */ +#define I40E_MIN_ITR 0x0000 +#define I40E_MAX_ITR 0x0FF0 +#define I40E_DEF_RX_ITR 0x0019 +#define I40E_DEF_TX_ITR 0x0064 +#define I40E_DEF_OTHER_ITR 0x00FA + +/* + * Indexes into the three ITR registers that we have. + */ +typedef enum i40e_itr_index { + I40E_ITR_INDEX_RX = 0x0, + I40E_ITR_INDEX_TX = 0x1, + I40E_ITR_INDEX_OTHER = 0x2, + I40E_ITR_INDEX_NONE = 0x3 +} i40e_itr_index_t; + + +/* + * Table 1-5 of the PRM notes that LSO supports up to 256 KB. + */ +#define I40E_LSO_MAXLEN (256 * 1024) + +#define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */ +#define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */ + +/* + * All the other queue types for are defined by the common code. However, this + * is the constant to indicate that it's terminated. + */ +#define I40E_QUEUE_TYPE_EOL 0x7FF + +/* + * See the comments in i40e_buf.c as to the purpose of this value and how it's + * used to ensure that the IP header is eventually aligned when it's received by + * the OS. + */ +#define I40E_BUF_IPHDR_ALIGNMENT 2 + +/* + * The XL710 controller has a limit of eight buffers being allowed to be used + * for the transmission of a single frame. This is defined in 8.4.1 - Transmit + * Packet in System Memory. + */ +#define I40E_TX_MAX_COOKIE 8 + +/* + * Sizing to determine the amount of available descriptors at which we'll + * consider ourselves blocked. Also, when we have these available, we'll then + * consider ourselves available to transmit to MAC again. Strictly speaking, the + * MAX is based on the ring size. The default sizing is based on ixgbe. + */ +#define I40E_MIN_TX_BLOCK_THRESH I40E_TX_MAX_COOKIE +#define I40E_DEF_TX_BLOCK_THRESH I40E_MIN_TX_BLOCK_THRESH + +/* + * Sizing for DMA thresholds. These are used to indicate whether or not we + * should perform a bcopy or a DMA binding of a given message block. The range + * allows for setting things such that we'll always do a bcopy (a high value) or + * always perform a DMA binding (a low value). + */ +#define I40E_MIN_RX_DMA_THRESH 0 +#define I40E_DEF_RX_DMA_THRESH 256 +#define I40E_MAX_RX_DMA_THRESH INT32_MAX + +#define I40E_MIN_TX_DMA_THRESH 0 +#define I40E_DEF_TX_DMA_THRESH 256 +#define I40E_MAX_TX_DMA_THRESH INT32_MAX + +/* + * Resource sizing counts. There are various aspects of hardware where we may + * have some variable number of elements that we need to handle. Such as the + * hardware capabilities and switch capacities. We cannot know a priori how many + * elements to do, so instead we take a starting guess and then will grow it up + * to an upper bound on a number of elements, to limit memory consumption in + * case of a hardware bug. + */ +#define I40E_HW_CAP_DEFAULT 40 +#define I40E_SWITCH_CAP_DEFAULT 25 + +/* + * Host Memory Context related constants. + */ +#define I40E_HMC_RX_CTX_UNIT 128 +#define I40E_HMC_RX_DBUFF_MIN 1024 +#define I40E_HMC_RX_DBUFF_MAX (16 * 1024 - 128) +#define I40E_HMC_RX_DTYPE_NOSPLIT 0 +#define I40E_HMC_RX_DSIZE_32BYTE 1 +#define I40E_HMC_RX_CRCSTRIP_ENABLE 1 +#define I40E_HMC_RX_FC_DISABLE 0 +#define I40E_HMC_RX_L2TAGORDER 1 +#define I40E_HMC_RX_HDRSPLIT_DISABLE 0 +#define I40E_HMC_RX_INVLAN_DONTSTRIP 0 +#define I40E_HMC_RX_TPH_DISABLE 0 +#define I40E_HMC_RX_LOWRXQ_NOINTR 0 +#define I40E_HMC_RX_PREFENA 1 + +#define I40E_HMC_TX_CTX_UNIT 128 +#define I40E_HMC_TX_NEW_CONTEXT 1 +#define I40E_HMC_TX_FC_DISABLE 0 +#define I40E_HMC_TX_TS_DISABLE 0 +#define I40E_HMC_TX_FD_DISABLE 0 +#define I40E_HMC_TX_ALT_VLAN_DISABLE 0 +#define I40E_HMC_TX_WB_ENABLE 1 +#define I40E_HMC_TX_TPH_DISABLE 0 + +/* + * Whenever we establish and create a VSI, we need to assign some number of + * queues that it's allowed to access from the PF. Because we only have a single + * VSI per PF at this time, we assign it all the queues. + * + * Many of the devices support what's called Data-center Bridging. Which is a + * feature that we don't have much use of at this time. However, we still need + * to fill in this information. We follow the guidance of the note in Table 7-80 + * which talks about bytes 62-77. It says that if we don't want to assign + * anything to traffic classes, we should set the field to zero. Effectively + * this means that everything in the system is assigned to traffic class zero. + */ +#define I40E_ASSIGN_ALL_QUEUES 0 +#define I40E_TRAFFIC_CLASS_NO_QUEUES 0 + +/* + * This defines the error mask that we care about from rx descriptors. Currently + * we're only concerned with the general errors and oversize errors. + */ +#define I40E_RX_ERR_BITS ((1 << I40E_RX_DESC_ERROR_RXE_SHIFT) | \ + (1 << I40E_RX_DESC_ERROR_OVERSIZE_SHIFT)) + +/* + * Property sizing macros for firmware versions, etc. They need to be large + * enough to hold 32-bit quantities transformed to strings as %d.%d or %x. + */ +#define I40E_DDI_PROP_LEN 64 + +/* + * We currently consolidate some overrides that we use in the code here. These + * will be gone in the fullness of time, but as we're bringing up the device, + * this is what we use. + */ +#define I40E_GROUP_MAX 1 +#define I40E_TRQPAIR_MAX 1 + +#define I40E_GROUP_NOMSIX 1 +#define I40E_TRQPAIR_NOMSIX 1 + +/* + * It seems reasonable to cast this to void because the only reason that we + * should be getting a DDI_FAILURE is due to the fact that we specify addresses + * out of range. Because we specify no offset or address, it shouldn't happen. + */ +#ifdef DEBUG +#define I40E_DMA_SYNC(handle, flag) ASSERT0(ddi_dma_sync( \ + (handle)->dmab_dma_handle, 0, 0, \ + (flag))) +#else /* !DEBUG */ +#define I40E_DMA_SYNC(handle, flag) ((void) ddi_dma_sync( \ + (handle)->dmab_dma_handle, 0, 0, \ + (flag))) +#endif /* DEBUG */ + +/* + * Constants related to ring startup and teardown. These refer to the amount of + * time that we're willing to wait for a ring to spin up and spin down. + */ +#define I40E_RING_WAIT_NTRIES 10 +#define I40E_RING_WAIT_PAUSE 10 /* ms */ + +/* + * Bit flags for attach_progress + */ +typedef enum i40e_attach_state { + I40E_ATTACH_PCI_CONFIG = 0x0001, /* PCI config setup */ + I40E_ATTACH_REGS_MAP = 0x0002, /* Registers mapped */ + I40E_ATTACH_PROPS = 0x0004, /* Properties initialized */ + I40E_ATTACH_ALLOC_INTR = 0x0008, /* Interrupts allocated */ + I40E_ATTACH_ALLOC_RINGSLOCKS = 0x0010, /* Rings & locks allocated */ + I40E_ATTACH_ADD_INTR = 0x0020, /* Intr handlers added */ + I40E_ATTACH_COMMON_CODE = 0x0040, /* Intel code initialized */ + I40E_ATTACH_INIT = 0x0080, /* Device initialized */ + I40E_ATTACH_STATS = 0x0200, /* Kstats created */ + I40E_ATTACH_MAC = 0x0800, /* MAC registered */ + I40E_ATTACH_ENABLE_INTR = 0x1000, /* DDI interrupts enabled */ + I40E_ATTACH_FM_INIT = 0x2000, /* FMA initialized */ + I40E_ATTACH_LINK_TIMER = 0x4000, /* link check timer */ +} i40e_attach_state_t; + + +/* + * State flags that what's going on in in the device. Some of these state flags + * indicate some aspirational work that needs to happen in the driver. + * + * I40E_UNKNOWN: The device has yet to be started. + * I40E_INITIALIZED: The device has been fully attached. + * I40E_STARTED: The device has come out of the GLDV3 start routine. + * I40E_SUSPENDED: The device is suspended and I/O among other things + * should not occur. This happens because of an actual + * DDI_SUSPEND or interrupt adjustments. + * I40E_STALL: The tx stall detection logic has found a stall. + * I40E_OVERTEMP: The device has encountered a temperature alarm. + * I40E_INTR_ADJUST: Our interrupts are being manipulated and therefore we + * shouldn't be manipulating their state. + * I40E_ERROR: We've detected an FM error and degraded the device. + */ +typedef enum i40e_state { + I40E_UNKNOWN = 0x00, + I40E_INITIALIZED = 0x01, + I40E_STARTED = 0x02, + I40E_SUSPENDED = 0x04, + I40E_STALL = 0x08, + I40E_OVERTEMP = 0x20, + I40E_INTR_ADJUST = 0x40, + I40E_ERROR = 0x80 +} i40e_state_t; + + +/* + * Definitions for common Intel things that we use and some slightly more usable + * names. + */ +typedef struct i40e_hw i40e_hw_t; +typedef struct i40e_aqc_switch_resource_alloc_element_resp i40e_switch_rsrc_t; + +/* + * Handles and addresses of DMA buffers. + */ +typedef struct i40e_dma_buffer { + caddr_t dmab_address; /* Virtual address */ + uint64_t dmab_dma_address; /* DMA (Hardware) address */ + ddi_acc_handle_t dmab_acc_handle; /* Data access handle */ + ddi_dma_handle_t dmab_dma_handle; /* DMA handle */ + size_t dmab_size; /* Buffer size */ + size_t dmab_len; /* Data length in the buffer */ +} i40e_dma_buffer_t; + +/* + * RX Control Block + */ +typedef struct i40e_rx_control_block { + mblk_t *rcb_mp; + uint32_t rcb_ref; + i40e_dma_buffer_t rcb_dma; + frtn_t rcb_free_rtn; + struct i40e_rx_data *rcb_rxd; +} i40e_rx_control_block_t; + +typedef enum { + I40E_TX_NONE, + I40E_TX_COPY, + I40E_TX_DMA +} i40e_tx_type_t; + +typedef struct i40e_tx_desc i40e_tx_desc_t; +typedef union i40e_32byte_rx_desc i40e_rx_desc_t; + +typedef struct i40e_tx_control_block { + struct i40e_tx_control_block *tcb_next; + mblk_t *tcb_mp; + i40e_tx_type_t tcb_type; + ddi_dma_handle_t tcb_dma_handle; + i40e_dma_buffer_t tcb_dma; +} i40e_tx_control_block_t; + +/* + * Receive ring data (used below). + */ +typedef struct i40e_rx_data { + struct i40e *rxd_i40e; + + /* + * RX descriptor ring definitions + */ + i40e_dma_buffer_t rxd_desc_area; /* DMA buffer of rx desc ring */ + i40e_rx_desc_t *rxd_desc_ring; /* Rx desc ring */ + uint32_t rxd_desc_next; /* Index of next rx desc */ + + /* + * RX control block list definitions + */ + kmutex_t rxd_free_lock; /* Lock to protect free data */ + i40e_rx_control_block_t *rxd_rcb_area; /* Array of control blocks */ + i40e_rx_control_block_t **rxd_work_list; /* Work list of rcbs */ + i40e_rx_control_block_t **rxd_free_list; /* Free list of rcbs */ + uint32_t rxd_rcb_free; /* Number of free rcbs */ + + /* + * RX software ring settings + */ + uint32_t rxd_ring_size; /* Rx descriptor ring size */ + uint32_t rxd_free_list_size; /* Rx free list size */ + + /* + * RX outstanding data. This is used to keep track of outstanding loaned + * descriptors after we've shut down receiving information. Note these + * are protected by the i40e_t`i40e_rx_pending_lock. + */ + uint32_t rxd_rcb_pending; + boolean_t rxd_shutdown; +} i40e_rx_data_t; + +/* + * Structures for unicast and multicast addresses. Note that we keep the VSI id + * around for unicast addresses, since they may belong to different VSIs. + * However, since all multicast addresses belong to the default VSI, we don't + * duplicate that information. + */ +typedef struct i40e_uaddr { + uint8_t iua_mac[ETHERADDRL]; + int iua_vsi; +} i40e_uaddr_t; + +typedef struct i40e_maddr { + uint8_t ima_mac[ETHERADDRL]; +} i40e_maddr_t; + +/* + * Collection of RX statistics on a given queue. + */ +typedef struct i40e_rxq_stat { + /* + * The i40e hardware does not maintain statistics on a per-ring basis, + * only on a per-PF and per-VSI level. As such, to satisfy the GLDv3, we + * need to maintain our own stats for packets and bytes. + */ + kstat_named_t irxs_bytes; /* Bytes in on queue */ + kstat_named_t irxs_packets; /* Packets in on queue */ + + /* + * The following set of stats cover non-checksum data path issues. + */ + kstat_named_t irxs_rx_desc_error; /* Error bit set on desc */ + kstat_named_t irxs_rx_copy_nomem; /* allocb failure for copy */ + kstat_named_t irxs_rx_intr_limit; /* Hit i40e_rx_limit_per_intr */ + kstat_named_t irxs_rx_bind_norcb; /* No replacement rcb free */ + kstat_named_t irxs_rx_bind_nomp; /* No mblk_t in bind rcb */ + + /* + * The following set of statistics covers rx checksum related activity. + * These are all primarily set in i40e_rx_hcksum. If rx checksum + * activity is disabled, then these should all be zero. + */ + kstat_named_t irxs_hck_v4hdrok; /* Valid IPv4 Header */ + kstat_named_t irxs_hck_l4hdrok; /* Valid L4 Header */ + kstat_named_t irxs_hck_unknown; /* !pinfo.known */ + kstat_named_t irxs_hck_nol3l4p; /* Missing L3L4P bit in desc */ + kstat_named_t irxs_hck_iperr; /* IPE error bit set */ + kstat_named_t irxs_hck_eiperr; /* EIPE error bit set */ + kstat_named_t irxs_hck_l4err; /* L4E error bit set */ + kstat_named_t irxs_hck_v6skip; /* IPv6 case hw fails on */ + kstat_named_t irxs_hck_set; /* Total times we set cksum */ + kstat_named_t irxs_hck_miss; /* Times with zero cksum bits */ +} i40e_rxq_stat_t; + +/* + * Collection of TX Statistics on a given queue + */ +typedef struct i40e_txq_stat { + kstat_named_t itxs_bytes; /* Bytes out on queue */ + kstat_named_t itxs_packets; /* Packets out on queue */ + kstat_named_t itxs_descriptors; /* Descriptors issued */ + kstat_named_t itxs_recycled; /* Descriptors reclaimed */ + /* + * Various failure conditions. + */ + kstat_named_t itxs_hck_meoifail; /* ether offload failures */ + kstat_named_t itxs_hck_nol2info; /* Missing l2 info */ + kstat_named_t itxs_hck_nol3info; /* Missing l3 info */ + kstat_named_t itxs_hck_nol4info; /* Missing l4 info */ + kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */ + kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */ + + kstat_named_t itxs_err_notcb; /* No tcb's available */ + kstat_named_t itxs_err_nodescs; /* No tcb's available */ + kstat_named_t itxs_err_context; /* Total context failures */ + + kstat_named_t itxs_num_unblocked; /* Number of MAC unblocks */ +} i40e_txq_stat_t; + +/* + * An instance of an XL710 transmit/receive queue pair. This currently + * represents a combination of both a transmit and receive ring, though they + * should really be split apart into separate logical structures. Unfortunately, + * during initial work we mistakenly joined them together. + */ +typedef struct i40e_trqpair { + struct i40e *itrq_i40e; + + /* Receive-side structures. */ + kmutex_t itrq_rx_lock; + mac_ring_handle_t itrq_macrxring; /* Receive ring handle. */ + i40e_rx_data_t *itrq_rxdata; /* Receive ring rx data. */ + uint64_t itrq_rxgen; /* Generation number for mac/GLDv3. */ + uint32_t itrq_index; /* Queue index in the PF */ + uint32_t itrq_rx_intrvec; /* Receive interrupt vector. */ + + /* Receive-side stats. */ + i40e_rxq_stat_t itrq_rxstat; + kstat_t *itrq_rxkstat; + + /* Transmit-side structures. */ + kmutex_t itrq_tx_lock; + mac_ring_handle_t itrq_mactxring; /* Transmit ring handle. */ + uint32_t itrq_tx_intrvec; /* Transmit interrupt vector. */ + boolean_t itrq_tx_blocked; /* Does MAC think we're blocked? */ + + /* + * TX data sizing + */ + uint32_t itrq_tx_ring_size; + uint32_t itrq_tx_free_list_size; + + /* + * TX descriptor ring data + */ + i40e_dma_buffer_t itrq_desc_area; /* DMA buffer of tx desc ring */ + i40e_tx_desc_t *itrq_desc_ring; /* TX Desc ring */ + volatile uint32_t *itrq_desc_wbhead; /* TX write-back index */ + uint32_t itrq_desc_head; /* Last index hw freed */ + uint32_t itrq_desc_tail; /* Index of next free desc */ + uint32_t itrq_desc_free; /* Number of free descriptors */ + + /* + * TX control block (tcb) data + */ + kmutex_t itrq_tcb_lock; + i40e_tx_control_block_t *itrq_tcb_area; /* Array of control blocks */ + i40e_tx_control_block_t **itrq_tcb_work_list; /* In use tcb */ + i40e_tx_control_block_t **itrq_tcb_free_list; /* Available tcb */ + uint32_t itrq_tcb_free; /* Count of free tcb */ + + /* Transmit-side stats. */ + i40e_txq_stat_t itrq_txstat; + kstat_t *itrq_txkstat; + +} i40e_trqpair_t; + +/* + * VSI statistics. + * + * This mirrors the i40e_eth_stats structure but transforms it into a kstat. + * Note that the stock statistic structure also includes entries for tx + * discards. However, this is not actually implemented for the VSI (see Table + * 7-221), hence why we don't include the member which would always have a value + * of zero. This choice was made to minimize confusion to someone looking at + * these, as a value of zero does not necessarily equate to the fact that it's + * not implemented. + */ +typedef struct i40e_vsi_stats { + uint64_t ivs_rx_bytes; /* gorc */ + uint64_t ivs_rx_unicast; /* uprc */ + uint64_t ivs_rx_multicast; /* mprc */ + uint64_t ivs_rx_broadcast; /* bprc */ + uint64_t ivs_rx_discards; /* rdpc */ + uint64_t ivs_rx_unknown_protocol; /* rupp */ + uint64_t ivs_tx_bytes; /* gotc */ + uint64_t ivs_tx_unicast; /* uptc */ + uint64_t ivs_tx_multicast; /* mptc */ + uint64_t ivs_tx_broadcast; /* bptc */ + uint64_t ivs_tx_errors; /* tepc */ +} i40e_vsi_stats_t; + +typedef struct i40e_vsi_kstats { + kstat_named_t ivk_rx_bytes; + kstat_named_t ivk_rx_unicast; + kstat_named_t ivk_rx_multicast; + kstat_named_t ivk_rx_broadcast; + kstat_named_t ivk_rx_discards; + kstat_named_t ivk_rx_unknown_protocol; + kstat_named_t ivk_tx_bytes; + kstat_named_t ivk_tx_unicast; + kstat_named_t ivk_tx_multicast; + kstat_named_t ivk_tx_broadcast; + kstat_named_t ivk_tx_errors; +} i40e_vsi_kstats_t; + +/* + * For pf statistics, we opt not to use the standard statistics as defined by + * the Intel common code. This also currently combines statistics that are + * global across the entire device. + */ +typedef struct i40e_pf_stats { + uint64_t ips_rx_bytes; /* gorc */ + uint64_t ips_rx_unicast; /* uprc */ + uint64_t ips_rx_multicast; /* mprc */ + uint64_t ips_rx_broadcast; /* bprc */ + uint64_t ips_tx_bytes; /* gotc */ + uint64_t ips_tx_unicast; /* uptc */ + uint64_t ips_tx_multicast; /* mptc */ + uint64_t ips_tx_broadcast; /* bptc */ + + uint64_t ips_rx_size_64; /* prc64 */ + uint64_t ips_rx_size_127; /* prc127 */ + uint64_t ips_rx_size_255; /* prc255 */ + uint64_t ips_rx_size_511; /* prc511 */ + uint64_t ips_rx_size_1023; /* prc1023 */ + uint64_t ips_rx_size_1522; /* prc1522 */ + uint64_t ips_rx_size_9522; /* prc9522 */ + + uint64_t ips_tx_size_64; /* ptc64 */ + uint64_t ips_tx_size_127; /* ptc127 */ + uint64_t ips_tx_size_255; /* ptc255 */ + uint64_t ips_tx_size_511; /* ptc511 */ + uint64_t ips_tx_size_1023; /* ptc1023 */ + uint64_t ips_tx_size_1522; /* ptc1522 */ + uint64_t ips_tx_size_9522; /* ptc9522 */ + + uint64_t ips_link_xon_rx; /* lxonrxc */ + uint64_t ips_link_xoff_rx; /* lxoffrxc */ + uint64_t ips_link_xon_tx; /* lxontxc */ + uint64_t ips_link_xoff_tx; /* lxofftxc */ + uint64_t ips_priority_xon_rx[8]; /* pxonrxc[8] */ + uint64_t ips_priority_xoff_rx[8]; /* pxoffrxc[8] */ + uint64_t ips_priority_xon_tx[8]; /* pxontxc[8] */ + uint64_t ips_priority_xoff_tx[8]; /* pxofftxc[8] */ + uint64_t ips_priority_xon_2_xoff[8]; /* rxon2offcnt[8] */ + + uint64_t ips_crc_errors; /* crcerrs */ + uint64_t ips_illegal_bytes; /* illerrc */ + uint64_t ips_mac_local_faults; /* mlfc */ + uint64_t ips_mac_remote_faults; /* mrfc */ + uint64_t ips_rx_length_errors; /* rlec */ + uint64_t ips_rx_undersize; /* ruc */ + uint64_t ips_rx_fragments; /* rfc */ + uint64_t ips_rx_oversize; /* roc */ + uint64_t ips_rx_jabber; /* rjc */ + uint64_t ips_rx_discards; /* rdpc */ + uint64_t ips_rx_vm_discards; /* ldpc */ + uint64_t ips_rx_short_discards; /* mspdc */ + uint64_t ips_tx_dropped_link_down; /* tdold */ + uint64_t ips_rx_unknown_protocol; /* rupp */ + uint64_t ips_rx_err1; /* rxerr1 */ + uint64_t ips_rx_err2; /* rxerr2 */ +} i40e_pf_stats_t; + +typedef struct i40e_pf_kstats { + kstat_named_t ipk_rx_bytes; /* gorc */ + kstat_named_t ipk_rx_unicast; /* uprc */ + kstat_named_t ipk_rx_multicast; /* mprc */ + kstat_named_t ipk_rx_broadcast; /* bprc */ + kstat_named_t ipk_tx_bytes; /* gotc */ + kstat_named_t ipk_tx_unicast; /* uptc */ + kstat_named_t ipk_tx_multicast; /* mptc */ + kstat_named_t ipk_tx_broadcast; /* bptc */ + + kstat_named_t ipk_rx_size_64; /* prc64 */ + kstat_named_t ipk_rx_size_127; /* prc127 */ + kstat_named_t ipk_rx_size_255; /* prc255 */ + kstat_named_t ipk_rx_size_511; /* prc511 */ + kstat_named_t ipk_rx_size_1023; /* prc1023 */ + kstat_named_t ipk_rx_size_1522; /* prc1522 */ + kstat_named_t ipk_rx_size_9522; /* prc9522 */ + + kstat_named_t ipk_tx_size_64; /* ptc64 */ + kstat_named_t ipk_tx_size_127; /* ptc127 */ + kstat_named_t ipk_tx_size_255; /* ptc255 */ + kstat_named_t ipk_tx_size_511; /* ptc511 */ + kstat_named_t ipk_tx_size_1023; /* ptc1023 */ + kstat_named_t ipk_tx_size_1522; /* ptc1522 */ + kstat_named_t ipk_tx_size_9522; /* ptc9522 */ + + kstat_named_t ipk_link_xon_rx; /* lxonrxc */ + kstat_named_t ipk_link_xoff_rx; /* lxoffrxc */ + kstat_named_t ipk_link_xon_tx; /* lxontxc */ + kstat_named_t ipk_link_xoff_tx; /* lxofftxc */ + kstat_named_t ipk_priority_xon_rx[8]; /* pxonrxc[8] */ + kstat_named_t ipk_priority_xoff_rx[8]; /* pxoffrxc[8] */ + kstat_named_t ipk_priority_xon_tx[8]; /* pxontxc[8] */ + kstat_named_t ipk_priority_xoff_tx[8]; /* pxofftxc[8] */ + kstat_named_t ipk_priority_xon_2_xoff[8]; /* rxon2offcnt[8] */ + + kstat_named_t ipk_crc_errors; /* crcerrs */ + kstat_named_t ipk_illegal_bytes; /* illerrc */ + kstat_named_t ipk_mac_local_faults; /* mlfc */ + kstat_named_t ipk_mac_remote_faults; /* mrfc */ + kstat_named_t ipk_rx_length_errors; /* rlec */ + kstat_named_t ipk_rx_undersize; /* ruc */ + kstat_named_t ipk_rx_fragments; /* rfc */ + kstat_named_t ipk_rx_oversize; /* roc */ + kstat_named_t ipk_rx_jabber; /* rjc */ + kstat_named_t ipk_rx_discards; /* rdpc */ + kstat_named_t ipk_rx_vm_discards; /* ldpc */ + kstat_named_t ipk_rx_short_discards; /* mspdc */ + kstat_named_t ipk_tx_dropped_link_down; /* tdold */ + kstat_named_t ipk_rx_unknown_protocol; /* rupp */ + kstat_named_t ipk_rx_err1; /* rxerr1 */ + kstat_named_t ipk_rx_err2; /* rxerr2 */ +} i40e_pf_kstats_t; + +/* + * Resources that are pooled and specific to a given i40e_t. + */ +typedef struct i40e_func_rsrc { + uint_t ifr_nrx_queue; + uint_t ifr_nrx_queue_used; + uint_t ifr_ntx_queue; + uint_t ifr_trx_queue_used; + uint_t ifr_nvsis; + uint_t ifr_nvsis_used; + uint_t ifr_nmacfilt; + uint_t ifr_nmacfilt_used; + uint_t ifr_nmcastfilt; + uint_t ifr_nmcastfilt_used; +} i40e_func_rsrc_t; + +/* + * Main i40e per-instance state. + */ +typedef struct i40e { + list_node_t i40e_glink; /* Global list link */ + list_node_t i40e_dlink; /* Device list link */ + kmutex_t i40e_general_lock; /* General device lock */ + + /* + * General Data and management + */ + dev_info_t *i40e_dip; + int i40e_instance; + int i40e_fm_capabilities; + uint_t i40e_state; + i40e_attach_state_t i40e_attach_progress; + mac_handle_t i40e_mac_hdl; + ddi_periodic_t i40e_periodic_id; + + /* + * Pointers to common code data structures and memory for the common + * code. + */ + struct i40e_hw i40e_hw_space; + struct i40e_osdep i40e_osdep_space; + struct i40e_aq_get_phy_abilities_resp i40e_phy; + void *i40e_aqbuf; + + /* + * Device state, switch information, and resources. + */ + int i40e_vsi_id; + struct i40e_device *i40e_device; + i40e_func_rsrc_t i40e_resources; + uint16_t i40e_switch_rsrc_alloc; + uint16_t i40e_switch_rsrc_actual; + i40e_switch_rsrc_t *i40e_switch_rsrcs; + i40e_uaddr_t *i40e_uaddrs; + i40e_maddr_t *i40e_maddrs; + int i40e_mcast_promisc_count; + boolean_t i40e_promisc_on; + link_state_t i40e_link_state; + uint32_t i40e_link_speed; /* In Mbps */ + link_duplex_t i40e_link_duplex; + uint_t i40e_sdu; + uint_t i40e_frame_max; + + /* + * Transmit and receive information, tunables, and MAC info. + */ + i40e_trqpair_t *i40e_trqpairs; + boolean_t i40e_mr_enable; + int i40e_num_trqpairs; + uint_t i40e_other_itr; + + int i40e_num_rx_groups; + int i40e_num_rx_descs; + mac_group_handle_t i40e_rx_group_handle; + uint32_t i40e_rx_ring_size; + uint32_t i40e_rx_buf_size; + boolean_t i40e_rx_hcksum_enable; + uint32_t i40e_rx_dma_min; + uint32_t i40e_rx_limit_per_intr; + uint_t i40e_rx_itr; + + int i40e_num_tx_descs; + uint32_t i40e_tx_ring_size; + uint32_t i40e_tx_buf_size; + uint32_t i40e_tx_block_thresh; + boolean_t i40e_tx_hcksum_enable; + uint32_t i40e_tx_dma_min; + uint_t i40e_tx_itr; + + /* + * Interrupt state + * + * Note that the use of a single boolean_t for i40e_intr_poll isn't + * really the best design. When we have more than a single ring on the + * device working, we'll transition to using something more + * sophisticated. + */ + uint_t i40e_intr_pri; + uint_t i40e_intr_force; + uint_t i40e_intr_type; + int i40e_intr_cap; + uint32_t i40e_intr_count; + uint32_t i40e_intr_count_max; + uint32_t i40e_intr_count_min; + size_t i40e_intr_size; + ddi_intr_handle_t *i40e_intr_handles; + ddi_cb_handle_t i40e_callback_handle; + boolean_t i40e_intr_poll; + + /* + * DMA attributes. See i40e_buf.c for why we have copies of them in the + * i40e_t. + */ + ddi_dma_attr_t i40e_static_dma_attr; + ddi_dma_attr_t i40e_txbind_dma_attr; + ddi_device_acc_attr_t i40e_desc_acc_attr; + ddi_device_acc_attr_t i40e_buf_acc_attr; + + /* + * The following two fields are used to protect and keep track of + * outstanding, loaned buffers to MAC. If we have these, we can't + * detach as we have active DMA memory outstanding. + */ + kmutex_t i40e_rx_pending_lock; + kcondvar_t i40e_rx_pending_cv; + uint32_t i40e_rx_pending; + + /* + * PF statistics and VSI statistics. + */ + kmutex_t i40e_stat_lock; + kstat_t *i40e_pf_kstat; + kstat_t *i40e_vsi_kstat; + i40e_pf_stats_t i40e_pf_stat; + i40e_vsi_stats_t i40e_vsi_stat; + uint16_t i40e_vsi_stat_id; + + /* + * Misc. stats and counters that should maybe one day be kstats. + */ + uint64_t i40e_s_link_status_errs; + uint32_t i40e_s_link_status_lasterr; +} i40e_t; + +/* + * The i40e_device represents a PCI device which encapsulates multiple physical + * functions which are represented as an i40e_t. This is used to track the use + * of pooled resources throughout all of the various devices. + */ +typedef struct i40e_device { + list_node_t id_link; + dev_info_t *id_parent; + uint_t id_pci_bus; + uint_t id_pci_device; + uint_t id_nfuncs; /* Total number of functions */ + uint_t id_nreg; /* Total number present */ + list_t id_i40e_list; /* List of i40e_t's registered */ + i40e_switch_rsrc_t *id_rsrcs; /* Switch resources for this PF */ + uint_t id_rsrcs_alloc; /* Total allocated resources */ + uint_t id_rsrcs_act; /* Actual number of resources */ +} i40e_device_t; + +/* Values for the interrupt forcing on the NIC. */ +#define I40E_INTR_NONE 0 +#define I40E_INTR_MSIX 1 +#define I40E_INTR_MSI 2 +#define I40E_INTR_LEGACY 3 + +/* Hint that we don't want to do any polling... */ +#define I40E_POLL_NULL -1 + +/* + * Logging functions. + */ +extern void i40e_error(i40e_t *, const char *, ...); +extern void i40e_notice(i40e_t *, const char *, ...); +extern void i40e_log(i40e_t *, const char *, ...); + +/* + * General link handling functions. + */ +extern void i40e_link_check(i40e_t *); +extern void i40e_update_mtu(i40e_t *); + +/* + * FMA functions. + */ +extern int i40e_check_acc_handle(ddi_acc_handle_t); +extern int i40e_check_dma_handle(ddi_dma_handle_t); +extern void i40e_fm_ereport(i40e_t *, char *); + +/* + * Interrupt handlers and interrupt handler setup. + */ +extern void i40e_intr_chip_init(i40e_t *); +extern void i40e_intr_chip_fini(i40e_t *); +extern uint_t i40e_intr_msix(void *, void *); +extern uint_t i40e_intr_msi(void *, void *); +extern uint_t i40e_intr_legacy(void *, void *); +extern void i40e_intr_io_enable_all(i40e_t *); +extern void i40e_intr_io_disable_all(i40e_t *); +extern void i40e_intr_io_clear_cause(i40e_t *); +extern void i40e_intr_rx_queue_disable(i40e_t *, uint_t); +extern void i40e_intr_rx_queue_enable(i40e_t *, uint_t); +extern void i40e_intr_set_itr(i40e_t *, i40e_itr_index_t, uint_t); + +/* + * Receive-side functions + */ +extern mblk_t *i40e_ring_rx(i40e_trqpair_t *, int); +extern mblk_t *i40e_ring_rx_poll(void *, int); +extern void i40e_rx_recycle(caddr_t); + +/* + * Transmit-side functions + */ +mblk_t *i40e_ring_tx(void *, mblk_t *); +extern void i40e_tx_recycle_ring(i40e_trqpair_t *); +extern void i40e_tx_cleanup_ring(i40e_trqpair_t *); + +/* + * Statistics functions. + */ +extern boolean_t i40e_stats_init(i40e_t *); +extern void i40e_stats_fini(i40e_t *); +extern boolean_t i40e_stat_vsi_init(i40e_t *); +extern void i40e_stat_vsi_fini(i40e_t *); +extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *); +extern void i40e_stats_trqpair_fini(i40e_trqpair_t *); +extern int i40e_m_stat(void *, uint_t, uint64_t *); +extern int i40e_rx_ring_stat(mac_ring_driver_t, uint_t, uint64_t *); +extern int i40e_tx_ring_stat(mac_ring_driver_t, uint_t, uint64_t *); + +/* + * MAC/GLDv3 functions, and functions called by MAC/GLDv3 support code. + */ +extern boolean_t i40e_register_mac(i40e_t *); +extern boolean_t i40e_start(i40e_t *, boolean_t); +extern void i40e_stop(i40e_t *, boolean_t); + +/* + * DMA & buffer functions and attributes + */ +extern void i40e_init_dma_attrs(i40e_t *, boolean_t); +extern boolean_t i40e_alloc_ring_mem(i40e_t *); +extern void i40e_free_ring_mem(i40e_t *, boolean_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _I40E_SW_H */ diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c new file mode 100644 index 0000000000..06f82f856e --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c @@ -0,0 +1,2266 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +#include "i40e_sw.h" + +/* + * --------------------------------------------------------- + * Buffer and Memory Management, Receiving, and Transmitting + * --------------------------------------------------------- + * + * Each physical function (PF), which is what we think of as an instance of the + * device driver, has a series of associated transmit and receive queue pairs. + * Effectively, what we think of in MAC as rings. Each of these has their own + * ring of descriptors which is used as part of doing DMA activity. + * + * The transmit ring of descriptors are 16-byte entries which are used to send + * packets, program filters, etc. The receive ring of descriptors are either + * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor + * format so that we're in a better position if we ever want to leverage that + * information later on. + * + * However, these rings are just for descriptors, they don't talk or deal with + * how we actually store the memory that we need for DMA or the associated + * information that we need for keeping track of message blocks. To correspond + * to the hardware descriptor ring which is how we communicate with hardware, we + * introduce a control block which keeps track of our required metadata like DMA + * mappings. + * + * There are two main considerations that dictate how much memory and buffers + * we end up allocating. Those are: + * + * o The size of the ring (controlled through the driver.conf file) + * + * o The maximum size frame we can receive. + * + * The size of the rings currently defaults to 1024 descriptors and is stored in + * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size. + * + * While the size of the rings is controlled by the driver.conf, the maximum + * size frame is informed primarily through the use of dladm and the setting of + * the MTU property on the device. From the MTU, we then go and do some + * machinations. The first thing we do is we then have to add in space for the + * Ethernet header, potentially a VLAN header, and the FCS check. This value is + * what's stored as i40e_t`i40e_frame_max and is derived any time + * i40e_t`i40e_sdu changes. + * + * This size is then rounded up to the nearest 1k chunk, which represents the + * actual amount of memory that we'll allocate for a single frame. + * + * Note, that for rx, we do something that might be unexpected. We always add + * an extra two bytes to the frame size that we allocate. We then offset the DMA + * address that we receive a packet into by two bytes. This ensures that the IP + * header will always be 4 byte aligned because the MAC header is either 14 or + * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's + * and MAC's lives easier. + * + * Both the rx and tx descriptor rings (which are what we use to communicate + * with hardware) are allocated as a single region of DMA memory which is the + * size of the descriptor (4 bytes and 2 bytes respectively) times the total + * number of descriptors for an rx and tx ring. + * + * While the rx and tx descriptors are allocated using DMA-based memory, the + * control blocks for each of them are allocated using normal kernel memory. + * They aren't special from a DMA perspective. We'll go over the design of both + * receiving and transmitting separately, as they have slightly different + * control blocks and different ways that we manage the relationship between + * control blocks and descriptors. + * + * --------------------------------- + * RX Descriptors and Control Blocks + * --------------------------------- + * + * For every descriptor in the ring that the driver has, we need some associated + * memory, which means that we need to have the receive specific control block. + * We have a couple different, but related goals: + * + * o Once we've completed the mc_start GLDv3 endpoint, we do not want to do + * any additional memory allocations or DMA allocations if we don't have to. + * + * o We'd like to try and do as much zero-copy as possible, while taking into + * account the cost of mapping in DMA resources. + * + * o We'd like to have every receive descriptor available. + * + * Now, these rules are a bit in tension with one another. The act of mapping in + * is an exercise of trying to find the break-even point between page table + * updates and bcopy. We currently start by using the same metrics that ixgbe + * used; however, it should be known that this value has effectively been + * cargo-culted across to yet another driver, sorry. + * + * If we receive a packet which is larger than our copy threshold, we'll create + * a message block out of the DMA memory via desballoc(9F) and send that up to + * MAC that way. This will cause us to be notified when the message block is + * then freed because it has been consumed, dropped, or otherwise. Otherwise, if + * it's less than the threshold, we'll try to use allocb and bcopy it into the + * block, thus allowing us to immediately reuse the DMA resource. Note, on debug + * builds, we allow someone to whack the variable i40e_debug_rx_mode to override + * the behavior and always do a bcopy or a DMA bind. + * + * To try and ensure that the device always has blocks that it can receive data + * into, we maintain two lists of control blocks, a working list and a free + * list. Each list is sized equal to the number of descriptors in the rx ring. + * During the GLDv3 mc_start routine, we allocate a number of rx control blocks + * equal to twice the number of descriptors in the ring and we assign them + * equally to the free list and to the working list. Each control block also has + * DMA memory allocated and associated with which it will be used to receive the + * actual packet data. All of a received frame's data will end up in a single + * DMA buffer. + * + * During operation, we always maintain the invariant that each rx descriptor + * has an associated rx control block which lives in the working list. If we + * feel that we should loan up DMA memory to MAC in the form of a message block, + * we can only do so if we can maintain this invariant. To do that, we swap in + * one of the buffers from the free list. If none are available, then we resort + * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the + * size. + * + * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is + * called on the block, at which point we restore the rx control block to the + * free list and are able to reuse the DMA memory again. While the scheme may + * seem odd, it importantly keeps us out of trying to do any DMA allocations in + * the normal path of operation, even though we may still have to allocate + * message blocks and copy. + * + * The following state machine describes the life time of a rx control block. In + * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx + * control block entry as rcb. + * + * | | + * * ... 1/2 of all initial rcb's ... * + * | | + * v v + * +------------------+ +------------------+ + * | rcb on free list |---*---------->| rcb on work list | + * +------------------+ . +------------------+ + * ^ . moved to | + * | replace rcb * . . Frame received, + * | loaned to | entry on free list + * | MAC + co. | available. rcb's + * | | memory made into mblk_t + * * . freemsg(9F) | and sent up to MAC. + * | called on | + * | loaned rcb | + * | and it is v + * | recycled. +-------------------+ + * +--------------------<-----| rcb loaned to MAC | + * +-------------------+ + * + * Finally, note that every rx control block has a reference count on it. One + * reference is added as long as the driver has had the GLDv3 mc_start endpoint + * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and + * no other DLPI consumers remain, then we'll decrement the reference count by + * one. Whenever we loan up the rx control block and associated buffer to MAC, + * then we bump the reference count again. Even though the device is stopped, + * there may still be loaned frames in upper levels that we'll want to account + * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure + * that it is cleaned up. + * + * -------------------- + * Managing the RX Ring + * -------------------- + * + * The receive ring descriptors are arranged in a circular buffer with a head + * and tail pointer. There are both the conventional head and tail pointers + * which are used to partition the ring into two portions, a portion that we, + * the operating system, manage and a portion that is managed by hardware. When + * hardware owns a descriptor in the ring, it means that it is waiting for data + * to be filled in. However, when a portion of the ring is owned by the driver, + * then that means that the descriptor has been consumed and we need to go take + * a look at it. + * + * The initial head is configured to be zero by writing it as such in the + * receive queue context in the FPM (function private memory from the host). The + * initial tail is written to be the last descriptor. This is written to via the + * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between + * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD, + * the only values we ever consult ourselves are the TAIL register and our own + * state tracking. Effectively, we cache the HEAD register and then update it + * ourselves based on our work. + * + * When we iterate over the rx descriptors and thus the received frames, we are + * either in an interrupt context or we've been asked by MAC to poll on the + * ring. If we've been asked to poll on the ring, we have a maximum number of + * bytes of mblk_t's to return. If processing an rx descriptor would cause us to + * exceed that count, then we do not process it. When in interrupt context, we + * don't have a strict byte count. However, to ensure liveness, we limit the + * amount of data based on a configuration value + * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this + * is based on similar numbers that are used for ixgbe. After some additional + * time in the field, we'll have a sense as to whether or not it should be + * changed. + * + * When processing, we start at our own HEAD pointer + * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start + * processing. Every RX descriptor has what's described as the DD bit. This bit + * (the LSB of the second 8-byte word), indicates whether or not the descriptor + * is done. When we give descriptors to the hardware, this value is always + * zero. When the hardware has finished a descriptor, it will always be one. + * + * The first thing that we check is whether the DD bit indicates that the + * current HEAD is ready. If it isn't, then we're done. That's the primary + * invariant of processing a frame. If it's done, then there are a few other + * things that we want to look at. In the same status word as the DD bit, there + * are two other important bits: + * + * o End of Packet (EOP) + * o Error bits + * + * The end of packet indicates that we have reached the last descriptor. Now, + * you might ask when would there be more than one descriptor. The reason for + * that might be due to large receive offload (lro) or header splitting + * functionality, which presently isn't supported in the driver. The error bits + * in the frame are only valid when EOP is set. + * + * If error bits are set on the frame, then we still consume it; however, we + * will not generate an mblk_t to send up to MAC. If there are no error bits + * set, then we'll consume the descriptor either using bcopy or DMA binding. See + * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information + * on how that selection is made. + * + * Regardless of whether we construct an mblk_t or encounter an error, we end up + * resetting the descriptor. This re-arms the descriptor for hardware and in the + * process, we may end up assigning it a new receive control bock. After we do + * this, we always update our HEAD pointer, no matter what. + * + * Finally, once we've consumed as much as we will in a given window, we go and + * update the TAIL register to indicate all the frames we've consumed. We only + * do a single bulk write for the ring. + * + * --------------------------------- + * TX Descriptors and Control Blocks + * --------------------------------- + * + * While the transmit path is similar in spirit to the receive path, it works + * differently due to the fact that all data is originated by the operating + * system and not by the device. + * + * Like rx, there is both a descriptor ring that we use to communicate to the + * driver and which points to the memory used to transmit a frame. Similarly, + * there is a corresponding transmit control block. Each transmit control block + * has a region of DMA memory allocated to it; however, the way we use it + * varies. + * + * The driver is asked to process a single frame at a time. That message block + * may be made up of multiple fragments linked together by the mblk_t`b_cont + * member. The device has a hard limit of up to 8 buffers being allowed for use + * for a single logical frame. For each fragment, we'll try and use an entry + * from the tx descriptor ring and then we'll allocate a corresponding tx + * control block. Depending on the size of the fragment, we may copy it around + * or we might instead try to do DMA binding of the fragment. + * + * If we exceed the number of blocks that fit, we'll try to pull up the block + * and then we'll do a DMA bind and send it out. + * + * If we don't have enough space in the ring or tx control blocks available, + * then we'll return the unprocessed message block to MAC. This will induce flow + * control and once we recycle enough entries, we'll once again enable sending + * on the ring. + * + * We size the working list as equal to the number of descriptors in the ring. + * We size the free list as equal to 1.5 times the number of descriptors in the + * ring. We'll allocate a number of tx control block entries equal to the number + * of entries in the free list. By default, all entries are placed in the free + * list. As we come along and try to send something, we'll allocate entries from + * the free list and add them to the working list, where they'll stay until the + * hardware indicates that all of the data has been written back to us. The + * reason that we start with 1.5x is to help facilitate having more than one TX + * buffer associated with the DMA activity. + * + * -------------------- + * Managing the TX Ring + * -------------------- + * + * The transmit descriptor ring is driven by us. We maintain our own notion of a + * HEAD and TAIL register and we update the hardware with updates to the TAIL + * register. When the hardware is done writing out data, it updates us by + * writing back to a specific address, not by updating the individual + * descriptors. That address is a 4-byte region after the main transmit + * descriptor ring. This is why the descriptor ring has an extra descriptor's + * worth allocated to it. + * + * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and + * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames, + * we'll update the tail there and in the I40E_QTX_TAIL() register. At various + * points in time, through both interrupts, and our own internal checks, we'll + * sync the write-back head portion of the DMA space. Based on the index it + * reports back, we'll free everything between our current HEAD and the + * indicated index and update HEAD to the new index. + * + * When a frame comes in, we try to use a number of transmit control blocks and + * we'll transition them from the free list to the work list. They'll get moved + * to the entry on the work list that corresponds with the transmit descriptor + * they correspond to. Once we are indicated that the corresponding descriptor + * has been freed, we'll return it to the list. + * + * The thread control block free list is managed by keeping track of the number + * of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to index + * into the free list and add things to it. In effect, we always push and pop + * from the tail and protect it with a single lock, + * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not + * stand up to further performance testing; however, it does allow us to get off + * the ground with the device driver. + * + * The following image describes where a given transmit control block lives in + * its lifetime: + * + * | + * * ... Initial placement for all tcb's + * | + * v + * +------------------+ +------------------+ + * | tcb on free list |---*------------------>| tcb on work list | + * +------------------+ . +------------------+ + * ^ . tcb allocated | + * | to send frame v + * | or fragment on | + * | wire, mblk from | + * | MAC associated. | + * | | + * +------*-------------------------------<----+ + * . + * . Hardware indicates + * entry transmitted. + * tcb recycled, mblk + * from MAC freed. + * + * ------------ + * Blocking MAC + * ------------ + * + * Wen performing transmit, we can run out of descriptors and ring entries. When + * such a case happens, we return the mblk_t to MAC to indicate that we've been + * blocked. At that point in time, MAC becomes blocked and will not transmit + * anything out that specific ring until we notify MAC. To indicate that we're + * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE. + * + * When we recycle tx descriptors then we'll end up signaling MAC by calling + * mac_tx_ring_update() if we were blocked, letting it know that it's safe to + * start sending frames out to us again. + */ + +/* + * We set our DMA alignment requests based on the smallest supported page size + * of the corresponding platform. + */ +#if defined(__sparc) +#define I40E_DMA_ALIGNMENT 0x2000ull +#elif defined(__x86) +#define I40E_DMA_ALIGNMENT 0x1000ull +#else +#error "unknown architecture for i40e" +#endif + +/* + * This structure is used to maintain information and flags related to + * transmitting a frame. The first member is the set of flags we need to or into + * the command word (generally checksumming related). The second member controls + * the word offsets which is required for IP and L4 checksumming. + */ +typedef struct i40e_tx_context { + enum i40e_tx_desc_cmd_bits itc_cmdflags; + uint32_t itc_offsets; +} i40e_tx_context_t; + +/* + * Toggles on debug builds which can be used to override our RX behaviour based + * on thresholds. + */ +#ifdef DEBUG +typedef enum { + I40E_DEBUG_RX_DEFAULT = 0, + I40E_DEBUG_RX_BCOPY = 1, + I40E_DEBUG_RX_DMABIND = 2 +} i40e_debug_rx_t; + +i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; +#endif /* DEBUG */ + +/* + * Notes on the following pair of DMA attributes. The first attribute, + * i40e_static_dma_attr, is designed to be used for both the descriptor rings + * and the static buffers that we associate with control blocks. For this + * reason, we force an SGL length of one. While technically the driver supports + * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our + * management here. In addition, when the Intel common code wants to allocate + * memory via the i40e_allocate_virt_mem osdep function, we have it leverage + * the static dma attr. + * + * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're + * binding a bunch of mblk_t fragments to go out the door. Note that the main + * difference here is that we're allowed a larger SGL length -- eight. + * + * Note, we default to setting ourselves to be DMA capable here. However, + * because we could have multiple instances which have different FMA error + * checking capabilities, or end up on different buses, we make these static + * and const and copy them into the i40e_t for the given device with the actual + * values that reflect the actual capabilities. + */ +static const ddi_dma_attr_t i40e_g_static_dma_attr = { + DMA_ATTR_V0, /* version number */ + 0x0000000000000000ull, /* low address */ + 0xFFFFFFFFFFFFFFFFull, /* high address */ + 0x00000000FFFFFFFFull, /* dma counter max */ + I40E_DMA_ALIGNMENT, /* alignment */ + 0x00000FFF, /* burst sizes */ + 0x00000001, /* minimum transfer size */ + 0x00000000FFFFFFFFull, /* maximum transfer size */ + 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ + 1, /* scatter/gather list length */ + 0x00000001, /* granularity */ + DDI_DMA_FLAGERR /* DMA flags */ +}; + +static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { + DMA_ATTR_V0, /* version number */ + 0x0000000000000000ull, /* low address */ + 0xFFFFFFFFFFFFFFFFull, /* high address */ + 0x00000000FFFFFFFFull, /* dma counter max */ + I40E_DMA_ALIGNMENT, /* alignment */ + 0x00000FFF, /* burst sizes */ + 0x00000001, /* minimum transfer size */ + 0x00000000FFFFFFFFull, /* maximum transfer size */ + 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ + I40E_TX_MAX_COOKIE, /* scatter/gather list length */ + 0x00000001, /* granularity */ + DDI_DMA_FLAGERR /* DMA flags */ +}; + +/* + * Next, we have the attributes for these structures. The descriptor rings are + * all strictly little endian, while the data buffers are just arrays of bytes + * representing frames. Because of this, we purposefully simplify the driver + * programming life by programming the descriptor ring as little endian, while + * for the buffer data we keep it as unstructured. + * + * Note, that to keep the Intel common code operating in a reasonable way, when + * we allocate DMA memory for it, we do not use byte swapping and thus use the + * standard i40e_buf_acc_attr. + */ +static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + +static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* + * The next two functions are designed to be type-safe versions of macros that + * are used to increment and decrement a descriptor index in the loop. Note, + * these are marked inline to try and keep the data path hot and they were + * effectively inlined in their previous life as macros. + */ +static inline int +i40e_next_desc(int base, int count, int size) +{ + int out; + + ASSERT(base >= 0); + ASSERT(count > 0); + ASSERT(size > 0); + + if (base + count < size) { + out = base + count; + } else { + out = base + count - size; + } + + ASSERT(out >= 0 && out < size); + return (out); +} + +static inline int +i40e_prev_desc(int base, int count, int size) +{ + int out; + + ASSERT(base >= 0); + ASSERT(count > 0); + ASSERT(size > 0); + + if (base >= count) { + out = base - count; + } else { + out = base - count + size; + } + + ASSERT(out >= 0 && out < size); + return (out); +} + +/* + * Free DMA memory that is represented by a i40e_dma_buffer_t. + */ +static void +i40e_free_dma_buffer(i40e_dma_buffer_t *dmap) +{ + if (dmap->dmab_dma_address != NULL) { + VERIFY(dmap->dmab_dma_handle != NULL); + (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle); + dmap->dmab_dma_address = NULL; + dmap->dmab_size = 0; + } + + if (dmap->dmab_acc_handle != NULL) { + ddi_dma_mem_free(&dmap->dmab_acc_handle); + dmap->dmab_acc_handle = NULL; + dmap->dmab_address = NULL; + } + + if (dmap->dmab_dma_handle != NULL) { + ddi_dma_free_handle(&dmap->dmab_dma_handle); + dmap->dmab_dma_handle = NULL; + } + + /* + * These should only be set if we have valid handles allocated and + * therefore should always be NULLed out due to the above code. This + * is here to catch us acting sloppy. + */ + ASSERT(dmap->dmab_dma_address == NULL); + ASSERT(dmap->dmab_address == NULL); + ASSERT(dmap->dmab_size == 0); + dmap->dmab_len = 0; +} + +/* + * Allocate size bytes of DMA memory based on the passed in attributes. This + * fills in the information in dmap and is designed for all of our single cookie + * allocations. + */ +static boolean_t +i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap, + ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream, + boolean_t zero, size_t size) +{ + int ret; + uint_t flags; + size_t len; + ddi_dma_cookie_t cookie; + uint_t ncookies; + + if (stream == B_TRUE) + flags = DDI_DMA_STREAMING; + else + flags = DDI_DMA_CONSISTENT; + + /* + * Step one: Allocate the DMA handle + */ + ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT, + NULL, &dmap->dmab_dma_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate dma handle for I/O " + "buffers: %d", ret); + dmap->dmab_dma_handle = NULL; + return (B_FALSE); + } + + /* + * Step two: Allocate the DMA memory + */ + ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags, + DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len, + &dmap->dmab_acc_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate %d bytes of DMA for I/O " + "buffers", size); + dmap->dmab_address = NULL; + dmap->dmab_acc_handle = NULL; + i40e_free_dma_buffer(dmap); + return (B_FALSE); + } + + /* + * Step three: Optionally zero + */ + if (zero == B_TRUE) + bzero(dmap->dmab_address, len); + + /* + * Step four: Bind the memory + */ + ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL, + dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, + NULL, &cookie, &ncookies); + if (ret != DDI_DMA_MAPPED) { + i40e_error(i40e, "failed to allocate %d bytes of DMA for I/O " + "buffers: %d", size, ret); + i40e_free_dma_buffer(dmap); + return (B_FALSE); + } + + VERIFY(ncookies == 1); + dmap->dmab_dma_address = cookie.dmac_laddress; + dmap->dmab_size = len; + dmap->dmab_len = 0; + return (B_TRUE); +} + +/* + * This function is called once the last pending rcb has been freed by the upper + * levels of the system. + */ +static void +i40e_free_rx_data(i40e_rx_data_t *rxd) +{ + VERIFY(rxd->rxd_rcb_pending == 0); + + if (rxd->rxd_rcb_area != NULL) { + kmem_free(rxd->rxd_rcb_area, + sizeof (i40e_rx_control_block_t) * + (rxd->rxd_free_list_size + rxd->rxd_ring_size)); + rxd->rxd_rcb_area = NULL; + } + + if (rxd->rxd_free_list != NULL) { + kmem_free(rxd->rxd_free_list, + sizeof (i40e_rx_control_block_t *) * + rxd->rxd_free_list_size); + rxd->rxd_free_list = NULL; + } + + if (rxd->rxd_work_list != NULL) { + kmem_free(rxd->rxd_work_list, + sizeof (i40e_rx_control_block_t *) * + rxd->rxd_ring_size); + } + + kmem_free(rxd, sizeof (i40e_rx_data_t)); +} + +static boolean_t +i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) +{ + i40e_rx_data_t *rxd; + + rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP); + if (rxd == NULL) + return (B_FALSE); + itrq->itrq_rxdata = rxd; + rxd->rxd_i40e = i40e; + + rxd->rxd_ring_size = i40e->i40e_rx_ring_size; + rxd->rxd_free_list_size = i40e->i40e_rx_ring_size; + + rxd->rxd_rcb_free = rxd->rxd_free_list_size; + + rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * + rxd->rxd_ring_size, KM_NOSLEEP); + if (rxd->rxd_work_list == NULL) { + i40e_error(i40e, "failed to allocate rx work list for a ring " + "of %d entries for ring %d", rxd->rxd_ring_size, + itrq->itrq_index); + goto cleanup; + } + + rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * + rxd->rxd_free_list_size, KM_NOSLEEP); + if (rxd->rxd_free_list == NULL) { + i40e_error(i40e, "failed to allocate a %d entry rx free list " + "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); + goto cleanup; + } + + rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) * + (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP); + if (rxd->rxd_rcb_area == NULL) { + i40e_error(i40e, "failed to allocate a %d entry rcb area for " + "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size, + itrq->itrq_index); + goto cleanup; + } + + return (B_TRUE); + +cleanup: + i40e_free_rx_data(rxd); + itrq->itrq_rxdata = NULL; + return (B_FALSE); +} + +/* + * Free all of the memory that we've allocated for DMA. Note that we may have + * buffers that we've loaned up to the OS which are still outstanding. We'll + * always free up the descriptor ring, because we no longer need that. For each + * rcb, we'll iterate over it and if we send the reference count to zero, then + * we'll free the message block and DMA related resources. However, if we don't + * take the last one, then we'll go ahead and keep track that we'll have pending + * data and clean it up when we get there. + */ +static void +i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init) +{ + uint32_t i, count, ref; + + i40e_rx_control_block_t *rcb; + i40e_t *i40e = rxd->rxd_i40e; + + i40e_free_dma_buffer(&rxd->rxd_desc_area); + rxd->rxd_desc_ring = NULL; + rxd->rxd_desc_next = 0; + + mutex_enter(&i40e->i40e_rx_pending_lock); + + rcb = rxd->rxd_rcb_area; + count = rxd->rxd_ring_size + rxd->rxd_free_list_size; + + for (i = 0; i < count; i++, rcb++) { + VERIFY(rcb != NULL); + + /* + * If we're cleaning up from a failed creation attempt, then an + * entry may never have been assembled which would mean that + * it's reference count is zero. If we find that, we leave it + * be, because nothing else should be modifying it at this + * point. We're not at the point that any more references can be + * added, just removed. + */ + if (failed_init == B_TRUE && rcb->rcb_ref == 0) + continue; + + ref = atomic_dec_32_nv(&rcb->rcb_ref); + if (ref == 0) { + freemsg(rcb->rcb_mp); + rcb->rcb_mp = NULL; + i40e_free_dma_buffer(&rcb->rcb_dma); + } else { + atomic_inc_32(&rxd->rxd_rcb_pending); + atomic_inc_32(&i40e->i40e_rx_pending); + } + } + mutex_exit(&i40e->i40e_rx_pending_lock); +} + +/* + * Initialize the DMA memory for the descriptor ring and for each frame in the + * control block list. + */ +static boolean_t +i40e_alloc_rx_dma(i40e_rx_data_t *rxd) +{ + int i, count; + size_t dmasz; + i40e_rx_control_block_t *rcb; + i40e_t *i40e = rxd->rxd_i40e; + + /* + * First allocate the rx descriptor ring. + */ + dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; + VERIFY(dmasz > 0); + if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area, + &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, + B_TRUE, dmasz) == B_FALSE) { + i40e_error(i40e, "failed to allocate DMA resources " + "for rx descriptor ring"); + return (B_FALSE); + } + rxd->rxd_desc_ring = + (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address; + rxd->rxd_desc_next = 0; + + count = rxd->rxd_ring_size + rxd->rxd_free_list_size; + rcb = rxd->rxd_rcb_area; + + dmasz = i40e->i40e_rx_buf_size; + VERIFY(dmasz > 0); + for (i = 0; i < count; i++, rcb++) { + i40e_dma_buffer_t *dmap; + VERIFY(rcb != NULL); + + if (i < rxd->rxd_ring_size) { + rxd->rxd_work_list[i] = rcb; + } else { + rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb; + } + + dmap = &rcb->rcb_dma; + if (i40e_alloc_dma_buffer(i40e, dmap, + &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, + B_TRUE, B_FALSE, dmasz) == B_FALSE) { + i40e_error(i40e, "failed to allocate rx dma buffer"); + return (B_FALSE); + } + + /* + * Initialize the control block and offset the DMA address. See + * the note in the big theory statement that explains how this + * helps IP deal with alignment. Note, we don't worry about + * whether or not we successfully get an mblk_t from desballoc, + * it's a common case that we have to handle later on in the + * system. + */ + dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT; + dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT; + dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT; + + rcb->rcb_ref = 1; + rcb->rcb_rxd = rxd; + rcb->rcb_free_rtn.free_func = i40e_rx_recycle; + rcb->rcb_free_rtn.free_arg = (caddr_t)rcb; + rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address, + dmap->dmab_size, 0, &rcb->rcb_free_rtn); + } + + return (B_TRUE); +} + +static void +i40e_free_tx_dma(i40e_trqpair_t *itrq) +{ + size_t fsz; + + if (itrq->itrq_tcb_area != NULL) { + uint32_t i; + i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area; + + for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { + i40e_free_dma_buffer(&tcb->tcb_dma); + if (tcb->tcb_dma_handle != NULL) { + ddi_dma_free_handle(&tcb->tcb_dma_handle); + tcb->tcb_dma_handle = NULL; + } + } + + fsz = sizeof (i40e_tx_control_block_t) * + itrq->itrq_tx_free_list_size; + kmem_free(itrq->itrq_tcb_area, fsz); + itrq->itrq_tcb_area = NULL; + } + + if (itrq->itrq_tcb_free_list != NULL) { + fsz = sizeof (i40e_tx_control_block_t *) * + itrq->itrq_tx_free_list_size; + kmem_free(itrq->itrq_tcb_free_list, fsz); + itrq->itrq_tcb_free_list = NULL; + } + + if (itrq->itrq_tcb_work_list != NULL) { + fsz = sizeof (i40e_tx_control_block_t *) * + itrq->itrq_tx_ring_size; + kmem_free(itrq->itrq_tcb_work_list, fsz); + itrq->itrq_tcb_work_list = NULL; + } + + i40e_free_dma_buffer(&itrq->itrq_desc_area); + itrq->itrq_desc_ring = NULL; + +} + +static boolean_t +i40e_alloc_tx_dma(i40e_trqpair_t *itrq) +{ + int i, ret; + size_t dmasz; + i40e_tx_control_block_t *tcb; + i40e_t *i40e = itrq->itrq_i40e; + + itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size; + itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size + + (i40e->i40e_tx_ring_size >> 1); + + /* + * Allocate an additional tx descriptor for the writeback head. + */ + dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; + dmasz += sizeof (i40e_tx_desc_t); + + VERIFY(dmasz > 0); + if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, + &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, + B_FALSE, B_TRUE, dmasz) == B_FALSE) { + i40e_error(i40e, "failed to allocate DMA resources for tx " + "descriptor ring"); + return (B_FALSE); + } + itrq->itrq_desc_ring = + (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address; + itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring + + itrq->itrq_tx_ring_size); + itrq->itrq_desc_head = 0; + itrq->itrq_desc_tail = 0; + itrq->itrq_desc_free = itrq->itrq_tx_ring_size; + + itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * + sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); + if (itrq->itrq_tcb_work_list == NULL) { + i40e_error(i40e, "failed to allocate a %d entry tx work list " + "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); + goto cleanup; + } + + itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * + sizeof (i40e_tx_control_block_t *), KM_SLEEP); + if (itrq->itrq_tcb_free_list == NULL) { + i40e_error(i40e, "failed to allocate a %d entry tx free list " + "for ring %d", itrq->itrq_tx_free_list_size, + itrq->itrq_index); + goto cleanup; + } + + /* + * We allocate enough tx control blocks to cover the free list. + */ + itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * + itrq->itrq_tx_free_list_size, KM_NOSLEEP); + if (itrq->itrq_tcb_area == NULL) { + i40e_error(i40e, "failed to allocate a %d entry tcb area for " + "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); + goto cleanup; + } + + /* + * For each tcb, allocate DMA memory. + */ + dmasz = i40e->i40e_tx_buf_size; + VERIFY(dmasz > 0); + tcb = itrq->itrq_tcb_area; + for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { + VERIFY(tcb != NULL); + + /* + * Allocate both a DMA buffer which we'll use for when we copy + * packets for transmission and allocate a DMA handle which + * we'll use when we bind data. + */ + ret = ddi_dma_alloc_handle(i40e->i40e_dip, + &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, + &tcb->tcb_dma_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate DMA handle for tx " + "data binding on ring %d: %d", itrq->itrq_index, + ret); + tcb->tcb_dma_handle = NULL; + goto cleanup; + } + + if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, + &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, + B_TRUE, B_FALSE, dmasz) == B_FALSE) { + i40e_error(i40e, "failed to allocate %d bytes of " + "DMA for tx data binding on ring %d: %d", dmasz, + itrq->itrq_index); + goto cleanup; + } + + itrq->itrq_tcb_free_list[i] = tcb; + } + + itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size; + + return (B_TRUE); + +cleanup: + i40e_free_tx_dma(itrq); + return (B_FALSE); +} + +/* + * Free all memory associated with all of the rings on this i40e instance. Note, + * this is done as part of the GLDv3 stop routine. + */ +void +i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) +{ + int i; + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; + + /* + * Clean up our rx data. We have to free DMA resources first and + * then if we have no more pending RCB's, then we'll go ahead + * and clean things up. Note, we can't set the stopped flag on + * the rx data until after we've done the first pass of the + * pending resources. Otherwise we might race with + * i40e_rx_recycle on determining who should free the + * i40e_rx_data_t above. + */ + i40e_free_rx_dma(rxd, failed_init); + + mutex_enter(&i40e->i40e_rx_pending_lock); + rxd->rxd_shutdown = B_TRUE; + if (rxd->rxd_rcb_pending == 0) { + i40e_free_rx_data(rxd); + i40e->i40e_trqpairs[i].itrq_rxdata = NULL; + } + mutex_exit(&i40e->i40e_rx_pending_lock); + + i40e_free_tx_dma(&i40e->i40e_trqpairs[i]); + } +} + +/* + * Allocate all of the resources associated with all of the rings on this i40e + * instance. Note this is done as part of the GLDv3 start routine and thus we + * should not use blocking allocations. This takes care of both DMA and non-DMA + * related resources. + */ +boolean_t +i40e_alloc_ring_mem(i40e_t *i40e) +{ + int i; + + for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) == + B_FALSE) + goto unwind; + + if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) == + B_FALSE) + goto unwind; + + if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE) + goto unwind; + } + + return (B_TRUE); + +unwind: + i40e_free_ring_mem(i40e, B_TRUE); + return (B_FALSE); +} + + +/* + * Because every instance of i40e may have different support for FMA + * capabilities, we copy the DMA attributes into the i40e_t and set them that + * way and use them for determining attributes. + */ +void +i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) +{ + bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr, + sizeof (ddi_dma_attr_t)); + bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, + sizeof (ddi_dma_attr_t)); + bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, + sizeof (ddi_device_acc_attr_t)); + bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, + sizeof (ddi_device_acc_attr_t)); + + if (fma == B_TRUE) { + i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; + i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; + } else { + i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; + i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; + } +} + +static void +i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb) +{ + mutex_enter(&rxd->rxd_free_lock); + ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size); + ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL); + rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb; + rxd->rxd_rcb_free++; + mutex_exit(&rxd->rxd_free_lock); +} + +static i40e_rx_control_block_t * +i40e_rcb_alloc(i40e_rx_data_t *rxd) +{ + i40e_rx_control_block_t *rcb; + + mutex_enter(&rxd->rxd_free_lock); + if (rxd->rxd_rcb_free == 0) { + mutex_exit(&rxd->rxd_free_lock); + return (NULL); + } + rxd->rxd_rcb_free--; + rcb = rxd->rxd_free_list[rxd->rxd_rcb_free]; + VERIFY(rcb != NULL); + rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL; + mutex_exit(&rxd->rxd_free_lock); + + return (rcb); +} + +/* + * This is the callback that we get from the OS when freemsg(9F) has been called + * on a loaned descriptor. In addition, if we take the last reference count + * here, then we have to tear down all of the rx data. + */ +void +i40e_rx_recycle(caddr_t arg) +{ + uint32_t ref; + i40e_rx_control_block_t *rcb; + i40e_rx_data_t *rxd; + i40e_t *i40e; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + rcb = (i40e_rx_control_block_t *)arg; + rxd = rcb->rcb_rxd; + i40e = rxd->rxd_i40e; + + /* + * It's possible for this to be called with a reference count of zero. + * That will happen when we're doing the freemsg after taking the last + * reference because we're tearing down everything and this rcb is not + * outstanding. + */ + if (rcb->rcb_ref == 0) + return; + + /* + * Don't worry about failure of desballoc here. It'll only become fatal + * if we're trying to use it and we can't in i40e_rx_bind(). + */ + rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, + rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); + i40e_rcb_free(rxd, rcb); + + /* + * It's possible that the rcb was being used while we are shutting down + * the device. In that case, we'll take the final reference from the + * device here. + */ + ref = atomic_dec_32_nv(&rcb->rcb_ref); + if (ref == 0) { + freemsg(rcb->rcb_mp); + rcb->rcb_mp = NULL; + i40e_free_dma_buffer(&rcb->rcb_dma); + + mutex_enter(&i40e->i40e_rx_pending_lock); + atomic_dec_32(&rxd->rxd_rcb_pending); + atomic_dec_32(&i40e->i40e_rx_pending); + + /* + * If this was the last block and it's been indicated that we've + * passed the shutdown point, we should clean up. + */ + if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) { + i40e_free_rx_data(rxd); + cv_broadcast(&i40e->i40e_rx_pending_cv); + } + + mutex_exit(&i40e->i40e_rx_pending_lock); + } +} + +static mblk_t * +i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, + uint32_t plen) +{ + mblk_t *mp; + i40e_t *i40e = rxd->rxd_i40e; + i40e_rx_control_block_t *rcb, *rep_rcb; + + ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); + + if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) { + itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++; + return (NULL); + } + + rcb = rxd->rxd_work_list[index]; + + /* + * Check to make sure we have a mblk_t. If we don't, this is our last + * chance to try and get one. + */ + if (rcb->rcb_mp == NULL) { + rcb->rcb_mp = + desballoc((unsigned char *)rcb->rcb_dma.dmab_address, + rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); + if (rcb->rcb_mp == NULL) { + itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++; + i40e_rcb_free(rxd, rcb); + return (NULL); + } + } + + I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); + + if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + i40e_rcb_free(rxd, rcb); + return (NULL); + } + + /* + * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT. + */ + mp = rcb->rcb_mp; + atomic_inc_32(&rcb->rcb_ref); + mp->b_wptr = mp->b_rptr + plen; + mp->b_next = mp->b_cont = NULL; + + rxd->rxd_work_list[index] = rep_rcb; + return (mp); +} + +/* + * We're going to allocate a new message block for this frame and attempt to + * receive it. See the big theory statement for more information on when we copy + * versus bind. + */ +static mblk_t * +i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, + uint32_t plen) +{ + i40e_t *i40e = rxd->rxd_i40e; + i40e_rx_control_block_t *rcb; + mblk_t *mp; + + ASSERT(index < rxd->rxd_ring_size); + rcb = rxd->rxd_work_list[index]; + + I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); + + if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + return (NULL); + } + + mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0); + if (mp == NULL) { + itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++; + return (NULL); + } + + mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT; + bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen); + mp->b_wptr = mp->b_rptr + plen; + + return (mp); +} + +/* + * Determine if the device has enabled any checksum flags for us. The level of + * checksum computed will depend on the type packet that we have, which is + * contained in ptype. For example, the checksum logic it does will vary + * depending on whether or not the packet is considered tunneled, whether it + * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are + * valid. + * + * While there are additional checksums that we could recognize here, we'll need + * to get some additional GLDv3 enhancements to be able to properly describe + * them. + */ +static void +i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err, + uint32_t ptype) +{ + uint32_t cksum; + struct i40e_rx_ptype_decoded pinfo; + + ASSERT(ptype <= 255); + pinfo = decode_rx_desc_ptype(ptype); + + cksum = 0; + + /* + * If the ptype isn't something that we know in the driver, then we + * shouldn't even consider moving forward. + */ + if (pinfo.known == 0) { + itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++; + return; + } + + /* + * If hardware didn't set the L3L4P bit on the frame, then there is no + * checksum offload to consider. + */ + if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) { + itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++; + return; + } + + /* + * The device tells us that IPv6 checksums where a Destination Options + * Header or a Routing header shouldn't be trusted. Discard all + * checksums in this case. + */ + if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && + pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 && + (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) { + itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++; + return; + } + + /* + * The hardware denotes three kinds of possible errors. Two are reserved + * for inner and outer IP checksum errors (IPE and EIPE) and the latter + * is for L4 checksum errors (L4E). If there is only one IP header, then + * the only thing that we care about is IPE. Note that since we don't + * support inner checksums, we will ignore IPE being set on tunneled + * packets and only care about EIPE. + */ + if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && + pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { + if (pinfo.tunnel_type == I40E_RX_PTYPE_OUTER_NONE) { + if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { + itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; + } else { + itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; + cksum |= HCK_IPV4_HDRCKSUM_OK; + } + } else { + if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) { + itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++; + } else { + itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; + cksum |= HCK_IPV4_HDRCKSUM_OK; + } + } + } + + /* + * We only have meaningful L4 checksums in the case of IP->L4 and + * IP->IP->L4. There is not outer L4 checksum data available in any + * other case. Further, we don't bother reporting the valid checksum in + * the case of IP->IP->L4 set. + */ + if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && + pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE && + (pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || + pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || + pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_ICMP || + pinfo.inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)) { + ASSERT(pinfo.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4); + if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { + itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; + } else { + itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; + cksum |= HCK_FULLCKSUM_OK; + } + } + + if (cksum != 0) { + itrq->itrq_rxstat.irxs_hck_set.value.ui64++; + mac_hcksum_set(mp, 0, 0, 0, 0, cksum); + } else { + itrq->itrq_rxstat.irxs_hck_miss.value.ui64++; + } +} + +mblk_t * +i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes) +{ + i40e_t *i40e; + i40e_hw_t *hw; + i40e_rx_data_t *rxd; + uint32_t cur_head; + i40e_rx_desc_t *cur_desc; + i40e_rx_control_block_t *rcb; + uint64_t rx_bytes, rx_frames; + uint64_t stword; + mblk_t *mp, *mp_head, **mp_tail; + + ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); + rxd = itrq->itrq_rxdata; + i40e = itrq->itrq_i40e; + hw = &i40e->i40e_hw_space; + + if (!(i40e->i40e_state & I40E_STARTED) || + (i40e->i40e_state & I40E_OVERTEMP) || + (i40e->i40e_state & I40E_SUSPENDED) || + (i40e->i40e_state & I40E_ERROR)) + return (NULL); + + /* + * Before we do anything else, we have to make sure that all of the DMA + * buffers are synced up and then check to make sure that they're + * actually good from an FM perspective. + */ + I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL); + if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + return (NULL); + } + + /* + * Prepare our stats. We do a limited amount of processing in both + * polling and interrupt context. The limit in interrupt context is + * based on frames, in polling context based on bytes. + */ + rx_bytes = rx_frames = 0; + mp_head = NULL; + mp_tail = &mp_head; + + /* + * At this point, the descriptor ring is available to check. We'll try + * and process until we either run out of poll_bytes or descriptors. + */ + cur_head = rxd->rxd_desc_next; + cur_desc = &rxd->rxd_desc_ring[cur_head]; + stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); + + /* + * Note, the primary invariant of this loop should be tha cur_head, + * cur_desc, and stword always point to the currently processed + * descriptor. When we leave the loop, it should point to a descriptor + * that HAS NOT been processed. Meaning, that if we haven't consumed the + * frame, the descriptor should not be advanced. + */ + while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) { + uint32_t error, eop, plen, ptype; + + /* + * The DD, PLEN, and EOP bits are the only ones that are valid + * in every frame. The error information is only valid when EOP + * is set in the same frame. + * + * At this time, because we don't do any LRO or header + * splitting. We expect that every frame should have EOP set in + * it. When later functionality comes in, we'll want to + * re-evaluate this. + */ + eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT); + VERIFY(eop != 0); + + error = (stword & I40E_RXD_QW1_ERROR_MASK) >> + I40E_RXD_QW1_ERROR_SHIFT; + if (error & I40E_RX_ERR_BITS) { + itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++; + goto discard; + } + + plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> + I40E_RXD_QW1_LENGTH_PBUF_SHIFT; + + ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >> + I40E_RXD_QW1_PTYPE_SHIFT; + + /* + * This packet contains valid data. We should check to see if + * we're actually going to consume it based on its length (to + * ensure that we don't overshoot our quota). We determine + * whether to bcopy or bind the DMA resources based on the size + * of the frame. However, if on debug, we allow it to be + * overridden for testing purposes. + * + * We should be smarter about this and do DMA binding for + * larger frames, but for now, it's really more important that + * we actually just get something simple working. + */ + + /* + * Ensure we don't exceed our polling quota by reading this + * frame. Note we only bump bytes now, we bump frames later. + */ + if ((poll_bytes != I40E_POLL_NULL) && + (rx_bytes + plen) > poll_bytes) + break; + rx_bytes += plen; + + mp = NULL; + if (plen >= i40e->i40e_rx_dma_min) + mp = i40e_rx_bind(itrq, rxd, cur_head, plen); + if (mp == NULL) + mp = i40e_rx_copy(itrq, rxd, cur_head, plen); + + if (mp != NULL) { + if (i40e->i40e_rx_hcksum_enable) + i40e_rx_hcksum(itrq, mp, stword, error, ptype); + *mp_tail = mp; + mp_tail = &mp->b_next; + } + + /* + * Now we need to prepare this frame for use again. See the + * discussion in the big theory statements. + * + * However, right now we're doing the simple version of this. + * Normally what we'd do would depend on whether or not we were + * doing DMA binding or bcopying. But because we're always doing + * bcopying, we can just always use the current index as a key + * for what to do and reassign the buffer based on the ring. + */ +discard: + rcb = rxd->rxd_work_list[cur_head]; + cur_desc->read.pkt_addr = + CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); + cur_desc->read.hdr_addr = 0; + + /* + * Finally, update our loop invariants. + */ + cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size); + cur_desc = &rxd->rxd_desc_ring[cur_head]; + stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); + + /* + * To help provide liveness, we limit the amount of data that + * we'll end up counting. Note that in these cases, an interrupt + * is not dissimilar from a polling request. + */ + rx_frames++; + if (rx_frames > i40e->i40e_rx_limit_per_intr) { + itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++; + break; + } + } + + /* + * As we've modified the ring, we need to make sure that we sync the + * descriptor ring for the device. Next, we update the hardware and + * update our notion of where the head for us to read from hardware is + * next. + */ + I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV); + if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + } + + if (rx_frames != 0) { + uint32_t tail; + ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle; + rxd->rxd_desc_next = cur_head; + tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size); + + I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail); + if (i40e_check_acc_handle(rh) != DDI_FM_OK) { + ddi_fm_service_impact(i40e->i40e_dip, + DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + } + + itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes; + itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames; + } + +#ifdef DEBUG + if (rx_frames == 0) { + ASSERT(rx_bytes == 0); + } +#endif + + return (mp_head); +} + +/* + * This function is called by the GLDv3 when it wants to poll on a ring. The + * only primary difference from when we call this during an interrupt is that we + * have a limit on the number of bytes that we should consume. + */ +mblk_t * +i40e_ring_rx_poll(void *arg, int poll_bytes) +{ + i40e_trqpair_t *itrq = arg; + mblk_t *mp; + + ASSERT(poll_bytes > 0); + if (poll_bytes == 0) + return (NULL); + + mutex_enter(&itrq->itrq_rx_lock); + mp = i40e_ring_rx(itrq, poll_bytes); + mutex_exit(&itrq->itrq_rx_lock); + + return (mp); +} + +/* + * This is a structure I wish someone would fill out for me for dorking with the + * checksums. When we get some more experience with this, we should go ahead and + * consider adding this to MAC. + */ +typedef enum mac_ether_offload_flags { + MEOI_L2INFO_SET = 0x01, + MEOI_VLAN_TAGGED = 0x02, + MEOI_L3INFO_SET = 0x04, + MEOI_L3CKSUM_SET = 0x08, + MEOI_L4INFO_SET = 0x10, + MEOI_L4CKSUM_SET = 0x20 +} mac_ether_offload_flags_t; + +typedef struct mac_ether_offload_info { + mac_ether_offload_flags_t meoi_flags; + uint8_t meoi_l2hlen; /* How long is the Ethernet header? */ + uint16_t meoi_l3proto; /* What's the Ethertype */ + uint8_t meoi_l3hlen; /* How long is the header? */ + uint8_t meoi_l4proto; /* What is the payload type? */ + uint8_t meoi_l4hlen; /* How long is the L4 header */ + mblk_t *meoi_l3ckmp; /* Which mblk has the l3 checksum */ + off_t meoi_l3ckoff; /* What's the offset to it */ + mblk_t *meoi_l4ckmp; /* Which mblk has the L4 checksum */ + off_t meoi_l4off; /* What is the offset to it? */ +} mac_ether_offload_info_t; + +/* + * This is something that we'd like to make a general MAC function. Before we do + * that, we should add support for TSO. + * + * We should really keep track of our offset and not walk everything every + * time. I can't imagine that this will be kind to us at high packet rates; + * however, for the moment, let's leave that. + * + * This walks a message block chain without pulling up to fill in the context + * information. Note that the data we care about could be hidden across more + * than one mblk_t. + */ +static int +i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (-1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + bp = mp->b_rptr + off; + *out = *bp; + return (0); + +} + +static int +i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (-1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); + +} + +static int +mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) +{ + size_t off; + uint16_t ether; + uint8_t ipproto, iplen, l4len, maclen; + + bzero(meoi, sizeof (mac_ether_offload_info_t)); + + off = offsetof(struct ether_header, ether_type); + if (i40e_meoi_get_uint16(mp, off, ðer) != 0) + return (-1); + + if (ether == ETHERTYPE_VLAN) { + off = offsetof(struct ether_vlan_header, ether_type); + if (i40e_meoi_get_uint16(mp, off, ðer) != 0) + return (-1); + meoi->meoi_flags |= MEOI_VLAN_TAGGED; + maclen = sizeof (struct ether_vlan_header); + } else { + maclen = sizeof (struct ether_header); + } + meoi->meoi_flags |= MEOI_L2INFO_SET; + meoi->meoi_l2hlen = maclen; + meoi->meoi_l3proto = ether; + + switch (ether) { + case ETHERTYPE_IP: + /* + * For IPv4 we need to get the length of the header, as it can + * be variable. + */ + off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen; + if (i40e_meoi_get_uint8(mp, off, &iplen) != 0) + return (-1); + iplen &= 0x0f; + if (iplen < 5 || iplen > 0x0f) + return (-1); + iplen *= 4; + off = offsetof(ipha_t, ipha_protocol) + maclen; + if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) + return (-1); + break; + case ETHERTYPE_IPV6: + iplen = 40; + off = offsetof(ip6_t, ip6_nxt) + maclen; + if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) + return (-1); + break; + default: + return (0); + } + meoi->meoi_l3hlen = iplen; + meoi->meoi_l4proto = ipproto; + meoi->meoi_flags |= MEOI_L3INFO_SET; + + switch (ipproto) { + case IPPROTO_TCP: + off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen; + if (i40e_meoi_get_uint8(mp, off, &l4len) == -1) + return (-1); + l4len = (l4len & 0xf0) >> 4; + if (l4len < 5 || l4len > 0xf) + return (-1); + l4len *= 4; + break; + case IPPROTO_UDP: + l4len = sizeof (struct udphdr); + break; + case IPPROTO_SCTP: + l4len = sizeof (sctp_hdr_t); + break; + default: + return (0); + } + + meoi->meoi_l4hlen = l4len; + meoi->meoi_flags |= MEOI_L4INFO_SET; + return (0); +} + +/* + * Attempt to put togther the information we'll need to feed into a descriptor + * to properly program the hardware for checksum offload as well as the + * generally required flags. + * + * The i40e_tx_contex_t`itc_cmdflags contains the set of flags we need to or + * into the descriptor based on the checksum flags for this mblk_t and the + * actual information we care about. + */ +static int +i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, + i40e_tx_context_t *tctx) +{ + int ret; + uint32_t flags, start; + mac_ether_offload_info_t meo; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + bzero(tctx, sizeof (i40e_tx_context_t)); + + if (i40e->i40e_tx_hcksum_enable != B_TRUE) + return (0); + + mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags); + if (flags == 0) + return (0); + + if ((ret = mac_ether_offload_info(mp, &meo)) != 0) { + txs->itxs_hck_meoifail.value.ui64++; + return (ret); + } + + /* + * Have we been asked to checksum an IPv4 header. If so, verify that we + * have sufficient information and then set the proper fields in the + * command structure. + */ + if (flags & HCK_IPV4_HDRCKSUM) { + if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + txs->itxs_hck_nol2info.value.ui64++; + return (-1); + } + if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + txs->itxs_hck_nol3info.value.ui64++; + return (-1); + } + if (meo.meoi_l3proto != ETHERTYPE_IP) { + txs->itxs_hck_badl3.value.ui64++; + return (-1); + } + tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; + tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + I40E_TX_DESC_LENGTH_MACLEN_SHIFT; + tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + I40E_TX_DESC_LENGTH_IPLEN_SHIFT; + } + + /* + * We've been asked to provide an L4 header, first, set up the IP + * information in the descriptor if we haven't already before moving + * onto seeing if we have enough information for the L4 checksum + * offload. + */ + if (flags & HCK_PARTIALCKSUM) { + if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { + txs->itxs_hck_nol4info.value.ui64++; + return (-1); + } + + if (!(flags & HCK_IPV4_HDRCKSUM)) { + if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + txs->itxs_hck_nol2info.value.ui64++; + return (-1); + } + if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + txs->itxs_hck_nol3info.value.ui64++; + return (-1); + } + + if (meo.meoi_l3proto == ETHERTYPE_IP) { + tctx->itc_cmdflags |= + I40E_TX_DESC_CMD_IIPT_IPV4; + } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { + tctx->itc_cmdflags |= + I40E_TX_DESC_CMD_IIPT_IPV6; + } else { + txs->itxs_hck_badl3.value.ui64++; + return (-1); + } + tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + I40E_TX_DESC_LENGTH_MACLEN_SHIFT; + tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + I40E_TX_DESC_LENGTH_IPLEN_SHIFT; + } + + switch (meo.meoi_l4proto) { + case IPPROTO_TCP: + tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; + break; + case IPPROTO_UDP: + tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; + break; + case IPPROTO_SCTP: + tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; + break; + default: + txs->itxs_hck_badl4.value.ui64++; + return (-1); + } + + tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) << + I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + } + + return (0); +} + +static void +i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb) +{ + ASSERT(tcb != NULL); + + mutex_enter(&itrq->itrq_tcb_lock); + ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size); + itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb; + itrq->itrq_tcb_free++; + mutex_exit(&itrq->itrq_tcb_lock); +} + +static i40e_tx_control_block_t * +i40e_tcb_alloc(i40e_trqpair_t *itrq) +{ + i40e_tx_control_block_t *ret; + + mutex_enter(&itrq->itrq_tcb_lock); + if (itrq->itrq_tcb_free == 0) { + mutex_exit(&itrq->itrq_tcb_lock); + return (NULL); + } + + itrq->itrq_tcb_free--; + ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free]; + itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL; + mutex_exit(&itrq->itrq_tcb_lock); + + ASSERT(ret != NULL); + return (ret); +} + +/* + * This should be used to free any DMA resources, associated mblk_t's, etc. It's + * used as part of recycling the message blocks when we have either an interrupt + * or other activity that indicates that we need to take a look. + */ +static void +i40e_tcb_reset(i40e_tx_control_block_t *tcb) +{ + switch (tcb->tcb_type) { + case I40E_TX_COPY: + tcb->tcb_dma.dmab_len = 0; + break; + case I40E_TX_DMA: + (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + break; + case I40E_TX_NONE: + /* Cast to pacify lint */ + panic("trying to free tcb %p with bad type none\n", (void *)tcb); + default: + panic("unknown i40e tcb type: %d", tcb->tcb_type); + } + + tcb->tcb_type = I40E_TX_NONE; + freemsg(tcb->tcb_mp); + tcb->tcb_mp = NULL; + tcb->tcb_next = NULL; +} + +/* + * This is called as part of shutting down to clean up all outstanding + * descriptors. Similar to recycle, except we don't re-arm anything and instead + * just return control blocks to the free list. + */ +void +i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) +{ + uint32_t index; + + ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); + ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); + + /* + * Because we should have shut down the chip at this point, it should be + * safe to just clean up all the entries between our head and tail. + */ +#ifdef DEBUG + index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space, + I40E_QTX_ENA(itrq->itrq_index)); + VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK | + I40E_QTX_ENA_QENA_STAT_MASK)); +#endif + + index = itrq->itrq_desc_head; + while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) { + i40e_tx_control_block_t *tcb; + + tcb = itrq->itrq_tcb_work_list[index]; + VERIFY(tcb != NULL); + itrq->itrq_tcb_work_list[index] = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + + bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); + index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); + itrq->itrq_desc_free++; + } + + ASSERT(index == itrq->itrq_desc_tail); + itrq->itrq_desc_head = index; +} + +/* + * We're here either by hook or by crook. We need to see if there are transmit + * descriptors available for us to go and clean up and return to the hardware. + * We may also be blocked, and if so, we should make sure that we let it know + * we're good to go. + */ +void +i40e_tx_recycle_ring(i40e_trqpair_t *itrq) +{ + uint32_t wbhead, toclean, count; + i40e_tx_control_block_t *tcbhead; + i40e_t *i40e = itrq->itrq_i40e; + + mutex_enter(&itrq->itrq_tx_lock); + + ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); + if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) { + if (itrq->itrq_tx_blocked == B_TRUE) { + itrq->itrq_tx_blocked = B_FALSE; + mac_tx_ring_update(i40e->i40e_mac_hdl, + itrq->itrq_mactxring); + itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; + } + mutex_exit(&itrq->itrq_tx_lock); + return; + } + + /* + * Now we need to try and see if there's anything available. The driver + * will write to the head location and it guarantees that it does not + * use relaxed ordering. + */ + VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle, + (uintptr_t)itrq->itrq_desc_wbhead, + sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL)); + + if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) != + DDI_FM_OK) { + mutex_exit(&itrq->itrq_tx_lock); + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + return; + } + + wbhead = *itrq->itrq_desc_wbhead; + toclean = itrq->itrq_desc_head; + count = 0; + tcbhead = NULL; + + while (toclean != wbhead) { + i40e_tx_control_block_t *tcb; + + tcb = itrq->itrq_tcb_work_list[toclean]; + itrq->itrq_tcb_work_list[toclean] = NULL; + ASSERT(tcb != NULL); + tcb->tcb_next = tcbhead; + tcbhead = tcb; + + /* + * We zero this out for sanity purposes. + */ + bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); + toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); + count++; + } + + itrq->itrq_desc_head = wbhead; + itrq->itrq_desc_free += count; + itrq->itrq_txstat.itxs_recycled.value.ui64 += count; + ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); + + if (itrq->itrq_tx_blocked == B_TRUE && + itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) { + itrq->itrq_tx_blocked = B_FALSE; + + mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); + itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; + } + + mutex_exit(&itrq->itrq_tx_lock); + + /* + * Now clean up the tcb. + */ + while (tcbhead != NULL) { + i40e_tx_control_block_t *tcb = tcbhead; + + tcbhead = tcb->tcb_next; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + } + + DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); +} + +/* + * We've been asked to send a message block on the wire. We'll only have a + * single chain. There will not be any b_next pointers; however, there may be + * multiple b_cont blocks. + * + * We may do one of three things with any given mblk_t chain: + * + * 1) Drop it + * 2) Transmit it + * 3) Return it + * + * If we return it to MAC, then MAC will flow control on our behalf. In other + * words, it won't send us anything until we tell it that it's okay to send us + * something. + */ +mblk_t * +i40e_ring_tx(void *arg, mblk_t *mp) +{ + const mblk_t *nmp; + size_t mpsize; + i40e_tx_control_block_t *tcb; + i40e_tx_desc_t *txdesc; + i40e_tx_context_t tctx; + int cmd, type; + + i40e_trqpair_t *itrq = arg; + i40e_t *i40e = itrq->itrq_i40e; + i40e_hw_t *hw = &i40e->i40e_hw_space; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + ASSERT(mp->b_next == NULL); + + if (!(i40e->i40e_state & I40E_STARTED) || + (i40e->i40e_state & I40E_OVERTEMP) || + (i40e->i40e_state & I40E_SUSPENDED) || + (i40e->i40e_state & I40E_ERROR) || + (i40e->i40e_link_state != LINK_STATE_UP)) { + freemsg(mp); + return (NULL); + } + + /* + * Figure out the relevant context about this frame that we might need + * for enabling checksum, lso, etc. This also fills in information that + * we might set around the packet type, etc. + */ + if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { + freemsg(mp); + itrq->itrq_txstat.itxs_err_context.value.ui64++; + return (NULL); + } + + /* + * For the primordial driver we can punt on doing any recycling right + * now; however, longer term we need to probably do some more pro-active + * recycling to cut back on stalls in the tx path. + */ + + /* + * Do a quick size check to make sure it fits into what we think it + * should for this device. Note that longer term this will be false, + * particularly when we have the world of TSO. + */ + mpsize = 0; + for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { + mpsize += MBLKL(nmp); + } + + /* + * First we allocate our tx control block and prepare the packet for + * transmit before we do a final check for descriptors. We do it this + * way to minimize the time under the tx lock. + */ + tcb = i40e_tcb_alloc(itrq); + if (tcb == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto txfail; + } + + /* + * For transmitting a block, we're currently going to use just a + * single control block and bcopy all of the fragments into it. We + * should be more intelligent about doing DMA binding or otherwise, but + * for getting off the ground this will have to do. + */ + ASSERT(tcb->tcb_dma.dmab_len == 0); + ASSERT(tcb->tcb_dma.dmab_size >= mpsize); + for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { + size_t clen = MBLKL(nmp); + void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + + bcopy(nmp->b_rptr, coff, clen); + tcb->tcb_dma.dmab_len += clen; + } + ASSERT(tcb->tcb_dma.dmab_len == mpsize); + + /* + * While there's really no need to keep the mp here, but let's just do + * it to help with our own debugging for now. + */ + tcb->tcb_mp = mp; + tcb->tcb_type = I40E_TX_COPY; + I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); + + mutex_enter(&itrq->itrq_tx_lock); + if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { + txs->itxs_err_nodescs.value.ui64++; + mutex_exit(&itrq->itrq_tx_lock); + goto txfail; + } + + /* + * Build up the descriptor and send it out. Thankfully at the moment + * we only need a single desc, because we're not doing anything fancy + * yet. + */ + ASSERT(itrq->itrq_desc_free > 0); + itrq->itrq_desc_free--; + txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; + itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; + itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, + itrq->itrq_tx_ring_size); + + /* + * Note, we always set EOP and RS which indicates that this is the last + * data frame and that we should ask for it to be transmitted. We also + * must always set ICRC, because that is an internal bit that must be + * set to one for data descriptors. The remaining bits in the command + * descriptor depend on checksumming and are determined based on the + * information set up in i40e_tx_context(). + */ + type = I40E_TX_DESC_DTYPE_DATA; + cmd = I40E_TX_DESC_CMD_EOP | + I40E_TX_DESC_CMD_RS | + I40E_TX_DESC_CMD_ICRC | + tctx.itc_cmdflags; + txdesc->buffer_addr = + CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address); + txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | + ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | + ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); + + /* + * Now, finally, sync the DMA data and alert hardware. + */ + I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV); + + I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), + itrq->itrq_desc_tail); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != + DDI_FM_OK) { + /* + * Note, we can't really go through and clean this up very well, + * because the memory has been given to the device, so just + * indicate it's been transmitted. + */ + ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); + atomic_or_32(&i40e->i40e_state, I40E_ERROR); + } + + txs->itxs_bytes.value.ui64 += mpsize; + txs->itxs_packets.value.ui64++; + txs->itxs_descriptors.value.ui64++; + + mutex_exit(&itrq->itrq_tx_lock); + + return (NULL); + +txfail: + /* + * We ran out of resources. Return it to MAC and indicate that we'll + * need to signal MAC. If there are allocated tcb's, return them now. + * Make sure to reset their message block's, since we'll return them + * back to MAC. + */ + if (tcb != NULL) { + tcb->tcb_mp = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + } + + mutex_enter(&itrq->itrq_tx_lock); + itrq->itrq_tx_blocked = B_TRUE; + mutex_exit(&itrq->itrq_tx_lock); + + return (mp); +} diff --git a/usr/src/uts/common/io/i40e/i40e_xregs.h b/usr/src/uts/common/io/i40e/i40e_xregs.h new file mode 100644 index 0000000000..1bf3a1f0be --- /dev/null +++ b/usr/src/uts/common/io/i40e/i40e_xregs.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _I40E_XREGS_H +#define _I40E_XREGS_H + +/* + * This file contains extra register definitions and other things that would + * nominally come from the Intel common code, but do not due to bugs, erratum, + * etc. Ideally we'll get to a point where we can remove this file. + */ +#include "i40e_type.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The MSPDC register is missing from the current datasheet. + */ +#define I40E_GLPRT_MSPDC(_i) (0x00300060 + ((_i) * 8)) /* _i=0...3 */ +#define I40E_GLPRT_MSDPC_MAX_INDEX 3 +#define I40E_GLPRT_MSPDC_MSPDC_SHIFT 0 +#define I40E_GLPRT_MSPDC_MSPDC_MASK \ + I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MSPDC_MSPDC_SHIFT) + +/* + * The RXERR* registers are technically correct from the perspective of their + * addreses; however, the other associated constants are not correct. Instead, + * we have new definitions here in the interim. + */ + +#define I40E_X_GL_RXERR1_L(_i) (0x00318000 + ((_i) * 8)) + +#define I40E_X_GL_RXERR2_L(_i) (0x0031c000 + ((_i) * 8)) + +#ifdef __cplusplus +} +#endif + +#endif /* _I40E_XREGS_H */ diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c new file mode 100644 index 0000000000..baa36cfc8d --- /dev/null +++ b/usr/src/uts/common/io/inotify.c @@ -0,0 +1,1504 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + * Copyright (c) 2015 The MathWorks, Inc. All rights reserved. + */ + +/* + * Support for the inotify facility, a Linux-borne facility for asynchronous + * notification of certain events on specified files or directories. Our + * implementation broadly leverages the file event monitoring facility, and + * would actually be quite straightforward were it not for a very serious + * blunder in the inotify interface: in addition to allowing for one to be + * notified on events on a particular file or directory, inotify also allows + * for one to be notified on certain events on files _within_ a watched + * directory -- even though those events have absolutely nothing to do with + * the directory itself. This leads to all sorts of madness because file + * operations are (of course) not undertaken on paths but rather on open + * files -- and the relationships between open files and the paths that resolve + * to those files are neither static nor isomorphic. We implement this + * concept by having _child watches_ when directories are watched with events + * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is + * first added, and we modify those child watches dynamically as files are + * created, deleted, moved into or moved out of the specified directory. This + * mechanism works well, absent hard links. Hard links, unfortunately, break + * this rather badly, and the user is warned that watches on directories that + * have multiple directory entries referring to the same file may behave + * unexpectedly. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/inotify.h> +#include <sys/fem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/vfs_opreg.h> +#include <sys/vmem.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include <sys/cyclic.h> +#include <sys/filio.h> + +struct inotify_state; +struct inotify_kevent; + +typedef struct inotify_watch inotify_watch_t; +typedef struct inotify_state inotify_state_t; +typedef struct inotify_kevent inotify_kevent_t; + +struct inotify_watch { + kmutex_t inw_lock; /* lock protecting ref count */ + int inw_refcnt; /* reference count */ + uint8_t inw_zombie:1; /* boolean: is zombie */ + uint8_t inw_fired:1; /* boolean: fired one-shot */ + uint8_t inw_active:1; /* boolean: watch is active */ + uint8_t inw_orphaned:1; /* boolean: orphaned */ + kcondvar_t inw_cv; /* condvar for zombifier */ + uint32_t inw_mask; /* mask of watch */ + int32_t inw_wd; /* watch descriptor */ + vnode_t *inw_vp; /* underlying vnode */ + inotify_watch_t *inw_parent; /* parent, if a child */ + avl_node_t inw_byvp; /* watches by vnode */ + avl_node_t inw_bywd; /* watches by descriptor */ + avl_tree_t inw_children; /* children, if a parent */ + char *inw_name; /* name, if a child */ + list_node_t inw_orphan; /* orphan list */ + cred_t *inw_cred; /* cred, if orphaned */ + inotify_state_t *inw_state; /* corresponding state */ +}; + +struct inotify_kevent { + inotify_kevent_t *ine_next; /* next event in queue */ + struct inotify_event ine_event; /* event (variable size) */ +}; + +#define INOTIFY_EVENT_LENGTH(ev) \ + (sizeof (inotify_kevent_t) + (ev)->ine_event.len) + +struct inotify_state { + kmutex_t ins_lock; /* lock protecting state */ + avl_tree_t ins_byvp; /* watches by vnode */ + avl_tree_t ins_bywd; /* watches by descriptor */ + vmem_t *ins_wds; /* watch identifier arena */ + int ins_maxwatches; /* maximum number of watches */ + int ins_maxevents; /* maximum number of events */ + int ins_nevents; /* current # of events */ + int32_t ins_size; /* total size of events */ + inotify_kevent_t *ins_head; /* head of event queue */ + inotify_kevent_t *ins_tail; /* tail of event queue */ + pollhead_t ins_pollhd; /* poll head */ + kcondvar_t ins_cv; /* condvar for reading */ + list_t ins_orphans; /* orphan list */ + ddi_periodic_t ins_cleaner; /* cyclic for cleaning */ + inotify_watch_t *ins_zombies; /* zombie watch list */ + cred_t *ins_cred; /* creator's credentials */ + inotify_state_t *ins_next; /* next state on global list */ +}; + +/* + * Tunables (exported read-only in lx-branded zones via /proc). + */ +int inotify_maxwatches = 8192; /* max watches per instance */ +int inotify_maxevents = 16384; /* max events */ +int inotify_maxinstances = 128; /* max instances per user */ + +/* + * Internal global variables. + */ +static kmutex_t inotify_lock; /* lock protecting state */ +static dev_info_t *inotify_devi; /* device info */ +static fem_t *inotify_femp; /* FEM pointer */ +static vmem_t *inotify_minor; /* minor number arena */ +static void *inotify_softstate; /* softstate pointer */ +static inotify_state_t *inotify_state; /* global list if state */ + +static void inotify_watch_event(inotify_watch_t *, uint64_t, char *); +static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *); +static void inotify_watch_delete(inotify_watch_t *, uint32_t); +static void inotify_watch_remove(inotify_state_t *state, + inotify_watch_t *watch); + +static int +inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset, + cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) { + inotify_watch_event(watch, flag & FWRITE ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL); + } + + return (rval); +} + +static int +inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_create(vf, name, vap, excl, mode, + vpp, cr, flag, ct, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE, name); + } + + return (rval); +} + +static int +inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) { + inotify_watch_insert(watch, svp, tnm); + inotify_watch_event(watch, IN_CREATE, tnm); + } + + return (rval); +} + +static int +inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_mkdir(vf, name, vap, vpp, cr, + ct, flags, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name); + } + + return (rval); +} + +static int +inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_open(vf, mode, cr, ct)) == 0) + inotify_watch_event(watch, IN_OPEN, NULL); + + return (rval); +} + +static int +inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_read(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_ACCESS, NULL); + + return (rval); +} + +static int +inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags); + inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL); + + return (rval); +} + +int +inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE, nm); + + return (rval); +} + +int +inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm); + + return (rval); +} + +static int +inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + return (rval); +} + +static int +inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_write(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_MODIFY, NULL); + + return (rval); +} + +static int +inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + + switch (vnevent) { + case VE_RENAME_SRC: + inotify_watch_event(watch, IN_MOVE_SELF, NULL); + inotify_watch_delete(watch, IN_MOVE_SELF); + break; + case VE_REMOVE: + /* + * Linux will apparently fire an IN_ATTRIB event when the link + * count changes (including when it drops to 0 on a remove). + * This is merely somewhat odd; what is amazing is that this + * IN_ATTRIB event is not visible on an inotify watch on the + * parent directory. (IN_ATTRIB events are normally sent to + * watches on the parent directory). While it's hard to + * believe that this constitutes desired semantics, ltp + * unfortunately tests this case (if implicitly); in the name + * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are + * explicitly watching the file that has been removed. + */ + if (watch->inw_parent == NULL) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + /*FALLTHROUGH*/ + case VE_RENAME_DEST: + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_RMDIR: + /* + * It seems that IN_ISDIR should really be OR'd in here, but + * Linux doesn't seem to do that in this case; for the sake of + * bug-for-bug compatibility, we don't do it either. + */ + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_CREATE: + case VE_TRUNCATE: + case VE_RESIZE: + inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL); + break; + case VE_LINK: + inotify_watch_event(watch, IN_ATTRIB, NULL); + break; + case VE_RENAME_SRC_DIR: + inotify_watch_event(watch, IN_MOVED_FROM, name); + break; + case VE_RENAME_DEST_DIR: + if (name == NULL) + name = dvp->v_path; + + inotify_watch_insert(watch, dvp, name); + inotify_watch_event(watch, IN_MOVED_TO, name); + break; + case VE_SUPPORT: + case VE_MOUNTEDOVER: + case VE_PRE_RENAME_SRC: + case VE_PRE_RENAME_DEST: + case VE_PRE_RENAME_DEST_DIR: + break; + } + + return (vnext_vnevent(vf, vnevent, dvp, name, ct)); +} + +const fs_operation_def_t inotify_vnodesrc_template[] = { + VOPNAME_CLOSE, { .femop_close = inotify_fop_close }, + VOPNAME_CREATE, { .femop_create = inotify_fop_create }, + VOPNAME_LINK, { .femop_link = inotify_fop_link }, + VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir }, + VOPNAME_OPEN, { .femop_open = inotify_fop_open }, + VOPNAME_READ, { .femop_read = inotify_fop_read }, + VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir }, + VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove }, + VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir }, + VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr }, + VOPNAME_WRITE, { .femop_write = inotify_fop_write }, + VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent }, + NULL, NULL +}; + +static int +inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + if (lhs->inw_wd < rhs->inw_wd) + return (-1); + + if (lhs->inw_wd > rhs->inw_wd) + return (1); + + return (0); +} + +static int +inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp; + + if (lvp < rvp) + return (-1); + + if (lvp > rvp) + return (1); + + return (0); +} + +static void +inotify_watch_hold(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 0); + watch->inw_refcnt++; + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_release(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 1); + + if (--watch->inw_refcnt == 1 && watch->inw_zombie) { + /* + * We're down to our last reference; kick anyone that might be + * waiting. + */ + cv_signal(&watch->inw_cv); + } + + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name) +{ + inotify_kevent_t *event, *tail; + inotify_state_t *state = watch->inw_state; + uint32_t wd = watch->inw_wd, cookie = 0, len; + boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE; + inotify_watch_t *source = watch; + + if (!(mask &= watch->inw_mask) || mask == IN_ISDIR) + return; + + if (watch->inw_parent != NULL) { + /* + * This is an event on the child; if this isn't a valid child + * event, return. Otherwise, we move our watch to be our + * parent (which we know is around because we have a hold on + * it) and continue. + */ + if (!(mask & IN_CHILD_EVENTS)) + return; + + name = watch->inw_name; + watch = watch->inw_parent; + wd = watch->inw_wd; + } + + if (!removal) { + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || + watch->inw_fired || !watch->inw_active) { + mutex_exit(&state->ins_lock); + return; + } + } else { + if (!watch->inw_active) + return; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + } + + /* + * If this is an operation on a directory and it's a child event + * (event if it's not on a child), we specify IN_ISDIR. + */ + if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS)) + mask |= IN_ISDIR; + + if (mask & (IN_MOVED_FROM | IN_MOVED_TO)) + cookie = (uint32_t)curthread->t_did; + + if (state->ins_nevents >= state->ins_maxevents) { + /* + * We're at our maximum number of events -- turn our event + * into an IN_Q_OVERFLOW event, which will be coalesced if + * it's already the tail event. + */ + mask = IN_Q_OVERFLOW; + wd = (uint32_t)-1; + cookie = 0; + len = 0; + } + + if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd && + tail->ine_event.mask == mask && tail->ine_event.cookie == cookie && + ((tail->ine_event.len == 0 && len == 0) || + (name != NULL && tail->ine_event.len != 0 && + strcmp(tail->ine_event.name, name) == 0))) { + /* + * This is an implicitly coalesced event; we're done. + */ + if (!removal) + mutex_exit(&state->ins_lock); + return; + } + + if (name != NULL) { + len = strlen(name) + 1; + len = roundup(len, sizeof (struct inotify_event)); + } else { + len = 0; + } + + event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP); + event->ine_event.wd = wd; + event->ine_event.mask = (uint32_t)mask; + event->ine_event.cookie = cookie; + event->ine_event.len = len; + + if (name != NULL) + strcpy(event->ine_event.name, name); + + if (tail != NULL) { + tail->ine_next = event; + } else { + VERIFY(state->ins_head == NULL); + state->ins_head = event; + cv_broadcast(&state->ins_cv); + } + + state->ins_tail = event; + state->ins_nevents++; + state->ins_size += sizeof (event->ine_event) + len; + + if (removal) + return; + + if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) { + /* + * If this is a one-shot, we need to remove the watch. (Note + * that this will recurse back into inotify_watch_event() to + * fire the IN_IGNORED event -- but with "removal" set.) + */ + watch->inw_fired = 1; + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); +} + +/* + * Destroy a watch. By the time we're in here, the watch must have exactly + * one reference. + */ +static void +inotify_watch_destroy(inotify_watch_t *watch) +{ + VERIFY(MUTEX_HELD(&watch->inw_lock)); + + if (watch->inw_name != NULL) + kmem_free(watch->inw_name, strlen(watch->inw_name) + 1); + + kmem_free(watch, sizeof (inotify_watch_t)); +} + +/* + * Zombify a watch. By the time we come in here, it must be true that the + * watch has already been fem_uninstall()'d -- the only reference should be + * in the state's data structure. If we can get away with freeing it, we'll + * do that -- but if the reference count is greater than one due to an active + * vnode operation, we'll put this watch on the zombie list on the state + * structure. + */ +static void +inotify_watch_zombify(inotify_watch_t *watch) +{ + inotify_state_t *state = watch->inw_state; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(!watch->inw_zombie); + + watch->inw_zombie = 1; + + if (watch->inw_parent != NULL) { + inotify_watch_release(watch->inw_parent); + } else { + avl_remove(&state->ins_byvp, watch); + avl_remove(&state->ins_bywd, watch); + vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1); + watch->inw_wd = -1; + } + + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + /* + * There are no operations in flight and there is no way + * for anyone to discover this watch -- we can destroy it. + */ + inotify_watch_destroy(watch); + } else { + /* + * There are operations in flight; we will need to enqueue + * this for later destruction. + */ + watch->inw_parent = state->ins_zombies; + state->ins_zombies = watch; + mutex_exit(&watch->inw_lock); + } +} + +static inotify_watch_t * +inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent, + const char *name, vnode_t *vp, uint32_t mask) +{ + inotify_watch_t *watch; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + + watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP); + + watch->inw_vp = vp; + watch->inw_mask = mask; + watch->inw_state = state; + watch->inw_refcnt = 1; + + if (parent == NULL) { + watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds, + 1, VM_BESTFIT | VM_SLEEP); + avl_add(&state->ins_byvp, watch); + avl_add(&state->ins_bywd, watch); + + avl_create(&watch->inw_children, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + } else { + VERIFY(name != NULL); + inotify_watch_hold(parent); + watch->inw_mask &= IN_CHILD_EVENTS; + watch->inw_parent = parent; + watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + strcpy(watch->inw_name, name); + + avl_add(&parent->inw_children, watch); + } + + /* + * Add our monitor to the vnode. We must not have the watch lock held + * when we do this, as it will immediately hold our watch. + */ + err = fem_install(vp, inotify_femp, watch, OPARGUNIQ, + (void (*)(void *))inotify_watch_hold, + (void (*)(void *))inotify_watch_release); + + VERIFY(err == 0); + + return (watch); +} + +/* + * Remove a (non-child) watch. This is called from either synchronous context + * via inotify_rm_watch() or monitor context via either a vnevent or a + * one-shot. + */ +static void +inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) +{ + inotify_watch_t *child; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(watch->inw_parent == NULL); + + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + /* + * If we have children, we're going to remove them all and set them + * all to be zombies. + */ + while ((child = avl_first(&watch->inw_children)) != NULL) { + VERIFY(child->inw_parent == watch); + avl_remove(&watch->inw_children, child); + + err = fem_uninstall(child->inw_vp, inotify_femp, child); + VERIFY(err == 0); + + /* + * If this child watch has been orphaned, remove it from the + * state's list of orphans. + */ + if (child->inw_orphaned) { + list_remove(&state->ins_orphans, child); + crfree(child->inw_cred); + } + + VN_RELE(child->inw_vp); + + /* + * We're down (or should be down) to a single reference to + * this child watch; it's safe to zombify it. + */ + inotify_watch_zombify(child); + } + + inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- we know that the only reference + * can come from operations in flight. + */ + inotify_watch_zombify(watch); +} + +/* + * Delete a watch. Should only be called from VOP context. + */ +static void +inotify_watch_delete(inotify_watch_t *watch, uint32_t event) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent; + int err; + + if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie) { + mutex_exit(&state->ins_lock); + return; + } + + if ((parent = watch->inw_parent) == NULL) { + if (event == IN_DELETE_SELF) { + /* + * If we're here because we're being deleted and we + * are not a child watch, we need to delete the entire + * watch, children and all. + */ + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + return; + } else { + if (event == IN_DELETE_SELF && + !(parent->inw_mask & IN_EXCL_UNLINK)) { + /* + * This is a child watch for a file that is being + * removed and IN_EXCL_UNLINK has not been specified; + * indicate that it is orphaned and add it to the list + * of orphans. (This list will be checked by the + * cleaning cyclic to determine when the watch has + * become the only hold on the vnode, at which point + * the watch can be zombified.) Note that we check + * if the watch is orphaned before we orphan it: hard + * links make it possible for VE_REMOVE to be called + * multiple times on the same vnode. (!) + */ + if (!watch->inw_orphaned) { + watch->inw_orphaned = 1; + watch->inw_cred = CRED(); + crhold(watch->inw_cred); + list_insert_head(&state->ins_orphans, watch); + } + + mutex_exit(&state->ins_lock); + return; + } + + if (watch->inw_orphaned) { + /* + * If we're here, a file was orphaned and then later + * moved -- which almost certainly means that hard + * links are on the scene. We choose the orphan over + * the move because we don't want to spuriously + * drop events if we can avoid it. + */ + crfree(watch->inw_cred); + list_remove(&state->ins_orphans, watch); + } + } + + if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) { + /* + * This watch has already been deleted from the parent. + */ + mutex_exit(&state->ins_lock); + return; + } + + avl_remove(&parent->inw_children, watch); + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- which won't actually delete + * it as we know that the reference count is greater than 1. + */ + inotify_watch_zombify(watch); + mutex_exit(&state->ins_lock); +} + +/* + * Insert a new child watch. Should only be called from VOP context when + * a child is created in a watched directory. + */ +static void +inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = vp }; + + if (!(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) { + mutex_exit(&state->ins_lock); + return; + } + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + return; + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); + VERIFY(watch != NULL); + + mutex_exit(&state->ins_lock); +} + + +static int +inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, + int32_t *wdp) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + uint32_t set; + + set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE; + + /* + * Lookup our vnode to determine if we already have a watch on it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * We don't have this watch; allocate a new one, provided that + * we have fewer than our limit. + */ + if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) { + mutex_exit(&state->ins_lock); + return (ENOSPC); + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, NULL, NULL, vp, set); + *wdp = watch->inw_wd; + mutex_exit(&state->ins_lock); + + return (0); + } + + VERIFY(!watch->inw_zombie); + + if (!(mask & IN_MASK_ADD)) { + /* + * Note that if we're resetting our event mask and we're + * transitioning from an event mask that includes child events + * to one that doesn't, there will be potentially some stale + * child watches. This is basically fine: they won't fire, + * and they will correctly be removed when the watch is + * removed. + */ + watch->inw_mask = 0; + } + + watch->inw_mask |= set; + + *wdp = watch->inw_wd; + + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + vnode_t *cvp; + int err; + + /* + * Verify that the specified child doesn't have a directory component + * within it. + */ + if (strchr(name, '/') != NULL) + return (EINVAL); + + /* + * Lookup the underlying file. Note that this will succeed even if + * we don't have permissions to actually read the file. + */ + if ((err = lookupnameat(name, + UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) { + return (err); + } + + /* + * Use our vnode to find our watch, and then add our child watch to it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * This is unexpected -- it means that we don't have the + * watch that we thought we had. + */ + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (ENXIO); + } + + /* + * Now lookup the child vnode in the watch; we'll only add it if it + * isn't already there. + */ + cmp.inw_vp = cvp; + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (0); + } + + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); + VERIFY(watch != NULL); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_rm_watch(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + inotify_watch_remove(state, watch); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_activate(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + watch->inw_active = 1; + + mutex_exit(&state->ins_lock); + + return (0); +} + +/* + * Called periodically as a cyclic to process the orphans and zombies. + */ +static void +inotify_clean(void *arg) +{ + inotify_state_t *state = arg; + inotify_watch_t *watch, *parent, *next, **prev; + cred_t *savecred; + int err; + + mutex_enter(&state->ins_lock); + + for (watch = list_head(&state->ins_orphans); + watch != NULL; watch = next) { + next = list_next(&state->ins_orphans, watch); + + VERIFY(!watch->inw_zombie); + VERIFY((parent = watch->inw_parent) != NULL); + + if (watch->inw_vp->v_count > 1) + continue; + + avl_remove(&parent->inw_children, watch); + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + list_remove(&state->ins_orphans, watch); + + /* + * For purposes of releasing the vnode, we need to switch our + * cred to be the cred of the orphaning thread (which we held + * at the time this watch was orphaned). + */ + savecred = curthread->t_cred; + curthread->t_cred = watch->inw_cred; + VN_RELE(watch->inw_vp); + crfree(watch->inw_cred); + curthread->t_cred = savecred; + + inotify_watch_zombify(watch); + } + + prev = &state->ins_zombies; + + while ((watch = *prev) != NULL) { + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + *prev = watch->inw_parent; + inotify_watch_destroy(watch); + continue; + } + + prev = &watch->inw_parent; + mutex_exit(&watch->inw_lock); + } + + mutex_exit(&state->ins_lock); +} + +/*ARGSUSED*/ +static int +inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + int instances = 0; + char c[64]; + + if (minor != INOTIFYMNRN_INOTIFY) + return (ENXIO); + + mutex_enter(&inotify_lock); + + for (state = inotify_state; state != NULL; state = state->ins_next) { + if (state->ins_cred == cred_p) + instances++; + } + + if (instances >= inotify_maxinstances) { + mutex_exit(&inotify_lock); + return (EMFILE); + } + + minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) { + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&inotify_lock); + return (NULL); + } + + state = ddi_get_soft_state(inotify_softstate, minor); + *devp = makedevice(major, minor); + + crhold(cred_p); + state->ins_cred = cred_p; + state->ins_next = inotify_state; + inotify_state = state; + + (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor); + state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + avl_create(&state->ins_bywd, + (int(*)(const void *, const void *))inotify_watch_cmpwd, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_bywd)); + + avl_create(&state->ins_byvp, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + + list_create(&state->ins_orphans, sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_orphan)); + + state->ins_maxwatches = inotify_maxwatches; + state->ins_maxevents = inotify_maxevents; + + mutex_exit(&inotify_lock); + + state->ins_cleaner = ddi_periodic_add(inotify_clean, + state, NANOSEC, DDI_IPL_0); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + inotify_state_t *state; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + int err = 0, nevents = 0; + size_t len; + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + while (state->ins_head == NULL) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->ins_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) { + mutex_exit(&state->ins_lock); + return (EINTR); + } + } + + /* + * We have events and we have our lock; return as many as we can. + */ + while ((event = state->ins_head) != NULL) { + len = sizeof (event->ine_event) + event->ine_event.len; + + if (uio->uio_resid < len) { + if (nevents == 0) + err = EINVAL; + break; + } + + nevents++; + + if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0) + break; + + VERIFY(state->ins_nevents > 0); + state->ins_nevents--; + + VERIFY(state->ins_size > 0); + state->ins_size -= len; + + if ((state->ins_head = event->ine_next) == NULL) { + VERIFY(event == state->ins_tail); + VERIFY(state->ins_nevents == 0); + state->ins_tail = NULL; + } + + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + mutex_exit(&state->ins_lock); + + return (err); +} + +/*ARGSUSED*/ +static int +inotify_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + if (state->ins_head != NULL) { + *reventsp = events & (POLLRDNORM | POLLIN); + } else { + *reventsp = 0; + + if (!anyyet) + *phpp = &state->ins_pollhd; + } + + mutex_exit(&state->ins_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + file_t *fp; + int rval; + + state = ddi_get_soft_state(inotify_softstate, minor); + + switch (cmd) { + case INOTIFYIOC_ADD_WATCH: { + inotify_addwatch_t addwatch; + file_t *fp; + + if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0) + return (EFAULT); + + if ((fp = getf(addwatch.inaw_fd)) == NULL) + return (EBADF); + + rval = inotify_add_watch(state, fp->f_vnode, + addwatch.inaw_mask, rv); + + releasef(addwatch.inaw_fd); + return (rval); + } + + case INOTIFYIOC_ADD_CHILD: { + inotify_addchild_t addchild; + char name[MAXPATHLEN]; + + if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0) + return (EFAULT); + + if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0) + return (EFAULT); + + if ((fp = getf(addchild.inac_fd)) == NULL) + return (EBADF); + + rval = inotify_add_child(state, fp->f_vnode, name); + + releasef(addchild.inac_fd); + return (rval); + } + + case INOTIFYIOC_RM_WATCH: + return (inotify_rm_watch(state, arg)); + + case INOTIFYIOC_ACTIVATE: + return (inotify_activate(state, arg)); + + case FIONREAD: { + int32_t size; + + mutex_enter(&state->ins_lock); + size = state->ins_size; + mutex_exit(&state->ins_lock); + + if (copyout(&size, (void *)arg, sizeof (size)) != 0) + return (EFAULT); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state, **sp; + inotify_watch_t *watch, *zombies; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + if (state->ins_pollhd.ph_list != NULL) { + pollwakeup(&state->ins_pollhd, POLLERR); + pollhead_clean(&state->ins_pollhd); + } + + mutex_enter(&state->ins_lock); + + /* + * First, destroy all of our watches. + */ + while ((watch = avl_first(&state->ins_bywd)) != NULL) + inotify_watch_remove(state, watch); + + /* + * And now destroy our event queue. + */ + while ((event = state->ins_head) != NULL) { + state->ins_head = event->ine_next; + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + zombies = state->ins_zombies; + state->ins_zombies = NULL; + mutex_exit(&state->ins_lock); + + /* + * Now that our state lock is dropped, we can synchronously wait on + * any zombies. + */ + while ((watch = zombies) != NULL) { + zombies = zombies->inw_parent; + + mutex_enter(&watch->inw_lock); + + while (watch->inw_refcnt > 1) + cv_wait(&watch->inw_cv, &watch->inw_lock); + + inotify_watch_destroy(watch); + } + + if (state->ins_cleaner != NULL) { + ddi_periodic_delete(state->ins_cleaner); + state->ins_cleaner = NULL; + } + + mutex_enter(&inotify_lock); + + /* + * Remove our state from our global list, and release our hold on + * the cred. + */ + for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->ins_next; + crfree(state->ins_cred); + vmem_destroy(state->ins_wds); + + ddi_soft_state_free(inotify_softstate, minor); + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&inotify_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&inotify_lock); + + if (ddi_soft_state_init(&inotify_softstate, + sizeof (inotify_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/inotify failed to create soft state"); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "inotify", S_IFCHR, + INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node"); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (fem_create("inotify_fem", + inotify_vnodesrc_template, &inotify_femp) != 0) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + inotify_devi = devi; + + inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE, + UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&inotify_lock); + fem_free(inotify_femp); + vmem_destroy(inotify_minor); + + ddi_remove_minor_node(inotify_devi, NULL); + inotify_devi = NULL; + + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)inotify_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops inotify_cb_ops = { + inotify_open, /* open */ + inotify_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + inotify_read, /* read */ + nodev, /* write */ + inotify_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + inotify_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops inotify_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + inotify_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + inotify_attach, /* attach */ + inotify_detach, /* detach */ + nodev, /* reset */ + &inotify_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "inotify support", /* name of module */ + &inotify_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf new file mode 100644 index 0000000000..ce9da6180f --- /dev/null +++ b/usr/src/uts/common/io/inotify.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="inotify" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index 848e3470c7..c29a762e06 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -271,7 +271,7 @@ static adapter_info_t ixgbe_82599eb_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -302,7 +302,7 @@ static adapter_info_t ixgbe_X540_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -1792,6 +1792,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, void *arg1, void *arg2) { ixgbe_t *ixgbe = (ixgbe_t *)arg1; + int prev = ixgbe->intr_cnt; switch (cbaction) { /* IRM callback */ @@ -1805,7 +1806,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, if (ixgbe_intr_adjust(ixgbe, cbaction, count) != DDI_SUCCESS) { ixgbe_error(ixgbe, - "IRM CB: Failed to adjust interrupts"); + "IRM CB: Failed to adjust interrupts [%d %d %d]", + cbaction, count, prev); goto cb_fail; } break; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 8944fcbff3..25da45be39 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,7 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/file.h> @@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks) cv_signal(&so->so_closing_cv); } } + +int +ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg) +{ + return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg)); +} + +void +ksocket_krecv_unblock(ksocket_t ks) +{ + return (so_krecv_unblock(KSTOSO(ks))); +} diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h index ac5251540f..516a68d358 100644 --- a/usr/src/uts/common/io/ksocket/ksocket_impl.h +++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h @@ -22,11 +22,17 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #ifndef _INET_KSOCKET_KSOCKET_IMPL_H #define _INET_KSOCKET_KSOCKET_IMPL_H +/* + * Note that if this relationship ever changes, the logic in ksocket_krecv_set + * must be updated and we must maintain local state about this on whatever the + * new ksocket object is. + */ #define KSTOSO(ks) ((struct sonode *)(ks)) #define SOTOKS(so) ((ksocket_t)(uintptr_t)(so)) diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c index c9f0c63b69..5233fcd0b4 100644 --- a/usr/src/uts/common/io/ksyms.c +++ b/usr/src/uts/common/io/ksyms.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred) char *addr; void *hptr = NULL; ksyms_buflist_hdr_t hdr; + + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); + bzero(&hdr, sizeof (struct ksyms_buflist_hdr)); list_create(&hdr.blist, PAGESIZE, offsetof(ksyms_buflist_t, buflist_node)); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 1d30dc3478..1bf49a5b44 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -3141,6 +3141,9 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range) case MAC_PROP_WL_MLME: minsize = sizeof (wl_mlme_t); break; + case MAC_PROP_VN_PROMISC_FILTERED: + minsize = sizeof (boolean_t); + break; } return (valsize >= minsize); diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 8f0ec9eb67..18a6613424 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -3263,6 +3263,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mac_cb_info_t *mcbi; int rc; + if ((flags & MAC_PROMISC_FLAGS_NO_COPY) && + (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) { + return (EINVAL); + } + i_mac_perim_enter(mip); if ((rc = mac_start((mac_handle_t)mip)) != 0) { @@ -3271,7 +3276,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, } if ((mcip->mci_state_flags & MCIS_IS_VNIC) && - type == MAC_CLIENT_PROMISC_ALL) { + type == MAC_CLIENT_PROMISC_ALL && + (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED)) { /* * The function is being invoked by the upper MAC client * of a VNIC. The VNIC should only see the traffic @@ -3308,6 +3314,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mpip->mpi_strip_vlan_tag = ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0); mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0); + mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0); mcbi = &mip->mi_promisc_cb_info; mutex_enter(mcbi->mcbi_lockp); @@ -3944,15 +3951,22 @@ mac_client_get_effective_resources(mac_client_handle_t mch, static void mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, - boolean_t loopback) + boolean_t loopback, boolean_t local) { mblk_t *mp_copy, *mp_next; - if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) { + if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag || + (mpip->mpi_do_fixups && local)) { mp_copy = copymsg(mp); if (mp_copy == NULL) return; + if (mpip->mpi_do_fixups && local) { + mp_copy = mac_fix_cksum(mp_copy); + if (mp_copy == NULL) + return; + } + if (mpip->mpi_strip_vlan_tag) { mp_copy = mac_strip_vlan_tag_chain(mp_copy); if (mp_copy == NULL) @@ -4009,7 +4023,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp) */ void mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, - mac_client_impl_t *sender) + mac_client_impl_t *sender, boolean_t local) { mac_promisc_impl_t *mpip; mac_cb_t *mcb; @@ -4049,8 +4063,10 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, if (is_sender || mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || - is_mcast) - mac_promisc_dispatch_one(mpip, mp, is_sender); + is_mcast) { + mac_promisc_dispatch_one(mpip, mp, is_sender, + local); + } } } MAC_PROMISC_WALKER_DCR(mip); @@ -4079,7 +4095,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain) mpip = (mac_promisc_impl_t *)mcb->mcb_objp; if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED && !is_mcast) { - mac_promisc_dispatch_one(mpip, mp, B_FALSE); + mac_promisc_dispatch_one(mpip, mp, B_FALSE, + B_FALSE); } } } @@ -4150,16 +4167,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + return (B_TRUE); else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); else @@ -4338,7 +4354,13 @@ mac_addr_len(mac_handle_t mh) boolean_t mac_is_vnic(mac_handle_t mh) { - return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0); +} + +boolean_t +mac_is_overlay(mac_handle_t mh) +{ + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0); } mac_handle_t @@ -5552,3 +5574,23 @@ mac_client_set_rings(mac_client_handle_t mch, int rxrings, int txrings) mrp->mrp_ntxrings = txrings; } } + +boolean_t +mac_get_promisc_filtered(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + return (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED); +} + +void +mac_set_promisc_filtered(mac_client_handle_t mch, boolean_t enable) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + if (enable) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; + else + mcip->mci_protect_flags &= ~MPT_FLAG_PROMISC_FILTERED; +} diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 14d94981cd..0459506784 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -603,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) * * TODO: Cleanup and tighten some of the assumptions. */ +boolean_t mac_check_overlay = B_TRUE; boolean_t mac_use_bw_heuristic = B_TRUE; static int mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) @@ -610,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) uint64_t cpu_speed, bw = 0; int srings = 0; boolean_t bw_enabled = B_FALSE; + mac_client_impl_t *mcip = flent->fe_mcip; ASSERT(!(flent->fe_type & FLOW_USER)); if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && @@ -637,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) */ if (mac_soft_ring_enable) srings = srings * 2; + } else if (mac_check_overlay == B_TRUE && + (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) { + /* Is this a VNIC on an overlay? */ + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; + if (mac_is_overlay(mh) == B_TRUE) { + srings = mac_rx_soft_ring_10gig_count; + } } + + } else { /* * Soft ring computation using CPU speed and specified diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index 805b5d36f9..da83dc643e 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -2576,6 +2576,9 @@ mac_protect_init(mac_client_impl_t *mcip) sizeof (dhcpv6_addr_t), offsetof(dhcpv6_addr_t, da_node)); avl_create(&mcip->mci_v6_slaac_ip, compare_slaac_ip, sizeof (slaac_addr_t), offsetof(slaac_addr_t, sla_node)); + + if (mcip->mci_state_flags & MCIS_IS_VNIC) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; } void diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 57d1996d84..98b770786a 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -350,6 +351,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) + mip->mi_state_flags |= MIS_IS_OVERLAY; + mac_addr_factory_init(mip); /* @@ -670,7 +674,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp) mac_impl_t *mip = (mac_impl_t *)mh; if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp, NULL); + mac_promisc_dispatch(mip, mp, NULL, B_FALSE); } /* @@ -691,7 +695,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * this MAC, pass them a copy if appropriate. */ if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp_chain, NULL); + mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE); if (mr != NULL) { /* diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index 148f739d52..0e2cb864c9 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -1370,7 +1370,7 @@ int mac_srs_worker_wakeup_ticks = 0; * said, the constant is left as a static variable to allow it to be * dynamically tuned in the field if and as needed. */ -static uintptr_t mac_rx_srs_stack_needed = 10240; +static uintptr_t mac_rx_srs_stack_needed = 13312; static uint_t mac_rx_srs_stack_toodeep; #ifndef STACK_GROWTH_DOWN @@ -2310,7 +2310,7 @@ check_again: if (smcip->mci_mip->mi_promisc_list != NULL) { mutex_exit(lock); mac_promisc_dispatch(smcip->mci_mip, - head, NULL); + head, NULL, B_FALSE); mutex_enter(lock); } } @@ -4450,8 +4450,10 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * check is done inside the MAC_TX() * macro. */ - if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp, src_mcip); + if (mip->mi_promisc_list != NULL) { + mac_promisc_dispatch(mip, mp, src_mcip, + B_TRUE); + } do_switch = ((src_mcip->mci_state_flags & dst_mcip->mci_state_flags & diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index 31972f94d8..c1a5c9c069 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ /* @@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname, kstat_t *ksp; kstat_named_t *knp; - ksp = kstat_create(modname, 0, statname, "net", - KSTAT_TYPE_NAMED, count, 0); + ksp = kstat_create_zone(modname, 0, statname, "net", + KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return (NULL); @@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip) major_t major = getmajor(mip->mi_phy_dev); count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount; - ksp = kstat_create((const char *)ddi_major_to_name(major), + ksp = kstat_create_zone((const char *)ddi_major_to_name(major), getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME, - MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0); + MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return; diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c index cdbeb0d422..8955b3d935 100644 --- a/usr/src/uts/common/io/mem.c +++ b/usr/src/uts/common/io/mem.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* @@ -221,10 +221,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred) switch (getminor(*devp)) { case M_NULL: case M_ZERO: + /* standard devices */ + break; + case M_MEM: case M_KMEM: case M_ALLKMEM: - /* standard devices */ + /* + * These devices should never be visible in a zone, but if they + * somehow do get created we refuse to allow the zone to use + * them. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); break; default: diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index cfda434e23..6c585c6a42 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -13,3 +13,11 @@ # Fast-Path specific flag. Default is "yes". # mrsas-enable-fp="yes"; +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE new file mode 100644 index 0000000000..187088ff34 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014, Thales UK Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..cde8b65b37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +NFAST CRYPTO ACCELERATOR DRIVER diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h new file mode 100644 index 0000000000..b9021942b2 --- /dev/null +++ b/usr/src/uts/common/io/nfp/autoversion.h @@ -0,0 +1,21 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* AUTOGENERATED - DO NOT EDIT */ +#ifndef AUTOVERSION_H +#define AUTOVERSION_H + +#define VERSION_RELEASEMAJOR 2 +#define VERSION_RELEASEMINOR 26 +#define VERSION_RELEASEPATCH 40 +#define VERSION_NO "2.26.40cam999" +#define VERSION_COMPNAME "nfdrv" + +#endif diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c new file mode 100644 index 0000000000..a04b1fd5b0 --- /dev/null +++ b/usr/src/uts/common/io/nfp/drvlist.c @@ -0,0 +1,19 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include "nfp_common.h" +#include "nfp_cmd.h" + +const nfpcmd_dev *nfp_drvlist[] = { + &i21285_cmddev, + &i21555_cmddev, + NULL +}; + diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c new file mode 100644 index 0000000000..684be703ea --- /dev/null +++ b/usr/src/uts/common/io/nfp/hostif.c @@ -0,0 +1,1192 @@ +/* + +hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8 + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv +12/02/2002 jsh added high level interrupt support + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" + +#include "nfp.h" + +/* mapped memory attributes, no-swap endianess (done in higher level) */ +static struct ddi_device_acc_attr nosw_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* dma attributes */ +static ddi_dma_attr_t dma_attrs = { + DMA_ATTR_V0, /* version number */ + (uint64_t)0x0, /* low address */ + (uint64_t)0xffffffff, /* high address */ + (uint64_t)0xffffff, /* DMA counter max */ + (uint64_t)0x1, /* alignment */ + 0x0c, /* burst sizes */ + 0x1, /* minimum transfer size */ + (uint64_t)0x3ffffff, /* maximum transfer size */ + (uint64_t)0x7fff, /* maximum segment size */ + 1, /* no scatter/gather lists */ + 1, /* granularity */ + 0 /* DMA flags */ +}; + +/* + * Debug message control + * Debug Levels: + * 0 = no messages + * 1 = Errors + * 2 = Subroutine calls & control flow + * 3 = I/O Data (verbose!) + * Can be set with adb or in the /etc/system file with + * "set nfp:nfp_debug=<value>" + */ + +int nfp_debug= 1; + +static void *state_head; /* opaque handle top of state structs */ + +static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp); +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp); +static int nfp_release_dev( dev_info_t *dip ); + +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_strategy(struct buf *bp); + +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp); +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static void nfp_wrtimeout (void *pdev); +static void nfp_rdtimeout (void *pdev); + +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result); +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +static void nfp_read_complete_final(nfp_dev *pdev, int ok); +static void nfp_write_complete_final(nfp_dev *pdev, int ok); + +/* nfp file ops --------------------------------------------------- */ + +static struct cb_ops nfp_cb_ops = { + nfp_open, + nfp_close, + nodev, /* no nfp_strategy */ + nodev, /* no print routine */ + nodev, /* no dump routine */ + nfp_read, + nfp_write, + nfp_ioctl, + nodev, /* no devmap routine */ + nodev, /* no mmap routine */ + nodev, /* no segmap routine */ + nfp_chpoll, + ddi_prop_op, + 0, /* not a STREAMS driver, no cb_str routine */ + D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */ + CB_REV, + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops nfp_ops = { + DEVO_REV, /* DEVO_REV indicated by manual */ + 0, /* device reference count */ + nfp_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + nfp_attach, + nfp_detach, + nodev, /* device reset routine */ + &nfp_cb_ops, + (struct bus_ops *)0, /* bus operations */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv modldrv = { + &mod_driverops, + NFP_DRVNAME, + &nfp_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* MODREV_1 indicated by manual */ + (void *)&modldrv, + NULL, /* termination of list of linkage structures */ +}; + +/* interface resource allocation */ + +int nfp_alloc_pci_push( nfp_dev *pdev ) { + /* allocate resources needed for PCI Push, + * if not already allocated. + * return True if successful + */ + nfp_err ret; + uint_t cookie_count; + size_t real_length; + + if(!pdev->read_buf) { + /* allocate read buffer */ + pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP ); + } + if(!pdev->read_buf) { + nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed"); + pdev->read_buf = NULL; + return 0; + } + + if(!pdev->rd_dma_ok) { + /* allocate dma handle for read buffer */ + ret = ddi_dma_alloc_handle( pdev->dip, + &dma_attrs, + DDI_DMA_DONTWAIT, + NULL, + &pdev->read_dma_handle ); + if( ret != DDI_SUCCESS ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)", + ret ); + return 0; + } + + /* Allocate the memory for dma transfers */ + ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr, + DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL, + (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle); + if (ret != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + + ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle, + NULL, /* kernel address space */ + (caddr_t)pdev->read_buf, real_length, + DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */ + DDI_DMA_DONTWAIT, NULL, + &pdev->read_dma_cookie, &cookie_count ); + if( ret != DDI_DMA_MAPPED ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)", + ret); + ddi_dma_mem_free(&pdev->acchandle); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + if( cookie_count > 1 ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: error:" + " ddi_dma_addr_bind_handle wants %d transfers", + cookie_count); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + pdev->rd_dma_ok = 1; + } + return pdev->rd_dma_ok; +} + +void nfp_free_pci_push( nfp_dev *pdev ) { + /* free resources allocated to PCI Push */ + if( pdev->rd_dma_ok ) { + (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + pdev->rd_dma_ok = 0; + } + if( pdev->read_buf ) { + kmem_free( pdev->read_buf, NFP_READBUF_SIZE ); + pdev->read_buf = NULL; + } +} + +/* include definition of nfp_set_ifvers() */ +#define nfp_ifvers NFDEV_IF_PCI_PUSH +#include "nfp_ifvers.c" +#undef nfp_ifvers + +/*--------------------*/ +/* nfp_isr */ +/*--------------------*/ + +static u_int nfp_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + nfp_err ne; + int handled; + + nfp_log( NFP_DBG3, "nfp_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + + /* The isr needs to be mutex'ed - an SMP can call us while we're still + * running! + */ + mutex_enter(&pdev->low_mutex); + ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled ); + mutex_exit(&pdev->low_mutex); + + if( !ne && handled ) + return DDI_INTR_CLAIMED; + if (ne) + nfp_log( NFP_DBG1, "nfp_isr: failed"); + else + nfp_log( NFP_DBG3, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + +static u_int nfp_soft_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + int rd, wr; + + nfp_log( NFP_DBG3, "nfp_soft_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + rd= wr= 0; + + mutex_enter(&pdev->high_mutex); + if(pdev->high_read) { + pdev->high_read= 0; + mutex_exit(&pdev->high_mutex); + rd= 1; + } + if(pdev->high_write) { + pdev->high_write= 0; + wr= 1; + } + mutex_exit(&pdev->high_mutex); + + if(rd) { + nfp_log( NFP_DBG3, "nfp_soft_isr: read done"); + nfp_read_complete_final(pdev, pdev->rd_ok); + } + if(wr) { + nfp_log( NFP_DBG3, "nfp_soft_isr: write done"); + nfp_write_complete_final(pdev, pdev->wr_ok); + } + if( rd || wr ) + return DDI_INTR_CLAIMED; + + nfp_log( NFP_DBG2, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + + +/*-------------------------*/ +/* nfp_read */ +/*-------------------------*/ + +void nfp_read_complete(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete: entering"); + + if(pdev->high_intr) { + nfp_log(NFP_DBG2, "nfp_read_complete: high_intr"); + mutex_enter(&pdev->high_mutex); + nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered"); + if(pdev->high_read) + nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!"); + pdev->high_read= 1; + pdev->rd_ok= ok; + nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex"); + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_read_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_read_complete: exiting"); +} + +static void nfp_read_complete_final(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete_final: entering"); + if(pdev->rdtimeout) + (void) untimeout(pdev->rdtimeout); + if(!pdev->rd_outstanding) { + nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding"); + } + nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok); + mutex_enter(&pdev->isr_mutex); + pdev->rd_outstanding= 0; + pdev->rd_ready= 1; + pdev->rd_ok= ok; + cv_broadcast(&pdev->rd_cv); + mutex_exit(&pdev->isr_mutex); + pollwakeup (&pdev->pollhead, POLLRDNORM); + nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting"); +} + +static void nfp_rdtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." ); + return; + } + pdev->rdtimeout= 0; + nfp_read_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_read: entered" ); + if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) { + nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev"); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_read: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_write */ +/*-------------------------*/ + +void nfp_write_complete( nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_write_complete: entering"); + + if(pdev->high_intr) { + mutex_enter(&pdev->high_mutex); + if(pdev->high_write) + nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!"); + pdev->high_write= 1; + pdev->wr_ok= ok; + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_write_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_write_complete: exiting"); +} + +static void nfp_write_complete_final( nfp_dev *pdev, int ok) { + struct buf *local_wr_bp; + nfp_log( NFP_DBG2,"nfp_write_complete_final: entering"); + if(pdev->wrtimeout) + (void) untimeout(pdev->wrtimeout); + + if (!pdev->wr_bp) { + nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." ); + return; + } + + bp_mapout(pdev->wr_bp); + pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount; + /* Make sure we set wr_ready before calling biodone to avoid a race */ + pdev->wr_ready = 1; + bioerror(pdev->wr_bp, ok ? 0 : ENXIO); + local_wr_bp = pdev->wr_bp; + pdev->wr_bp = 0; + biodone(local_wr_bp); + nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited"); + pollwakeup (&pdev->pollhead, POLLWRNORM); + + nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving"); +} + +static void nfp_wrtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." ); + return; + } + pdev->wrtimeout= 0; + nfp_write_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_write: entered." ); + if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) { + nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev."); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_write: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_strategy */ +/*-------------------------*/ + +#define NFP_STRAT_ERR(thebp,err,txt) \ + nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \ + (thebp)->b_resid = (thebp)->b_bcount; \ + bioerror ((thebp), err); \ + biodone ((thebp)); + +static int nfp_strategy(struct buf *bp) { + register struct nfp_dev *pdev; + nfp_err ne; + + nfp_log( NFP_DBG2, "nfp_strategy: entered." ); + if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) { + NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev"); + return (0); + } + + if (bp->b_flags & B_READ) { + int count; + /* read */ + if (!pdev->rd_ready) { + NFP_STRAT_ERR (bp,ENXIO,"read called when not ready"); + return (0); + } + pdev->rd_ready=0; + pdev->rd_pending = 0; + if( !pdev->rd_ok) { + NFP_STRAT_ERR (bp,ENXIO,"read failed"); + return (0); + } + /* copy data from module */ + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer"); + if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS ) + { + NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed"); + return (0); + } + /* LINTED: alignment */ + count= *(unsigned int *)(pdev->read_buf+4); + count= FROM_LE32_MEM(&count); + nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count); + if(count<0 || count>bp->b_bcount) { + NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device"); + nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count); + return (0); + } + bp_mapin (bp); + bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count ); + bp_mapout (bp); + } else { + bp_mapin (bp); + ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count ); + bp_mapout (bp); + if( ne != NFP_SUCCESS) { + NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed"); + return (0); + } + } + bioerror(bp, 0); + bp->b_resid = 0; + biodone (bp); + } else { + /* write */ + if (!pdev->wr_ready) { + NFP_STRAT_ERR (bp,ENXIO,"write called when not ready"); + return (0); + } + if (pdev->wr_bp) { + NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL"); + return (0); + } + pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + pdev->wr_bp = bp; + pdev->wr_ready = 0; + bp_mapin (bp); + ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx); + if( ne != NFP_SUCCESS ) { + bp_mapout (bp); + (void) untimeout(pdev->wrtimeout); + pdev->wr_bp = 0; + pdev->wr_ready = 1; + NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed"); + return (0); + } + } + nfp_log( NFP_DBG2, "nfp_strategy: leaving"); + + return (0); +} + + +/*--------------------*/ +/* poll / select */ +/*--------------------*/ + +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) { + nfp_dev *pdev; + short revents; + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev"); + *reventsp=0; + return (0); + } + nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events); + + revents=0; + if (events&POLLWRNORM) { + if (pdev->wr_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: write ready"); + revents|=POLLWRNORM; + } + } + + if (events&POLLRDNORM) { + if (pdev->rd_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: read ready"); + revents|=POLLRDNORM; + } + } + + if (!revents && !anyyet) { + *phpp=&pdev->pollhead; + } + *reventsp=revents; + + nfp_log( NFP_DBG2, "nfp_chpoll: leaving"); + return (0); +} + + +/*--------------------*/ +/* ioctl */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) { + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_ioctl: entered." ); + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev."); + return (ENXIO); + } + + switch (cmd) { + case NFDEV_IOCTL_ENQUIRY: + { + long *outp; + int outlen; + nfdev_enquiry_str enq_data; + + enq_data.busno = (unsigned int)-1; + enq_data.slotno = (unsigned char)-1; + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." ); + if( outlen > 0 ) { + enq_data.busno = ((*outp)>>16) & 0xff; + enq_data.slotno = ((*outp)>>11) & 0x1f; + nfp_log( NFP_DBG2, "busno %d, slotno %d.", + enq_data.busno, enq_data.slotno ); + } + } else + nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." ); + + if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + case NFDEV_IOCTL_ENSUREREADING: + { + unsigned int addr, len; + nfp_err ret; + if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + /* signal a read to the module */ + nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len ); + if (len>8192) { + nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len ); + return EINVAL; + } + if (pdev->rd_outstanding==1) { + nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding."); + return EIO; + } + + addr= 0; + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + if( len > NFP_READBUF_SIZE ) { + nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len ); + return EINVAL; + } + addr= pdev->read_dma_cookie.dmac_address; + } + + pdev->rd_outstanding = 1; + nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1"); + + /* setup timeout timer */ + pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + + nfp_log( NFP_DBG2, "nfp_ioctl: read request"); + ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx); + if ( ret != NFP_SUCCESS ) { + (void) untimeout(pdev->rdtimeout); + pdev->rdtimeout = 0; + pdev->rd_outstanding = 0; + nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed "); + return nfp_oserr( ret ); + } + } + break; + + case NFDEV_IOCTL_PCI_IFVERS: + { + int vers; + + nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS"); + + if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + + if( pdev->rd_outstanding ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers); + return EIO; + } + + nfp_set_ifvers(pdev, vers); + if( pdev->ifvers != vers ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers); + return EIO; + } + } + break; + + case NFDEV_IOCTL_STATS: + { + if( ddi_copyout( (char *)&(pdev->common.stats), + (void *)arg, + sizeof(nfdev_stats_str), + mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + default: + nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." ); + return EINVAL; + } + + return 0; +} + +/*-------------------------*/ +/* nfp_open */ +/*-------------------------*/ + +/* ARGSUSED */ +int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + nfp_err ret; + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "entered nfp_open." ); + + pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev)); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev."); + return (ENODEV); + } + + if( otyp != OTYP_CHR ) { + nfp_log( NFP_DBG1, "nfp_open: not opened as character device"); + return (EINVAL); + } + + mutex_enter(&pdev->busy_mutex); + + if (pdev->busy) { + mutex_exit(&pdev->busy_mutex); + nfp_log( NFP_DBG1, "nfp_open: device busy"); + return EBUSY; + } + pdev->busy= 1; + mutex_exit(&pdev->busy_mutex); + + /* use oldest possible interface until told otherwise */ + pdev->ifvers= NFDEV_IF_STANDARD; + nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers); + pdev->rd_ready= 0; /* drop any old data */ + + ret = pdev->cmddev->open(pdev->common.cmdctx); + if( ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed "); + return nfp_oserr( ret ); + } + + nfp_log( NFP_DBG2, "nfp_open: done"); + + return 0; +} + +/*--------------------*/ +/* nfp_close */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) { + nfp_dev *pdev; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_close: entered"); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + mutex_enter(&pdev->isr_mutex); + if(pdev->rd_outstanding) { + int lbolt, err; + nfp_get_lbolt(&lbolt, err); + if(!err) + (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) ); + } + mutex_exit(&pdev->isr_mutex); + ret = pdev->cmddev->close(pdev->common.cmdctx); + if (ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed"); + return nfp_oserr( ret ); + } + + mutex_enter(&pdev->busy_mutex); + pdev->busy= 0; + mutex_exit(&pdev->busy_mutex); + + return 0; +} + +/**************************************************************************** + + nfp driver config + + ****************************************************************************/ + +/*-------------------------*/ +/* nfp_getinfo */ +/*-------------------------*/ + +/* ARGSUSED */ +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + int error; + nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_getinfo: entered" ); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if (pdev == NULL) { + *result = NULL; + error = DDI_FAILURE; + } else { + /* + * don't need to use a MUTEX even though we are + * accessing our instance structure; dev->dip + * never changes. + */ + *result = pdev->dip; + error = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + break; + default: + *result = NULL; + error = DDI_FAILURE; + } + + nfp_log( NFP_DBG2, "nfp_getinfo: leaving." ); + return (error); +} + +/*-------------------------*/ +/* nfp_release */ +/*-------------------------*/ + +static int nfp_release_dev( dev_info_t *dip ) { + nfp_dev *pdev; + int instance, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_release_dev: entering" ); + + instance = ddi_get_instance(dip); + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if (pdev) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing device" ); + + nfp_free_pci_push(pdev); + + if( pdev->cmddev ) { + nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" ); + ret = pdev->cmddev->destroy(pdev->common.cmdctx); + if (ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed "); + return nfp_oserr( ret ); + } + } + + if(pdev->high_iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" ); + ddi_remove_softintr(pdev->soft_int_id); + ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + mutex_destroy( &pdev->high_mutex ); + } else if(pdev->iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" ); + ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + } + if(pdev->low_iblock_cookie) { + ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie); + mutex_destroy( &pdev->low_mutex); + } + + for(i=0;i<6;i++) { + if( pdev->common.extra[i] ) { + nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i ); + ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]); + } + } + + ddi_remove_minor_node(dip, NULL); + + if (pdev->conf_handle) + pci_config_teardown( &pdev->conf_handle ); + + ddi_soft_state_free(state_head, instance); + } + nfp_log( NFP_DBG2, "nfp_release: finished" ); + + return DDI_SUCCESS; +} + + +/*-------------------------*/ +/* nfp_attach */ +/*-------------------------*/ + +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + int instance; + nfp_dev *pdev = NULL; + int intres; + uint16_t device, vendor, sub_device, sub_vendor; + long *outp; + nfpcmd_dev const *cmddev; + int index, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_attach: entered." ); + + if (cmd != DDI_ATTACH) { + nfp_log( NFP_DBG1, "nfp_attach: bad command." ); + goto bailout; + } + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(state_head, instance) != 0) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." ); + goto bailout; + } + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_attach: cannot find dev."); + return ENODEV; + } + pdev->dip = dip; + + /* map in pci config registers */ + if (pci_config_setup(dip, &pdev->conf_handle)) { + nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." ); + goto bailout; + } + + /* find out what we have got */ + vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID ); + device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID ); + sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID ); + sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID ); + + index= 0; + while( (cmddev = nfp_drvlist[index++]) != NULL ) { + if( cmddev->vendorid == vendor && + cmddev->deviceid == device && + cmddev->sub_vendorid == sub_vendor && + cmddev->sub_deviceid == sub_device ) + break; + } + if( !cmddev ) { + nfp_log( NFP_DBG1, "nfp_attach: unknonw device." ); + goto bailout; + } + + /* map BARs */ + for( i=0; i<6; i++ ) { + if( cmddev->bar_sizes[i] ) { + off_t size; + if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i ); + goto bailout; + } + if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { + nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) ); + goto bailout; + } + if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i], + 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i ); + goto bailout; + } + nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size ); + } + } + + pdev->read_buf = NULL; + pdev->rd_dma_ok = 0; + + /* attach to minor node */ + if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." ); + goto bailout; + } + + pdev->wr_ready = 1; + pdev->rd_ready = 0; + pdev->rd_pending = 0; + pdev->rd_outstanding = 0; + pdev->busy=0; + pdev->cmddev= cmddev; + + ret = pdev->cmddev->create(&pdev->common); + if( ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: failed to create command device"); + goto bailout; + } + pdev->common.dev= pdev; + + if (ddi_intr_hilevel(dip, 0) != 0){ + nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt"); + if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." ); + goto bailout; + } + if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." ); + goto bailout; + } + mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->high_iblock_cookie); + mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->low_iblock_cookie); + if (ddi_add_intr(dip, 0, NULL, + NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." ); + goto bailout; + } + if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH, + &pdev->iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." ); + goto bailout; + } + mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->iblock_cookie); + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id, + &pdev->iblock_cookie, NULL, + nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS) + goto bailout; + pdev->high_intr= 1; + } else { + nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt"); + + if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." ); + goto bailout; + } + + mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie); + + if (ddi_add_intr(dip, 0, NULL, + (ddi_idevice_cookie_t *)NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." ); + goto bailout; + } + } + mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL ); + cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL ); + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." ); + if( intres > 0 ) { + nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.", + ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f ); + } + } + + nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." ); + return DDI_SUCCESS; + +bailout: + (void) nfp_release_dev( dip ); + + return DDI_FAILURE; +} + +/*-------------------------*/ +/* nfp_detach */ +/*-------------------------*/ + +/* + * When our driver is unloaded, nfp_detach cleans up and frees the resources + * we allocated in nfp_attach. + */ +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + (void) nfp_release_dev(dip); + + return (DDI_SUCCESS); +} + +/*-------------------------*/ +/* _init */ +/*-------------------------*/ + +int _init(void) { + register int error; + + nfp_log( NFP_DBG2, "_init: entered" ); + + if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) { + nfp_log( NFP_DBG1, "_init: soft_state_init() failed" ); + return (error); + } + + if ((error = mod_install(&modlinkage)) != 0) { + nfp_log( NFP_DBG1, "_init: mod_install() failed" ); + ddi_soft_state_fini(&state_head); + } + + nfp_log( NFP_DBG2, "_init: leaving" ); + return (error); +} + +/*-------------------------*/ +/* _info */ +/*-------------------------*/ + +int _info(struct modinfo *modinfop) { + nfp_log( NFP_DBG2, "_info: entered" ); + + return (mod_info(&modlinkage, modinfop)); +} + +/*-------------------------*/ +/* _fini */ +/*-------------------------*/ + +int _fini(void) { + int status; + + nfp_log( NFP_DBG2, "_fini: entered" ); + + if ((status = mod_remove(&modlinkage)) != 0) { + nfp_log( NFP_DBG2, "_fini: mod_remove() failed." ); + return (status); + } + + ddi_soft_state_fini(&state_head); + + nfp_log( NFP_DBG2, "_fini: leaving" ); + + return (status); +} + diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c new file mode 100644 index 0000000000..f51a09188d --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.c @@ -0,0 +1,310 @@ +/* + +i21285.c: nCipher PCI HSM intel/digital 21285 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21285.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* create ------------------------------------------------------- */ + +static nfp_err i21285_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + nfp_log( NFP_DBG2, "i21285_create: enable doorbell"); + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21285_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_open( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21285_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + unsigned int doorbell; + unsigned int tmp32; + + nfp_log( NFP_DBG3, "i21285_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_isr: NULL pdev"); + return NFP_ENODEV; + } + + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell ); + TO_LE32_IO( &tmp32, 0xffff & doorbell ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + } + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + } + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21285_write( const char *block, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_write: NULL pdev"); + return NFP_ENODEV; + } + + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]); + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]); + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_write: length not written"); + return NFP_EIO; + } + + TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST); + + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log( NFP_DBG2, "i21285_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_read: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if(ne) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if( ne ) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21285_read: done"); + *rcount= count; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21285_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + unsigned int tmp32; + nfp_err ne; + + nfp_log( NFP_DBG2, "i21285_ensure_reading: entered"); + + if(addr) { + nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr"); + return -NFP_EINVAL; + } + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR ); + return NFP_ENXIO; + } + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM( &hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written"); + return NFP_EIO; + }; + TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST ); + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + + +const nfpcmd_dev i21285_cmddev = { + "nCipher Gen 1 PCI", + PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, + PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21285_create, + i21285_destroy, + i21285_open, + i21285_close, + i21285_isr, + i21285_write, + i21285_read, + i21285_chupdate, + i21285_ensure_reading, + 0, /* no debug */ +}; + diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h new file mode 100644 index 0000000000..4ea1d853ec --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.h @@ -0,0 +1,43 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_I21285_H +#define NFP_I21285_H + +#ifndef PCI_VENDOR_ID_DEC +#define PCI_VENDOR_ID_DEC 0x1011 +#endif +#ifndef PCI_DEVICE_ID_DEC_21285 +#define PCI_DEVICE_ID_DEC_21285 0x1065 +#endif +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_DEVICE_ID_NFAST_GEN1 +#define PCI_DEVICE_ID_NFAST_GEN1 0x0100 +#endif + +#define I21285_OFFSET_DOORBELL 0x60 +#define I21285_OFFSET_INTERRUPT_MASK 0x34 + +#define DOORBELL_ENABLE 0x0 +#define DOORBELL_DISABLE 0x4 + +#define POSTLIST_ENABLE 0x0 +#define POSTLIST_DISABLE 0x8 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x80 +#define MEMSIZE 0x100000 + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c new file mode 100644 index 0000000000..82024dc800 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.c @@ -0,0 +1,423 @@ +/* + +i21555.c: nCipher PCI HSM intel 21555 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21555.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* started ------------------------------------------------------ + * + * Check that device is ready to talk, by checking that + * the i21555 has master enabled on its secondary interface + */ + +static nfp_err i21555_started( nfp_cdev *pdev ) { + unsigned int tmp32; +#ifdef CONFIGSPACE_DEBUG + unsigned int reg32[64]; + int i; +#endif + nfp_err ne; + + nfp_log( NFP_DBG2, "i21555_started: entered"); + +#ifdef CONFIGSPACE_DEBUG + /* Suck up all the registers */ + for (i=0; i < 64; i++) { + ne = nfp_config_inl( pdev, i*4, ®32[i] ); + } + + for (i=0; i < 16; i++) { + int j = i * 4; + nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4, + reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]); + } +#endif + + ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 ); + if (ne) { + /* succeed if PCI config reads are not implemented */ + if (ne == NFP_EUNKNOWN) + return NFP_SUCCESS; + nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed"); + return ne; + } + + tmp32= FROM_LE32_IO(&tmp32) & 0xffff; + + if ( tmp32 & CFG_CMD_MASTER ) { + nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32); + return NFP_SUCCESS; + } else { + nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32); + return NFP_ESTARTING; + } +} + +/* create ------------------------------------------------------- */ + +static nfp_err i21555_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + nfp_log( NFP_DBG2, "i21555_create: enable doorbell"); + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21555_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_open( void * ctx ) { + + nfp_log( NFP_DBG2, "i21555_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21555_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21555_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + nfp_err ne; + unsigned short doorbell; + unsigned short tmp16; + + nfp_log( NFP_DBG3, "i21555_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_isr: NULL pdev"); + return NFP_ENODEV; + } + + pdev->stats.isr++; + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + /* This interrupt may not be from our module, so check that it actually is + * us before handling it. + */ + ne = i21555_started( pdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed"); + } + return ne; + } + + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + pdev->stats.isr_write++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + pdev->stats.isr_read++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE16_IO(&tmp16,doorbell); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell ); + } + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + } + nfp_log( NFP_DBG3, "i21555_isr: exiting"); + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21555_write( const char *block, int len, void *ctx) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned short tmp16; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_write: NULL cdev"); + return NFP_ENODEV; + } + + cdev->stats.write_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_write: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_write: length not written"); + return NFP_EIO; + } + TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.write_fail--; + cdev->stats.write_block++; + cdev->stats.write_byte += len; + + nfp_log( NFP_DBG2, "i21555_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21555_read: entered"); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_read: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.read_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21555_read: done"); + *rcount= count; + cdev->stats.read_fail--; + cdev->stats.read_block++; + cdev->stats.read_byte += len; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21555_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[3]; + unsigned short tmp16; + unsigned int tmp32; + nfp_err ne; + int hdr_len; + + nfp_log( NFP_DBG2, "i21555_ensure_reading: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.ensure_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + if(addr) { + nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr); + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH); + TO_LE32_MEM(&hdr[1], len); + TO_LE32_MEM(&hdr[2], addr); + hdr_len= 12; + } else { + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + hdr_len= 8; + } + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written"); + return NFP_EIO; + } + TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.ensure_fail--; + cdev->stats.ensure++; + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + +const nfpcmd_dev i21555_cmddev = { + "nCipher Gen 2 PCI", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555, + PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21555_create, + i21555_destroy, + i21555_open, + i21555_close, + i21555_isr, + i21555_write, + i21555_read, + i21555_chupdate, + i21555_ensure_reading, + i21555_debug, +}; diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h new file mode 100644 index 0000000000..d8f3965938 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.h @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef I21555_H +#define I21555_H + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL_21555 +#define PCI_DEVICE_ID_INTEL_21555 0xb555 +#endif + +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1 +#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100 +#endif + +#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C +#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E +#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98 + +#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4 +#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0 + +#define I21555_DOORBELL_PRI_ENABLE 0x0000 +#define I21555_DOORBELL_PRI_DISABLE 0xFFFF + +#define I21555_CFG_SEC_CMD_STATUS 0x44 + +#define CFG_CMD_MASTER 0x0004 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x100 + +extern nfp_err i21555_debug( int cmd, void *ctx ); + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c new file mode 100644 index 0000000000..183ace8275 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555d.c @@ -0,0 +1,28 @@ +/* + +i21555d.c: nCipher PCI HSM intel 21555 debug ioctl + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +15/05/2002 jsh Original, does nothing + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "i21555.h" + +/* ARGSUSED */ +nfp_err i21555_debug( int cmd, void *ctx) { + nfp_log( NFP_DBG1, "i21555_debug: entered"); + + return NFP_EUNKNOWN; +} diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h new file mode 100644 index 0000000000..8a97bf2c63 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-common.h @@ -0,0 +1,141 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ +/** \file nfdev-common.h + * + * \brief nFast device driver (not generic SCSI) ioctl struct definition file + * include NFDEV-$(system) for ioctl number definitions + * + * 1998.07.13 jsh Started + * + * + */ + +#ifndef NFDEV_COMMON_H +#define NFDEV_COMMON_H + +/** + * Result of the ENQUIRY ioctl. + */ +typedef struct nfdev_enquiry_str { + unsigned int busno; /**< Which bus is the PCI device on. */ + unsigned char slotno; /**< Which slot is the PCI device in. */ + unsigned char reserved[3]; /**< for consistant struct alignment */ +} nfdev_enquiry_str; + +/** + * Result of the STATS ioctl. + */ +typedef struct nfdev_stats_str { + unsigned long isr; /**< Count interrupts. */ + unsigned long isr_read; /**< Count read interrupts. */ + unsigned long isr_write; /**< Count write interrupts. */ + unsigned long write_fail; /**< Count write failures. */ + unsigned long write_block; /**< Count blocks written. */ + unsigned long write_byte; /**< Count bytes written. */ + unsigned long read_fail; /**< Count read failures. */ + unsigned long read_block; /**< Count blocks read. */ + unsigned long read_byte; /**< Count bytes read. */ + unsigned long ensure_fail; /**< Count read request failures. */ + unsigned long ensure; /**< Count read requests. */ +} nfdev_stats_str; + +/** + * Input to the CONTROL ioctl. + */ +typedef struct nfdev_control_str { + unsigned control; /**< Control flags. */ +} nfdev_control_str; + +/** Control bit indicating host supports MOI control */ +#define NFDEV_CONTROL_HOST_MOI 0x0001 + +/** Index of control bits indicating desired mode + * + * Desired mode follows the M_ModuleMode enumeration. + */ +#define NFDEV_CONTROL_MODE_SHIFT 1 + +/** Detect a backwards-compatible control value + * + * Returns true if the request control value "makes no difference", i.e. + * and the failure of an attempt to set it is therefore uninteresting. + */ +#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1) + +/** + * Result of the STATUS ioctl. + */ +typedef struct nfdev_status_str { + unsigned status; /**< Status flags. */ + char error[8]; /**< Error string. */ +} nfdev_status_str; + +/** Monitor firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_MONITOR_MOI 0x0001 + +/** Application firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_APPLICATION_MOI 0x0002 + +/** Application firmware running and supports error reporting */ +#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004 + +/** HSM failed + * + * Consult error[] for additional information. + */ +#define NFDEV_STATUS_FAILED 0x0008 + +/** Standard PCI interface. */ +#define NFDEV_IF_STANDARD 0x01 + +/** PCI interface with results pushed from device + * via DMA. + */ +#define NFDEV_IF_PCI_PUSH 0x02 + +/* platform independant base ioctl numbers */ + +/** Enquiry ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_ENQUIRY 0x01 +/** Channel Update ioctl. + * \deprecated */ +#define NFDEV_IOCTL_NUM_CHUPDATE 0x02 +/** Ensure Reading ioctl. + * Signal a read request to the device. + * \param (unsigned int) Length of data to be read. + */ +#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03 +/** Device Count ioctl. + * Not implemented for on all platforms. + * \return (int) the number of attached devices. */ +#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04 +/** Internal Debug ioctl. + * Not implemented in release drivers. */ +#define NFDEV_IOCTL_NUM_DEBUG 0x05 +/** PCI Interface Version ioctl. + * \param (int) Maximum PCI interface version + * supported by the user of the device. */ +#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06 +/** Statistics ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_STATS 0x07 + +/** Module control ioctl + * \param (nfdev_control_str) Value to write to HSM control register + */ +#define NFDEV_IOCTL_NUM_CONTROL 0x08 + +/** Module state ioctl + * \return (nfdev_status_str) Values read from HSM status/error registers + */ +#define NFDEV_IOCTL_NUM_STATUS 0x09 + +#endif diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h new file mode 100644 index 0000000000..923b902e46 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h @@ -0,0 +1,37 @@ +/* + +nfdev-solaris.h: nFast solaris specific device ioctl interface. + +(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +14/07/1998 jsh Original + +*/ + +#ifndef NFDEV_SOLARIS_H +#define NFDEV_SOLARIS_H + +#include "nfdev-common.h" + +#define NFDEV_IOCTL_TYPE ('n'<<8) + +#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENQUIRY ) +#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENSUREREADING ) +#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEVCOUNT ) +#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEBUG ) +#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_PCI_IFVERS ) +#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_STATS ) + +#endif /* NFDEV_SOLARIS_H */ diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h new file mode 100644 index 0000000000..9704f04fbc --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp.h @@ -0,0 +1,113 @@ +/* + +nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7 + +(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv + +*/ + +#ifndef NFP_H +#define NFP_H + +#ifndef _KERNEL +#error Hello? this is a driver, please compile with -D_KERNEL +#endif + +#if ( CH_KERNELVER < 260 ) +typedef int ioctlptr_t; +typedef unsigned short uint16_t; +#define DDI_GET32 ddi_getl +#define DDI_PUT32 ddi_putl +#define DDI_GET16 ddi_getw +#define DDI_PUT16 ddi_putw +#define DDI_REP_GET8 ddi_rep_getb +#define DDI_REP_PUT8 ddi_rep_putb +#define DDI_REP_GET32 ddi_rep_getl +#define DDI_REP_PUT32 ddi_rep_putl +#define PCI_CONFIG_GET16 pci_config_getw +#else /* ( CH_KERNELVER >= 260 ) */ +typedef intptr_t ioctlptr_t; +#define DDI_GET32 ddi_get32 +#define DDI_PUT32 ddi_put32 +#define DDI_GET16 ddi_get16 +#define DDI_PUT16 ddi_put16 +#define DDI_REP_GET8 ddi_rep_get8 +#define DDI_REP_PUT8 ddi_rep_put8 +#define DDI_REP_GET32 ddi_rep_get32 +#define DDI_REP_PUT32 ddi_rep_put32 +#define PCI_CONFIG_GET16 pci_config_get16 +#endif + +#if ( CH_KERNELVER < 270 ) +typedef int nfp_timeout_t; +#define EXTRA_CB_FLAGS 0 +#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap) +#else /* ( CH_KERNELVER >= 270 ) */ +typedef timeout_id_t nfp_timeout_t; +#define EXTRA_CB_FLAGS D_64BIT +#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap) +#endif + +typedef struct nfp_dev { + int rd_ok; + int wr_ok; + + int ifvers; + + /* for PCI push read interface */ + unsigned char *read_buf; + ddi_dma_handle_t read_dma_handle; + ddi_dma_cookie_t read_dma_cookie; + + ddi_acc_handle_t acchandle; + + int rd_dma_ok; + + nfp_timeout_t wrtimeout; + nfp_timeout_t rdtimeout; + + struct buf *wr_bp; + int wr_ready; + int rd_ready; + int rd_pending; + int rd_outstanding; + kcondvar_t rd_cv; + + struct pollhead pollhead; + dev_info_t *dip; + + ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */ + ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */ + kmutex_t high_mutex; + kmutex_t low_mutex; + int high_intr; + ddi_softintr_t soft_int_id; + int high_read; + int high_write; + + ddi_iblock_cookie_t iblock_cookie; /* for mutex */ + kmutex_t isr_mutex; + + kmutex_t busy_mutex; + int busy; + + ddi_acc_handle_t conf_handle; + + nfp_cdev common; + const nfpcmd_dev *cmddev; +} nfp_dev; + +extern struct nfp_dev *nfp_dev_list[]; + +#endif /* NFP_H */ diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h new file mode 100644 index 0000000000..db8af0b2f9 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_cmd.h @@ -0,0 +1,68 @@ +/* + +nfp_cmd.h: nCipher PCI HSM command driver decalrations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFPCMD_H +#define NFPCMD_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* read and write called with userspace buffer */ + +typedef struct nfpcmd_dev { + const char *name; + unsigned short vendorid, deviceid, + sub_vendorid, sub_deviceid; + unsigned int bar_sizes[6]; /* includes IO bit */ + unsigned int flags; + nfp_err (*create)(struct nfp_cdev *pdev); + nfp_err (*destroy)(void * ctx); + nfp_err (*open)(void * ctx); + nfp_err (*close)(void * ctx); + nfp_err (*isr)(void *ctx, int *handled); + nfp_err (*write_block)( const char *ublock, int len, void *ctx ); + nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount); + nfp_err (*channel_update)( char *data, int len, void *ctx); + nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx ); + nfp_err (*debug)( int cmd, void *ctx); +} nfpcmd_dev; + +#define NFP_CMD_FLG_NEED_IOBUF 0x1 + +/* list of all supported drivers ---------------------------------------- */ + +extern const nfpcmd_dev *nfp_drvlist[]; + +extern const nfpcmd_dev i21285_cmddev; +extern const nfpcmd_dev i21555_cmddev; +extern const nfpcmd_dev bcm5820_cmddev; + +#ifndef PCI_BASE_ADDRESS_SPACE_IO +#define PCI_BASE_ADDRESS_SPACE_IO 0x1 +#endif + +#define NFP_MAXDEV 16 + + +#define NFP_MEMBAR_MASK ~0xf +#define NFP_IOBAR_MASK ~0x3 +/* + This masks off the bottom bits of the PCI_CSR_BAR which signify that the + BAR is an IO BAR rather than a MEM BAR +*/ + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h new file mode 100644 index 0000000000..d1d2100fea --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_common.h @@ -0,0 +1,68 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_COMMON_H +#define NFP_COMMON_H + +#include <sys/types.h> +#include <sys/conf.h> + +typedef uint32_t UINT32; +typedef uint8_t BYTE; + +#define DEFINE_NFPCI_PACKED_STRUCTS +#include "nfpci.h" +#include "nfdev-solaris.h" + +typedef int oserr_t; + +#if CH_BIGENDIAN + +/* Big Endian Sparc */ + +#define SWP32(x) \ +( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) + +#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) ) + +#define FROM_LE32_IO(x) SWP32(*x) +#define TO_LE32_IO(x,y) *x=SWP32(y) + +#define FROM_LE32_MEM(x) SWP32(*x) +#define TO_LE32_MEM(x,y) *x=SWP32(y) + +#define FROM_LE16_IO(x) SWP16(*x) +#define TO_LE16_IO(x,y) *x=SWP16(y) + +#else + +/* Little Endian x86 */ + +#define FROM_LE32_IO(x) (*x) +#define TO_LE32_IO(x,y) (*x=y) + +#define FROM_LE32_MEM(x) (*x) +#define TO_LE32_MEM(x,y) (*x=y) + +#define FROM_LE16_IO(x) (*x) +#define TO_LE16_IO(x,y) (*x=y) + +#endif /* !CH_BIGENDIAN */ + +#include <sys/types.h> + +#if CH_KERNELVER == 260 +#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt ) +#else +#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; } +#endif + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h new file mode 100644 index 0000000000..d64cb78fd4 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_error.h @@ -0,0 +1,48 @@ +/* + +nfp_error.h: nCipher PCI HSM error handling + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +05/12/2001 jsh Original + +*/ + +#ifndef NFP_ERROR_H +#define NFP_ERROR_H + +#include "nfp_common.h" + +#define NFP_SUCCESS 0x0 +#define NFP_EFAULT 0x1 +#define NFP_ENOMEM 0x2 +#define NFP_EINVAL 0x3 +#define NFP_EIO 0x4 +#define NFP_ENXIO 0x5 +#define NFP_ENODEV 0x6 +#define NFP_EINTR 0x7 +#define NFP_ESTARTING 0x8 +#define NFP_EAGAIN 0x9 +#define NFP_EUNKNOWN 0x100 + +typedef int nfp_err; + +extern oserr_t nfp_oserr( nfp_err nerr ); +extern nfp_err nfp_error( oserr_t oerr ); + +#define nfr( x) \ + return nfp_error((x)) + +#define nfer(x, fn, msg) \ + { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } } + +#define er(x, fn, msg ) \ +{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } } + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h new file mode 100644 index 0000000000..3e7d8187e5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_hostif.h @@ -0,0 +1,54 @@ +/* + +nfp_hostif.h: nCipher PCI HSM host interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_HOSTIF_H +#define NFP_HOSTIF_H + +#include "nfdev-common.h" + +struct nfp_dev; + +/* common device structure */ + +typedef struct nfp_cdev { + unsigned char *bar[6]; + void *extra[6]; + + int busno; + int slotno; + + void *cmdctx; + + char *iobuf; + + struct nfp_dev* dev; + + struct nfdev_stats_str stats; + +} nfp_cdev; + +/* callbacks from command drivers -------------------------------------- */ + +void nfp_read_complete( struct nfp_dev *pdev, int ok); +void nfp_write_complete( struct nfp_dev *pdev, int ok); + +#define NFP_READ_MAX (8 * 1024) +#define NFP_READBUF_SIZE (NFP_READ_MAX + 8) +#define NFP_TIMEOUT_SEC 10 + +#define NFP_DRVNAME "nCipher nFast PCI driver" + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c new file mode 100644 index 0000000000..807b4f24c5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* + * nfp_ifervs.c - common pci interface versioning + * + * uses: + * + * int pdev->ifvers + * device interface version + * + * int nfp_ifvers + * interface version limit + * + * int nfp_alloc_pci_push( nfp_dev *pdev ) + * allocates resources needed for PCI Push, + * if not already allocated, and return True if successful + * + * void nfp_free_pci_push( nfp_dev *pdev ) { + * frees any resources allocated to PCI Push + */ + +void nfp_set_ifvers( nfp_dev *pdev, int vers ) { + if( nfp_ifvers != 0 && vers > nfp_ifvers ) { + nfp_log( NFP_DBG2, + "nfp_set_ifvers: can't set ifvers %d" + " as nfp_ifvers wants max ifvers %d", + vers, nfp_ifvers); + return; + } + if( vers >= NFDEV_IF_PCI_PUSH ) { + if(!nfp_alloc_pci_push(pdev)) { + nfp_log( NFP_DBG1, + "nfp_set_ifvers: can't set ifvers %d" + " as resources not available", + vers); + return; + } + } else { + nfp_free_pci_push(pdev); + } + pdev->ifvers= vers; + nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers); +} diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h new file mode 100644 index 0000000000..17ffe469ce --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_osif.h @@ -0,0 +1,105 @@ +/* + +nfp_osif.h: nCipher PCI HSM OS interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_OSIF_H +#define NFP_OSIF_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* general typedefs ----------------------------------------------- */ + +typedef volatile unsigned int reg32; +typedef volatile unsigned short reg16; +typedef volatile unsigned char reg8; + +/* sempaphores, mutexs and events --------------------------------- */ + +#if 0 +extern nfp_err nfp_sema_init( nfp_sema *sema, int initial); +extern void nfp_sema_destroy( nfp_sema *sema ); +extern void nfp_sema_post( nfp_sema *sema ); +extern void nfp_sema_wait( nfp_sema *sema ); +extern int nfp_sema_wait_sig( nfp_sema *sema ); + +extern nfp_err nfp_mutex_init( nfp_mutex *mutex ); +extern void nfp_mutex_destroy( nfp_mutex *mutex ); +extern void nfp_mutex_enter( nfp_mutex *mutex ); +extern void nfp_mutex_exit( nfp_mutex *mutex ); + +extern nfp_err nfp_event_init( nfp_event *event ); +extern void nfp_event_destroy( nfp_event *event ); +extern void nfp_event_set( nfp_event *event ); +extern void nfp_event_clear( nfp_event *event ); +extern void nfp_event_wait( nfp_event *event ); +extern void nfp_event_wait_sig( nfp_event *event ); + +#endif + +/* timeouts ------------------------------------------------------ */ + +extern void nfp_sleep( int ms ); + +/* memory handling ----------------------------------------------- */ + +#define KMALLOC_DMA 0 +#define KMALLOC_CACHED 1 + +extern void *nfp_kmalloc( int size, int flags ); +extern void *nfp_krealloc( void *ptr, int size, int flags ); +extern void nfp_kfree( void * ); + +/* config space access ------------------------------------------------ */ + +/* return Little Endian 32 bit config register */ +extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ); + +/* io space access ------------------------------------------------ */ + +extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ); +extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ); +extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ); +extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ); + +/* user and device memory space access ---------------------------- */ + +/* NB these 2 functions are not guarenteed to be re-entrant for a given device */ +extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len); +extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len); + +extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len ); +extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len ); + +extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len ); +extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len); + +/* debug ------------------------------------------------------------ */ + +#define NFP_DBG1 1 +#define NFP_DBGE NFP_DBG1 +#define NFP_DBG2 2 +#define NFP_DBG3 3 +#define NFP_DBG4 4 + +#ifdef STRANGE_VARARGS +extern void nfp_log(); +#else +extern void nfp_log( int severity, const char *format, ...); +#endif + +extern int nfp_debug; + +#endif diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h new file mode 100644 index 0000000000..793f5995e6 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfpci.h @@ -0,0 +1,171 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* +* +* NFPCI.H - nFast PCI interface definition file +* +* +* +* 1998.06.09 IH Started +* +* The interface presented by nFast PCI devices consists of: +* +* A region of shared RAM used for data transfer & control information +* A doorbell interrupt register, so both sides can give each other interrupts +* A number of DMA channels for transferring data +*/ + +#ifndef NFPCI_H +#define NFPCI_H + +/* Sizes of some regions */ +#define NFPCI_RAM_MINSIZE 0x00100000 +/* This is the minimum size of shared RAM. In future it may be possible to + negotiate larger sizes of shared RAM or auto-detect how big it is */ +#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */ +#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */ + +/* Offsets within shared memory space. + The following main regions are: + jobs input area + jobs output area + kernel jobs input area + kernel output area +*/ + +#define NFPCI_OFFSET_JOBS 0x00000000 +#define NFPCI_OFFSET_JOBS_WR 0x00000000 +#define NFPCI_OFFSET_JOBS_RD 0x00010000 +#define NFPCI_OFFSET_KERN 0x00020000 +#define NFPCI_OFFSET_KERN_WR 0x00020000 +#define NFPCI_OFFSET_KERN_RD 0x00030000 + +/* Interrupts, defined by bit position in doorbell register */ + +/* Interrupts from device to host */ +#define NFAST_INT_DEVICE_WRITE_OK 0x00000001 +#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002 +#define NFAST_INT_DEVICE_READ_OK 0x00000004 +#define NFAST_INT_DEVICE_READ_FAILED 0x00000008 +#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010 +#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020 +#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040 +#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080 + +/* Interrupts from host to device */ +#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000 +#define NFAST_INT_HOST_READ_REQUEST 0x00020000 +#define NFAST_INT_HOST_DEBUG 0x00040000 +#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000 +#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000 + +/* Ordinary job submission ------------------------ */ + +/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined + by the following (byte) address offsets... */ + +#define NFPCI_OFFSET_CONTROL 0x0 +#define NFPCI_OFFSET_LENGTH 0x4 +#define NFPCI_OFFSET_DATA 0x8 +#define NFPCI_OFFSET_PUSH_ADDR 0x8 + +#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8) + +#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8) + +/* Kernel inferface job submission ---------------- */ + +#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8) + +#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8) + +#ifdef DEFINE_NFPCI_PACKED_STRUCTS +typedef struct +{ + UINT32 controlword; + UINT32 length; /* length of data to follow */ + union { + BYTE data[1]; + UINT32 addr; + } uu; +} + NFPCI_JOBS_BLOCK; +#endif + + +#define NFPCI_JOB_CONTROL 0x00000001 +#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002 +/* + The 'Control' word is analogous to the SCSI read/write address; + 1 = standard push/pull IO + 2 = push/push IO + + To submit a block of job data, the host: + - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data + - copies the data to NFPCI_JOBS_WR_DATA + - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register + - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back + + To read a block of jobs back, the host: + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at + NFPCI_JOBS_RD_LENGTH to its actual length. + + Optionally the host can request the PCI read data to be pushed to host PCI mapped ram: + - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max + size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8 + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of + the buffer + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The + module will set NFPCI_OFFSET_LENGTH to the actual length. +*/ + +#define NFPCI_SCRATCH_CONTROL 0 + +#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0) +#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1 +#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT) + +#define NFPCI_SCRATCH_STATUS 1 + +#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0) +#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1) +#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2) +#define NFPCI_SCRATCH_STATUS_ERROR (1<<3) + +#define NFPCI_SCRATCH_ERROR_LO 2 +#define NFPCI_SCRATCH_ERROR_HI 3 + +#endif diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c new file mode 100644 index 0000000000..fba62f9a37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/osif.c @@ -0,0 +1,184 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" +#include "nfp.h" +#include "autoversion.h" + +/* config space access ---------------------------------- */ + +nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) { + unsigned int tmp32; + if ( !pdev || !pdev->dev || !pdev->dev->conf_handle ) + return NFP_ENODEV; + +/* pci_config_get32() does byte swapping, so put back to LE */ + tmp32 = pci_config_get32( pdev->dev->conf_handle, offset ); + TO_LE32_IO(res, tmp32); + + return NFP_SUCCESS; +} + +/* user space memory access ---------------------------------- */ + +nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) { + bcopy(ubuf, kbuf, len); + return 0; +} + +nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) { + bcopy(kbuf, ubuf, len); + return 0; +} + +nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */ + return nfp_copy_to_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */ + return nfp_copy_from_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR); + else + /* LINTED: alignment */ + DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR); + return NFP_SUCCESS; +} + +nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR ); + else + /* LINTED: alignment */ + DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR ); + return NFP_SUCCESS; +} + +/* pci io space access --------------------------------------- */ + +unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) ); +} + +unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) ); +} + +void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); +} + +void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); +} + +/* logging ---------------------------------------------------- */ + +void nfp_log( int level, const char *fmt, ...) +{ + auto char buf[256]; + va_list ap; + + switch (level) { + case NFP_DBG4: if (nfp_debug < 4) break; + /*FALLTHROUGH*/ + case NFP_DBG3: if (nfp_debug < 3) break; + /*FALLTHROUGH*/ + case NFP_DBG2: if (nfp_debug < 2) break; + /*FALLTHROUGH*/ + case NFP_DBG1: if (nfp_debug < 1) break; + /*FALLTHROUGH*/ + default: + va_start(ap, fmt); + (void) vsnprintf(buf, 256, fmt, ap); + va_end(ap); + cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf); + break; + } +} + +struct errstr { + int oserr; + nfp_err nferr; +}; + + +static struct errstr errtab[] = { + { EFAULT, NFP_EFAULT }, + { ENOMEM, NFP_ENOMEM }, + { EINVAL, NFP_EINVAL }, + { EIO, NFP_EIO }, + { ENXIO, NFP_ENXIO }, + { ENODEV, NFP_ENODEV }, + { EINVAL, NFP_EUNKNOWN }, + { 0, 0 } +}; + +nfp_err nfp_error( int oserr ) +{ + struct errstr *perr; + if(!oserr) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->oserr == oserr) + return perr->nferr; + perr++; + } + return NFP_EUNKNOWN; +} + +int nfp_oserr( nfp_err nferr ) +{ + struct errstr *perr; + if(nferr == NFP_SUCCESS) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->nferr == nferr) + return perr->oserr; + perr++; + } + return EIO; +} diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c new file mode 100644 index 0000000000..3f34ec3b58 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -0,0 +1,2184 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay Devices + * + * Overlay devices provide a means for creating overlay networks, a means of + * multiplexing multiple logical, isolated, and discrete layer two and layer + * three networks on top of one physical network. + * + * In general, these overlay devices encapsulate the logic to answer two + * different questions: + * + * 1) How should I transform a packet to put it on the wire? + * 2) Where should I send a transformed packet? + * + * Each overlay device is presented to the user as a GLDv3 device. While the + * link itself cannot have an IP interface created on top of it, it allows for + * additional GLDv3 devices, such as a VNIC, to be created on top of it which + * can be plumbed up with IP interfaces. + * + * + * -------------------- + * General Architecture + * -------------------- + * + * The logical overlay device that a user sees in dladm(1M) is a combination of + * two different components that work together. The first component is this + * kernel module, which is responsible for answering question one -- how should + * I transform a packet to put it on the wire. + * + * The second component is what we call the virtual ARP daemon, or varpd. It is + * a userland component that is responsible for answering the second question -- + * Where should I send a transformed packet. Instances of the kernel overlay + * GLDv3 device ask varpd the question of where should a packet go. + * + * The split was done for a few reasons. Importantly, we wanted to keep the act + * of generating encapsulated packets in the kernel so as to ensure that the + * general data path was fast and also kept simple. On the flip side, while the + * question of where should something go may be simple, it may often be + * complicated and need to interface with several different external or + * distributed systems. In those cases, it's simpler to allow for the full + * flexibility of userland to be brought to bear to solve that problem and in + * general, the path isn't very common. + * + * The following is what makes up the logical overlay device that a user would + * create with dladm(1M). + * + * Kernel Userland + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . +--------+ +--------+ +--------+ . . . + * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . + * . +--------+ +--------+ +--------+ . . . + * . | | | . . . + * . | | | . . . + * . +------------+-----------+ . . . + * . | . . /dev/overlay . + * . +--------------+ . . . +------------+ . + * . | | . . . | | . + * . | Overlay |======*=================| Virtual | . + * . | GLDv3 Device |========================| ARP Daemon | . + * . | | . . | | . + * . +--------------+ . . +------------+ . + * . | . . | . + * . | . . | . + * . +----------------+ . . +--------+ . + * . | Overlay | . . | varpd | . + * . | Encapsulation | . . | Lookup | . + * . | Plugin | . . | Plugin | . + * . +----------------+ . . +--------+ . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * + * + * This image shows the two different components and where they live. + * Importantly, it also shows that both the kernel overlay device and the + * userland varpd both support plugins. The plugins actually implement the + * things that users care about and the APIs have been designed to try to + * minimize the amount of things that a module writer needs to worry about it. + * + * IDENTIFIERS + * + * Every overlay device is defined by a unique identifier which is the overlay + * identifier. Its purpose is similar to that of a VLAN identifier, it's a + * unique number that is used to differentiate between different entries on the + * wire. + * + * ENCAPSULATION + * + * An overlay encapsulation plugin is a kernel miscellaneous module whose + * purpose is to contain knowledge about how to transform packets to put them + * onto the wire and to take them off. An example of an encapsulation plugin is + * vxlan. It's also how support for things like nvgre or geneve would be brought + * into the system. + * + * Each encapsulation plugins defines a series of operation vectors and + * properties. For the full details on everything they should provide, please + * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible + * for telling the system what information is required to send a packet. For + * example, vxlan is defined to send everything over a UDP packet and therefore + * requires a port and an IP address, while nvgre on the other hand is its own + * IP type and therefore just requires an IP address. In addition, it also + * provides information about the kind of socket that should be created. This is + * used by the kernel multiplexor, more of that in the Kernel Components + * section. + * + * LOOKUPS + * + * The kernel communicates requests for lookups over the character device + * /dev/overlay. varpd is responsible for listening for requests on that device + * and answering them. The character device is specific to the target path and + * varpd. + * + * Much as the kernel overlay module handles the bulk of the scaffolding but + * leaves the important work to the encapsulation plugin, varpd provides a + * similar role and leaves the full brunt of lookups to a userland dynamic + * shared object which implements the logic of lookups. + * + * Each lookup plugin defines a series of operation vectors and properties. For + * the full details on everything that they should provide, please read + * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC + * address and asked to give an address on the physical network that it should + * be sent to. In addition, they handle questions related to how to handle + * things like broadcast and multicast traffic, etc. + * + * ---------- + * Properties + * ---------- + * + * A device from a dladm perspective has a unique set of properties that are + * combined from three different sources: + * + * 1) Generic properties that every overlay device has + * 2) Properties that are specific to the encapsulation plugin + * 3) Properties that are specific to the lookup plugin + * + * All of these are exposed in a single set of properties in dladm. Note that + * these are not necessarily traditional link properties. However, if something + * is both a traditional GLDv3 link property, say the MTU of a device, and a + * specific property here, than the driver ensures that all existing GLDv3 + * specific means of manipulating it are used and wraps up its private property + * interfaces to ensure that works. + * + * Properties in the second and third category are prefixed with the name of + * their module. For example, the vxlan encapsulation module has a property + * called the 'listen_ip'. This property would show up in dladm as + * 'vxlan/listen_ip'. This allows different plugins to both use similar names + * for similar properties and to also have independent name spaces so that + * overlapping names do not conflict with anything else. + * + * While the kernel combines both sets one and two into a single coherent view, + * it does not do anything with respect to the properties that are owned by the + * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in + * charge of bridging these two worlds into one magical experience for the user. + * It carries the burden of knowing about both overlay specific and varpd + * specific properties. Importantly, we want to maintain this distinction. We + * don't want to treat the kernel as an arbitrary key/value store for varpd and + * we want the kernel to own its own data and not have to ask userland for + * information that it owns. + * + * Every property in the system has the following attributes: + * + * o A name + * o A type + * o A size + * o Permissions + * o Default value + * o Valid value ranges + * o A value + * + * Everything except for the value is obtained by callers through the propinfo + * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, + * currently 256 bytes. + * + * The following are the supported types of properties: + * + * OVERLAY_PROP_T_INT + * + * A signed integer, its length is 8 bytes, corresponding to a + * int64_t. + * + * OVERLAY_PROP_T_UINT + * + * An unsigned integer, its length is 8 bytes, corresponding to a + * uint64_t. + * + * OVERLAY_PROP_T_IP + * + * A struct in6_addr, it has a fixed size. + * + * OVERLAY_PROP_T_STRING + * + * A null-terminated character string encoded in either ASCII or + * UTF-8. Note that the size of the string includes the null + * terminator. + * + * The next thing that we apply to a property is its permission. The permissions + * are put together by the bitwise or of the following flags and values. + * + * OVERLAY_PROP_PERM_REQ + * + * This indicates a required property. A property that is required + * must be set by a consumer before the device can be created. If a + * required property has a default property, this constraint is + * loosened because the default property defines the value. + * + * OVERLAY_PORP_PERM_READ + * + * This indicates that a property can be read. All properties will + * have this value set. + * + * OVERLAY_PROP_PERM_WRITE + * + * This indicates that a property can be written to and thus + * updated by userland. Properties that are only intended to + * display information, will not have OVERLAY_PROP_PERM_WRITE set. + * + * In addition, a few additional values are defined as a convenience to + * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of + * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, + * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, + * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a + * property should generally be a constant across its lifetime. + * + * A property may optionally have a default value. If it does have a default + * value, and that property is not set to be a different value, then the default + * value is inherited automatically. It also means that if the default value is + * acceptable, there is no need to set the value for a required property. For + * example, the vxlan module has the vxlan/listen_port property which is + * required, but has a default value of 4789 (the IANA assigned port). Because + * of that default value, there is no need for it to be set. + * + * Finally, a property may declare a list of valid values. These valid values + * are used for display purposes, they are not enforced by the broader system, + * but merely allow a means for the information to be communicated to the user + * through dladm(1M). Like a default value, this is optional. + * + * The general scaffolding does not do very much with respect to the getting and + * setting of properties. That is really owned by the individual plugins + * themselves. + * + * ----------------------------- + * Destinations and Plugin Types + * ----------------------------- + * + * Both encapsulation and lookup plugins define the kinds of destinations that + * they know how to support. There are three different pieces of information + * that can be used to address to a destination currently, all of which is + * summarized in the type overlay_point_t. Any combination of these is + * supported. + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * An Ethernet MAC address is required. + * + * OVERLAY_PLUGIN_D_IP + * + * An IP address is required. All IP addresses used by the overlay + * system are transmitted as IPv6 addresses. IPv4 addresses can be + * represented by using IPv4-mapped IPv6 addresses. + * + * OVERLAY_PLUGIN_D_PORT + * + * A TCP/UDP port is required. + * + * A kernel encapsulation plugin declares which of these that it requires, it's + * a static set. On the other hand, a userland lookup plugin can be built to + * support all of these or any combination thereof. It gets passed the required + * destination type, based on the kernel encapsulation method, and then it makes + * the determination as to whether or not it supports it. For example, the + * direct plugin can support either an IP or both an IP and a port, it simply + * doesn't display the direct/dest_port property in the cases where a port is + * not required to support this. + * + * The user lookup plugins have two different modes of operation which + * determines how they interact with the broader system and how look ups are + * performed. These types are: + * + * OVERLAY_TARGET_POINT + * + * A point to point plugin has a single static definition for where + * to send all traffic. Every packet in the system always gets sent + * to the exact same destination which is programmed into the + * kernel when the general device is activated. + * + * OVERLAY_TARGET_DYNAMIC + * + * A dynamic plugin does not have a single static definition. + * Instead, for each destination, the kernel makes an asynchronous + * request to varpd to determine where the packet should be routed, + * and if a specific destination is found, then that destination is + * cached in the overlay device's target cache. + * + * This distinction, while important for the general overlay device's operation, + * is not important to the encapsulation plugins. They don't need to know about + * any of these pieces. It's just a concern for varpd, the userland plugin, and + * the general overlay scaffolding. + * + * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not + * maintain a target cache, and instead just keeps track of the destination and + * always sends encapsulated packets to that address. When the target type is of + * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such + * destinations. These destinations are kept around in an instance of a + * reference hash that is specific to the given overlay device. Entries in the + * cache can be invalidated and replaced by varpd and its lookup plugins. + * + * ---------------------------------- + * Kernel Components and Architecture + * ---------------------------------- + * + * There are multiple pieces inside the kernel that work together, there is the + * general overlay_dev_t structure, which is the logical GLDv3 device, but it + * itself has references to things like an instance of an encapsulation plugin, + * a pointer to a mux and a target cache. It can roughly be summarized in the + * following image: + * + * +------------------+ + * | global | + * | overlay list | + * | overlay_dev_list | + * +------------------+ + * | + * | +-----------------------+ +---------------+ + * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... + * | overlay_dev_t | | overlay_dev_t | + * | | +---------------+ + * | | + * | mac_handle_t -----+---> GLDv3 handle to MAC + * | datalink_id_t -----+---> Datalink ID used by DLS + * | overlay_dev_flag_t ---+---> Device state + * | uint_t -----+---> Curent device MTU + * | uint_t -----+---> In-progress RX operations + * | uint_t -----+---> In-progress TX operations + * | char[] -----+---> FMA degraded message + * | void * -----+---> plugin private data + * | overlay_target_t * ---+---------------------+ + * | overlay_plugin_t * ---+---------+ | + * +-----------------------+ | | + * ^ | | + * +--------------------+ | | | + * | Kernel Socket | | | | + * | Multiplexor | | | | + * | overlay_mux_t | | | | + * | | | | | + * | avl_tree_t -+--+ | | + * | uint_t -+--> socket family | | + * | uint_t -+--> socket type | | + * | uint_t -+--> socket protocol | | + * | ksocket_t -+--> I/O socket | | + * | struct sockaddr * -+--> ksocket address | | + * | overlay_plugin_t --+--------+ | | + * +--------------------+ | | | + * | | | + * +-------------------------+ | | | + * | Encap Plugin |<--+-----------+ | + * | overlay_plugin_t | | + * | | | + * | char * ---+--> plugin name | + * | overlay_plugin_ops_t * -+--> plugin downcalls | + * | char ** (props) ---+--> property list | + * | uint_t ---+--> id length | + * | overlay_plugin_flags_t -+--> plugin flags | + * | overlay_plugin_dest_t --+--> destination type v + * +-------------------------+ +-------------------------+ + * | Target Cache | + * | overlay_target_t | + * | | + * cache mode <--+- overlay_target_mode_t | + * dest type <--+- overlay_plugin_dest_t | + * cache flags <--+- overlay_target_flag_t | + * varpd id <--+- uint64_t | + * outstanding varpd reqs. <--+- uint_t | + * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | + * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | + * | +-------------------------+ + * +-----------------------+ + * | + * v + * +-------------------------------+ +------------------------+ + * | Target Entry |-->| Target Entry |--> ... + * | overlay_target_entry_t | | overlay_target_entry_t | + * | | +------------------------+ + * | | + * | overlay_target_entry_flags_t -+--> Entry flags + * | uint8_t[ETHERADDRL] ---+--> Target MAC address + * | overlay_target_point_t ---+--> Target underlay address + * | mblk_t * ---+--> outstanding mblk head + * | mblk_t * ---+--> outstanding mblk tail + * | size_t ---+--> outstanding mblk size + * +-------------------------------+ + * + * The primary entries that we care about are the overlay_dev_t, which + * correspond to each overlay device that is created with dladm(1M). Globally, + * these devices are maintained in a simple list_t which is protected with a + * lock. Hence, these include important information such as the mac_handle_t + * and a datalink_id_t which is used to interact with the broader MAC and DLS + * ecosystem. We also maintain additional information such as the current state, + * outstanding operations, the mtu, and importantly, the plugin's private data. + * This is the instance of an encapsulation plugin that gets created as part of + * creating an overlay device. Another aspect of this is that the overlay_dev_t + * also includes information with respect to FMA. For more information, see the + * FMA section. + * + * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin + * is the encapsulation plugin. This allows the device to make downcalls into it + * based on doing things like getting and setting properties. Otherwise, the + * plugin itself is a fairly straightforward entity. They are maintained in an + * (not pictured above) list. The plugins themselves mostly maintain things like + * the static list of properties, what kind of destination they require, and the + * operations vector. A given module may contain more if necessary. + * + * The next piece of the puzzle is the mux, or a multiplexor. The mux itself + * maintains a ksocket and it is through the mux that we send and receive + * message blocks. The mux represents a socket type and address, as well as a + * plugin. Multiple overlay_dev_t devices may then share the same mux. For + * example, consider the case where you have different instances of vxlan all on + * the same underlay network. These would all logically share the same IP + * address and port that packets are sent and received on; however, what differs + * is the decapuslation ID. + * + * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike + * a socket, we enable a direct callback on the ksocket. This means that + * whenever a message block chain is received, rather than sitting there and + * getting a callback in a context and kicking that back out to a taskq. Instead + * data comes into the callback function overlay_mux_recv(). + * + * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx + * function) to transmit. It receives encapsulated packets, decapsulates them to + * determine the overlay identifier, looks up the given device that matches that + * identifier, and then causes the broader MAC world to receive the packet with + * a call to mac_rx(). + * + * Today, we don't do too much that's special with the ksocket; however, as + * hardware is gaining understanding for these encapuslation protocols, we'll + * probably want to think of better ways to get those capabilities passed down + * and potentially better ways to program receive filters so they get directly + * to us. Though, that's all fantasy future land. + * + * The next part of the puzzle is the target cache. The purpose of the target + * cache is to cache where we should send a packet on the underlay network, + * given its mac address. The target cache operates in two modes depending on + * whether the lookup module was declared to OVERLAY_TARGET_POINT or + * OVERLAY_TARGET_DYANMIC. + * + * In the case where the target cache has been programmed to be + * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t + * which has the destination that we send everything, no matter the destination + * mac address. + * + * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things + * are much more interesting and as a result, more complicated. We primarily + * store lists of overlay_target_entry_t's which are stored in both an avl tree + * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * is only used for a few of the target ioctls used to dump data such that we + * can get a consistent iteration order for things like dladm show-overlay -t. + * The key that we use for the reference hashtable is based on the mac address + * in the cache and currently we just do a simple CRC32 to transform it into a + * hash. + * + * Each entry maintains a set of flags to indicate the current status of the + * request. The flags may indicate one of three states: that current cache entry + * is valid, that the current cache entry has been directed to drop all output, + * and that the current cache entry is invalid and may be being looked up. In + * the case where it's valid, we just take the destination address and run with + * it. + * + * If it's invalid and a lookup has not been made, then we start the process + * that prepares a query that will make its way up to varpd. The cache entry + * entry maintains a message block chain of outstanding message blocks and a + * size. These lists are populated only when we don't know the answer as to + * where should these be sent. The size entry is used to cap the amount of + * outstanding data that we don't know the answer to. If we exceed a cap on the + * amount of outstanding data (currently 1 Mb), then we'll drop any additional + * packets. Once we get an answer indicating a valid destination, we transmit + * any outstanding data to that place. For the full story on how we look that up + * will be discussed in the section on the Target Cache Lifecycle. + * + * ------------------------ + * FMA and Degraded Devices + * ------------------------ + * + * Every kernel overlay device keeps track of its FMA state. Today in FMA we + * cannot represent partitions between resources nor can we represent that a + * given minor node of a psuedo device has failed -- if we degrade the overlay + * device, then the entire dev_info_t is degraded. However, we still want to be + * able to indicate to administrators that things may go wrong. + * + * To this end, we've added a notion of a degraded state to every overlay + * device. This state is primarily dictated by userland and it can happen for + * various reasons. Generally, because a userland lookup plugin has been + * partitioned, or something has gone wrong such that there is no longer any + * userland lookup module for a device, then we'll mark it degraded. + * + * As long as any of our minor instances is degraded, then we'll fire off the + * FMA event to note that. Once the last degraded instance is no longer + * degraded, then we'll end up telling FMA that we're all clean. + * + * To help administrators get a better sense of which of the various minor + * devices is wrong, we store the odd_fmamsg[] character array. This character + * array can be fetched with doing a dladm show-overlay -f. + * + * Note, that it's important that we do not update the link status of the + * devices. We want to remain up as much as possible. By changing the link in a + * degraded state, this may end up making things worse. We may still actually + * have information in the target cache and if we mark the link down, that'll + * result in not being able to use it. The reason being that this'll mark all + * the downstream VNICs down which will go to IP and from there we end up + * dealing with sadness. + * + * ----------------------- + * Target Cache Life Cycle + * ----------------------- + * + * This section only applies when we have a lookup plugin of + * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type + * OVERLAY_TARGET_POINT. + * + * While we got into the target cache in the general architecture section, it's + * worth going into more details as to how this actually works and showing some + * examples and state machines. Recall that a target cache entry basically has + * the following state transition diagram: + * + * Initial state + * . . . . . . first access . . . varpd lookup enqueued + * . . . + * . . . + * +-------+ . +----------+ . + * | No |------*---->| Invalid |-------*----+ + * | Entry | | Entry | | + * +-------+ +----------+ | + * varpd ^ ^ varpd | + * invalidate | | drop | + * . . . * * . . v + * +-------+ | | +---------+ + * | Entry |--->-----+ +----<----| Entry | + * | Valid |<----------*---------<----| Pending |->-+ varpd + * +-------+ . +---------+ * . . drop, but + * . varpd ^ | other queued + * . success | | entries + * +-----+ + * + * When the table is first created, it is empty. As we attempt to lookup entries + * and we find there is no entry at all, we'll create a new table entry for it. + * At that point the entry is technically in an invalid state, that means that + * we have no valid data from varpd. In that case, we'll go ahead and queue the + * packet into the entry's pending chain, and queue a varpd lookup, setting the + * OVERLAY_ENTRY_F_PENDING flag in the progress. + * + * If additional mblk_t's come in for this entry, we end up appending them to + * the tail of the chain, if and only if, we don't exceed the threshold for the + * amount of space they can take up. An entry remains pending until we get a + * varpd reply. If varpd replies with a valid results, we move to the valid + * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one + * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. + * + * Once an entry is valid, it stays valid until user land tells us to invalidate + * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and + * OVERLAY_TARG_CACHE_SET respectively. + * + * If the lookup fails with a call to drop the packet, then the next state is + * determined by the state of the queue. If the set of outstanding entries is + * empty, then we just transition back to the invalid state. If instead, the + * set of outstanding entries is not empty, then we'll queue another entry and + * stay in the same state, repeating this until the number of requests is + * drained. + * + * The following images describes the flow of a given lookup and where the + * overlay_target_entry_t is at any given time. + * + * +-------------------+ + * | Invalid Entry | An entry starts off as an invalid entry + * | de:ad:be:ef:00:00 | and only exists in the target cache. + * +-------------------+ + * + * ~~~~ + * + * +---------------------+ + * | Global list_t | A mblk_t comes in for an entry. We + * | overlay_target_list | append it to the overlay_target_list. + * +---------------------+ + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry |--->... + * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +--------------------------+ + * | /dev/overlay minor state | User land said that it would look up an + * | overlay_target_hdl_t | entry for us. We remove it from the + * +--------------------------+ global list and add it to the handle's + * | outstanding list. + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry | + * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +-------------------+ + * | Valid Entry | varpd returned an answer with + * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache + * | 10.169.23.42:4789 | entry is now populated with a + * +-------------------+ destination and marked as valid + * + * + * The lookup mechanism is performed via a series of operations on the character + * psuedo-device /dev/overlay. The only thing that uses this device is the + * userland daemon varpd. /dev/overlay is a cloneable device, each open of it + * granting a new minor number which maintains its own state. We maintain this + * state so that way if an outstanding lookup was queued to something that + * crashed or closed its handle without responding, we can know about this and + * thus handle it appropriately. + * + * When a lookup is first created it's added to our global list of outstanding + * lookups. To service requests, userland is required to perform an ioctl to ask + * for a request. We will block it in the kernel a set amount of time waiting + * for a request. When we give a request to a given minor instance of the + * device, we remove it from the global list and append the request to the + * device's list of outstanding entries, for the reasons we discussed above. + * When a lookup comes in, we give user land a smaller amount of information + * specific to that packet, the overlay_targ_lookup_t. It includes a request id + * to identify this, and then the overlay id, the varpd id, the header and + * packet size, the source and destination mac address, the SAP, and any + * potential VLAN header. + * + * At that point, it stays in that outstanding list until one of two ioctls are + * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, + * userland may also perform other operations. For example, it may use + * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth + * analysis of what to do beyond what we gave it initially. This is useful for + * providing proxy arp and the like. Finally, there are two other ioctls that + * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the + * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which + * causes us to encapsulate and send out the packet they've given us. + * + * + * Finally, through the target cache, several ioctls are provided to allow for + * interrogation and management of the cache. They allow for individual entries + * to be retrieved, set, or have the entire table flushed. For the full set of + * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. + * + * ------------------ + * Sample Packet Flow + * ------------------ + * + * There's a lot of pieces here, hopefully an example of how this all fits + * together will help clarify and elucidate what's going on. We're going to + * first track an outgoing packet, eg. one that is sent from an IP interface on + * a VNIC on top of an overlay device, and then we'll look at what it means to + * respond to that. + * + * + * +----------------+ +--------------+ +------------------+ + * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | + * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | + * +----------------+ | VNIC device | | overlay_m_tx() | + * +--------------+ +------------------+ + * | + * . lookup . cache | + * . drop . miss v + * +---------+ . +--------+ . +------------------+ + * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | + * | mblk_t | | lookup | | in the target | + * +---------+ | queued | | cache | + * ^ +--------+ +------------------+ + * on send | | | cache + * error . . * *. . lookup * . . hit + * | | success v + * | | +------------------+ + * +-----------------+ +--------------->| call plugin | + * | Send out | | ovpo_encap() to | + * | overlay_mux_t's |<----------------------------------| get encap mblk_t | + * | ksocket | +------------------+ + * +-----------------+ + * + * The receive end point looks a little different and looks more like: + * + * +------------------+ +----------------+ +-----------+ + * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ + * | the physical | | IP stack | | to | * . . direct + * | device | +----------------+ | ksocket | | callback + * +------------------+ +-----------+ | + * . overlay id | + * . not found v + * +-----------+ . +-----------------+ +--------------------+ + * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | + * | mblk_t | | ovpo_decap() to | +--------------------+ + * +-----------+ | decap mblk_t | + * +-----------------+ + * | + * * . . overlay id + * v found + * +--------+ +----------------+ + * | adjust |----->| call mac_rx | + * | mblk_t | | on original | + * +--------+ | decaped packet | + * +----------------+ + * + * ------------------ + * Netstack Awareness + * ------------------ + * + * In the above image we note that this enters a netstack. Today the only + * netstack that can be is the global zone as the overlay driver itself is not + * exactly netstack aware. What this really means is that varpd cannot run in a + * non-global zone and an overlay device cannot belong to a non-global zone. + * Non-global zones can still have a VNIC assigned to them that's been created + * over the overlay device the same way they would if it had been created over + * an etherstub or a physical device. + * + * The majority of the work to make it netstack aware is straightforward and the + * biggest thing is to create a netstack module that allows us to hook into + * netstack (and thus zone) creation and destruction. From there, we need to + * amend the target cache lookup routines that we discussed earlier to not have + * a global outstanding list and a global list of handles, but rather, one per + * netstack. + * + * For the mux, we'll need to open the ksocket in the context of the zone, we + * can likely do this with a properly composed credential, but we'll need to do + * some more work on that path. Finally, we'll want to make sure the dld ioctls + * are aware of the zoneid of the caller and we use that appropriately and store + * it in the overlay_dev_t. + * + * ----------- + * GLDv3 Notes + * ----------- + * + * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more + * relevant and other parts are much less relevant for us. For example, the + * GLDv3 is used to toggle the device being put into and out of promiscuous + * mode, to program MAC addresses for unicast and multicast hardware filters. + * Today, an overlay device doesn't have a notion of promiscuous mode nor does + * it have a notion of unicast and multicast addresses programmed into the + * device. Instead, for the purposes of the hardware filter, we don't do + * anything and just always accept new addresses being added and removed. + * + * If the GLDv3 start function has not been called, then we will not use this + * device for I/O purposes. Any calls to transmit or receive should be dropped, + * though the GLDv3 guarantees us that transmit will not be called without + * calling start. Similarly, once stop is called, then no packets can be dealt + * with. + * + * Today we don't support the stat interfaces, though there's no good reason + * that we shouldn't assemble some of the stats based on what we have in the + * future. + * + * When it comes to link properties, many of the traditional link properties do + * not apply and many others MAC handles for us. For example, we don't need to + * implement anything for overlay_m_getprop() to deal with returning the MTU, as + * MAC never calls into us for that. As such, there isn't much of anything to + * support in terms of properties. + * + * Today, we don't support any notion of hardware capabilities. However, if + * future NIC hardware or other changes to the system cause it to make sense for + * us to emulate logical groups, then we should do that. However, we still do + * implement a capab function so that we can identify ourselves as an overlay + * device to the broader MAC framework. This is done mostly so that a device + * created on top of us can have fanout rings as we don't try to lie about a + * speed for our device. + * + * The other question is what should be done for a device's MTU and margin. We + * set our minimum supported MTU to be the minimum value that an IP network may + * be set to 576 -- which mimics what an etherstub does. On the flip side, we + * have our upper bound set to 8900. This value comes from the fact that a lot + * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 + * bytes, which isn't exactly the most accurate number, but it'll be good enough + * for now. Because of that, our default MTU off of these devices is 1400, as + * the default MTU for everything is usually 1500 or whatever the underlying + * device is at; however, this is a bit simpler than asking the netstack what + * are all the IP interfaces at. It also calls into question how PMTU and PMTU + * discovery should work here. The challenge, especially for + * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's + * not clear that if you have a single bad entry that the overall MTU should be + * lowered. Instead, we should figure out a better way of determining these + * kinds of PMTU errors and appropriately alerting the administrator via FMA. + * + * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether + * or not the underlying encapsulation device supports VLAN tags. If it does, + * then we'll set the margin to allow for it, otherwise, we will not. + */ + +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/policy.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/ddifm.h> + +#include <sys/dls.h> +#include <sys/dld_ioc.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> + +#include <sys/overlay_impl.h> + +dev_info_t *overlay_dip; +static kmutex_t overlay_dev_lock; +static list_t overlay_dev_list; +static uint8_t overlay_macaddr[ETHERADDRL] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + +typedef enum overlay_dev_prop { + OVERLAY_DEV_P_MTU = 0, + OVERLAY_DEV_P_VNETID, + OVERLAY_DEV_P_ENCAP, + OVERLAY_DEV_P_VARPDID +} overlay_dev_prop_t; + +#define OVERLAY_DEV_NPROPS 4 +static const char *overlay_dev_props[] = { + "mtu", + "vnetid", + "encap", + "varpd/id" +}; + +#define OVERLAY_MTU_MIN 576 +#define OVERLAY_MTU_DEF 1400 +#define OVERLAY_MTU_MAX 8900 + +overlay_dev_t * +overlay_hold_by_dlid(datalink_id_t id) +{ + overlay_dev_t *o; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (id == o->odd_linkid) { + mutex_enter(&o->odd_lock); + o->odd_ref++; + mutex_exit(&o->odd_lock); + mutex_exit(&overlay_dev_lock); + return (o); + } + } + + mutex_exit(&overlay_dev_lock); + return (NULL); +} + +void +overlay_hold_rele(overlay_dev_t *odd) +{ + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_ref > 0); + odd->odd_ref--; + mutex_exit(&odd->odd_lock); +} + +void +overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) + odd->odd_rxcount++; + if (flag & OVERLAY_F_IN_TX) + odd->odd_txcount++; + odd->odd_flags |= flag; +} + +void +overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + boolean_t signal = B_FALSE; + + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) { + ASSERT(odd->odd_rxcount > 0); + odd->odd_rxcount--; + if (odd->odd_rxcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_RX; + } + } + if (flag & OVERLAY_F_IN_TX) { + ASSERT(odd->odd_txcount > 0); + odd->odd_txcount--; + if (odd->odd_txcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_TX; + } + } + + if (signal == B_TRUE) + cv_broadcast(&odd->odd_iowait); +} + +static void +overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + while (odd->odd_flags & flag) { + cv_wait(&odd->odd_iowait, &odd->odd_lock); + } +} + +void +overlay_dev_iter(overlay_dev_iter_f func, void *arg) +{ + overlay_dev_t *odd; + + mutex_enter(&overlay_dev_lock); + for (odd = list_head(&overlay_dev_list); odd != NULL; + odd = list_next(&overlay_dev_list, odd)) { + if (func(odd, arg) != 0) { + mutex_exit(&overlay_dev_lock); + return; + } + } + mutex_exit(&overlay_dev_lock); +} + +/* ARGSUSED */ +static int +overlay_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + return (ENOTSUP); +} + +static int +overlay_m_start(void *arg) +{ + overlay_dev_t *odd = arg; + overlay_mux_t *mux; + int ret, domain, family, prot; + struct sockaddr_storage storage; + socklen_t slen; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { + mutex_exit(&odd->odd_lock); + return (EAGAIN); + } + mutex_exit(&odd->odd_lock); + + ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, + &family, &prot, (struct sockaddr *)&storage, &slen); + if (ret != 0) + return (ret); + + mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, + (struct sockaddr *)&storage, slen, &ret); + if (mux == NULL) + return (ret); + + overlay_mux_add_dev(mux, odd); + odd->odd_mux = mux; + mutex_enter(&odd->odd_lock); + ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); + odd->odd_flags |= OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); + + return (0); +} + +static void +overlay_m_stop(void *arg) +{ + overlay_dev_t *odd = arg; + + /* + * The MAC Perimeter is held here, so we don't have to worry about + * synchornizing this with respect to metadata operations. + */ + mutex_enter(&odd->odd_lock); + VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); + VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + overlay_mux_close(odd->odd_mux); + odd->odd_mux = NULL; + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + odd->odd_flags &= ~OVERLAY_F_MDDROP; + VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); + mutex_exit(&odd->odd_lock); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_promisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_unicast(void *arg, const uint8_t *macaddr) +{ + return (0); +} + +mblk_t * +overlay_m_tx(void *arg, mblk_t *mp_chain) +{ + overlay_dev_t *odd = arg; + mblk_t *mp, *ep; + int ret; + ovep_encap_info_t einfo; + struct msghdr hdr; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + freemsgchain(mp_chain); + return (NULL); + } + overlay_io_start(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + + bzero(&hdr, sizeof (struct msghdr)); + + bzero(&einfo, sizeof (ovep_encap_info_t)); + einfo.ovdi_id = odd->odd_vid; + mp = mp_chain; + while (mp != NULL) { + socklen_t slen; + struct sockaddr_storage storage; + + mp_chain = mp->b_next; + mp->b_next = NULL; + ep = NULL; + + ret = overlay_target_lookup(odd, mp, + (struct sockaddr *)&storage, &slen); + if (ret != OVERLAY_TARGET_OK) { + if (ret == OVERLAY_TARGET_DROP) + freemsg(mp); + mp = mp_chain; + continue; + } + + hdr.msg_name = &storage; + hdr.msg_namelen = slen; + + ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, + &einfo, &ep); + if (ret != 0 || ep == NULL) { + freemsg(mp); + goto out; + } + + ep->b_cont = mp; + ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); + if (ret != 0) + goto out; + + mp = mp_chain; + } + +out: + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + return (mp_chain); +} + +/* ARGSUSED */ +static void +overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + miocnak(q, mp, 0, ENOTSUP); +} + +/* ARGSUSED */ +static boolean_t +overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + /* + * Tell MAC we're an overlay. + */ + if (cap == MAC_CAPAB_OVERLAY) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t mtu, old; + int err; + overlay_dev_t *odd = arg; + + if (pr_num != MAC_PROP_MTU) + return (ENOTSUP); + + bcopy(pr_val, &mtu, sizeof (mtu)); + if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) + return (EINVAL); + + mutex_enter(&odd->odd_lock); + old = odd->odd_mtu; + odd->odd_mtu = mtu; + err = mac_maxsdu_update(odd->odd_mh, mtu); + if (err != 0) + odd->odd_mtu = old; + mutex_exit(&odd->odd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static void +overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + if (pr_num != MAC_PROP_MTU) + return; + + mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); + mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); +} + +static mac_callbacks_t overlay_m_callbacks = { + .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | + MC_PROPINFO), + .mc_getstat = overlay_m_stat, + .mc_start = overlay_m_start, + .mc_stop = overlay_m_stop, + .mc_setpromisc = overlay_m_promisc, + .mc_multicst = overlay_m_multicast, + .mc_unicst = overlay_m_unicast, + .mc_tx = overlay_m_tx, + .mc_ioctl = overlay_m_ioctl, + .mc_getcapab = overlay_m_getcapab, + .mc_getprop = overlay_m_getprop, + .mc_setprop = overlay_m_setprop, + .mc_propinfo = overlay_m_propinfo +}; + +static boolean_t +overlay_valid_name(const char *name, size_t buflen) +{ + size_t actlen; + int err, i; + + for (i = 0; i < buflen; i++) { + if (name[i] == '\0') + break; + } + + if (i == 0 || i == buflen) + return (B_FALSE); + actlen = i; + if (strchr(name, '/') != NULL) + return (B_FALSE); + if (u8_validate((char *)name, actlen, NULL, + U8_VALIDATE_ENTIRE, &err) < 0) + return (B_FALSE); + return (B_TRUE); +} + +/* ARGSUSED */ +static int +overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int err; + uint64_t maxid; + overlay_dev_t *odd, *o; + mac_register_t *mac; + overlay_ioc_create_t *oicp = karg; + + if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) + return (EINVAL); + + odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); + odd->odd_linkid = oicp->oic_linkid; + odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); + if (odd->odd_plugin == NULL) { + kmem_free(odd, sizeof (overlay_dev_t)); + return (ENOENT); + } + err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, + &odd->odd_pvoid); + if (err != 0) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + /* + * Make sure that our virtual network id is valid for the given plugin + * that we're working with. + */ + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; + if (oicp->oic_vnetid > maxid) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_vid = oicp->oic_vnetid; + + mac = mac_alloc(MAC_VERSION); + if (mac == NULL) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = odd; + mac->m_dip = overlay_dip; + mac->m_dst_addr = NULL; + mac->m_callbacks = &overlay_m_callbacks; + mac->m_pdata = NULL; + mac->m_pdata_size = 0; + + mac->m_priv_props = NULL; + + /* Let mac handle this itself. */ + mac->m_instance = (uint_t)-1; + + /* + * There is no real source address that should be used here, but saying + * that we're not ethernet is going to cause its own problems. At the + * end of the say, this is fine. + */ + mac->m_src_addr = overlay_macaddr; + + /* + * Start with the default MTU as the max SDU. If the MTU is changed, the + * SDU will be changed to reflect that. + */ + mac->m_min_sdu = 1; + mac->m_max_sdu = OVERLAY_MTU_DEF; + mac->m_multicast_sdu = 0; + + /* + * The underlying device doesn't matter, instead this comes from the + * encapsulation protocol and whether or not they allow VLAN tags. + */ + if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { + mac->m_margin = VLAN_TAGSZ; + } else { + mac->m_margin = 0; + } + + /* + * Today, we have no MAC virtualization, it may make sense in the future + * to go ahead and emulate some subset of this, but it doesn't today. + */ + mac->m_v12n = MAC_VIRT_NONE; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (o->odd_linkid == oicp->oic_linkid) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + + if (o->odd_vid == oicp->oic_vnetid && + o->odd_plugin == odd->odd_plugin) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + } + + err = mac_register(mac, &odd->odd_mh); + mac_free(mac); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + (void) mac_unregister(odd->odd_mh); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); + odd->odd_ref = 0; + odd->odd_flags = 0; + list_insert_tail(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int i, ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_activate_t *oiap = karg; + overlay_ioc_propinfo_t *infop; + overlay_ioc_prop_t *oip; + overlay_prop_handle_t phdl; + + odd = overlay_hold_by_dlid(oiap->oia_linkid); + if (odd == NULL) + return (ENOENT); + + infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); + oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); + phdl = (overlay_prop_handle_t)infop; + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EEXIST); + } + mutex_exit(&odd->odd_lock); + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + const char *pname = odd->odd_plugin->ovp_props[i]; + bzero(infop, sizeof (overlay_ioc_propinfo_t)); + overlay_prop_init(phdl); + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + + if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) + continue; + bzero(oip, sizeof (overlay_ioc_prop_t)); + oip->oip_size = sizeof (oip->oip_value); + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + pname, oip->oip_value, &oip->oip_size); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + if (oip->oip_size == 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EINVAL); + } + } + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ENXIO); + } + + ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); + odd->odd_flags |= OVERLAY_F_ACTIVATED; + + /* + * Now that we've activated ourselves, we should indicate to the world + * that we're up. Note that we may not be able to perform lookups at + * this time, but our notion of being 'up' isn't dependent on that + * ability. + */ + mac_link_update(odd->odd_mh, LINK_STATE_UP); + mutex_exit(&odd->odd_lock); + + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + overlay_ioc_delete_t *oidp = karg; + overlay_dev_t *odd; + datalink_id_t tid; + int ret; + + odd = overlay_hold_by_dlid(oidp->oid_linkid); + if (odd == NULL) { + return (ENOENT); + } + + mutex_enter(&odd->odd_lock); + /* If we're not the only hold, we're busy */ + if (odd->odd_ref != 1) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + if (odd->odd_flags & OVERLAY_F_IN_MUX) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + /* + * To remove this, we need to first remove it from dls and then remove + * it from mac. The act of removing it from mac will check if there are + * devices on top of this, eg. vnics. If there are, then that will fail + * and we'll have to go through and recreate the dls entry. Only after + * mac_unregister has succeeded, then we'll go through and actually free + * everything and drop the dev lock. + */ + ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); + if (ret != 0) { + overlay_hold_rele(odd); + return (ret); + } + + ASSERT(oidp->oid_linkid == tid); + ret = mac_disable(odd->odd_mh); + if (ret != 0) { + (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + overlay_hold_rele(odd); + return (ret); + } + + overlay_target_quiesce(odd->odd_target); + + mutex_enter(&overlay_dev_lock); + list_remove(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + cv_destroy(&odd->odd_iowait); + mutex_destroy(&odd->odd_lock); + overlay_target_free(odd); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_nprops_t *on = karg; + + odd = overlay_hold_by_dlid(on->oipn_linkid); + if (odd == NULL) + return (ENOENT); + on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) +{ + overlay_prop_handle_t phdl = arg; + overlay_prop_set_range_str(phdl, opp->ovp_name); + return (0); +} + +static int +overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) +{ + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], name) == 0) { + *id = i; + return (0); + } + } + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { + *id = i + OVERLAY_DEV_NPROPS; + return (0); + } + } + + return (ENOENT); +} + +static void +overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) +{ + uint32_t def; + mac_propval_range_t range; + uint_t perm; + + ASSERT(MAC_PERIM_HELD(odd->odd_mh)); + + bzero(&range, sizeof (mac_propval_range_t)); + range.mpr_count = 1; + if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, + sizeof (def), &range, &perm) != 0) + return; + + if (perm == MAC_PROP_PERM_READ) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + else if (perm == MAC_PROP_PERM_WRITE) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); + else if (perm == MAC_PROP_PERM_RW) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, + range.mpr_range_uint32[0].mpur_max); +} + +/* ARGSUSED */ +static int +overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + int ret; + mac_perim_handle_t mph; + uint_t propid = UINT_MAX; + overlay_ioc_propinfo_t *oip = karg; + overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; + + odd = overlay_hold_by_dlid(oip->oipi_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_prop_init(phdl); + mac_perim_enter_by_mh(odd->odd_mh, &mph); + + /* + * If the id is -1, then the property that we're looking for is named in + * oipi_name and we should fill in its id. Otherwise, we've been given + * an id and we need to turn that into a name for our plugin's sake. The + * id is our own fabrication for property discovery. + */ + if (oip->oipi_id == -1) { + /* + * Determine if it's a known generic property or it belongs to a + * module by checking against the list of known names. + */ + oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, + &propid)) != 0) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + oip->oipi_id = propid; + if (propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + oip->oipi_name, phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + + } + } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; + + if (id >= odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + odd->odd_plugin->ovp_props[id], phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oipi_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oipi_id >= 0); + propid = oip->oipi_id; + (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], + sizeof (oip->oipi_name)); + } + + switch (propid) { + case OVERLAY_DEV_P_MTU: + overlay_i_propinfo_mtu(odd, phdl); + break; + case OVERLAY_DEV_P_VNETID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + case OVERLAY_DEV_P_ENCAP: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); + overlay_prop_set_nodefault(phdl); + overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); + break; + case OVERLAY_DEV_P_VARPDID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + default: + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ENOENT); + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_prop_t *oip = karg; + uint_t propid, mtu; + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + oip->oip_size = OVERLAY_PROP_SIZEMAX; + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_getprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oip_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + /* + * The MTU is always set and retrieved through MAC, to allow for + * MAC to do whatever it wants, as really that property belongs + * to MAC. This is important for things where vnics have hold on + * the MTU. + */ + mac_sdu_get(odd->odd_mh, NULL, &mtu); + bcopy(&mtu, oip->oip_value, sizeof (uint_t)); + oip->oip_size = sizeof (uint_t); + break; + case OVERLAY_DEV_P_VNETID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint64_t); + break; + case OVERLAY_DEV_P_ENCAP: + oip->oip_size = strlcpy((char *)oip->oip_value, + odd->odd_plugin->ovp_name, oip->oip_size); + break; + case OVERLAY_DEV_P_VARPDID: + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + const uint64_t val = odd->odd_target->ott_id; + bcopy(&val, oip->oip_value, sizeof (uint64_t)); + oip->oip_size = sizeof (uint64_t); + } else { + oip->oip_size = 0; + } + mutex_exit(&odd->odd_lock); + break; + default: + ret = ENOENT; + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); +} + +static void +overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +/* ARGSUSED */ +static int +overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + overlay_ioc_prop_t *oip = karg; + uint_t propid = UINT_MAX; + mac_perim_handle_t mph; + uint64_t maxid, *vidp; + + if (oip->oip_size > OVERLAY_PROP_SIZEMAX) + return (EINVAL); + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mac_perim_exit(mph); + mutex_exit(&odd->odd_lock); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); + } else if (oip->oip_id < -1) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", + oip->oip_value, oip->oip_size); + break; + case OVERLAY_DEV_P_VNETID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vidp = (uint64_t *)oip->oip_value; + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - + 1ULL; + if (*vidp >= maxid) { + ret = EINVAL; + break; + } + overlay_setprop_vnetid(odd, *vidp); + break; + case OVERLAY_DEV_P_ENCAP: + case OVERLAY_DEV_P_VARPDID: + ret = EPERM; + break; + default: + ret = ENOENT; + } + + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); +} + +/* ARGSUSED */ +static int +overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_status_t *os = karg; + + odd = overlay_hold_by_dlid(os->ois_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { + os->ois_status = OVERLAY_I_DEGRADED; + if (odd->odd_fmamsg != NULL) { + (void) strlcpy(os->ois_message, odd->odd_fmamsg, + OVERLAY_STATUS_BUFLEN); + } else { + os->ois_message[0] = '\0'; + } + + } else { + os->ois_status = OVERLAY_I_OK; + os->ois_message[0] = '\0'; + } + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + + return (0); +} + +static dld_ioc_info_t overlay_ioc_list[] = { + { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), + overlay_i_create, secpolicy_dl_config }, + { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), + overlay_i_activate, secpolicy_dl_config }, + { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), + overlay_i_delete, secpolicy_dl_config }, + { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, + secpolicy_dl_config }, + { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_prop_t), overlay_i_getprop, + secpolicy_dl_config }, + { OVERLAY_IOC_SETPROP, DLDCOPYIN, + sizeof (overlay_ioc_prop_t), overlay_i_setprop, + secpolicy_dl_config }, + { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_nprops_t), overlay_i_nprops, + secpolicy_dl_config }, + { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_status_t), overlay_i_status, + NULL } +}; + +static int +overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int fmcap = DDI_FM_EREPORT_CAPABLE; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (overlay_dip != NULL || ddi_get_instance(dip) != 0) + return (DDI_FAILURE); + + ddi_fm_init(dip, &fmcap, NULL); + + if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, + DLDIOCCNT(overlay_ioc_list)) != 0) { + ddi_remove_minor_node(dip, OVERLAY_CTL); + return (DDI_FAILURE); + } + + overlay_dip = dip; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resp = (void *)overlay_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resp = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&overlay_dev_lock); + if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { + mutex_exit(&overlay_dev_lock); + return (EBUSY); + } + mutex_exit(&overlay_dev_lock); + + + dld_ioc_unregister(OVERLAY_IOC); + ddi_remove_minor_node(dip, OVERLAY_CTL); + ddi_fm_fini(dip); + overlay_dip = NULL; + return (DDI_SUCCESS); +} + +static struct cb_ops overlay_cbops = { + overlay_target_open, /* cb_open */ + overlay_target_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + overlay_target_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev, /* cb_awrite */ +}; + +static struct dev_ops overlay_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + overlay_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + overlay_attach, /* devo_attach */ + overlay_detach, /* devo_detach */ + nulldev, /* devo_reset */ + &overlay_cbops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + NULL, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv overlay_modldrv = { + &mod_driverops, + "Overlay Network Driver", + &overlay_dev_ops +}; + +static struct modlinkage overlay_linkage = { + MODREV_1, + &overlay_modldrv +}; + +static int +overlay_init(void) +{ + mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&overlay_dev_list, sizeof (overlay_dev_t), + offsetof(overlay_dev_t, odd_link)); + overlay_mux_init(); + overlay_plugin_init(); + overlay_target_init(); + + return (DDI_SUCCESS); +} + +static void +overlay_fini(void) +{ + overlay_target_fini(); + overlay_plugin_fini(); + overlay_mux_fini(); + mutex_destroy(&overlay_dev_lock); + list_destroy(&overlay_dev_list); +} + +int +_init(void) +{ + int err; + + if ((err = overlay_init()) != DDI_SUCCESS) + return (err); + + mac_init_ops(NULL, "overlay"); + err = mod_install(&overlay_linkage); + if (err != DDI_SUCCESS) { + overlay_fini(); + return (err); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&overlay_linkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + err = mod_remove(&overlay_linkage); + if (err != 0) + return (err); + + overlay_fini(); + return (0); +} diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf new file mode 100644 index 0000000000..4b62fafd94 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015, Joyent, Inc. +# + +name="overlay" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile new file mode 100644 index 0000000000..800d72dc2b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.mapfile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # Encapsualation Plugin interfaces + overlay_plugin_alloc; + overlay_plugin_free; + overlay_plugin_register; + overlay_plugin_unregister; + local: + *; +}; diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c new file mode 100644 index 0000000000..0701d08e8b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_fm.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device FMA operations. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/ddifm.h> +#include <sys/overlay_impl.h> + +kmutex_t overlay_fm_lock; +uint_t overlay_fm_count; + +void +overlay_fm_init(void) +{ + overlay_fm_count = 0; + mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_fm_fini(void) +{ + VERIFY(overlay_fm_count == 0); + mutex_destroy(&overlay_fm_lock); +} + +void +overlay_fm_degrade(overlay_dev_t *odd, const char *msg) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + + if (msg != NULL) + (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN); + + if (odd->odd_flags & OVERLAY_F_DEGRADED) + goto out; + + odd->odd_flags |= OVERLAY_F_DEGRADED; + overlay_fm_count++; + if (overlay_fm_count == 1) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} + +void +overlay_fm_restore(overlay_dev_t *odd) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_DEGRADED)) + goto out; + + odd->odd_fmamsg[0] = '\0'; + odd->odd_flags &= ~OVERLAY_F_DEGRADED; + overlay_fm_count--; + if (overlay_fm_count == 0) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c new file mode 100644 index 0000000000..9f70e8c83e --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -0,0 +1,354 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device ksocket multiplexer. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ksynch.h> +#include <sys/ksocket.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> + +#include <sys/overlay_impl.h> + +#include <sys/sdt.h> + +#define OVERLAY_FREEMSG(mp, reason) \ + DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason) + +static list_t overlay_mux_list; +static kmutex_t overlay_mux_lock; + +void +overlay_mux_init(void) +{ + list_create(&overlay_mux_list, sizeof (overlay_mux_t), + offsetof(overlay_mux_t, omux_lnode)); + mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_mux_fini(void) +{ + mutex_destroy(&overlay_mux_lock); + list_destroy(&overlay_mux_list); +} + +static int +overlay_mux_comparator(const void *a, const void *b) +{ + const overlay_dev_t *odl, *odr; + odl = a; + odr = b; + if (odl->odd_vid > odr->odd_vid) + return (1); + else if (odl->odd_vid < odr->odd_vid) + return (-1); + else + return (0); +} + +/* + * This is the central receive data path. We need to decode the packet, if we + * can, and then deliver it to the appropriate overlay. + */ +/* ARGSUSED */ +static boolean_t +overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, + void *arg) +{ + mblk_t *mp, *nmp, *fmp; + overlay_mux_t *mux = arg; + + /* + * We may have a received a chain of messages. Each messsage in the + * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. + * If we aren't getting that, we should probably drop that for the + * moment. + */ + for (mp = mpchain; mp != NULL; mp = nmp) { + struct T_unitdata_ind *tudi; + ovep_encap_info_t infop; + overlay_dev_t od, *odd; + int ret; + + nmp = mp->b_next; + mp->b_next = NULL; + + if (DB_TYPE(mp) != M_PROTO) { + OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); + freemsg(mp); + continue; + } + + if (mp->b_cont == NULL) { + OVERLAY_FREEMSG(mp, "missing a b_cont"); + freemsg(mp); + continue; + } + + tudi = (struct T_unitdata_ind *)mp->b_rptr; + if (tudi->PRIM_type != T_UNITDATA_IND) { + OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); + freemsg(mp); + continue; + } + + /* + * In the future, we'll care about the source information + * for purposes of telling varpd for oob invalidation. But for + * now, just drop that block. + */ + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + freemsg(fmp); + + /* + * Decap and deliver. + */ + bzero(&infop, sizeof (ovep_encap_info_t)); + ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); + if (ret != 0) { + OVERLAY_FREEMSG(mp, "decap failed"); + freemsg(mp); + continue; + } + if (MBLKL(mp) > infop.ovdi_hdr_size) { + mp->b_rptr += infop.ovdi_hdr_size; + } else { + while (infop.ovdi_hdr_size != 0) { + size_t rem, blkl; + + if (mp == NULL) + break; + + blkl = MBLKL(mp); + rem = MIN(infop.ovdi_hdr_size, blkl); + infop.ovdi_hdr_size -= rem; + mp->b_rptr += rem; + if (rem == blkl) { + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + OVERLAY_FREEMSG(mp, + "freed a fmp block"); + freemsg(fmp); + } + } + if (mp == NULL) { + OVERLAY_FREEMSG(mp, "freed it all..."); + continue; + } + } + + + od.odd_vid = infop.ovdi_id; + mutex_enter(&mux->omux_lock); + odd = avl_find(&mux->omux_devices, &od, NULL); + if (odd == NULL) { + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "no matching vid"); + freemsg(mp); + continue; + } + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "dev dropped"); + freemsg(mp); + continue; + } + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + } + + return (B_TRUE); +} + +/* + * Register a given device with a socket backend. If no such device socket + * exists, create a new one. + */ +overlay_mux_t * +overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, + struct sockaddr *addr, socklen_t len, int *errp) +{ + int err; + overlay_mux_t *mux; + ksocket_t ksock; + + if (errp == NULL) + errp = &err; + + mutex_enter(&overlay_mux_lock); + for (mux = list_head(&overlay_mux_list); mux != NULL; + mux = list_next(&overlay_mux_list, mux)) { + if (domain == mux->omux_domain && + family == mux->omux_family && + protocol == mux->omux_protocol && + len == mux->omux_alen && + bcmp(addr, mux->omux_addr, len) == 0) { + + if (opp != mux->omux_plugin) { + *errp = EEXIST; + return (NULL); + } + + mutex_enter(&mux->omux_lock); + mux->omux_count++; + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + *errp = 0; + return (mux); + } + } + + /* + * Today we aren't zone-aware and only exist in the global zone. When we + * allow for things to exist in the non-global zone, we'll want to use a + * credential that's actually specific to the zone. + */ + *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, + kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + return (NULL); + } + + *errp = ksocket_bind(ksock, addr, len, kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + /* + * Ask our lower layer to optionally toggle anything they need on this + * socket. Because a socket is owned by a single type of plugin, we can + * then ask it to perform any additional socket set up it'd like to do. + */ + if (opp->ovp_ops->ovpo_sockopt != NULL && + (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); + list_link_init(&mux->omux_lnode); + mux->omux_ksock = ksock; + mux->omux_plugin = opp; + mux->omux_domain = domain; + mux->omux_family = family; + mux->omux_protocol = protocol; + mux->omux_addr = kmem_alloc(len, KM_SLEEP); + bcopy(addr, mux->omux_addr, len); + mux->omux_alen = len; + mux->omux_count = 1; + avl_create(&mux->omux_devices, overlay_mux_comparator, + sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); + mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); + + + /* Once this is called, we need to expect to rx data */ + *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); + if (*errp != 0) { + ksocket_close(ksock, kcred); + mutex_destroy(&mux->omux_lock); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, len); + kmem_free(mux, sizeof (overlay_mux_t)); + return (NULL); + } + + list_insert_tail(&overlay_mux_list, mux); + mutex_exit(&overlay_mux_lock); + + *errp = 0; + return (mux); +} + +void +overlay_mux_close(overlay_mux_t *mux) +{ + mutex_enter(&overlay_mux_lock); + mutex_enter(&mux->omux_lock); + mux->omux_count--; + if (mux->omux_count != 0) { + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + return; + } + list_remove(&overlay_mux_list, mux); + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + + ksocket_close(mux->omux_ksock, kcred); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, mux->omux_alen); + kmem_free(mux, sizeof (overlay_mux_t)); +} + +void +overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_add(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +void +overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_remove(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +int +overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) +{ + int ret; + + /* + * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, + * that isn't actually supported by UDP at this time. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); + if (ret != 0) + freemsg(mp); + + return (ret); +} diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c new file mode 100644 index 0000000000..348ddb92a2 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_plugin.c @@ -0,0 +1,281 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device encapsulation plugin management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/modctl.h> + +#include <sys/overlay_impl.h> + +static kmem_cache_t *overlay_plugin_cache; +static kmutex_t overlay_plugin_lock; +static list_t overlay_plugin_list; + +#define OVERLAY_MODDIR "overlay" + +/* ARGSUSED */ +static int +overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags) +{ + overlay_plugin_t *opp = buf; + + mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL); + list_link_init(&opp->ovp_link); + + return (0); +} + +/* ARGSUSED */ +static void +overlay_plugin_cache_destructor(void *buf, void *arg) +{ + overlay_plugin_t *opp = buf; + ASSERT(list_link_active(&opp->ovp_link) == 0); + mutex_destroy(&opp->ovp_mutex); +} + +void +overlay_plugin_init(void) +{ + mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0); + + /* + * In the future we may want to have a reaper to unload unused modules + * to help the kernel be able to reclaim memory. + */ + overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache", + sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor, + overlay_plugin_cache_destructor, NULL, NULL, NULL, 0); + list_create(&overlay_plugin_list, sizeof (overlay_plugin_t), + offsetof(overlay_plugin_t, ovp_link)); +} + +void +overlay_plugin_fini(void) +{ + mutex_enter(&overlay_plugin_lock); + VERIFY(list_is_empty(&overlay_plugin_list)); + mutex_exit(&overlay_plugin_lock); + + list_destroy(&overlay_plugin_list); + kmem_cache_destroy(overlay_plugin_cache); + mutex_destroy(&overlay_plugin_lock); +} + +overlay_plugin_register_t * +overlay_plugin_alloc(uint_t version) +{ + overlay_plugin_register_t *ovrp; + /* Version 1 is the only one that exists */ + if (version != OVEP_VERSION_ONE) + return (NULL); + + ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP); + ovrp->ovep_version = version; + return (ovrp); +} + +void +overlay_plugin_free(overlay_plugin_register_t *ovrp) +{ + kmem_free(ovrp, sizeof (overlay_plugin_register_t)); +} + +int +overlay_plugin_register(overlay_plugin_register_t *ovrp) +{ + overlay_plugin_t *opp, *ipp; + + /* Sanity check parameters of the registration */ + if (ovrp->ovep_version != OVEP_VERSION_ONE) + return (EINVAL); + + if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL) + return (EINVAL); + + if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0) + return (EINVAL); + + if (ovrp->ovep_id_size < 1) + return (EINVAL); + + /* Don't support anything that has an id size larger than 8 bytes */ + if (ovrp->ovep_id_size > 8) + return (ENOTSUP); + + if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID) + return (EINVAL); + + if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0) + return (EINVAL); + + if (ovrp->ovep_ops->ovpo_callbacks != 0) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_init == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_fini == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_encap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_decap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_socket == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_getprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_setprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_propinfo == NULL) + return (EINVAL); + + + opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP); + opp->ovp_active = 0; + opp->ovp_name = ovrp->ovep_name; + opp->ovp_ops = ovrp->ovep_ops; + opp->ovp_props = ovrp->ovep_props; + opp->ovp_id_size = ovrp->ovep_id_size; + opp->ovp_flags = ovrp->ovep_flags; + opp->ovp_dest = ovrp->ovep_dest; + + opp->ovp_nprops = 0; + if (ovrp->ovep_props != NULL) { + while (ovrp->ovep_props[opp->ovp_nprops] != NULL) { + if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >= + OVERLAY_PROP_NAMELEN) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EINVAL); + } + opp->ovp_nprops++; + } + } + + mutex_enter(&overlay_plugin_lock); + for (ipp = list_head(&overlay_plugin_list); ipp != NULL; + ipp = list_next(&overlay_plugin_list, ipp)) { + if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EEXIST); + } + } + list_insert_tail(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + return (0); +} + +int +overlay_plugin_unregister(const char *name) +{ + overlay_plugin_t *opp; + + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(opp->ovp_name, name) == 0) + break; + } + + if (opp == NULL) { + mutex_exit(&overlay_plugin_lock); + return (ENOENT); + } + + mutex_enter(&opp->ovp_mutex); + if (opp->ovp_active > 0) { + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (EBUSY); + } + mutex_exit(&opp->ovp_mutex); + + list_remove(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + kmem_cache_free(overlay_plugin_cache, opp); + return (0); +} + +overlay_plugin_t * +overlay_plugin_lookup(const char *name) +{ + overlay_plugin_t *opp; + boolean_t trymodload = B_FALSE; + + for (;;) { + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(name, opp->ovp_name) == 0) { + mutex_enter(&opp->ovp_mutex); + opp->ovp_active++; + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (opp); + } + } + mutex_exit(&overlay_plugin_lock); + + if (trymodload == B_TRUE) + return (NULL); + + /* + * If we didn't find it, it may still exist, but just not have + * been a loaded module. In that case, we'll do one attempt to + * load it. + */ + if (modload(OVERLAY_MODDIR, (char *)name) == -1) + return (NULL); + trymodload = B_TRUE; + } + +} + +void +overlay_plugin_rele(overlay_plugin_t *opp) +{ + mutex_enter(&opp->ovp_mutex); + ASSERT(opp->ovp_active > 0); + opp->ovp_active--; + mutex_exit(&opp->ovp_mutex); +} + +void +overlay_plugin_walk(overlay_plugin_walk_f func, void *arg) +{ + overlay_plugin_t *opp; + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (func(opp, arg) != 0) { + mutex_exit(&overlay_plugin_lock); + return; + } + } + mutex_exit(&overlay_plugin_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c new file mode 100644 index 0000000000..ba1ea2a629 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_prop.c @@ -0,0 +1,122 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Routines for manipulating property information structures. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/overlay_impl.h> + +void +overlay_prop_init(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + infop->oipi_posssize = sizeof (mac_propval_range_t); + bzero(rangep, sizeof (mac_propval_range_t)); +} + +void +overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN); +} + +void +overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_prot = prot; +} + +void +overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_type = type; +} + +int +overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + + if (len > OVERLAY_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(def, infop->oipi_default, len); + infop->oipi_defsize = (uint32_t)len; + + return (0); +} + +void +overlay_prop_set_nodefault(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_default[0] = '\0'; + infop->oipi_defsize = 0; +} + +void +overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min, + uint32_t max) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->oipi_poss)) + return; + + infop->oipi_posssize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str) +{ + size_t len = strlen(str) + 1; /* Account for a null terminator */ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->oipi_posssize + len > sizeof (infop->oipi_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->oipi_poss) - infop->oipi_posssize); + pstr->mpur_nextbyte += len; + infop->oipi_posssize += len; +} diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c new file mode 100644 index 0000000000..f4147b56d1 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -0,0 +1,1651 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device target cache management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> +#include <sys/vlan.h> +#include <sys/crc32.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/overlay_impl.h> +#include <sys/sdt.h> + +/* + * This is total straw man, but at least it's a prime number. Here we're + * going to have to go through and do a lot of evaluation and understanding as + * to how these target caches should grow and shrink, as well as, memory + * pressure and evictions. This just gives us a starting point that'll be 'good + * enough', until it's not. + */ +#define OVERLAY_HSIZE 823 + +/* + * We use this data structure to keep track of what requests have been actively + * allocated to a given instance so we know what to put back on the pending + * list. + */ +typedef struct overlay_target_hdl { + minor_t oth_minor; /* RO */ + zoneid_t oth_zoneid; /* RO */ + int oth_oflags; /* RO */ + list_node_t oth_link; /* overlay_target_lock */ + kmutex_t oth_lock; + list_t oth_outstanding; /* oth_lock */ +} overlay_target_hdl_t; + +typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); +typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); +typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); + +typedef struct overaly_target_ioctl { + int oti_cmd; /* ioctl id */ + boolean_t oti_write; /* ioctl requires FWRITE */ + boolean_t oti_ncopyout; /* copyout data? */ + overlay_target_copyin_f oti_copyin; /* copyin func */ + overlay_target_ioctl_f oti_func; /* function to call */ + overlay_target_copyout_f oti_copyout; /* copyin func */ + size_t oti_size; /* size of user level structure */ +} overlay_target_ioctl_t; + +static kmem_cache_t *overlay_target_cache; +static kmem_cache_t *overlay_entry_cache; +static id_space_t *overlay_thdl_idspace; +static void *overlay_thdl_state; + +/* + * When we support overlay devices in the NGZ, then all of these need to become + * zone aware, by plugging into the netstack engine and becoming per-netstack + * data. + */ +static list_t overlay_thdl_list; +static kmutex_t overlay_target_lock; +static kcondvar_t overlay_target_condvar; +static list_t overlay_target_list; +static boolean_t overlay_target_excl; + +/* + * Outstanding data per hash table entry. + */ +static int overlay_ent_size = 128 * 1024; + +/* ARGSUSED */ +static int +overlay_target_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_t *ott = buf; + + mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_target_cache_destructor(void *buf, void *arg) +{ + overlay_target_t *ott = buf; + + cv_destroy(&ott->ott_cond); + mutex_destroy(&ott->ott_lock); +} + +/* ARGSUSED */ +static int +overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_entry_t *ote = buf; + + bzero(ote, sizeof (overlay_target_entry_t)); + mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_entry_cache_destructor(void *buf, void *arg) +{ + overlay_target_entry_t *ote = buf; + + mutex_destroy(&ote->ote_lock); +} + +static uint64_t +overlay_mac_hash(const void *v) +{ + uint32_t crc; + CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + return (crc); +} + +static int +overlay_mac_cmp(const void *a, const void *b) +{ + return (bcmp(a, b, ETHERADDRL)); +} + +/* ARGSUSED */ +static void +overlay_target_entry_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + + ote->ote_flags = 0; + bzero(ote->ote_addr, ETHERADDRL); + ote->ote_ott = NULL; + ote->ote_odd = NULL; + freemsgchain(ote->ote_chead); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = 0; + kmem_cache_free(overlay_entry_cache, ote); +} + +static int +overlay_mac_avl(const void *a, const void *b) +{ + int i; + const overlay_target_entry_t *l, *r; + l = a; + r = b; + + for (i = 0; i < ETHERADDRL; i++) { + if (l->ote_addr[i] > r->ote_addr[i]) + return (1); + else if (l->ote_addr[i] < r->ote_addr[i]) + return (-1); + } + + return (0); +} + +void +overlay_target_init(void) +{ + int ret; + ret = ddi_soft_state_init(&overlay_thdl_state, + sizeof (overlay_target_hdl_t), 1); + VERIFY(ret == 0); + overlay_target_cache = kmem_cache_create("overlay_target", + sizeof (overlay_target_t), 0, overlay_target_cache_constructor, + overlay_target_cache_destructor, NULL, NULL, NULL, 0); + overlay_entry_cache = kmem_cache_create("overlay_entry", + sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor, + overlay_entry_cache_destructor, NULL, NULL, NULL, 0); + mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL); + list_create(&overlay_target_list, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t), + offsetof(overlay_target_hdl_t, oth_link)); + overlay_thdl_idspace = id_space_create("overlay_target_minors", + 1, INT32_MAX); +} + +void +overlay_target_fini(void) +{ + id_space_destroy(overlay_thdl_idspace); + list_destroy(&overlay_thdl_list); + list_destroy(&overlay_target_list); + cv_destroy(&overlay_target_condvar); + mutex_destroy(&overlay_target_lock); + kmem_cache_destroy(overlay_entry_cache); + kmem_cache_destroy(overlay_target_cache); + ddi_soft_state_fini(&overlay_thdl_state); +} + +void +overlay_target_free(overlay_dev_t *odd) +{ + if (odd->odd_target == NULL) + return; + + if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { + refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; + avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; + overlay_target_entry_t *ote; + + /* + * Our AVL tree and hashtable contain the same elements, + * therefore we should just remove it from the tree, but then + * delete the entries when we remove them from the hash table + * (which happens through the refhash dtor). + */ + while ((ote = avl_first(ap)) != NULL) + avl_remove(ap, ote); + + avl_destroy(ap); + for (ote = refhash_first(rp); ote != NULL; + ote = refhash_next(rp, ote)) { + refhash_remove(rp, ote); + } + refhash_destroy(rp); + } + + ASSERT(odd->odd_target->ott_ocount == 0); + kmem_cache_free(overlay_target_cache, odd->odd_target); +} + +int +overlay_target_busy() +{ + int ret; + + mutex_enter(&overlay_target_lock); + ret = !list_is_empty(&overlay_thdl_list); + mutex_exit(&overlay_target_lock); + + return (ret); +} + +static void +overlay_target_queue(overlay_target_entry_t *entry) +{ + mutex_enter(&overlay_target_lock); + mutex_enter(&entry->ote_ott->ott_lock); + if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&entry->ote_ott->ott_lock); + mutex_exit(&overlay_target_lock); + return; + } + entry->ote_ott->ott_ocount++; + mutex_exit(&entry->ote_ott->ott_lock); + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&overlay_target_lock); +} + +void +overlay_target_quiesce(overlay_target_t *ott) +{ + if (ott == NULL) + return; + mutex_enter(&ott->ott_lock); + ott->ott_flags |= OVERLAY_T_TEARDOWN; + while (ott->ott_ocount != 0) + cv_wait(&ott->ott_cond, &ott->ott_lock); + mutex_exit(&ott->ott_lock); +} + +/* + * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at + * this time, say for NVGRE, we drop all packets that mcuh this. + */ +int +overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, + socklen_t *slenp) +{ + int ret; + struct sockaddr_in6 *v6; + overlay_target_t *ott; + mac_header_info_t mhi; + overlay_target_entry_t *entry; + + ASSERT(odd->odd_target != NULL); + + /* + * At this point, the overlay device is in a mux which means that it's + * been activated. At this point, parts of the target, such as the mode + * and the destination are now read-only and we don't have to worry + * about synchronization for them. + */ + ott = odd->odd_target; + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (OVERLAY_TARGET_DROP); + + v6 = (struct sockaddr_in6 *)sock; + bzero(v6, sizeof (struct sockaddr_in6)); + v6->sin6_family = AF_INET6; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + mutex_enter(&ott->ott_lock); + bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(ott->ott_u.ott_point.otp_port); + mutex_exit(&ott->ott_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (OVERLAY_TARGET_OK); + } + + ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); + + /* + * Note we only want the MAC address here, therefore we won't bother + * using mac_vlan_header_info(). If any caller needs the vlan info at + * this point, this should change to a call to mac_vlan_header_info(). + */ + if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + return (OVERLAY_TARGET_DROP); + mutex_enter(&ott->ott_lock); + entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + mhi.mhi_daddr); + if (entry == NULL) { + entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (entry == NULL) { + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_DROP); + } + bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; + entry->ote_odd = odd; + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); + mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + return (OVERLAY_TARGET_ASYNC); + } + refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(entry->ote_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + ret = OVERLAY_TARGET_OK; + } else { + size_t mlen = msgsize(mp); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + ret = OVERLAY_TARGET_DROP; + } else { + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == + NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & + OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= + OVERLAY_ENTRY_F_PENDING; + overlay_target_queue(entry); + } + ret = OVERLAY_TARGET_ASYNC; + } + } + mutex_exit(&entry->ote_lock); + + mutex_enter(&ott->ott_lock); + refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_info(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_info_t *oti = arg; + + odd = overlay_hold_by_dlid(oti->oti_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + oti->oti_flags = 0; + oti->oti_needs = odd->odd_plugin->ovp_dest; + if (odd->odd_flags & OVERLAY_F_DEGRADED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED; + if (odd->odd_flags & OVERLAY_F_ACTIVATED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; + oti->oti_vnetid = odd->odd_vid; + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_associate_t *ota = arg; + + odd = overlay_hold_by_dlid(ota->ota_linkid); + if (odd == NULL) + return (ENOENT); + + if (ota->ota_id == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode != OVERLAY_TARGET_POINT && + ota->ota_mode != OVERLAY_TARGET_DYNAMIC) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_provides != odd->odd_plugin->ovp_dest) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode == OVERLAY_TARGET_POINT) { + if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + + if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) { + if (ota->ota_point.otp_port == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + } + + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); + ott->ott_flags = 0; + ott->ott_ocount = 0; + ott->ott_mode = ota->ota_mode; + ott->ott_dest = ota->ota_provides; + ott->ott_id = ota->ota_id; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + bcopy(&ota->ota_point, &ott->ott_u.ott_point, + sizeof (overlay_target_point_t)); + } else { + ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + overlay_mac_hash, overlay_mac_cmp, + overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); + } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (EEXIST); + } + + odd->odd_flags |= OVERLAY_F_VARPD; + odd->odd_target = ott; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + + + return (0); +} + + +/* ARGSUSED */ +static int +overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_degrade_t *otd = arg; + + odd = overlay_hold_by_dlid(otd->otd_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_degrade(odd, otd->otd_buf); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_restore(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_restore(odd); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_VARPD; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + return (0); + +} + +static int +overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_lookup_t *otl = arg; + overlay_target_entry_t *entry; + clock_t ret, timeout; + mac_header_info_t mhi; + + timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC); +again: + mutex_enter(&overlay_target_lock); + while (list_is_empty(&overlay_target_list)) { + ret = cv_timedwait(&overlay_target_condvar, + &overlay_target_lock, timeout); + if (ret == -1) { + mutex_exit(&overlay_target_lock); + return (ETIME); + } + } + entry = list_remove_head(&overlay_target_list); + mutex_exit(&overlay_target_lock); + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + ASSERT(entry->ote_chead == NULL); + mutex_exit(&entry->ote_lock); + goto again; + } + ASSERT(entry->ote_chead != NULL); + + /* + * If we have a bogon that doesn't have a valid mac header, drop it and + * try again. + */ + if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead, + &mhi) != 0) { + boolean_t queue = B_FALSE; + mblk_t *mp = entry->ote_chead; + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + if (entry->ote_chead != NULL) + queue = B_TRUE; + mutex_exit(&entry->ote_lock); + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + goto again; + } + + otl->otl_dlid = entry->ote_odd->odd_linkid; + otl->otl_reqid = (uintptr_t)entry; + otl->otl_varpdid = entry->ote_ott->ott_id; + otl->otl_vnetid = entry->ote_odd->odd_vid; + + otl->otl_hdrsize = mhi.mhi_hdrsize; + otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; + bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); + otl->otl_dsttype = mhi.mhi_dsttype; + otl->otl_sap = mhi.mhi_bindsap; + otl->otl_vlan = VLAN_ID(mhi.mhi_tci); + mutex_exit(&entry->ote_lock); + + mutex_enter(&thdl->oth_lock); + list_insert_tail(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + return (0); +} + +static int +overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + bcopy(&otr->otr_answer, &entry->ote_dest, + sizeof (overlay_target_point_t)); + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; + entry->ote_chead = NULL; + entry->ote_ctail = NULL; + entry->ote_mbsize = 0; + entry->ote_vtime = gethrtime(); + mutex_exit(&entry->ote_lock); + + /* + * For now do an in-situ drain. + */ + mp = overlay_m_tx(entry->ote_odd, mp); + freemsgchain(mp); + + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +static int +overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + boolean_t queue = B_FALSE; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + + /* Safeguard against a confused varpd */ + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + DTRACE_PROBE1(overlay__target__valid__drop, + overlay_target_entry_t *, entry); + mutex_exit(&entry->ote_lock); + goto done; + } + + mp = entry->ote_chead; + if (mp != NULL) { + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + } + if (entry->ote_chead != NULL) { + queue = B_TRUE; + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + } else { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + } + mutex_exit(&entry->ote_lock); + + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + +done: + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_pkt_t *pkt; + overlay_targ_pkt32_t *pkt32; + + pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP); + *outp = pkt; + *bsize = sizeof (overlay_targ_pkt_t); + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + uintptr_t addr; + + if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + pkt32 = (overlay_targ_pkt32_t *)pkt; + addr = pkt32->otp_buf; + pkt->otp_buf = (void *)addr; + } else { + if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + } + return (0); +} + +static int +overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + overlay_targ_pkt_t *pkt = buf; + overlay_targ_pkt32_t *pkt32 = buf; + uintptr_t addr = (uintptr_t)pkt->otp_buf; + pkt32->otp_buf = (caddr32_t)addr; + if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + } else { + if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +static int +overlay_target_packet(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + size_t mlen; + size_t boff; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + mutex_enter(&entry->ote_lock); + mutex_exit(&thdl->oth_lock); + mp = entry->ote_chead; + /* Protect against a rogue varpd */ + if (mp == NULL) { + mutex_exit(&entry->ote_lock); + return (EINVAL); + } + mlen = MIN(msgsize(mp), pkt->otp_size); + pkt->otp_size = mlen; + boff = 0; + while (mlen > 0) { + size_t wlen = MIN(MBLKL(mp), mlen); + if (ddi_copyout(mp->b_rptr, + (void *)((uintptr_t)pkt->otp_buf + boff), + wlen, 0) != 0) { + mutex_exit(&entry->ote_lock); + return (EFAULT); + } + mlen -= wlen; + boff += wlen; + mp = mp->b_cont; + } + mutex_exit(&entry->ote_lock); + return (0); +} + +static int +overlay_target_inject(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mutex_enter(&odd->odd_lock); + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + return (0); +} + +static int +overlay_target_resend(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mp = overlay_m_tx(odd, mp); + freemsgchain(mp); + + return (0); +} + +typedef struct overlay_targ_list_int { + boolean_t otli_count; + uint32_t otli_cur; + uint32_t otli_nents; + uint32_t otli_ents[]; +} overlay_targ_list_int_t; + +static int +overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_list_t n; + overlay_targ_list_int_t *otl; + + if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + /* + */ + if (n.otl_nents >= INT32_MAX / sizeof (uint32_t)) + return (EINVAL); + *bsize = sizeof (overlay_targ_list_int_t) + + sizeof (uint32_t) * n.otl_nents; + otl = kmem_zalloc(*bsize, KM_SLEEP); + otl->otli_cur = 0; + otl->otli_nents = n.otl_nents; + if (otl->otli_nents != 0) { + otl->otli_count = B_FALSE; + if (ddi_copyin((void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + otl->otli_ents, n.otl_nents * sizeof (uint32_t), + flags & FKIOCTL) != 0) { + kmem_free(otl, *bsize); + return (EFAULT); + } + } else { + otl->otli_count = B_TRUE; + } + + *outp = otl; + return (0); +} + +static int +overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg) +{ + overlay_targ_list_int_t *otl = arg; + + if (otl->otli_cur < otl->otli_nents) + otl->otli_ents[otl->otli_cur] = odd->odd_linkid; + otl->otli_cur++; + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_iter(overlay_target_ioctl_list_cb, arg); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags) +{ + overlay_targ_list_int_t *otl = buf; + + if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (otl->otli_count == B_FALSE) { + if (ddi_copyout(otl->otli_ents, + (void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + sizeof (uint32_t) * otl->otli_nents, + flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_POINT && + ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + otc->otc_entry.otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } else { + overlay_target_entry_t *ote; + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & + OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else { + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } + ret = 0; + } else { + ret = ENOENT; + } + mutex_exit(&ote->ote_lock); + } else { + ret = ENOENT; + } + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + mblk_t *mp = NULL; + + if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + return (EINVAL); + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote == NULL) { + ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_ott = ott; + ote->ote_odd = odd; + mutex_enter(&ote->ote_lock); + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } else { + mutex_enter(&ote->ote_lock); + } + + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { + ote->ote_flags |= OVERLAY_ENTRY_F_DROP; + } else { + ote->ote_flags |= OVERLAY_ENTRY_F_VALID; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + sizeof (overlay_target_point_t)); + mp = ote->ote_chead; + ote->ote_chead = NULL; + ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = gethrtime(); + } + + mutex_exit(&ote->ote_lock); + mutex_exit(&ott->ott_lock); + + if (mp != NULL) { + mp = overlay_m_tx(ote->ote_odd, mp); + freemsgchain(mp); + } + + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + ret = 0; + } else { + ret = ENOENT; + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) +{ + avl_tree_t *avl; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + avl = &ott->ott_u.ott_dyn.ott_tree; + + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_cache_iter_t base, *iter; + + if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (base.otci_count > OVERLAY_TARGET_ITER_MAX) + return (E2BIG); + + if (base.otci_count == 0) + return (EINVAL); + + *bsize = sizeof (overlay_targ_cache_iter_t) + + base.otci_count * sizeof (overlay_targ_cache_entry_t); + iter = kmem_alloc(*bsize, KM_SLEEP); + bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t)); + *outp = iter; + + return (0); +} + +typedef struct overlay_targ_cache_marker { + uint8_t otcm_mac[ETHERADDRL]; + uint16_t otcm_done; +} overlay_targ_cache_marker_t; + +/* ARGSUSED */ +static int +overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t lookup, *ent; + overlay_targ_cache_marker_t *mark; + avl_index_t where; + avl_tree_t *avl; + uint16_t written = 0; + + overlay_targ_cache_iter_t *iter = arg; + mark = (void *)&iter->otci_marker; + + if (mark->otcm_done != 0) { + iter->otci_count = 0; + return (0); + } + + odd = overlay_hold_by_dlid(iter->otci_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC && + ott->ott_mode != OVERLAY_TARGET_POINT) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + + /* + * Holding this lock across the entire iteration probably isn't very + * good. We should perhaps add an r/w lock for the avl tree. But we'll + * wait until we now it's necessary before we do more. + */ + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; + bzero(out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mark->otcm_done = 1; + } + + avl = &ott->ott_u.ott_dyn.ott_tree; + bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + ent = avl_find(avl, &lookup, &where); + + /* + * NULL ent means that the entry does not exist, so we want to start + * with the closest node in the tree. This means that we implicitly rely + * on the tree's order and the first node will be the mac 00:00:00:00:00 + * and the last will be ff:ff:ff:ff:ff:ff. + */ + if (ent == NULL) { + ent = avl_nearest(avl, where, AVL_AFTER); + if (ent == NULL) { + mark->otcm_done = 1; + goto done; + } + } + + for (; ent != NULL && written < iter->otci_count; + ent = AVL_NEXT(avl, ent)) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[written]; + mutex_enter(&ent->ote_lock); + if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) { + mutex_exit(&ent->ote_lock); + continue; + } + bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) + out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; + if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) + bcopy(&ent->ote_dest, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mutex_exit(&ent->ote_lock); + } + + if (ent != NULL) { + bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + } else { + mark->otcm_done = 1; + } + +done: + iter->otci_count = written; + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + size_t outsize; + const overlay_targ_cache_iter_t *iter = buf; + + outsize = sizeof (overlay_targ_cache_iter_t) + + iter->otci_count * sizeof (overlay_targ_cache_entry_t); + + if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static overlay_target_ioctl_t overlay_target_ioctab[] = { + { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, + NULL, overlay_target_info, + NULL, sizeof (overlay_targ_info_t) }, + { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_associate, + NULL, sizeof (overlay_targ_associate_t) }, + { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_disassociate, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE, + NULL, overlay_target_degrade, + NULL, sizeof (overlay_targ_degrade_t) }, + { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE, + NULL, overlay_target_restore, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE, + NULL, overlay_target_lookup_request, + NULL, sizeof (overlay_targ_lookup_t) }, + { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_respond, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_DROP, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_drop, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_PKT, B_TRUE, B_TRUE, + overlay_target_pkt_copyin, + overlay_target_packet, + overlay_target_pkt_copyout, + sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_inject, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_resend, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_LIST, B_FALSE, B_TRUE, + overlay_target_list_copyin, + overlay_target_ioctl_list, + overlay_target_list_copyout, + sizeof (overlay_targ_list_t) }, + { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE, + NULL, overlay_target_cache_get, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_set, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE, + NULL, overlay_target_cache_flush, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE, + overlay_target_cache_iter_copyin, + overlay_target_cache_iter, + overlay_target_cache_iter_copyout, + sizeof (overlay_targ_cache_iter_t) }, + { 0 } +}; + +int +overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + minor_t mid; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if (getminor(*devp) != 0) + return (ENXIO); + + if (otype & OTYP_BLK) + return (EINVAL); + + if (flags & ~(FREAD | FWRITE | FEXCL)) + return (EINVAL); + + if ((flags & FWRITE) && + !(flags & FEXCL)) + return (EINVAL); + + if (!(flags & FREAD) && !(flags & FWRITE)) + return (EINVAL); + + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EPERM); + + mid = id_alloc(overlay_thdl_idspace); + if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) { + id_free(overlay_thdl_idspace, mid); + return (ENXIO); + } + + thdl = ddi_get_soft_state(overlay_thdl_state, mid); + VERIFY(thdl != NULL); + thdl->oth_minor = mid; + thdl->oth_zoneid = crgetzoneid(credp); + thdl->oth_oflags = flags; + mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + *devp = makedevice(getmajor(*devp), mid); + + mutex_enter(&overlay_target_lock); + if ((flags & FEXCL) && overlay_target_excl == B_TRUE) { + mutex_exit(&overlay_target_lock); + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + return (EEXIST); + } else if ((flags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_FALSE); + overlay_target_excl = B_TRUE; + } + list_insert_tail(&overlay_thdl_list, thdl); + mutex_exit(&overlay_target_lock); + + return (0); +} + +/* ARGSUSED */ +int +overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + overlay_target_ioctl_t *ioc; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, + getminor(dev))) == NULL) + return (ENXIO); + + for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) { + int ret; + caddr_t buf; + size_t bufsize; + + if (ioc->oti_cmd != cmd) + continue; + + if (ioc->oti_write == B_TRUE && !(mode & FWRITE)) + return (EBADF); + + if (ioc->oti_copyin == NULL) { + bufsize = ioc->oti_size; + buf = kmem_alloc(bufsize, KM_SLEEP); + if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize, + mode & FKIOCTL) != 0) { + kmem_free(buf, bufsize); + return (EFAULT); + } + } else { + if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg, + (void **)&buf, &bufsize, mode)) != 0) + return (ret); + } + + ret = ioc->oti_func(thdl, buf); + if (ret == 0 && ioc->oti_size != 0 && + ioc->oti_ncopyout == B_TRUE) { + if (ioc->oti_copyout == NULL) { + if (ddi_copyout(buf, (void *)(uintptr_t)arg, + bufsize, mode & FKIOCTL) != 0) + ret = EFAULT; + } else { + ret = ioc->oti_copyout((void *)(uintptr_t)arg, + buf, bufsize, mode); + } + } + + kmem_free(buf, bufsize); + return (ret); + } + + return (ENOTTY); +} + +/* ARGSUSED */ +int +overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + overlay_target_hdl_t *thdl; + overlay_target_entry_t *entry; + minor_t mid = getminor(dev); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL) + return (ENXIO); + + mutex_enter(&overlay_target_lock); + list_remove(&overlay_thdl_list, thdl); + mutex_enter(&thdl->oth_lock); + while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL) + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&thdl->oth_lock); + if ((thdl->oth_oflags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_TRUE); + overlay_target_excl = B_FALSE; + } + mutex_exit(&overlay_target_lock); + + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + mid = thdl->oth_minor; + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + + return (0); +} diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c new file mode 100644 index 0000000000..8b4e4ecb42 --- /dev/null +++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c @@ -0,0 +1,372 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * VXLAN encapsulation module + * + * + * The VXLAN header looks as follows in network byte order: + * + * |0 3| 4 |5 31| + * +----------+---+------------------------+ + * | Reserved | I | Reserved | + * +---------------------------------------+ + * | Virtual Network ID | Reserved | + * +----------------------------+----------+ + * |0 23|24 31| + * + * All reserved values must be 0. The I bit must be 1. We call the top + * word the VXLAN magic field for the time being. The second word is + * definitely not the most friendly way to operate. Specifically, the ID + * is a 24-bit big endian value, but we have to make sure not to use the + * reserved byte. + * + * For us, VXLAN encapsulation is a fairly straightforward implementation. It + * only has two properties, a listen_ip and a listen_port. These determine on + * what address we should be listening on. While we do not have a default + * address to listen upon, we do have a default port, which is the IANA assigned + * port for VXLAN -- 4789. + */ + +#include <sys/overlay_plugin.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/byteorder.h> +#include <sys/vxlan.h> +#include <inet/ip.h> +#include <netinet/in.h> +#include <sys/strsun.h> +#include <netinet/udp.h> + +static const char *vxlan_ident = "vxlan"; +static uint16_t vxlan_defport = IPPORT_VXLAN; + +/* + * Should we enable UDP source port hashing for fanout. + */ +boolean_t vxlan_fanout = B_TRUE; + +static const char *vxlan_props[] = { + "vxlan/listen_ip", + "vxlan/listen_port", + NULL +}; + +typedef struct vxlan { + kmutex_t vxl_lock; + overlay_handle_t vxl_oh; + uint16_t vxl_lport; + boolean_t vxl_hladdr; + struct in6_addr vxl_laddr; +} vxlan_t; + +static int +vxlan_o_init(overlay_handle_t oh, void **outp) +{ + vxlan_t *vxl; + + vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); + *outp = vxl; + mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); + vxl->vxl_oh = oh; + vxl->vxl_lport = vxlan_defport; + vxl->vxl_hladdr = B_FALSE; + + return (0); +} + +static void +vxlan_o_fini(void *arg) +{ + vxlan_t *vxl = arg; + + mutex_destroy(&vxl->vxl_lock); + kmem_free(arg, sizeof (vxlan_t)); +} + +static int +vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, + socklen_t *slenp) +{ + vxlan_t *vxl = arg; + struct sockaddr_in6 *in; + + in = (struct sockaddr_in6 *)addr; + *dp = AF_INET6; + *fp = SOCK_DGRAM; + *pp = 0; + bzero(in, sizeof (struct sockaddr_in6)); + in->sin6_family = AF_INET6; + + /* + * We should consider a more expressive private errno set that + * provider's can use. + */ + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + mutex_exit(&vxl->vxl_lock); + return (EINVAL); + } + in->sin6_port = htons(vxl->vxl_lport); + in->sin6_addr = vxl->vxl_laddr; + mutex_exit(&vxl->vxl_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (0); +} + +static int +vxlan_o_sockopt(ksocket_t ksock) +{ + int val, err; + if (vxlan_fanout == B_FALSE) + return (0); + + val = UDP_HASH_VXLAN; + err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, + sizeof (val), kcred); + return (err); +} + +/* ARGSUSED */ +static int +vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, + mblk_t **outp) +{ + mblk_t *ob; + vxlan_hdr_t *vxh; + + ASSERT(einfop->ovdi_id < (1 << 24)); + + /* + * This allocation could get hot. We may want to have a good way to + * cache and handle this allocation the same way that IP does with + * keeping around a message block per entry, or basically treating this + * as an immutable message block in the system. Basically freemsg() will + * be a nop, but we'll do the right thing with respect to the rest of + * the chain. + */ + ob = allocb(VXLAN_HDR_LEN, 0); + if (ob == NULL) + return (ENOMEM); + + vxh = (vxlan_hdr_t *)ob->b_rptr; + vxh->vxlan_flags = ntohl(VXLAN_F_VDI); + vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); + ob->b_wptr += VXLAN_HDR_LEN; + *outp = ob; + + return (0); +} + +/* ARGSUSED */ +static int +vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) +{ + vxlan_hdr_t *vxh; + + if (MBLKL(mp) < sizeof (vxlan_hdr_t)) + return (EINVAL); + vxh = (vxlan_hdr_t *)mp->b_rptr; + if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) + return (EINVAL); + + dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; + dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; + + return (0); +} + +static int +vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + if (*bufsize < sizeof (struct in6_addr)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + *bufsize = 0; + } else { + bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); + *bufsize = sizeof (struct in6_addr); + } + mutex_exit(&vxl->vxl_lock); + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + uint64_t val; + if (*bufsize < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + val = vxl->vxl_lport; + bcopy(&val, buf, sizeof (uint64_t)); + *bufsize = sizeof (uint64_t); + mutex_exit(&vxl->vxl_lock); + return (0); + } + + return (EINVAL); +} + +static int +vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, + uint32_t bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + if (bufsize != sizeof (struct in6_addr)) + return (EINVAL); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_MULTICAST(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + ipaddr_t v4; + IN6_V4MAPPED_TO_IPADDR(ipv6, v4); + if (IN_MULTICAST(v4)) + return (EINVAL); + } + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_hladdr = B_TRUE; + bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); + mutex_exit(&vxl->vxl_lock); + + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + const uint64_t *valp = buf; + if (bufsize != 8) + return (EINVAL); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_lport = *valp; + mutex_exit(&vxl->vxl_lock); + return (0); + } + return (EINVAL); +} + +static int +vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) +{ + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[0]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); + overlay_prop_set_nodefault(phdl); + return (0); + } + + if (strcmp(pr_name, vxlan_props[1]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[1]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + (void) overlay_prop_set_default(phdl, &vxlan_defport, + sizeof (vxlan_defport)); + overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static struct overlay_plugin_ops vxlan_o_ops = { + 0, + vxlan_o_init, + vxlan_o_fini, + vxlan_o_encap, + vxlan_o_decap, + vxlan_o_socket, + vxlan_o_sockopt, + vxlan_o_getprop, + vxlan_o_setprop, + vxlan_o_propinfo +}; + +static struct modlmisc vxlan_modlmisc = { + &mod_miscops, + "VXLAN encap plugin" +}; + +static struct modlinkage vxlan_modlinkage = { + MODREV_1, + &vxlan_modlmisc +}; + +int +_init(void) +{ + int err; + overlay_plugin_register_t *ovrp; + + ovrp = overlay_plugin_alloc(OVEP_VERSION); + if (ovrp == NULL) + return (ENOTSUP); + ovrp->ovep_name = vxlan_ident; + ovrp->ovep_ops = &vxlan_o_ops; + ovrp->ovep_id_size = VXLAN_ID_LEN; + ovrp->ovep_flags = OVEP_F_VLAN_TAG; + ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; + ovrp->ovep_props = vxlan_props; + + if ((err = overlay_plugin_register(ovrp)) == 0) { + if ((err = mod_install(&vxlan_modlinkage)) != 0) { + (void) overlay_plugin_unregister(vxlan_ident); + } + } + + overlay_plugin_free(ovrp); + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vxlan_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) + return (err); + + return (mod_remove(&vxlan_modlinkage)); +} diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c index 39d5003b02..c48fecd133 100644 --- a/usr/src/uts/common/io/physmem.c +++ b/usr/src/uts/common/io/physmem.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) int ret; static int msg_printed = 0; + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EACCES); + if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { return (EINVAL); } diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf index 42248e93d6..08affec609 100644 --- a/usr/src/uts/common/io/pseudo.conf +++ b/usr/src/uts/common/io/pseudo.conf @@ -22,8 +22,7 @@ # # Copyright 2003 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2014 Joyent, Inc. All rights reserved. # # This file is private to the pseudonex driver. It should not be edited. # @@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0; # /pseudo; it has as its children the zone console pseudo nodes. # name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons"; + +# +# zfdnex is an alias for pseudo; this node is instantiated as a child of +# /pseudo; it has as its children the zone fd pseudo nodes. +# +name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd"; diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c index f83b0abf39..0ae06f88cc 100644 --- a/usr/src/uts/common/io/pseudonex.c +++ b/usr/src/uts/common/io/pseudonex.c @@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t); static int pseudonex_open(dev_t *, int, int, cred_t *); static int pseudonex_close(dev_t, int, int, cred_t *); static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int, + ddi_iblock_cookie_t *); static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, void *); @@ -90,6 +92,8 @@ static void *pseudonex_state; typedef struct pseudonex_state { dev_info_t *pnx_devi; + int pnx_fmcap; + ddi_iblock_cookie_t pnx_fm_ibc; } pseudonex_state_t; static struct bus_ops pseudonex_bus_ops = { @@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = { NULL, /* bus_intr_ctl */ NULL, /* bus_config */ NULL, /* bus_unconfig */ - NULL, /* bus_fm_init */ + pseudonex_fm_init, /* bus_fm_init */ NULL, /* bus_fm_fini */ NULL, /* bus_fm_access_enter */ NULL, /* bus_fm_access_exit */ @@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) pnx_state = ddi_get_soft_state(pseudonex_state, instance); pnx_state->pnx_devi = devi; + pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE; + ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc); + if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance, DDI_NT_NEXUS, 0) != DDI_SUCCESS) { ddi_remove_minor_node(devi, NULL); @@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) if (cmd == DDI_SUSPEND) return (DDI_SUCCESS); + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ddi_fm_fini(devi); ddi_remove_minor_node(devi, NULL); ddi_soft_state_free(pseudonex_state, instance); return (DDI_SUCCESS); @@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child) } static int +pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, + ddi_iblock_cookie_t *ibc) +{ + pseudonex_state_t *pnx_state; + + pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip)); + ASSERT(pnx_state != NULL); + ASSERT(ibc != NULL); + *ibc = pnx_state->pnx_fm_ibc; + return (pnx_state->pnx_fmcap & cap); +} + +static int pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, void *arg, void *result) { diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index 400e9ffd10..07ffddc123 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c index 72c8800f3e..dc5e8eafc9 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright 2014 OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2014, Tegile Systems Inc. All rights reserved. */ @@ -72,6 +72,7 @@ #include <sys/file.h> #include <sys/policy.h> #include <sys/model.h> +#include <sys/refhash.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/dr.h> @@ -99,7 +100,6 @@ #include <sys/scsi/adapters/mpt_sas/mptsas_var.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/raidioctl.h> #include <sys/fs/dv_node.h> /* devfs_clean */ diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index ae1e7e0fc3..dc5dc22e37 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -3503,9 +3503,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -4450,18 +4454,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. + */ + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. + */ + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c index 32f8f85f7a..4ab4f36d4e 100644 --- a/usr/src/uts/common/io/signalfd.c +++ b/usr/src/uts/common/io/signalfd.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* @@ -19,97 +19,73 @@ * * As described on the signalfd(3C) man page, the general idea behind these * file descriptors is that they can be used to synchronously consume signals - * via the read(2) syscall. That capability already exists with the - * sigwaitinfo(3C) function but the key advantage of signalfd is that, because - * it is file descriptor based, poll(2) can be used to determine when signals - * are available to be consumed. + * via the read(2) syscall. While that capability already exists with the + * sigwaitinfo(3C) function, signalfd holds an advantage since it is file + * descriptor based: It is able use the event facilities (poll(2), /dev/poll, + * event ports) to notify interested parties when consumable signals arrive. * - * The general implementation uses signalfd_state to hold both the signal set - * and poll head for an open file descriptor. Because a process can be using - * different sigfds with different signal sets, each signalfd_state poll head - * can be thought of as an independent signal stream and the thread(s) waiting - * on that stream will get poll notification when any signal in the - * corresponding set is received. + * The signalfd lifecycle begins When a process opens /dev/signalfd. A minor + * will be allocated for them along with an associated signalfd_state_t struct. + * It is there where the mask of desired signals resides. * - * The sigfd_proc_state_t struct lives on the proc_t and maintains per-proc - * state for function callbacks and data when the proc needs to do work during - * signal delivery for pollwakeup. + * Reading from the signalfd is straightforward and mimics the kernel behavior + * for sigtimedwait(). Signals continue to live on either the proc's p_sig, or + * thread's t_sig, member. During a read operation, those which match the mask + * are consumed so they are no longer pending. * - * The read side of the implementation is straightforward and mimics the - * kernel behavior for sigtimedwait(). Signals continue to live on either - * the proc's p_sig, or thread's t_sig, member. Read consumes the signal so - * that it is no longer pending. + * The poll side is more complex. Every time a signal is delivered, all of the + * signalfds on the process need to be examined in order to pollwake threads + * waiting for signal arrival. * - * The poll side is more complex since all of the sigfds on the process need - * to be examined every time a signal is delivered to the process in order to - * pollwake any thread waiting in poll for that signal. + * When a thread polling on a signalfd requires a pollhead, several steps must + * be taken to safely ensure the proper result. A sigfd_proc_state_t is + * created for the calling process if it does not yet exist. It is there where + * a list of sigfd_poll_waiter_t structures reside which associate pollheads to + * signalfd_state_t entries. The sigfd_proc_state_t list is walked to find a + * sigfd_poll_waiter_t matching the signalfd_state_t which corresponds to the + * polled resource. If one is found, it is reused. Otherwise a new one is + * created, incrementing the refcount on the signalfd_state_t, and it is added + * to the sigfd_poll_waiter_t list. * - * Because it is likely that a process will only be using one, or a few, sigfds, - * but many total file descriptors, we maintain a list of sigfds which need - * pollwakeup. The list lives on the proc's p_sigfd struct. In this way only - * zero, or a few, of the state structs will need to be examined every time a - * signal is delivered to the process, instead of having to examine all of the - * file descriptors to find the state structs. When a state struct with a - * matching signal set is found then pollwakeup is called. + * The complications imposed by fork(2) are why the pollhead is stored in the + * associated sigfd_poll_waiter_t instead of directly in the signalfd_state_t. + * More than one process can hold a reference to the signalfd at a time but + * arriving signals should wake only process-local pollers. Additionally, + * signalfd_close is called only when the last referencing fd is closed, hiding + * occurrences of preceeding threads which released their references. This + * necessitates reference counting on the signalfd_state_t so it is able to + * persist after close until all poll references have been cleansed. Doing so + * ensures that blocked pollers which hold references to the signalfd_state_t + * will be able to do clean-up after the descriptor itself has been closed. * - * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list - * will clear out on its own. There is an exit helper (signalfd_exit_helper) - * which cleans up any remaining per-proc state when the process exits. + * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb + * is called via the pointer in sigfd_proc_state_t. It will walk over the + * sigfd_poll_waiter_t entries present in the list, searching for any + * associated with a signalfd_state_t with a matching signal mask. The + * approach of keeping the poller list in p_sigfd was chosen because a process + * is likely to use few signalfds relative to its total file descriptors. It + * reduces the work required for each received signal. * - * The main complexity with signalfd is the interaction of forking and polling. - * This interaction is complex because now two processes have a fd that - * references the same dev_t (and its associated signalfd_state), but signals - * go to only one of those processes. Also, we don't know when one of the - * processes closes its fd because our 'close' entry point is only called when - * the last fd is closed (which could be by either process). + * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list + * will clear out on its own. Any remaining per-process state which remains + * will be cleaned up by the exit helper (signalfd_exit_helper). * - * Because the state struct is referenced by both file descriptors, and the - * state struct represents a signal stream needing a pollwakeup, if both - * processes were polling then both processes would get a pollwakeup when a - * signal arrives for either process (that is, the pollhead is associated with - * our dev_t so when a signal arrives the pollwakeup wakes up all waiters). + * The structures associated with signalfd state are designed to operate + * correctly across fork, but there is one caveat that applies. Using + * fork-shared signalfd descriptors in conjuction with fork-shared caching poll + * descriptors (such as /dev/poll or event ports) will result in missed poll + * wake-ups. This is caused by the pollhead identity of signalfd descriptors + * being dependent on the process they are polled from. Because it has a + * thread-local cache, poll(2) is unaffected by this limitation. * - * Fortunately this is not a common problem in practice, but the implementation - * attempts to mitigate unexpected behavior. The typical behavior is that the - * parent has been polling the signalfd (which is why it was open in the first - * place) and the parent might have a pending signalfd_state (with the - * pollhead) on its per-process sigfd_list. After the fork the child will - * simply close that fd (among others) as part of the typical fork/close/exec - * pattern. Because the child will never poll that fd, it will never get any - * state onto its own sigfd_list (the child starts with a null list). The - * intention is that the child sees no pollwakeup activity for signals unless - * it explicitly reinvokes poll on the sigfd. + * Lock ordering: * - * As background, there are two primary polling cases to consider when the - * parent process forks: - * 1) If any thread is blocked in poll(2) then both the parent and child will - * return from the poll syscall with EINTR. This means that if either - * process wants to re-poll on a sigfd then it needs to re-run poll and - * would come back in to the signalfd_poll entry point. The parent would - * already have the dev_t's state on its sigfd_list and the child would not - * have anything there unless it called poll again on its fd. - * 2) If the process is using /dev/poll(7D) then the polling info is being - * cached by the poll device and the process might not currently be blocked - * on anything polling related. A subsequent DP_POLL ioctl will not invoke - * our signalfd_poll entry point again. Because the parent still has its - * sigfd_list setup, an incoming signal will hit our signalfd_pollwake_cb - * entry point, which in turn calls pollwake, and /dev/poll will do the - * right thing on DP_POLL. The child will not have a sigfd_list yet so the - * signal will not cause a pollwakeup. The dp code does its own handling for - * cleaning up its cache. + * 1. signalfd_lock + * 2. signalfd_state_t`sfd_lock * - * This leaves only one odd corner case. If the parent and child both use - * the dup-ed sigfd to poll then when a signal is delivered to either process - * there is no way to determine which one should get the pollwakeup (since - * both processes will be queued on the same signal stream poll head). What - * happens in this case is that both processes will return from poll, but only - * one of them will actually have a signal to read. The other will return - * from read with EAGAIN, or block. This case is actually similar to the - * situation within a single process which got two different sigfd's with the - * same mask (or poll on two fd's that are dup-ed). Both would return from poll - * when a signal arrives but only one read would consume the signal and the - * other read would fail or block. Applications which poll on shared fd's - * cannot assume that a subsequent read will actually obtain data. + * 1. proc_t`p_lock (to walk p_sigfd) + * 2. signalfd_state_t`sfd_lock + * 2a. signalfd_lock (after sfd_lock is dropped, when sfd_count falls to 0) */ #include <sys/ddi.h> @@ -123,118 +99,150 @@ #include <sys/schedctl.h> #include <sys/id_space.h> #include <sys/sdt.h> +#include <sys/brand.h> typedef struct signalfd_state signalfd_state_t; struct signalfd_state { - kmutex_t sfd_lock; /* lock protecting state */ - pollhead_t sfd_pollhd; /* poll head */ - k_sigset_t sfd_set; /* signals for this fd */ - signalfd_state_t *sfd_next; /* next state on global list */ + list_node_t sfd_list; /* node in global list */ + kmutex_t sfd_lock; /* protects fields below */ + uint_t sfd_count; /* ref count */ + boolean_t sfd_valid; /* valid while open */ + k_sigset_t sfd_set; /* signals for this fd */ }; +typedef struct sigfd_poll_waiter { + list_node_t spw_list; + signalfd_state_t *spw_state; + pollhead_t spw_pollhd; +} sigfd_poll_waiter_t; + /* - * Internal global variables. + * Protects global state in signalfd_devi, signalfd_minor, signalfd_softstate, + * and signalfd_state (including sfd_list field of members) */ -static kmutex_t signalfd_lock; /* lock protecting state */ +static kmutex_t signalfd_lock; static dev_info_t *signalfd_devi; /* device info */ static id_space_t *signalfd_minor; /* minor number arena */ static void *signalfd_softstate; /* softstate pointer */ -static signalfd_state_t *signalfd_state; /* global list of state */ +static list_t signalfd_state; /* global list of state */ + -/* - * If we don't already have an entry in the proc's list for this state, add one. - */ static void -signalfd_wake_list_add(signalfd_state_t *state) +signalfd_state_enter(signalfd_state_t *state) { - proc_t *p = curproc; - list_t *lst; - sigfd_wake_list_t *wlp; - - ASSERT(MUTEX_HELD(&p->p_lock)); - ASSERT(p->p_sigfd != NULL); + ASSERT(MUTEX_HELD(&state->sfd_lock)); + ASSERT(state->sfd_count > 0); + VERIFY(state->sfd_valid == B_TRUE); - lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; - for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) { - if (wlp->sigfd_wl_state == state) - break; - } - - if (wlp == NULL) { - wlp = kmem_zalloc(sizeof (sigfd_wake_list_t), KM_SLEEP); - wlp->sigfd_wl_state = state; - list_insert_head(lst, wlp); - } + state->sfd_count++; } static void -signalfd_wake_rm(list_t *lst, sigfd_wake_list_t *wlp) +signalfd_state_release(signalfd_state_t *state, boolean_t locked) { - list_remove(lst, wlp); - kmem_free(wlp, sizeof (sigfd_wake_list_t)); + ASSERT(MUTEX_HELD(&state->sfd_lock)); + ASSERT(state->sfd_count > 0); + + if (state->sfd_count == 1) { + VERIFY(state->sfd_valid == B_FALSE); + mutex_exit(&state->sfd_lock); + if (locked) { + ASSERT(MUTEX_HELD(&signalfd_lock)); + list_remove(&signalfd_state, state); + } else { + ASSERT(MUTEX_NOT_HELD(&signalfd_lock)); + mutex_enter(&signalfd_lock); + list_remove(&signalfd_state, state); + mutex_exit(&signalfd_lock); + } + kmem_free(state, sizeof (*state)); + return; + } + state->sfd_count--; + mutex_exit(&state->sfd_lock); } -static void -signalfd_wake_list_rm(proc_t *p, signalfd_state_t *state) +static sigfd_poll_waiter_t * +signalfd_wake_list_add(sigfd_proc_state_t *pstate, signalfd_state_t *state) { - sigfd_wake_list_t *wlp; - list_t *lst; + list_t *lst = &pstate->sigfd_list; + sigfd_poll_waiter_t *pw; - ASSERT(MUTEX_HELD(&p->p_lock)); + for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) { + if (pw->spw_state == state) + break; + } - if (p->p_sigfd == NULL) - return; + if (pw == NULL) { + pw = kmem_zalloc(sizeof (*pw), KM_SLEEP); + + mutex_enter(&state->sfd_lock); + signalfd_state_enter(state); + pw->spw_state = state; + mutex_exit(&state->sfd_lock); + list_insert_head(lst, pw); + } + return (pw); +} + +static sigfd_poll_waiter_t * +signalfd_wake_list_rm(sigfd_proc_state_t *pstate, signalfd_state_t *state) +{ + list_t *lst = &pstate->sigfd_list; + sigfd_poll_waiter_t *pw; - lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; - for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) { - if (wlp->sigfd_wl_state == state) { - signalfd_wake_rm(lst, wlp); + for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) { + if (pw->spw_state == state) { break; } } - if (list_is_empty(lst)) { - ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL; - list_destroy(lst); - kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t)); - p->p_sigfd = NULL; + if (pw != NULL) { + list_remove(lst, pw); + mutex_enter(&state->sfd_lock); + signalfd_state_release(state, B_FALSE); + pw->spw_state = NULL; } + + return (pw); } static void signalfd_wake_list_cleanup(proc_t *p) { - sigfd_wake_list_t *wlp; + sigfd_proc_state_t *pstate = p->p_sigfd; + sigfd_poll_waiter_t *pw; list_t *lst; ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(pstate != NULL); + + lst = &pstate->sigfd_list; + while ((pw = list_remove_head(lst)) != NULL) { + signalfd_state_t *state = pw->spw_state; - ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL; + pw->spw_state = NULL; + mutex_enter(&state->sfd_lock); + signalfd_state_release(state, B_FALSE); - lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; - while (!list_is_empty(lst)) { - wlp = (sigfd_wake_list_t *)list_remove_head(lst); - kmem_free(wlp, sizeof (sigfd_wake_list_t)); + pollwakeup(&pw->spw_pollhd, POLLERR); + pollhead_clean(&pw->spw_pollhd); + kmem_free(pw, sizeof (*pw)); } + list_destroy(lst); + + p->p_sigfd = NULL; + kmem_free(pstate, sizeof (*pstate)); } static void signalfd_exit_helper(void) { proc_t *p = curproc; - list_t *lst; - - /* This being non-null is the only way we can get here */ - ASSERT(p->p_sigfd != NULL); mutex_enter(&p->p_lock); - lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; - signalfd_wake_list_cleanup(p); - list_destroy(lst); - kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t)); - p->p_sigfd = NULL; mutex_exit(&p->p_lock); } @@ -254,35 +262,40 @@ static void signalfd_pollwake_cb(void *arg0, int sig) { proc_t *p = (proc_t *)arg0; + sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd; list_t *lst; - sigfd_wake_list_t *wlp; + sigfd_poll_waiter_t *pw; ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(pstate != NULL); - if (p->p_sigfd == NULL) - return; - - lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; - wlp = list_head(lst); - while (wlp != NULL) { - signalfd_state_t *state = wlp->sigfd_wl_state; + lst = &pstate->sigfd_list; + pw = list_head(lst); + while (pw != NULL) { + signalfd_state_t *state = pw->spw_state; + sigfd_poll_waiter_t *next; + short pollev; mutex_enter(&state->sfd_lock); - - if (sigismember(&state->sfd_set, sig) && - state->sfd_pollhd.ph_list != NULL) { - sigfd_wake_list_t *tmp = wlp; - - /* remove it from the list */ - wlp = list_next(lst, wlp); - signalfd_wake_rm(lst, tmp); - - mutex_exit(&state->sfd_lock); - pollwakeup(&state->sfd_pollhd, POLLRDNORM | POLLIN); + if (!state->sfd_valid) { + pollev = POLLERR; + } else if (sigismember(&state->sfd_set, sig)) { + pollev = POLLRDNORM | POLLIN; } else { mutex_exit(&state->sfd_lock); - wlp = list_next(lst, wlp); + pw = list_next(lst, pw); + continue; } + + signalfd_state_release(state, B_FALSE); + pw->spw_state = NULL; + pollwakeup(&pw->spw_pollhd, pollev); + pollhead_clean(&pw->spw_pollhd); + + next = list_next(lst, pw); + list_remove(lst, pw); + kmem_free(pw, sizeof (*pw)); + pw = next; } } @@ -290,7 +303,7 @@ _NOTE(ARGSUSED(1)) static int signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) { - signalfd_state_t *state; + signalfd_state_t *state, **sstate; major_t major = getemajor(*devp); minor_t minor = getminor(*devp); @@ -300,18 +313,20 @@ signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) mutex_enter(&signalfd_lock); minor = (minor_t)id_allocff(signalfd_minor); - if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) { id_free(signalfd_minor, minor); mutex_exit(&signalfd_lock); return (ENODEV); } - state = ddi_get_soft_state(signalfd_softstate, minor); - *devp = makedevice(major, minor); + state = kmem_zalloc(sizeof (*state), KM_SLEEP); + state->sfd_valid = B_TRUE; + state->sfd_count = 1; + list_insert_head(&signalfd_state, (void *)state); - state->sfd_next = signalfd_state; - signalfd_state = state; + sstate = ddi_get_soft_state(signalfd_softstate, minor); + *sstate = state; + *devp = makedevice(major, minor); mutex_exit(&signalfd_lock); @@ -405,6 +420,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block) lwp->lwp_extsig = 0; mutex_exit(&p->p_lock); + if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate) + BROP(p)->b_sigfd_translate(infop); + /* Convert k_siginfo into external, datamodel independent, struct. */ bzero(ssp, sizeof (*ssp)); ssp->ssi_signo = infop->si_signo; @@ -439,7 +457,7 @@ _NOTE(ARGSUSED(2)) static int signalfd_read(dev_t dev, uio_t *uio, cred_t *cr) { - signalfd_state_t *state; + signalfd_state_t *state, **sstate; minor_t minor = getminor(dev); boolean_t block = B_TRUE; k_sigset_t set; @@ -449,7 +467,8 @@ signalfd_read(dev_t dev, uio_t *uio, cred_t *cr) if (uio->uio_resid < sizeof (signalfd_siginfo_t)) return (EINVAL); - state = ddi_get_soft_state(signalfd_softstate, minor); + sstate = ddi_get_soft_state(signalfd_softstate, minor); + state = *sstate; if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) block = B_FALSE; @@ -462,15 +481,26 @@ signalfd_read(dev_t dev, uio_t *uio, cred_t *cr) return (set_errno(EINVAL)); do { - res = consume_signal(state->sfd_set, uio, block); - if (res == 0) - got_one = B_TRUE; + res = consume_signal(set, uio, block); - /* - * After consuming one signal we won't block trying to consume - * further signals. - */ - block = B_FALSE; + if (res == 0) { + /* + * After consuming one signal, do not block while + * trying to consume more. + */ + got_one = B_TRUE; + block = B_FALSE; + + /* + * Refresh the matching signal set in case it was + * updated during the wait. + */ + mutex_enter(&state->sfd_lock); + set = state->sfd_set; + mutex_exit(&state->sfd_lock); + if (sigisempty(&set)) + break; + } } while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t)); if (got_one) @@ -499,13 +529,14 @@ static int signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { - signalfd_state_t *state; + signalfd_state_t *state, **sstate; minor_t minor = getminor(dev); kthread_t *t = curthread; proc_t *p = ttoproc(t); short revents = 0; - state = ddi_get_soft_state(signalfd_softstate, minor); + sstate = ddi_get_soft_state(signalfd_softstate, minor); + state = *sstate; mutex_enter(&state->sfd_lock); @@ -515,39 +546,36 @@ signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp, mutex_exit(&state->sfd_lock); if (!(*reventsp = revents & events) && !anyyet) { - *phpp = &state->sfd_pollhd; + sigfd_proc_state_t *pstate; + sigfd_poll_waiter_t *pw; /* * Enable pollwakeup handling. */ - if (p->p_sigfd == NULL) { - sigfd_proc_state_t *pstate; + mutex_enter(&p->p_lock); + if ((pstate = (sigfd_proc_state_t *)p->p_sigfd) == NULL) { - pstate = kmem_zalloc(sizeof (sigfd_proc_state_t), - KM_SLEEP); + mutex_exit(&p->p_lock); + pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP); list_create(&pstate->sigfd_list, - sizeof (sigfd_wake_list_t), - offsetof(sigfd_wake_list_t, sigfd_wl_lst)); + sizeof (sigfd_poll_waiter_t), + offsetof(sigfd_poll_waiter_t, spw_list)); + pstate->sigfd_pollwake_cb = signalfd_pollwake_cb; + /* Check again, after blocking for the alloc. */ mutex_enter(&p->p_lock); - /* check again now that we're locked */ if (p->p_sigfd == NULL) { p->p_sigfd = pstate; } else { /* someone beat us to it */ list_destroy(&pstate->sigfd_list); - kmem_free(pstate, sizeof (sigfd_proc_state_t)); + kmem_free(pstate, sizeof (*pstate)); + pstate = p->p_sigfd; } - mutex_exit(&p->p_lock); } - mutex_enter(&p->p_lock); - if (((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb == - NULL) { - ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = - signalfd_pollwake_cb; - } - signalfd_wake_list_add(state); + pw = signalfd_wake_list_add(pstate, state); + *phpp = &pw->spw_pollhd; mutex_exit(&p->p_lock); } @@ -558,11 +586,12 @@ _NOTE(ARGSUSED(4)) static int signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { - signalfd_state_t *state; + signalfd_state_t *state, **sstate; minor_t minor = getminor(dev); sigset_t mask; - state = ddi_get_soft_state(signalfd_softstate, minor); + sstate = ddi_get_soft_state(signalfd_softstate, minor); + state = *sstate; switch (cmd) { case SIGNALFDIOC_MASK: @@ -587,33 +616,42 @@ _NOTE(ARGSUSED(1)) static int signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) { - signalfd_state_t *state, **sp; + signalfd_state_t *state, **sstate; + sigfd_poll_waiter_t *pw = NULL; minor_t minor = getminor(dev); proc_t *p = curproc; - state = ddi_get_soft_state(signalfd_softstate, minor); - - if (state->sfd_pollhd.ph_list != NULL) { - pollwakeup(&state->sfd_pollhd, POLLERR); - pollhead_clean(&state->sfd_pollhd); - } + sstate = ddi_get_soft_state(signalfd_softstate, minor); + state = *sstate; - /* Make sure our state is removed from our proc's pollwake list. */ + /* Make sure state is removed from this proc's pollwake list. */ mutex_enter(&p->p_lock); - signalfd_wake_list_rm(p, state); - mutex_exit(&p->p_lock); + if (p->p_sigfd != NULL) { + sigfd_proc_state_t *pstate = p->p_sigfd; - mutex_enter(&signalfd_lock); + pw = signalfd_wake_list_rm(pstate, state); + if (list_is_empty(&pstate->sigfd_list)) { + signalfd_wake_list_cleanup(p); + } + } + mutex_exit(&p->p_lock); - /* Remove our state from our global list. */ - for (sp = &signalfd_state; *sp != state; sp = &((*sp)->sfd_next)) - VERIFY(*sp != NULL); + if (pw != NULL) { + pollwakeup(&pw->spw_pollhd, POLLERR); + pollhead_clean(&pw->spw_pollhd); + kmem_free(pw, sizeof (*pw)); + } - *sp = (*sp)->sfd_next; + mutex_enter(&signalfd_lock); + *sstate = NULL; ddi_soft_state_free(signalfd_softstate, minor); id_free(signalfd_minor, minor); + mutex_enter(&state->sfd_lock); + state->sfd_valid = B_FALSE; + signalfd_state_release(state, B_TRUE); + mutex_exit(&signalfd_lock); return (0); @@ -635,7 +673,7 @@ signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) } if (ddi_soft_state_init(&signalfd_softstate, - sizeof (signalfd_state_t), 0) != 0) { + sizeof (signalfd_state_t *), 0) != 0) { cmn_err(CE_WARN, "signalfd failed to create soft state"); id_space_destroy(signalfd_minor); mutex_exit(&signalfd_lock); @@ -656,6 +694,9 @@ signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) sigfd_exit_helper = signalfd_exit_helper; + list_create(&signalfd_state, sizeof (signalfd_state_t), + offsetof(signalfd_state_t, sfd_list)); + mutex_exit(&signalfd_lock); return (DDI_SUCCESS); @@ -673,10 +714,19 @@ signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (DDI_FAILURE); } - /* list should be empty */ - VERIFY(signalfd_state == NULL); - mutex_enter(&signalfd_lock); + + if (!list_is_empty(&signalfd_state)) { + /* + * There are dangling poll waiters holding signalfd_state_t + * entries on the global list. Detach is not possible until + * they purge themselves. + */ + return (DDI_FAILURE); + mutex_exit(&signalfd_lock); + } + list_destroy(&signalfd_state); + id_space_destroy(signalfd_minor); ddi_remove_minor_node(signalfd_devi, NULL); diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index e9af19ca18..994ca8baa8 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1451,6 +1451,16 @@ copyb(mblk_t *bp) ndp = nbp->b_datap; /* + * Copy the various checksum information that came in + * originally. + */ + ndp->db_cksumstart = dp->db_cksumstart; + ndp->db_cksumend = dp->db_cksumend; + ndp->db_cksumstuff = dp->db_cksumstuff; + bcopy(dp->db_struioun.data, ndp->db_struioun.data, + sizeof (dp->db_struioun.data)); + + /* * Well, here is a potential issue. If we are trying to * trace a flow, and we copy the message, we might lose * information about where this message might have been. diff --git a/usr/src/uts/common/io/udmf/dm9601reg.h b/usr/src/uts/common/io/udmf/dm9601reg.h new file mode 100644 index 0000000000..a36f2b0fc8 --- /dev/null +++ b/usr/src/uts/common/io/udmf/dm9601reg.h @@ -0,0 +1,348 @@ +/* + * %W% %E% + * Macro definitions for Davicom DM9601 USB to fast ethernet controler + * based on Davicom DM9601E data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +#ifndef __DM9601_H__ +#define __DM9601_H__ + +/* + * offset of registers + */ +#define NCR 0x00U /* network control register */ +#define NSR 0x01U /* network status register */ +#define TCR 0x02U /* tx control register */ +#define TSR1 0x03U /* tx status register 1 */ +#define TSR2 0x04U /* tx status register 2 */ +#define RCR 0x05U /* rx control register */ +#define RSR 0x06U /* rx status register */ +#define ROCR 0x07U /* rx overflow counter register */ +#define BPTR 0x08U /* back pressure threshold regster */ +#define FCTR 0x09U /* flow control threshold regster */ +#define FCR 0x0aU /* flow control threshold regster */ +#define EPCR 0x0bU /* eeprom & phy control register */ +#define EPAR 0x0cU /* eeprom & phy address register */ +#define EPDR 0x0dU /* eeprom & phy data register (2byte) */ +#define WCR 0x0fU /* wake up control register */ +#define PAR 0x10U /* physical address register (6byte) */ +#define MAR 0x16U /* multicast address register (8byte) */ +#define GPCR 0x1eU /* general purpose control register */ +#define GPR 0x1fU /* general purpose register */ +#define VID 0x28U /* vendor ID (2byte) */ +#define PID 0x2aU /* product ID (2byte) */ +#define CHIPR 0x2cU /* chip revision */ +#define USBDA 0xf0U /* usb device address register */ +#define RXC 0xf1U /* received packet counter register */ +#define TUSC 0xf2U /* tx packet counter/usb status register */ +#define USBC 0xf4U /* usb control register */ + +/* + * register definitions + */ +/* network control register */ +#define NCR_EXT_PHY 0x80U /* 1: select external phy */ +#define NCR_WAKEEN 0x40U /* 1: wake up event enable */ +#define NCR_FCOL 0x10U /* force collision mode for test */ +#define NCR_FDX 0x08U /* 1: full duplex mode (for external phy) */ +#define NCR_LBK 0x06U +#define NCR_LBK_SHIFT 1 +#define NCR_LBK_NORMAL (0U << NCR_LBK_SHIFT) +#define NCR_LBK_MAC (1U << NCR_LBK_SHIFT) +#define NCR_LBK_PHY_D (2U << NCR_LBK_SHIFT) +#define NCR_LBK_PHY_A (3U << NCR_LBK_SHIFT) +#define NCR_RST 0x01U /* 1: reset, auto clear */ + +#define NCR_BITS \ + "\020" \ + "\010EXT_PHY" \ + "\007WAKEEN" \ + "\005FCOL" \ + "\004FDX" \ + "\001RST" + +/* network status register */ +#define NSR_SPEED 0x80U /* 1:10M 0:100M */ +#define NSR_LINKST 0x40U /* 1:ok 0:fail */ +#define NSR_WAKEST 0x20U /* 1:enabled */ +#define NSR_TXFULL 0x10U /* 1:tx fifo full */ +#define NSR_TX2END 0x08U /* tx packet2 complete status */ +#define NSR_TX1END 0x04U /* tx packet1 complete status */ +#define NSR_RXOV 0x02U /* rx fifo overflow */ +#define NSR_RXRDY 0x01U /* rx packet ready */ + +#define NSR_BITS \ + "\020" \ + "\010SPEED_10" \ + "\007LINKST_UP" \ + "\006WAKEST" \ + "\005TXFULL" \ + "\004TX2END" \ + "\003TX1END" \ + "\002RXOV" \ + "\001RXRDY" + +/* tx control register */ +#define TCR_TJDIS 0x40U /* tx jitter control */ +#define TCR_EXCEDM 0x20U /* excessive collision mode */ +#define TCR_PAD_DIS2 0x10U /* PAD appends disable for pkt2 */ +#define TCR_CRC_DIS2 0x08U /* CRC appends disable for pkt2 */ +#define TCR_PAD_DIS1 0x04U /* PAD appends disable for pkt1 */ +#define TCR_CRC_DIS1 0x02U /* CRC appends disable for pkt1 */ + +#define TCR_BITS \ + "\020" \ + "\007TJDIS" \ + "\006EXCEDM" \ + "\005PAD_DIS2" \ + "\004CRC_DIS2" \ + "\003PAD_DIS1" \ + "\002CRC_DIS1" + +/* tx status register (ro) */ +#define TSR_TJTO 0x80U /* tx jabber time out */ +#define TSR_LC 0x40U /* loss of carrier */ +#define TSR_NC 0x20U /* no carrier */ +#define TSR_LATEC 0x10U /* late collision */ +#define TSR_COL 0x08U /* late collision */ +#define TSR_EL 0x04U /* excessive collision */ + +#define TSR_BITS \ + "\020" \ + "\010TJTO" \ + "\007LC" \ + "\006NC" \ + "\005LATEC" \ + "\004COL" \ + "\003EL" + +/* rx control register */ +#define RCR_WTDIS 0x40U /* watch dog timer disable */ +#define RCR_DIS_LONG 0x20U /* discard longer packets than 1522 */ +#define RCR_DIS_CRC 0x10U /* discard crc error packets */ +#define RCR_ALL 0x08U /* pass all multicast */ +#define RCR_RUNT 0x04U /* pass runt packets */ +#define RCR_PRMSC 0x02U /* promiscuous mode */ +#define RCR_RXEN 0x01U /* rx enable */ + +#define RCR_BITS \ + "\020" \ + "\007WTDIS" \ + "\006DIS_LONG" \ + "\005DIS_CRC" \ + "\004ALL" \ + "\003RUNT" \ + "\002PRMSC" \ + "\001RXEN" + +/* rx status register */ +#define RSR_RF 0x80U /* runt frame */ +#define RSR_MF 0x40U /* multicast frame */ +#define RSR_LCS 0x20U /* late collision seen */ +#define RSR_RWTO 0x10U /* receive watchdog timeout */ +#define RSR_PLE 0x08U /* physical layer error */ +#define RSR_AE 0x04U /* alignment error */ +#define RSR_CE 0x02U /* crc error */ +#define RSR_FOE 0x01U /* fifo overflow error */ + +#define RSR_BITS \ + "\020" \ + "\010RF" \ + "\007MF" \ + "\006LCS" \ + "\005RWTO" \ + "\004PLE" \ + "\003AE" \ + "\002CE" \ + "\001FOE" + +/* receive overflow counter register */ +#define ROCR_RXFU 0x80U /* receive overflow counter overflow */ +#define ROCR_ROC 0x7fU /* receive overflow counter */ + +#define ROCR_BITS \ + "\020" \ + "\010RXFU" + +/* back pressure threshold register */ +#define BPTR_BPHW 0xf0U /* high water overflow threshold */ +#define BPTR_BPHW_SHIFT 4 +#define BPTR_BPHW_UNIT 1024U +#define BPTR_BPHW_DEFAULT (3 << BPTR_BPHW_SHIFT) /* 3k */ +#define BPTR_JPT 0x0fU /* jam pattern time */ +#define BPTR_JPT_SHIFT 0 +#define BPTR_JPT_5us (0U << BPTR_JPT_SHIFT) +#define BPTR_JPT_10us (1U << BPTR_JPT_SHIFT) +#define BPTR_JPT_15us (2U << BPTR_JPT_SHIFT) +#define BPTR_JPT_25us (3U << BPTR_JPT_SHIFT) +#define BPTR_JPT_50us (4U << BPTR_JPT_SHIFT) +#define BPTR_JPT_100us (5U << BPTR_JPT_SHIFT) +#define BPTR_JPT_150us (6U << BPTR_JPT_SHIFT) +#define BPTR_JPT_200us (7U << BPTR_JPT_SHIFT) +#define BPTR_JPT_250us (8U << BPTR_JPT_SHIFT) +#define BPTR_JPT_300us (9U << BPTR_JPT_SHIFT) +#define BPTR_JPT_350us (10U << BPTR_JPT_SHIFT) +#define BPTR_JPT_400us (11U << BPTR_JPT_SHIFT) +#define BPTR_JPT_450us (12U << BPTR_JPT_SHIFT) +#define BPTR_JPT_500us (13U << BPTR_JPT_SHIFT) +#define BPTR_JPT_550us (14U << BPTR_JPT_SHIFT) +#define BPTR_JPT_600us (15U << BPTR_JPT_SHIFT) + +/* flow control threshold register */ +#define FCTR_HWOT 0xf0U /* rx fifo high water overflow threshold */ +#define FCTR_HWOT_SHIFT 4 +#define FCTR_HWOT_UNIT 1024U +#define FCTR_LWOT 0x0fU /* rx fifo low water overflow threshold */ +#define FCTR_LWOT_SHIFT 0 +#define FCTR_LWOT_UNIT 1024U + +/* rx/tx flow control register */ +#define FCR_TXPO 0x80U /* tx pause packet */ +#define FCR_TXPF 0x40U /* tx pause packet */ +#define FCR_TXPEN 0x20U /* tx pause packet */ +#define FCR_BKPA 0x10U /* back pressure mode */ +#define FCR_BKPM 0x08U /* back pressure mode */ +#define FCR_BKPS 0x04U /* rx pause packet current status (r/c) */ +#define FCR_RXPCS 0x02U /* rx pause packet current status (ro) */ +#define FCR_FLCE 0x01U /* flow control enbale */ + +#define FCR_BITS \ + "\020" \ + "\000TXPO" \ + "\000TXPF" \ + "\000TXPEN" \ + "\000BKPA" \ + "\000BKPM" \ + "\000BKPS" \ + "\000RXPCS" \ + "\000FLCE" + +/* EEPROM & PHY control register (0x0b) */ +#define EPCR_REEP 0x20U /* reload eeprom */ +#define EPCR_WEP 0x10U /* write eeprom enable */ +#define EPCR_EPOS 0x08U /* select device, 0:eeprom, 1:phy */ +#define EPCR_ERPRR 0x04U /* read command */ +#define EPCR_ERPRW 0x02U /* write command */ +#define EPCR_ERRE 0x01U /* eeprom/phy access in progress (ro) */ + +#define EPCR_BITS \ + "\020" \ + "\005REEP" \ + "\004WEP" \ + "\003EPOS" \ + "\002ERPRR" \ + "\001ERPRW" \ + "\000ERRE" + +/* EEPROM & PHY access register (0x0c) */ +#define EPAR_PHYADR 0xc0U /* phy address, internal phy(1) or external */ +#define EPAR_PHYADR_SHIFT 6 +#define EPAR_EROA 0x3fU /* eeprom word addr or phy register addr */ +#define EPAR_EROA_SHIFT 0 + +/* EEPROM & PHY data register (0x0d(low)-0x0e(hi)) */ + +/* wake up control register (0x0f) */ +#define WCR_LINKEN 0x20U /* enable link status event */ +#define WCR_SAMPLEEN 0x10U /* enable sample frame event */ +#define WCR_MAGICEN 0x08U /* enable magic pkt event */ +#define WCR_LINKST 0x04U /* link status change occur ro */ +#define WCR_SAMPLEST 0x02U /* sample frame rx occur ro */ +#define WCR_MAGICST 0x01U /* magic pkt rx occur ro */ + +#define WCR_BITS \ + "\020" \ + "\000LINKEN" \ + "\000SAMPLEEN" \ + "\000MAGICEN" \ + "\000LINKST" \ + "\000SAMPLEST" \ + "\000MAGICST" + +/* physical address register (0x10-0x15) */ +/* multicast address register (0x16-0x1c) */ +/* general purpose control register (0x1e) */ +#define GPCR_GEPCTRL 0x7f +#define GPCR_OUT(n) (1U << (n)) + +#define GPCR_BITS \ + "\020" \ + "\006OUT5" \ + "\005OUT4" \ + "\004OUT3" \ + "\003OUT2" \ + "\002OUT1" \ + "\001OUT0" + +/* general purpose register (0x1f) */ +#define GPR_GEPIO5 0x20U +#define GPR_GEPIO4 0x10U +#define GPR_GEPIO3 0x08U +#define GPR_GEPIO2 0x04U +#define GPR_GEPIO1 0x02U +#define GPR_GEPIO0 0x01U + +#define GPR_BITS \ + "\020" \ + "\006GEPIO5" \ + "\005GEPIO4" \ + "\004GEPIO3" \ + "\003GEPIO2" \ + "\002GEPIO1" \ + "\001GEPIO0" + +/* vendor id register (0x28-0x29) */ +/* product id register (0x2a-0x2b) */ +/* chip revision register (0x2c) */ + +/* usb device address register (0xf0) */ +#define USBDA_USBFA 0x3fU /* usb device address */ +#define USBDA_USBFA_SHIFT 0 + +/* receive packet counter register (0xf1) */ + +/* transmitpacket counter/usb status register (0xf2) */ +#define TUSR_RXFAULT 0x80U /* indicate rx has unexpected condition */ +#define TUSR_SUSFLAG 0x40U /* indicate device has suspended condition */ +#define TUSR_EP1RDY 0x20U /* ready for read from ep1 pipe */ +#define TUSR_SRAM 0x18U /* sram size 0:32K, 1:48K, 2:16K, 3:64K */ +#define TUSR_SRAM_SHIFT 3 +#define TUSR_SRAM_32K (0U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_48K (1U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_16K (2U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_64K (3U << TUSR_SRAM_SHIFT) +#define TUSR_TXC2 0x04U /* two or more packets in tx buffer */ +#define TUSR_TXC1 0x02U /* one packet in tx buffer */ +#define TUSR_TXC0 0x01U /* no packet in tx buffer */ + +#define TUSR_BITS \ + "\020" \ + "\010RXFAULT" \ + "\007SUSFLAG" \ + "\006EP1RDY" \ + "\003TXC2" \ + "\002TXC1" \ + "\001TXC0" + +/* usb control register (0xf4) */ +#define USBC_EP3ACK 0x20U /* ep3 will alway return 8byte data if NAK=0*/ +#define USBC_EP3NACK 0x10U /* ep3 will alway return NAK */ +#define USBC_MEMTST 0x01U + +/* bulk message format */ +#define TX_HEADER_SIZE 2 +#define RX_HEADER_SIZE 3 + +/* interrupt msg format */ +struct intr_msg { + uint8_t im_nsr; + uint8_t im_tsr1; + uint8_t im_tsr2; + uint8_t im_rsr; + uint8_t im_rocr; + uint8_t im_rxc; + uint8_t im_txc; + uint8_t im_gpr; +}; +#endif /* __DM9601_H__ */ diff --git a/usr/src/uts/common/io/udmf/udmf_usbgem.c b/usr/src/uts/common/io/udmf/udmf_usbgem.c new file mode 100644 index 0000000000..0637de054b --- /dev/null +++ b/usr/src/uts/common/io/udmf/udmf_usbgem.c @@ -0,0 +1,1036 @@ +/* + * udmfE_usbgem.c : Davicom DM9601E USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2009-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "dm9601reg.h" + +char ident[] = "dm9601 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label +#define LE16P(p) ((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0]) + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int udmf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (udmf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for dm9601 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct udmf_dev { + /* + * Misc HW information + */ + uint8_t rcr; + uint8_t last_nsr; + uint8_t mac_addr[ETHERADDRL]; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t udmf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void udmf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int udmf_reset_chip(struct usbgem_dev *); +static int udmf_init_chip(struct usbgem_dev *); +static int udmf_start_chip(struct usbgem_dev *); +static int udmf_stop_chip(struct usbgem_dev *); +static int udmf_set_media(struct usbgem_dev *); +static int udmf_set_rx_filter(struct usbgem_dev *); +static int udmf_get_stats(struct usbgem_dev *); +static void udmf_interrupt(struct usbgem_dev *, mblk_t *); + +/* packet operations */ +static mblk_t *udmf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *udmf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUT(dp, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 1, \ + /* wValue */ 0, \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define OUTB(dp, ix, val, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 3, \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ 0, \ + /* value */ NULL, \ + /* size */ 0)) != USB_SUCCESS) goto label + +#define IN(dp, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 0, \ + /* wValue */ 0, \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static void +udmf_enable_phy(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + /* de-assert reset signal to phy */ + OUTB(dp, GPCR, GPCR_OUT(0), &err, usberr); + OUTB(dp, GPR, 0, &err, usberr); +usberr: + ; +} + +static int +udmf_reset_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + OUTB(dp, NCR, NCR_LBK_NORMAL | NCR_RST, &err, usberr); + drv_usecwait(100); +usberr: + return (err); +} + +/* + * Setup dm9601 + */ +static int +udmf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err = USB_SUCCESS; + uint16_t reg; + uint8_t buf[2]; + struct udmf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + OUTB(dp, NCR, NCR_LBK_NORMAL, &err, usberr); + + /* tx control regiser: enable padding and crc generation */ + OUTB(dp, TCR, 0, &err, usberr); + + /* rx control register: will be set later by udmf_set_rx_filer() */ + lp->rcr = RCR_RUNT; + + /* back pressure threshold: */ + OUTB(dp, BPTR, (2 << BPTR_BPHW_SHIFT) | BPTR_JPT_200us, + &err, usberr); + + /* flow control threshold: same as default */ + OUTB(dp, FCTR, (3 << FCTR_HWOT_SHIFT) | (8 << FCTR_LWOT_SHIFT), + &err, usberr); + + /* usb control register */ + OUTB(dp, USBC, USBC_EP3ACK | 0x06, &err, usberr); + + /* flow control: will be set later by udmf_set_media() */ + + /* wake up control register: */ + OUTB(dp, WCR, 0, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + + /* enable Rx */ + lp->rcr |= RCR_RXEN; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_stop_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + + /* disable rx */ + lp->rcr &= ~RCR_RXEN; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_get_stats(struct usbgem_dev *dp) +{ + /* EMPTY */ + return (USB_SUCCESS); +} + +static uint_t +udmf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_le(addr) & 0x3f); +} + +static int +udmf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint8_t rcr; + uint8_t mode; + uint8_t mhash[8]; + uint8_t *mac; + uint_t h; + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + static uint8_t invalid_mac[ETHERADDRL] = {0, 0, 0, 0, 0, 0}; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & RCR_RXEN) { + /* set promiscuous mode before changing rx filter mode */ + OUTB(dp, RCR, lp->rcr | RCR_PRMSC, &err, usberr); + } + + lp->rcr &= ~(RCR_ALL | RCR_PRMSC); + mode = 0; + bzero(mhash, sizeof (mhash)); + mac = dp->cur_addr.ether_addr_octet; + + if ((dp->rxmode & RXMODE_ENABLE) == 0) { + mac = invalid_mac; + } else if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_PRMSC; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) { + /* accept all multicast packets */ + mode |= RCR_ALL; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + for (i = 0; i < dp->mc_count; i++) { + /* hash table is 64 = 2^6 bit width */ + h = dp->mc_list[i].hash; + mhash[h / 8] |= 1 << (h % 8); + } + } + + /* set node address */ + if (bcmp(mac, lp->mac_addr, ETHERADDRL) != 0) { + OUT(dp, PAR, ETHERADDRL, dp->cur_addr.ether_addr_octet, + &err, usberr); + bcopy(mac, lp->mac_addr, ETHERADDRL); + } + + /* set multicast hash table */ + OUT(dp, MAR, sizeof (mhash), &mhash[0], &err, usberr); + + /* update rcr */ + lp->rcr |= mode; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +#if DEBUG_LEVEL > 1 + /* verify rcr */ + IN(dp, RCR, 1, &rcr, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, rcr, RCR_BITS); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_set_media(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + uint8_t fcr; + struct udmf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* setup flow control */ + fcr = 0; + if (dp->full_duplex) { + /* select flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_RX_PAUSE: + fcr |= FCR_FLCE; + break; + + case FLOW_CONTROL_TX_PAUSE: + fcr |= FCR_TXPEN; + break; + + case FLOW_CONTROL_SYMMETRIC: + fcr |= FCR_FLCE | FCR_TXPEN; + break; + } + } + + /* update flow control register */ + OUTB(dp, FCR, fcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +/* + * send/receive packet check + */ +static mblk_t * +udmf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int n; + size_t pkt_size; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + uint_t align_mask; + + pkt_size = msgdsize(mp); + align_mask = 63; + + /* + * re-allocate the mp + */ + + /* minimum ethernet packet size of ETHERMIN */ + pkt_size = max(pkt_size, ETHERMIN); + +#if 0 /* CONFIG_ADD_TX_DELIMITOR_ALWAYS */ + pkt_size += TX_HEADER_SIZE; +#endif + if (((pkt_size + TX_HEADER_SIZE) & align_mask) == 0) { + /* padding is required in usb communication */ + pkt_size += TX_HEADER_SIZE; + } + + if ((new = allocb(TX_HEADER_SIZE + pkt_size, 0)) == NULL) { + return (NULL); + } + new->b_wptr = new->b_rptr + TX_HEADER_SIZE + pkt_size; + + /* add a header */ + bp = new->b_rptr; + bp[0] = (uint8_t)pkt_size; + bp[1] = (uint8_t)(pkt_size >> 8); + bp += TX_HEADER_SIZE; + + /* copy contents of the buffer */ + for (tp = mp; tp; tp = tp->b_cont) { + n = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, n); + bp += n; + } + + /* clear the rest including the next zero length header */ + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + return (new); +} + +static void +udmf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +udmf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int len; + uint8_t rx_stat; + + len = mp->b_wptr - mp->b_rptr; + + if (len <= RX_HEADER_SIZE) { + /* + * the usb bulk-in frame doesn't include a valid + * ethernet packet. + */ + return (NULL); + } + + /* remove rx header */ + rx_stat = mp->b_rptr[0]; + if (rx_stat & (RSR_RF | RSR_LCS | RSR_RWTO | + RSR_PLE | RSR_AE | RSR_CE | RSR_FOE)) { + if (rx_stat & RSR_RF) { + dp->stats.runt++; + } + if (rx_stat & RSR_LCS) { + /* late collision */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_RWTO) { + /* rx timeout */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_PLE) { + /* physical layer error */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_AE) { + /* alignment error */ + dp->stats.frame++; + } + if (rx_stat & RSR_CE) { + /* crc error */ + dp->stats.crc++; + } + if (rx_stat & RSR_FOE) { + /* fifo overflow error */ + dp->stats.overflow++; + } + dp->stats.errrcv++; + } + len = LE16P(&mp->b_rptr[1]); + if (len >= ETHERFCSL) { + len -= ETHERFCSL; + } + mp->b_rptr += RX_HEADER_SIZE; + mp->b_wptr = mp->b_rptr + len; + + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +udmf_ep_read(struct usbgem_dev *dp, uint_t which, uint_t addr, int *errp) +{ + int i; + uint8_t epcr; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, addr)); + + OUTB(dp, EPAR, addr, errp, usberr); + OUTB(dp, EPCR, which | EPCR_ERPRR, errp, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, EPCR, sizeof (epcr), &epcr, errp, usberr); + if ((epcr & EPCR_ERRE) == 0) { + /* done */ + IN(dp, EPDR, sizeof (val), &val, errp, usberr); + val = LE_16(val); + goto done; + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__); + val = 0; +done: + OUTB(dp, EPCR, 0, errp, usberr); + return (val); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + *errp, *errp == USB_SUCCESS ? "success" : "error")); + return (0); +} + +static void +udmf_ep_write(struct usbgem_dev *dp, uint_t which, uint_t addr, + uint16_t val, int *errp) +{ + int i; + uint8_t epcr; + + DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__)); + + val = LE_16(val); + OUT(dp, EPDR, sizeof (val), &val, errp, usberr); + + OUTB(dp, EPAR, addr, errp, usberr); + + OUTB(dp, EPCR, which | EPCR_WEP | EPCR_ERPRW, errp, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, EPCR, 1, &epcr, errp, usberr); + if ((epcr & EPCR_ERRE) == 0) { + /* done */ + goto done; + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__); +done: + OUTB(dp, EPCR, 0, errp, usberr); + return; + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + *errp, *errp == USB_SUCCESS ? "success" : "error")); +} + +static uint16_t +udmf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint16_t val; + + val = udmf_ep_read(dp, EPCR_EPOS, + (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, errp); + + return (val); +} + +static void +udmf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + udmf_ep_write(dp, EPCR_EPOS, + (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, val, errp); +} + +static void +udmf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + struct intr_msg *imp; + struct udmf_dev *lp = dp->private; + + imp = (struct intr_msg *)&mp->b_rptr[0]; + + DPRINTF(4, (CE_CONT, + "!%s: %s: size:%d, nsr:%b tsr1:%b tsr2:%b" + " rsr:%b rocr:%b rxc:%02x txc:%b gpr:%b", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + imp->im_nsr, NSR_BITS, + imp->im_tsr1, TSR_BITS, + imp->im_tsr2, TSR_BITS, + imp->im_rsr, RSR_BITS, + imp->im_rocr, ROCR_BITS, + imp->im_rxc, + imp->im_txc, TUSR_BITS, + imp->im_gpr, GPR_BITS)); + + if ((lp->last_nsr ^ imp->im_nsr) & NSR_LINKST) { + usbgem_mii_update_link(dp); + } + + lp->last_nsr = imp->im_nsr; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static uint16_t +udmf_eeprom_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint16_t val; + + val = udmf_ep_read(dp, 0, index, errp); + + return (val); +} + +#ifdef DEBUG_LEVEL +static void +udmf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint16_t w0, w1, w2, w3; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + + err = USB_SUCCESS; + + for (i = 0; i < size; i += 4) { + w0 = udmf_eeprom_read(dp, i + 0, &err); + w1 = udmf_eeprom_read(dp, i + 1, &err); + w2 = udmf_eeprom_read(dp, i + 2, &err); + w3 = udmf_eeprom_read(dp, i + 3, &err); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i, w0, w1, w2, w3); + } +usberr: + ; +} +#endif + +static int +udmf_attach_chip(struct usbgem_dev *dp) +{ + int i; + uint_t val; + uint8_t *m; + int err; + struct udmf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__)); + + /* + * get mac address from EEPROM + */ + m = dp->dev_addr.ether_addr_octet; + for (i = 0; i < ETHERADDRL; i += 2) { + val = udmf_eeprom_read(dp, i/2, &err); + m[i + 0] = (uint8_t)val; + m[i + 1] = (uint8_t)(val >> 8); + } + + /* invalidate a private cache for mac addr */ + bzero(lp->mac_addr, sizeof (lp->mac_addr)); +#ifdef CONFIG_VLAN + dp->misc_flag = USBGEM_VLAN; +#endif +#if DEBUG_LEVEL > 0 + udmf_eeprom_dump(dp, /* 0x3f + 1 */ 128); +#endif +{ + static uint8_t bcst[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + DPRINTF(0, (CE_CONT, "!%s: %s: hash of bcast:%x", + dp->name, __func__, usbgem_ether_crc_be(bcst))); +} + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: %s: usb error detected (%d)", + dp->name, __func__, err); + return (USB_FAILURE); +} + +static int +udmf_mii_probe(struct usbgem_dev *dp) +{ + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + udmf_enable_phy(dp); + return (usbgem_mii_probe_default(dp)); +} + +static int +udmf_mii_init(struct usbgem_dev *dp) +{ + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + udmf_enable_phy(dp); + return (USB_SUCCESS); +} + +static int +udmfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int revid; + int unit; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct udmf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + /* + * softmac requires that ppa is the instance number + * of the device, otherwise it hangs in seaching the device. + */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 64; + + ugcp->usbgc_rx_header_len = RX_HEADER_SIZE; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; +#if 1 + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; +#else + /* + * XXX - flow control caused link down frequently under + * heavy traffic + */ + ugcp->usbgc_flow_control = FLOW_CONTROL_NONE; +#endif + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = + USBGEM_LINK_WATCH_INTERVAL; + ugcp->usbgc_mii_an_watch_interval = + USBGEM_LINK_WATCH_INTERVAL/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = (25*ONESEC)/10; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + ugcp->usbgc_mii_hw_link_detection = B_TRUE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &udmf_attach_chip; + ugcp->usbgc_reset_chip = &udmf_reset_chip; + ugcp->usbgc_init_chip = &udmf_init_chip; + ugcp->usbgc_start_chip = &udmf_start_chip; + ugcp->usbgc_stop_chip = &udmf_stop_chip; + ugcp->usbgc_multicast_hash = &udmf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &udmf_set_rx_filter; + ugcp->usbgc_set_media = &udmf_set_media; + ugcp->usbgc_get_stats = &udmf_get_stats; + ugcp->usbgc_interrupt = &udmf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &udmf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &udmf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &udmf_mii_probe; + ugcp->usbgc_mii_init = &udmf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &udmf_mii_read; + ugcp->usbgc_mii_write = &udmf_mii_write; + ugcp->usbgc_mii_addr_min = 1; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct udmf_dev), KM_SLEEP); + lp->last_nsr; + + ddi_set_driver_private(dip, NULL); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct udmf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct udmf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + + return (DDI_FAILURE); +} + +static int +udmfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(udmf_ops, udmfattach, udmfdetach); +#else +static struct module_info udmfminfo = { + 0, /* mi_idnum */ + "udmf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit udmfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &udmfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit udmfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &udmfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab udmf_info = { + &udmfrinit, /* st_rdinit */ + &udmfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_udmf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &udmf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops udmf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + udmfattach, /* devo_attach */ + udmfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_udmf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &udmf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!udmf: _init: called")); + + status = usbgem_mod_init(&udmf_ops, "udmf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&udmf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!udmf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&udmf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/upf/adm8511reg.h b/usr/src/uts/common/io/upf/adm8511reg.h new file mode 100644 index 0000000000..68a2207bb5 --- /dev/null +++ b/usr/src/uts/common/io/upf/adm8511reg.h @@ -0,0 +1,205 @@ +/* + * @(#)adm8511reg.h 1.1 09/06/20 + * Register dehinitsions of ADMtek ADM8511 Fast Ethernet to USB controller. + * Codeded by Masayuki Murayama(KHF04453@nifty.ne.jp) + * This file is public domain. + */ + +#define EC0 0x00 /* B */ +#define EC1 0x01 /* B */ +#define EC2 0x02 /* B */ +#define MA 0x08 /* 8byte array */ +#define EID 0x10 /* B */ +#define PAUSETIMER 0x18 /* B pause timer */ +#define RPNBFC 0x1a /* B */ +#define ORFBFC 0x1b /* B */ +#define EP1C 0x1c /* B */ +#define RXFC 0x1d /* B */ +#define BIST 0x1e /* B */ +#define EEOFFSET 0x20 /* B */ +#define EEDATA 0x21 /* W */ +#define EECTRL 0x23 /* B */ +#define PHYA 0x25 /* B */ +#define PHYD 0x26 /* W */ +#define PHYAC 0x28 /* B */ +#define USBSTAT 0x2a /* B */ +#define ETHTXSTAT 0x2b /* W */ +#define ETHRXSTAT 0x2d /* B */ +#define LOSTCNT 0x2e /* W */ +#define WF0MASK 0x30 /* 16byte array */ +#define WF0OFFSET 0x40 /* W */ +#define WF0CRC 0x41 /* W */ +#define WF1MASK 0x48 /* 16byte array */ +#define WF1OFFSET 0x58 /* W */ +#define WF1CRC 0x59 /* W */ +#define WF2MASK 0x60 /* 16byte array */ +#define WF2OFFSET 0x70 /* W */ +#define WF2CRC 0x71 /* W */ +#define WCTRL 0x78 /* B */ +#define WSTAT 0x7a /* B */ +#define IPHYC 0x7b /* B */ +#define GPIO54 0x7c /* B */ +#define GPIO10 0x7e /* B */ +#define GPIO32 0x7f /* B */ +#define TEST 0x80 /* B */ +#define TM 0x81 /* B */ +#define RPN 0x82 /* B */ + +/* Ethernet control register 0: offset 0 */ +#define EC0_TXE 0x80U +#define EC0_RXE 0x40U +#define EC0_RXFCE 0x20U +#define EC0_WOE 0x10U +#define EC0_RXSA 0x08U +#define EC0_SBO 0x04U +#define EC0_RXMA 0x02U +#define EC0_RXCS 0x01U + +#define EC0_BITS \ + "\020" \ + "\010TXE" \ + "\007RXE" \ + "\006RXFCE" \ + "\005WOE" \ + "\004RXSA" \ + "\003SBO" \ + "\002RXMA" \ + "\001RXCS" + +/* Ethernet control register 1: offset 1 */ +#define EC1_FD 0x20U +#define EC1_100M 0x10U /* 0:10Mbps 1:100Mbps */ +#define EC1_RM 0x08U /* reset mac */ + +#define EC1_BITS \ + "\020" \ + "\006FD" \ + "\005100M" \ + "\004RM" + +/* Ethernet control register 2: offset 2 */ +#define EC2_MEPL 0x80U /* 8515: MTU 0:1528, 1:1638 */ +#define EC2_RPNC 0x40U +#define EC2_LEEPRS 0x20U +#define EC2_EEPRW 0x10U +#define EC2_LB 0x08U +#define EC2_PROM 0x04U +#define EC2_RXBP 0x02U +#define EC2_EP3RC 0x01U + +#define EC2_BITS \ + "\020" \ + "\010MEPS" \ + "\007RPNC" \ + "\006LEEPRS" \ + "\005EEPRW" \ + "\004LB" \ + "\003PROM" \ + "\002RXBP" \ + "\001EP3RC" + +/* Recieve Packet number based Flow Control register: offset 0x1a */ +#define RPNBFC_PN 0x7eU /* */ +#define RPNBFC_PN_SHIFT 1 +#define RPNBFC_FCP 0x01U /* enable rx flow control */ + +/* Occupied Recieve FIFO based Flow Control register: offset 0x1b */ +#define ORFBFC_RXS 0x7eU /* */ +#define ORFBFC_RXS_SHIFT 1 +#define ORFBFC_RXS_UNIT 1024U +#define ORFBFC_FCRXS 0x01U /* enable rx flow control */ + +/* EP1 control register: offset 0x1c */ +#define EP1C_EP1S0E 0x80U /* send 0 enable */ +#define EP1C_ITMA 0x60U /* internal test mode A */ +#define EP1C_ITMB 0x1fU /* internal test mode B */ + +#define EP1C_BITS \ + "\020" \ + "\010EP1S0E" + +/* Rx FIFO Control register: offset 0x1d */ +#define RXFC_EXT_SRAM 0x02 /* enable external 32k sram */ +#define RXFC_RX32PKT 0x01 /* max 32 packet */ + +/* EEPROM offset register: offset 0x20 */ +#define EEOFFSET_MASK 0x3f /* eeprom offset address in word */ + +/* EEPROM access control register: offset 0x23 */ +#define EECTRL_DONE 0x04 +#define EECTRL_RD 0x02 +#define EECTRL_WR 0x01 + +#define EECTRL_BITS \ + "\020" \ + "\003DONE" \ + "\002RD" \ + "\001WR" + +/* PHY control register: offset 28 */ +#define PHYAC_DO 0x80U /* Done */ +#define PHYAC_RDPHY 0x40U /* read phy */ +#define PHYAC_WRPHY 0x20U /* write phy */ +#define PHYAC_PHYRA 0x1fU /* PHY register address */ + +#define PHYCTRL_BITS \ + "\020" \ + "\010DO" \ + "\007RDPHY" \ + "\006WRPHY" + +/* Internal PHY control register: offset 7b */ +#define IPHYC_EPHY 0x02 +#define IPHYC_PHYR 0x01 + +#define IPHYC_BITS \ + "\020" \ + "\002EPHY" \ + "\001PHYR" + +/* GPIO45 register: offset 7c */ +#define GPIO54_5OE 0x20 +#define GPIO54_5O 0x10 +#define GPIO54_5I 0x08 +#define GPIO54_4OE 0x04 +#define GPIO54_4O 0x02 +#define GPIO54_4I 0x01 + +/* GPIO01 register: offset 7e */ +#define GPIO10_1OE 0x20 +#define GPIO10_1O 0x10 +#define GPIO10_1I 0x08 +#define GPIO10_0OE 0x04 +#define GPIO10_0O 0x02 +#define GPIO10_0I 0x01 + +/* GPIO23 register: offset 7f */ +#define GPIO32_3OE 0x20 +#define GPIO32_3O 0x10 +#define GPIO32_3I 0x08 +#define GPIO32_2OE 0x04 +#define GPIO32_2O 0x02 +#define GPIO32_2I 0x01 + +/* rx status at the end of received packets */ +/* byte 0 and 1 is packet length in little endian */ +/* byte 2 is receive status */ +#define RSR_DRIBBLE 0x10 +#define RSR_CRC 0x08 +#define RSR_RUNT 0x04 +#define RSR_LONG 0x02 +#define RSR_MULTI 0x01 + +#define RSR_ERRORS \ + (RSR_DRIBBLE | RSR_CRC | RSR_RUNT | RSR_LONG | RSR_MULTI) + +#define RSR_BITS \ + "\020" \ + "\005DRIBBLE" \ + "\004CRC" \ + "\003RUNT" \ + "\002LONG" \ + "\001MULTI" +/* byte 3 is reserved */ + +/* TEST register: offset 80 */ diff --git a/usr/src/uts/common/io/upf/upf_usbgem.c b/usr/src/uts/common/io/upf/upf_usbgem.c new file mode 100644 index 0000000000..5614803158 --- /dev/null +++ b/usr/src/uts/common/io/upf/upf_usbgem.c @@ -0,0 +1,1213 @@ +/* + * upf_usbgem.c : ADMtek an986/adm8511/adm8513/adm8515 USB to + * Fast Ethernet Driver for Solaris + */ + +/* + * Copyright (c) 2004-2011 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "adm8511reg.h" + +char ident[] = "pegasus usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(val, label) \ + if ((val) != USB_SUCCESS) { goto label; } + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int upf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (upf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for ADMtek Pegasus/PegasusII + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct upf_dev { + /* + * Misc HW information + */ + uint8_t ec[3]; + uint8_t mac_addr[ETHERADDRL]; + int chip_type; +#define CHIP_AN986 1 /* avoid 0 */ +#define CHIP_ADM8511 2 /* including adm8515 */ +#define CHIP_ADM8513 3 + boolean_t phy_init_done; + uint8_t last_link_state; + + uint16_t vid; /* vendor id */ + uint16_t pid; /* product id */ +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t upf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void upf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int upf_attach_chip(struct usbgem_dev *); +static int upf_reset_chip(struct usbgem_dev *); +static int upf_init_chip(struct usbgem_dev *); +static int upf_start_chip(struct usbgem_dev *); +static int upf_stop_chip(struct usbgem_dev *); +static int upf_set_media(struct usbgem_dev *); +static int upf_set_rx_filter(struct usbgem_dev *); +static int upf_get_stats(struct usbgem_dev *); + +/* packet operations */ +static mblk_t *upf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *upf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* interrupt handler */ +static void upf_interrupt(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define UPF_REQ_GET_REGISTER 0xf0 +#define UPF_REQ_SET_REGISTER 0xf1 +#define OUTB(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ (v), \ + /* wIndex */ (p), \ + /* wLength */ 1, \ + /* buf */ NULL, \ + /* size */ 0)) != USB_SUCCESS) goto label; + +#define OUTW(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 2, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ (len), \ + /* buf */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define INB(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 1, \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INW(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 2, \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ (len), \ + /* buf */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +upf_reset_chip(struct usbgem_dev *dp) +{ + int i; + uint8_t val; + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + bzero(lp->mac_addr, sizeof (lp->mac_addr)); + + lp->ec[1] = 0; + OUTB(dp, EC1, EC1_RM, &err, usberr); + + for (i = 0; i < 1000; i++) { + INB(dp, EC1, &val, &err, usberr); + if ((val & EC1_RM) == 0) { + lp->ec[1] = val; + return (USB_SUCCESS); + } + drv_usecwait(10); + } + + /* time out */ + cmn_err(CE_WARN, "!%s: failed to reset: timeout", dp->name); + return (USB_FAILURE); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * Setup an986/adm8511/adm8513/adm8515 + */ +static int +upf_init_chip(struct usbgem_dev *dp) +{ + uint64_t zero64 = 0; + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* ethernet control register 0 */ + lp->ec[0] |= EC0_RXSA | EC0_RXCS; + OUTB(dp, EC0, lp->ec[0], &err, usberr); + + /* ethernet control reg1: will be set later in set_rx_filter() */ + + /* ethernet control register 2: will be set later in set_rx_filter() */ + INB(dp, EC2, &lp->ec[2], &err, usberr); + lp->ec[2] |= EC2_RXBP | EC2_EP3RC; +#ifdef CONFIG_VLAN + if (dp->misc_flag & USBGEM_VLAN) { + lp->ec[2] |= EC2_MEPL; + } +#endif + OUTB(dp, EC2, lp->ec[2], &err, usberr); + + /* Multicast address hash: clear */ + OUTS(dp, MA, &zero64, 8, &err, usberr); + + /* Ethernet ID : will be set later in upf_set_rx_filter() */ + + /* PAUSE timer */ + OUTB(dp, PAUSETIMER, 0x1f, &err, usberr); + + /* receive packet number based pause control:set in upf_set_media() */ + + /* occupied receive FIFO based pause control:set in upf_set_media() */ + + /* EP1 control: default */ + + /* Rx FIFO control */ + if (lp->chip_type != CHIP_AN986) { + /* use 24K internal sram, 16pkts in fifo */ + OUTB(dp, RXFC, 0, &err, usberr); + } + + /* BIST contror: do nothing */ + err = upf_set_media(dp); + CHECK_AND_JUMP(err, usberr); + + DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* enable RX and TX */ + lp->ec[0] |= EC0_TXE | EC0_RXE; + OUTB(dp, EC0, lp->ec[0], &err, usberr); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_stop_chip(struct usbgem_dev *dp) +{ + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* disable RX and TX */ + lp->ec[0] &= ~(EC0_TXE | EC0_RXE); + OUTB(dp, EC0, lp->ec[0], &err, usberr); + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_get_stats(struct usbgem_dev *dp) +{ + /* do nothing */ + return (USB_SUCCESS); +} + +static uint_t +upf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + /* hash table is 64 = 2^6 bit width */ + return (usbgem_ether_crc_le(addr) & 0x3f); +} + +static int +upf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + int err; +#ifdef DEBUG_LEVEL + uint8_t reg0; + uint8_t reg1; + uint8_t reg2; +#endif + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called, rxmode:%b", + dp->name, __func__, dp->rxmode, RXMODE_BITS)); + + /* reset rx mode */ + lp->ec[0] &= ~EC0_RXMA; + lp->ec[2] &= ~EC2_PROM; + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + lp->ec[0] |= EC0_RXMA; + lp->ec[2] |= EC2_PROM; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 0) { + /* XXX - multicast hash table didin't work */ + /* accept all multicast packets */ + lp->ec[0] |= EC0_RXMA; + } + + if (bcmp(dp->cur_addr.ether_addr_octet, + lp->mac_addr, ETHERADDRL) != 0) { + + /* need to update mac address */ + bcopy(dp->cur_addr.ether_addr_octet, + lp->mac_addr, ETHERADDRL); + OUTS(dp, EID, + lp->mac_addr, ETHERADDRL, &err, usberr); + } + + /* update rx mode */ + OUTS(dp, EC0, lp->ec, 3, &err, usberr); + +#if DEBUG_LEVEL > 0 + INB(dp, EC0, ®0, &err, usberr); + INB(dp, EC1, ®1, &err, usberr); + INB(dp, EC2, ®2, &err, usberr); + + cmn_err(CE_CONT, "!%s: %s: returned, ec:%b %b %b", + dp->name, __func__, + reg0, EC0_BITS, reg1, EC1_BITS, reg2, EC2_BITS); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (err); +} + +static int +upf_set_media(struct usbgem_dev *dp) +{ + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + lp->ec[1] &= ~(EC1_FD | EC1_100M); + + /* select duplex */ + if (dp->full_duplex) { + lp->ec[1] |= EC1_FD; + } + + /* select speed */ + if (dp->speed == USBGEM_SPD_100) { + lp->ec[1] |= EC1_100M; + } + + /* rx flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_RX_PAUSE: + lp->ec[0] |= EC0_RXFCE; + break; + + default: + lp->ec[0] &= ~EC0_RXFCE; + break; + } + + /* tx flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_TX_PAUSE: + if (lp->chip_type != CHIP_AN986) { + /* pegasus II has internal 24k fifo */ + OUTB(dp, ORFBFC, + (12 << ORFBFC_RXS_SHIFT) | ORFBFC_FCRXS, + &err, usberr); + + /* 16 packts can be stored in rx fifo */ + OUTB(dp, RPNBFC_PN, + (8 << RPNBFC_PN_SHIFT) | RPNBFC_FCP, + &err, usberr); + } else { + /* an986 has external 32k fifo */ + OUTB(dp, ORFBFC, + (16 << ORFBFC_RXS_SHIFT) | ORFBFC_FCRXS, + &err, usberr); + + /* AN986 fails to link up when RPNBFC is enabled */ + OUTB(dp, RPNBFC, 0, &err, usberr); + } + break; + + default: + OUTB(dp, ORFBFC, 0, &err, usberr); + OUTB(dp, RPNBFC, 0, &err, usberr); + break; + } + + /* update ether control registers */ + OUTS(dp, EC0, lp->ec, 2, &err, usberr); + DPRINTF(0, (CE_CONT, "!%s: %s: returned, ec0:%b, ec1:%b", + dp->name, __func__, lp->ec[0], EC0_BITS, lp->ec[1], EC1_BITS)); + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: %s: failed to write ec1", dp->name, __func__); + return (err); +} + +/* + * send/receive packet check + */ +static mblk_t * +upf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + size_t len; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + int msglen; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + len = msgdsize(mp); + if (len < ETHERMIN) { + len = ETHERMIN; + } + + /* allocate msg block */ + msglen = len + sizeof (uint16_t); + + /* avoid usb controller bug */ + if ((msglen & 0x3f) == 0) { + /* add a header for additional 0-length usb message */ + msglen += sizeof (uint16_t); + } + + if ((new = allocb(msglen, 0)) == NULL) { + return (NULL); + } + + /* copy contents of the buffer */ + new->b_wptr = new->b_rptr + msglen; + bp = new->b_rptr; + + /* the nic requires a two byte header of the packet size */ + bp[0] = (uint8_t)len; + bp[1] = (uint8_t)(len >> 8); + bp += sizeof (uint16_t); + + /* copy the payload */ + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + if (len > 0) { + bcopy(tp->b_rptr, bp, len); + bp += len; + } + } + + /* clear ethernet pads and additional usb header if we have */ + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + return (new); +} + +static void +upf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +upf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *p; + uint16_t rxhd; + uint_t len; + uint8_t rsr; + struct upf_dev *lp = dp->private; + + ASSERT(mp != NULL); + +#ifdef DEBUG_LEVEL + len = msgdsize(mp); + DPRINTF(2, (CE_CONT, "!%s: time:%d %s: cont:%p", + dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont)); + + if (upf_debug > 3) { + upf_dump_packet(dp, mp->b_rptr, max(6, len)); + } +#endif + /* get the length of Rx packet */ + p = mp->b_wptr - 4; + rsr = p[3]; + if (lp->chip_type == CHIP_ADM8513) { + /* As Rx packets from ADM8513 have two byte header, remove it */ + p = mp->b_rptr; + len = ((p[1] << 8) | p[0]) & 0x0fff; + mp->b_rptr += 2; + } else { + len = (((p[1] << 8) | p[0]) & 0x0fff) - ETHERFCSL - 4; + } + + DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d", + dp->name, __func__, rsr, RSR_BITS, len)); + + /* check if error happen */ + if (rsr & RSR_ERRORS) { + DPRINTF(0, (CE_CONT, "!%s: rsr:%b", dp->name, rsr, RSR_BITS)); + if (rsr & (RSR_CRC | RSR_DRIBBLE)) { + dp->stats.frame++; + } + if (rsr & RSR_LONG) { + dp->stats.frame_too_long++; + } + if (rsr & RSR_RUNT) { + dp->stats.runt++; + } + + dp->stats.errrcv++; + return (NULL); + } +#ifndef CONFIG_VLAN + /* check packet size */ + if (len > ETHERMAX) { + /* too long */ + dp->stats.frame_too_long++; + dp->stats.errrcv++; + return (NULL); + } else if (len < ETHERMIN) { + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } +#endif + /* remove tailing crc and rx status fields */ + mp->b_wptr = mp->b_rptr + len; + ASSERT(mp->b_next == NULL); + return (mp); +} + +/* + * Device depend interrupt handler + */ +static void +upf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *bp; + struct upf_dev *lp = dp->private; + + bp = mp->b_rptr; + + DPRINTF(2, (CE_CONT, + "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7])); + + if ((lp->last_link_state ^ bp[5]) & 1) { + DPRINTF(1, (CE_CONT, "!%s:%s link status changed:", + dp->name, __func__)); + usbgem_mii_update_link(dp); + } + + lp->last_link_state = bp[5] & 1; +} + +/* + * MII Interfaces + */ +static uint16_t +upf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint8_t phyctrl; + uint16_t val; + int i; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + ASSERT(index >= 0 && index < 32); + + *errp = USB_SUCCESS; + + /* set PHYADDR */ + OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr); + + /* Initiate MII read transaction */ + OUTB(dp, PHYAC, index | PHYAC_RDPHY, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, PHYAC, &phyctrl, errp, usberr); + if (phyctrl & PHYAC_DO) { + /* done */ + INW(dp, PHYD, &val, errp, usberr); + DPRINTF(4, (CE_CONT, "!%s: %s: return %04x", + dp->name, __func__, val)); + return (val); + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__); + *errp = USB_FAILURE; + return (0); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +upf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + int i; + uint8_t phyctrl; + + DPRINTF(4, (CE_CONT, "!%s: %s called index:%d val:0x%04x", + dp->name, __func__, index, val)); + ASSERT(index >= 0 && index < 32); + + *errp = USB_SUCCESS; + + OUTW(dp, PHYD, val, errp, usberr); + OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr); + OUTB(dp, PHYAC, index | PHYAC_WRPHY, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, PHYAC, &phyctrl, errp, usberr); + if (phyctrl & PHYAC_DO) { + /* done */ + return; + } + drv_usecwait(10); + } + + /* time out */ + cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__); + *errp = USB_FAILURE; + return; + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); +} + + +static int +upf_enable_phy(struct usbgem_dev *dp) +{ + uint8_t val; + int err; + struct upf_dev *lp = dp->private; + + /* + * first, try to enable internal phy + */ + INB(dp, IPHYC, &val, &err, usberr); + val = (val | IPHYC_EPHY) & ~IPHYC_PHYR; + OUTB(dp, IPHYC, val, &err, usberr); + + INB(dp, IPHYC, &val, &err, usberr); + DPRINTF(0, (CE_CONT, "!%s: %s: IPHYC: %b", + dp->name, __func__, val, IPHYC_BITS)); + if (val) { + /* reset internal phy */ + OUTB(dp, IPHYC, val | IPHYC_PHYR, &err, usberr); + OUTB(dp, IPHYC, val, &err, usberr); + delay(drv_usectohz(10000)); + + /* identify the chip generation */ + OUTB(dp, 0x83, 0xa5, &err, usberr); + INB(dp, 0x83, &val, &err, usberr); + if (val == 0xa5) { + lp->chip_type = CHIP_ADM8513; + } else { + /* adm8511 or adm8515 */ + lp->chip_type = CHIP_ADM8511; + } + dp->ugc.usbgc_mii_hw_link_detection = B_TRUE; + } else { + /* + * It should be AN986 which doesn't have an internal PHY. + * We need to setup gpio ports in AN986, which are + * connected to external PHY control pins. + */ + lp->chip_type = CHIP_AN986; + + /* reset external phy */ + /* output port#0 L, port#1 L */ + OUTB(dp, GPIO10, GPIO10_0O | GPIO10_0OE, &err, usberr); + + /* output port#0 H, port#1 L */ + OUTB(dp, GPIO10, + GPIO10_0O | GPIO10_0OE | GPIO10_1OE, &err, usberr); + + /* hw link detection doesn't work correctly */ + dp->ugc.usbgc_mii_hw_link_detection = B_FALSE; + } + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +upf_mii_probe(struct usbgem_dev *dp) +{ + int err; + uint16_t val; + struct upf_dev *lp = dp->private; + + if (!lp->phy_init_done) { + upf_enable_phy(dp); + lp->phy_init_done = B_TRUE; + } + + return (usbgem_mii_probe_default(dp)); +} + +static int +upf_mii_init(struct usbgem_dev *dp) +{ + uint16_t val; + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + if (!lp->phy_init_done) { + upf_enable_phy(dp); + } + lp->phy_init_done = B_FALSE; + + if (lp->chip_type == CHIP_AN986 && + (lp->vid == 0x0db7 /* elecom */ || + lp->vid == 0x066b /* linksys */ || + lp->vid == 0x077b /* linksys */ || + lp->vid == 0x2001 /* dlink */)) { + /* special treatment for Linksys products */ + val = upf_mii_read(dp, 0x1b, &err) | 0x4; + upf_mii_write(dp, 0x1b, val, &err); + } + return (err); +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static uint16_t +upf_read_eeprom(struct usbgem_dev *dp, int index, int *errp) +{ + int i; + uint8_t eectrl; + uint16_t data; + + *errp = USB_SUCCESS; + + OUTB(dp, EECTRL, 0, errp, usberr); + + OUTB(dp, EEOFFSET, index, errp, usberr); + OUTB(dp, EECTRL, EECTRL_RD, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, EECTRL, &eectrl, errp, usberr); + if (eectrl & EECTRL_DONE) { + INW(dp, EEDATA, &data, errp, usberr); + return (data); + } + drv_usecwait(10); + } + + /* time out */ + *errp = USB_FAILURE; + return (0); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +upf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + + cmn_err(CE_CONT, "!%s: %s dump:", dp->name, __func__); + + for (i = 0; i < size; i += 4) { + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i*2, + upf_read_eeprom(dp, i + 0, &err), + upf_read_eeprom(dp, i + 1, &err), + upf_read_eeprom(dp, i + 2, &err), + upf_read_eeprom(dp, i + 3, &err)); + } +} + +static int +upf_attach_chip(struct usbgem_dev *dp) +{ + int i; + int err; + uint16_t val; + uint8_t *mac; + struct upf_dev *lp = dp->private; + + /* + * Read mac address from EEPROM + */ + mac = dp->dev_addr.ether_addr_octet; + for (i = 0; i < 3; i++) { + val = upf_read_eeprom(dp, i, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + mac[i*2+0] = (uint8_t)val; + mac[i*2+1] = (uint8_t)(val >> 8); + } + + DPRINTF(0, (CE_CONT, + "%s: %s: mac: %02x:%02x:%02x:%02x:%02x:%02x", + dp->name, __func__, + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5])); + + dp->misc_flag = 0; +#ifdef CONFIG_VLAN + dp->misc_flag |= USBGEM_VLAN; +#endif +#if DEBUG_LEVEL > 3 + upf_eeprom_dump(dp, 0x80); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usb error detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +upfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int unit; + uint32_t tcr; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct upf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 16; + + ugcp->usbgc_rx_header_len = 4; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; + + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_NONE; + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = MII_AN_TIMEOUT/2; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + ugcp->usbgc_mii_an_delay = ONESEC/10; + + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RESET; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &upf_attach_chip; + ugcp->usbgc_reset_chip = &upf_reset_chip; + ugcp->usbgc_init_chip = &upf_init_chip; + ugcp->usbgc_start_chip = &upf_start_chip; + ugcp->usbgc_stop_chip = &upf_stop_chip; + ugcp->usbgc_multicast_hash = &upf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &upf_set_rx_filter; + ugcp->usbgc_set_media = &upf_set_media; + ugcp->usbgc_get_stats = &upf_get_stats; + ugcp->usbgc_interrupt = &upf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &upf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &upf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &upf_mii_probe; + ugcp->usbgc_mii_init = &upf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &upf_mii_read; + ugcp->usbgc_mii_write = &upf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct upf_dev), KM_SLEEP); + + lp->vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "usb-vendor-id", -1); + lp->pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "usb-product-id", -1); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct upf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct upf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + if (cmd == DDI_RESUME) { + dp = USBGEM_GET_DEV(dip); + lp = dp->private; + lp->phy_init_done = B_FALSE; + + return (usbgem_resume(dip)); + } + return (DDI_FAILURE); +} + +static int +upfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(upf_ops, upfattach, upfdetach); +#else +static struct module_info upfminfo = { + 0, /* mi_idnum */ + "upf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + 32*1024, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit upfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &upfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit upfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &upfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab upf_info = { + &upfrinit, /* st_rdinit */ + &upfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_upf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &upf_info, /* cb_stream */ + D_MP /* cb_flag */ +}; + +static struct dev_ops upf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + upfattach, /* devo_attach */ + upfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_upf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif + +}; +#endif +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &upf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!upf: _init: called")); + + status = usbgem_mod_init(&upf_ops, "upf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&upf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!upf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&upf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/urf/rtl8150reg.h b/usr/src/uts/common/io/urf/rtl8150reg.h new file mode 100644 index 0000000000..7cba53356e --- /dev/null +++ b/usr/src/uts/common/io/urf/rtl8150reg.h @@ -0,0 +1,218 @@ +/* + * @(#)rtl8150reg.h 1.1 04/09/16 + * Macro definitions for Realtek 8150 USB to fast ethernet controller + * based on Realtek RTL8150 data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +/* + * Register offset + */ +#define IDR 0x0120 /* Base of ID registers */ +#define MAR 0x0126 /* Base of multicast registers */ +#define CR 0x012e /* Command register */ +#define TCR 0x012f /* Transmit Configuration register */ +#define RCR 0x0130 /* Receive Configuration register */ +#define TSR 0x0132 /* Transmit Status register */ +#define RSR 0x0133 /* Receive Status register */ +#define CON0 0x0135 /* Configuration register 0 */ +#define CON1 0x0136 /* Configuration register 1 */ +#define MSR 0x0137 /* Media Status register */ +#define PHYADD 0x0138 /* PHY address register */ +#define PHYDAT 0x0139 /* PHY data register */ +#define PHYCNT 0x013b /* PHY control register */ +#define GPPC 0x013d /* General purpose pin control */ +#define WAKECNT 0x013e /* Wake up event control */ +#define BMCR 0x0140 /* Basic Mode Control register */ +#define BMSR 0x0142 /* Basic Mode Status register */ +#define ANAR 0x0144 /* Auto Negotiation Advertisement register */ +#define ANLP 0x0146 /* Auto Negotiation Link Partner register */ +#define ANER 0x0148 /* Auto Negotiation Expansion register */ +#define NWAYT 0x014a /* Nway test register */ +#define CSCR 0x014c /* CS configuration register */ +#define CRC0 0x014e /* Power management register for wakeup frame0 */ +#define CRC1 0x0150 /* Power management register for wakeup frame1 */ +#define CRC2 0x0152 /* Power management register for wakeup frame2 */ +#define CRC3 0x0154 /* Power management register for wakeup frame3 */ +#define CRC4 0x0156 /* Power management register for wakeup frame4 */ +#define BYTEMASK0 0x0158 /* Power management wakeup frame0 bytemask */ +#define BYTEMASK1 0x0160 /* Power management wakeup frame1 bytemask */ +#define BYTEMASK2 0x0168 /* Power management wakeup frame2 bytemask */ +#define BYTEMASK3 0x0170 /* Power management wakeup frame3 bytemask */ +#define BYTEMASK4 0x0178 /* Power management wakeup frame4 bytemask */ +#define PHY1 0x0180 /* PHY parameter 1 */ +#define PHY2 0x0184 /* PHY parameter 2 */ +#define TW1 0x0186 /* Twister parameter 1 */ + +/* + * Bit field definitions + */ +/* CR : Command register (uint8_t) */ +#define CR_WEPROM 0x20 /* EEPROM write enable */ +#define CR_SOFT_RST 0x10 /* Reset */ +#define CR_RE 0x08 /* Ethernet receive enable */ +#define CR_TE 0x04 /* Ethernet transmit enable */ +#define CR_EP3CLREN 0x02 /* clear performance counter after EP3 */ +#define CR_AUTOLOAD 0x01 /* autoload contents of 93c46 */ + +#define CR_BITS "\020\006WEPROM\005SOFT_RST\004RE\003TE\002EP3CLREN\001AUTOLOAD" + +/* TCR: Transmit Configuration register */ +#define TCR_TXRR 0xc0 /* Tx retry count */ +#define TCR_TXRR_SHIFT 6 +#define TCR_IFG 0x18 /* Interframe Gap */ +#define TCR_IFG_SHIFT 3 +#define TCR_IFG_802_3 (3 << TCR_IFG_SHIFT) /* 802.3 standard */ +#define TCR_NOCRC 0x01 /* Inhibit Appending CRC */ + +#define TCR_BITS "\020\001NOCRC" + +/* Receive Configuration register */ +#define RCR_TAIL 0x0080 /* Rx header forward to host in CRC field */ +#define RCR_AER 0x0040 /* Accept Error packet */ +#define RCR_AR 0x0020 /* Accept runt */ +#define RCR_AM 0x0010 /* Accept multicast */ +#define RCR_AB 0x0008 /* Accept broadcast */ +#define RCR_AD 0x0004 /* Accept physical match */ +#define RCR_AAM 0x0002 /* Accept all Multicast */ +#define RCR_AAP 0x0001 /* Accept all physical */ + +#define RCR_ACCEPT_MODE \ + (RCR_AER | RCR_AR | RCR_AM | RCR_AB | RCR_AD | RCR_AAM | RCR_AAP) + +#define RCR_BITS \ + "\020\010TAIL\007AER\006AR\005AM\004AB\003AD\002AAM\001AAP" + +/* Transmit Status register */ + +#define TSR_ECOL 0x20 /* excessive collision indication */ +#define TSR_LCOL 0x10 /* late collision indication */ +#define TSR_LOSS_CRS 0x08 /* lost of carrier indication */ +#define TSR_JBR 0x04 /* jabber time out indication */ +#define TSR_BUF_EMPTY 0x02 /* Tx buffer is empty */ +#define TSR_BUF_FULL 0x01 /* Tx buffer is full */ + +#define TSR_BITS \ + "\020" \ + "\006ECOL" \ + "\005LCOL" \ + "\004LOSS_CRS" \ + "\003JBR" \ + "\002BUF_EMPTY" \ + "\001BUF_FULL" + +/* Receive status register in Rx packet field */ +#define RSR_WEVENT 0x80 /* Wakeup event indication */ +#define RSR_RX_BUF_FULL 0x40 /* Receive buffer full indication */ +#define RSR_LKCHG 0x20 /* Link change indication */ +#define RSR_RUNT 0x10 /* short packet indication */ +#define RSR_LONG 0x08 /* Long packet indication*/ +#define RSR_CRC 0x04 /* CRC error indication*/ +#define RSR_FAE 0x02 /* Frame alignment error */ +#define RSR_ROK 0x01 /* Receive OK indication */ + +#define RSR_ERRS (RSR_RUNT | RSR_LONG | RSR_CRC | RSR_FAE) +#define RSR_BITS \ + "\020" \ + "\010WEVENT" \ + "\007RX_BUF_FULL" \ + "\006LKCHG" \ + "\005RUNT" \ + "\004LONG" \ + "\003CRC" \ + "\002FAE" \ + "\001ROK" + +/* Config 0 */ + +#define CON0_SUSLED 0x80 +#define CON0_PARM_EN 0x40 /* parameter enable */ +#define CON0_LDPS 0x08 +#define CON0_MSEL 0x04 /* media select 1:MII, 0:auto */ +#define CON0_LEDS 0x03 /* LED pattern */ + +/* Config 1 */ +#define CON0_BWF 0x40 /* Broadcast wakeup function 1:on 0:off */ +#define CON0_MWF 0x20 /* Multicast wakeup function 1:on 0:off */ +#define CON0_UWF 0x10 /* Unicast wakeup function 1:on 0:off */ +#define CON0_LONGWF1 0x02 /* */ +#define CON0_LONGWF0 0x01 /* */ + + +/* MSR : Media Status register */ +#define MSR_TXFCE 0x80 /* Tx Flow control enable */ +#define MSR_RXFCE 0x40 /* Rx Flow control enable */ +#define MSR_DUPLEX 0x10 /* full duplex */ +#define MSR_SPEED_100 0x08 /* 100Mbps mode */ +#define MSR_LINK 0x04 /* link status */ +#define MSR_TXPF 0x02 /* 8150 sends pause packet */ +#define MSR_RXPF 0x01 /* 8150 is in backoff state*/ + +#define MSR_BITS \ + "\020" \ + "\010TXFCE" \ + "\007RXFCE" \ + "\005DUPLEX" \ + "\004SPEED_100" \ + "\003LINK" \ + "\002TXPF" \ + "\001RXPF" + +/* MII PHY Address */ +#define PHYADD_MASK 0x1f + +/* MII PHY Data */ +#define PHYCNT_OWN 0x40 /* 8150 owns:1 not owns:0 */ +#define PHYCNT_RWCR 0x20 /* write:1 read:0 */ +#define PHYCNT_PHYOFF 0x1f + +/* BMCR (almost same with MII_CONTROL register) */ +#define BMCR_RESET 0x8000 /* PHY reset */ +#define BMCR_Spd_Set 0x2000 /* 100Mbps */ +#define BMCR_ANE 0x1000 /* auto negotiation enable */ +#define BMCR_RSA 0x0200 /* restart auto negotiation */ +#define BMCR_duplex 0x0100 /* 100Mbps */ + +/* Basic mode status register */ +/* Auto-negotiation Advertisement register */ +/* Auto-negotiation Link Partner Ability register */ +/* Auto-negotiation Expansion register */ + +/* Nway test register */ +#define NWAYT_NWLPBK 0x0080 +#define NWAYT_ENNWLE 0x0008 +#define NWAYT_FLAGABD 0x0004 +#define NWAYT_FLAGPDF 0x0002 +#define NWAYT_FLAGLSC 0x0001 + +/* CS configuration register */ +#define CS_TESTFUN 0x8000 /* */ +#define CS_LD 0x0200 /* */ +#define CS_HEARTBEAT 0x0100 /* */ +#define CS_JBEN 0x0080 /* */ +#define CS_F_LINK100 0x0040 /* */ +#define CS_F_CONNECT 0x0020 /* */ +#define CS_CON_STATUS 0x0008 /* */ +#define CS_CON_STATUS_EN 0x0004 /* */ +#define CS_PASS_SCR 0x0001 /* bypass scramble function */ + +/* + * header format of rx packet + */ +#define RXHD_MULT 0x8000 /* multicast packet */ +#define RXHD_PHYS 0x4000 /* physical match packet */ +#define RXHD_RUNT 0x2000 /* too short */ +#define RXHD_VALID 0x1000 /* packet is ok */ +#define RXHD_BYTECNT 0x0fff /* rx byte count */ + +#define RXHD_BITS \ + "\020" \ + "\020MULT" \ + "\017PHYS" \ + "\016RUNT" \ + "\015VALID" +/* + * Offset to EPROM contents + */ +#define URF_EEPROM_BASE 0x1200 +#define EPROM_EthernetID 0x0002 diff --git a/usr/src/uts/common/io/urf/urf_usbgem.c b/usr/src/uts/common/io/urf/urf_usbgem.c new file mode 100644 index 0000000000..f61c8e3502 --- /dev/null +++ b/usr/src/uts/common/io/urf/urf_usbgem.c @@ -0,0 +1,1039 @@ +/* + * urf_usbgem.c : Realtek RTL8150 USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2003-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" +#include "usbgem_mii.h" +#include "rtl8150reg.h" + +char ident[] = "rtl8150 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define ROUNDUP2(x, y) (((x)+(y)-1) & ~((y)-1)) +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int urf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (urf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for rtl8150 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct chip_info { + int flags; + char *name; + int type; +}; + +#define CHIPTABLESIZE (sizeof (chiptbl_8150) / sizeof (struct chip_info)) + +struct urf_dev { + /* + * Misc HW information + */ + struct chip_info *chip; + uint8_t cr; + uint8_t tsr; + uint16_t rcr; + uint8_t txok_cnt; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t urf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void urf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int urf_attach_chip(struct usbgem_dev *); +static int urf_reset_chip(struct usbgem_dev *); +static int urf_init_chip(struct usbgem_dev *); +static int urf_start_chip(struct usbgem_dev *); +static int urf_stop_chip(struct usbgem_dev *); +static int urf_set_media(struct usbgem_dev *); +static int urf_set_rx_filter(struct usbgem_dev *); +static int urf_get_stats(struct usbgem_dev *); + +/* packet operations */ +static mblk_t *urf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *urf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUTB(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ 1, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTW(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ 2, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define IN(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ sizeof ((*vp)), \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * variables + */ +/* =============================================================== */ +static int urf_ppa = 0; + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +urf_reset_chip(struct usbgem_dev *dp) +{ + int i; + int err; + uint8_t reg; + struct urf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + lp->cr = 0; + OUTB(dp, CR, lp->cr | CR_SOFT_RST, &err, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, CR, ®, &err, usberr); + if ((reg & CR_SOFT_RST) == 0) { + return (USB_SUCCESS); + } + } + /* time out */ + cmn_err(CE_WARN, "%s: failed to reset: timeout", dp->name); + return (USB_FAILURE); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * Setup rtl8150 + */ +static int +urf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err; + struct urf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* ID registers: set later by urf_set_rx_filter */ + + /* Multicast registers: set later by urf_set_rx_filter */ + + /* Command register : Enable Tx and Rx before writing TCR and RCR */ + lp->cr |= CR_RE | CR_TE; + OUTB(dp, CR, lp->cr, &err, usberr); + + /* Transmit configration register : */ + OUTB(dp, TCR, TCR_IFG_802_3, &err, usberr); + + /* Receive configuration register : disable rx filter */ + lp->rcr = RCR_TAIL | RCR_AER | RCR_AR; + OUTW(dp, RCR, lp->rcr, &err, usberr); +#ifdef notdef + /* Media status register */ + err = urf_set_media(dp); + CHECK_AND_JUMP(err, usberr); +#endif + /* Configuration register 0: no need to change */ + + DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +urf_start_chip(struct usbgem_dev *dp) +{ + struct urf_dev *lp = dp->private; + + /* do nothing */ + return (USB_SUCCESS); +} + +static int +urf_stop_chip(struct usbgem_dev *dp) +{ + return (urf_reset_chip(dp)); +} + +static int +urf_get_stats(struct usbgem_dev *dp) +{ + /* do nothing */ + return (USB_SUCCESS); +} + +static uint_t +urf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_be(addr)); +} + +static int +urf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint16_t mode; + uint8_t mhash[8]; + int err; + int16_t rcr; + struct urf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & (RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM)) { +#ifdef notdef + /* disable rx filter before changing it. */ + lp->rcr &= ~(RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM); + OUTW(dp, RCR, lp->rcr, &err, usberr); +#else + /* receive all packets while we change rx filter*/ + OUTW(dp, RCR, lp->rcr | RCR_AAM | RCR_AAP, &err, usberr); +#endif + } + + mode = RCR_AB /* accept broadcast */ + | RCR_AD; /* accept physical match */ + bzero(mhash, sizeof (mhash)); + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_AAM | RCR_AAP; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 64/2) { + /* accept all multicast packets */ + mode |= RCR_AAM; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + mode |= RCR_AM; + for (i = 0; i < dp->mc_count; i++) { + uint_t h; + /* hash table is 64 = 2^6 bit width */ + h = dp->mc_list[i].hash >> (32 - 6); + mhash[h / 8] |= 1 << (h % 8); + } + } + lp->rcr |= mode; + + /* set mac address */ + OUTS(dp, IDR, dp->cur_addr.ether_addr_octet, ETHERADDRL, &err, usberr); + + /* set multicast hash table */ + if (mode & RCR_AM) { + /* need to set up multicast hash table */ + OUTS(dp, MAR, mhash, sizeof (mhash), &err, usberr); + } + + OUTW(dp, RCR, lp->rcr, &err, usberr); + +#if DEBUG_LEVEL > 2 + IN(dp, RCR, &rcr, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, rcr, RCR_BITS); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +urf_set_media(struct usbgem_dev *dp) +{ + uint8_t new; + uint8_t old; + int err; + struct urf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* select duplex: do nothing */ + + /* select speed: do nothing */ + + /* flow control */ + IN(dp, MSR, &old, &err, usberr); + + + /* setup flow control */ + new = old & ~(MSR_TXFCE | MSR_RXFCE); + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + new |= MSR_TXFCE | MSR_RXFCE; + break; + + case FLOW_CONTROL_TX_PAUSE: + new |= MSR_TXFCE; + break; + + case FLOW_CONTROL_RX_PAUSE: + new |= MSR_RXFCE; + break; + + case FLOW_CONTROL_NONE: + default: + break; + } + + if (new != old) { + OUTB(dp, MSR, new, &err, usberr); + } + DPRINTF(2, (CE_CONT, "!%s: %s: returned", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * send/receive packet check + */ +static mblk_t * +urf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + size_t len; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + + len = msgdsize(mp); + + if (len < ETHERMIN || mp->b_cont != NULL || (len & 0x3f) == 0) { + /* + * re-allocate mp + */ + len = max(len, ETHERMIN); + + if ((len & 0x3f) == 0) { + /* workaround for buggy USB hba */ + len++; + } + + if ((new = allocb(len, 0)) == NULL) { + return (NULL); + } + + /* copy contents of the buffer */ + new->b_wptr = new->b_rptr + len; + bp = new->b_rptr; + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, len); + bp += len; + } + + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + mp = new; + } + + return (mp); +} + +static void +urf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +urf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *p; + uint16_t rxhd; + uint_t len; + + ASSERT(mp != NULL); + len = msgdsize(mp); +#ifdef DEBUG_LEVEL + DPRINTF(2, (CE_CONT, "!%s: time:%d %s: len:%d cont:%p", + dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont)); + + if (urf_debug > 2) { + urf_dump_packet(dp, mp->b_rptr, max(6, len)); + } +#endif + if (len < ETHERMIN + ETHERFCSL) { + /* Too short */ + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } + + /* get Rx header which is placed at tail of the packet. */ + p = mp->b_wptr - 4; + rxhd = (p[1] << 8) | p[0]; + len = rxhd & RXHD_BYTECNT; + + DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d", + dp->name, __func__, rxhd, RXHD_BITS, len)); + + /* check if error happen */ + if ((rxhd & (RXHD_VALID)) == 0) { + DPRINTF(-1, (CE_CONT, "!%s: %s: rxhd:%b", + dp->name, __func__, rxhd, RXHD_BITS)); + if (rxhd & RXHD_RUNT) { + dp->stats.runt++; + } + + dp->stats.errrcv++; + return (NULL); + } +#ifdef notdef + /* check packet size */ + if (len > ETHERMAX + ETHERFCSL) { + /* too long */ + dp->stats.frame_too_long++; + dp->stats.errrcv++; + return (NULL); + } else if (len < ETHERMIN + ETHERFCSL) { + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } +#endif + /* remove tailing crc field */ + mp->b_wptr -= ETHERFCSL; + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +urf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + int reg; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + + *errp = USB_SUCCESS; + + switch (index) { + case MII_CONTROL: + reg = BMCR; + break; + + case MII_STATUS: + reg = BMSR; + break; + + case MII_AN_ADVERT: + reg = ANAR; + break; + + case MII_AN_LPABLE: + reg = ANLP; + break; + + case MII_AN_EXPANSION: + reg = ANER; + break; + + default: + return (0); + } + + IN(dp, reg, &val, errp, usberr); + + if (index == MII_STATUS) { + uint8_t msr; + /* + * Fix MII status register as it does't have LINKUP and + * MFPRMBLSUPR bits. + */ + IN(dp, MSR, &msr, errp, usberr); + + val |= (MII_STATUS_MFPRMBLSUPR | MII_STATUS_LINKUP); + if ((msr & MSR_LINK) == 0) { + val &= ~MII_STATUS_LINKUP; + } + } + + return (val); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + + return (0); +} + +static void +urf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + int reg; + + DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__)); + + *errp = USB_SUCCESS; + + switch (index) { + case MII_CONTROL: + reg = BMCR; + break; + + case MII_STATUS: + reg = BMSR; + break; + + case MII_AN_ADVERT: + reg = ANAR; + break; + + case MII_AN_LPABLE: + reg = ANLP; + break; + + case MII_AN_EXPANSION: + reg = ANER; + break; + + default: + return; + } + + OUTW(dp, reg, val, errp, usberr); +usberr: + ; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static void +urf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint16_t w0, w1, w2, w3; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + for (i = URF_EEPROM_BASE; i < size + URF_EEPROM_BASE; i += 8) { + IN(dp, i + 0, &w0, &err, usberr); + IN(dp, i + 2, &w1, &err, usberr); + IN(dp, i + 4, &w2, &err, usberr); + IN(dp, i + 6, &w3, &err, usberr); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i - URF_EEPROM_BASE, w0, w1, w2, w3); + } +usberr: + ; +} + +static int +urf_attach_chip(struct usbgem_dev *dp) +{ + int i; + uint8_t old; + uint_t new; + uint8_t reg; + int err; + struct urf_dev *lp = dp->private; + + /* + * setup flow control bit in eeprom + */ + IN(dp, URF_EEPROM_BASE + 9, &old, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: eeprom offset 9: %02x", dp->name, old)); + + if (dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE) { + /* enable PAUSE bit */ + new = old | 0x04; + } else { + /* clear PAUSE bit */ + new = old & ~0x04; + } + if (new != old) { + /* make eeprom writable */ + OUTB(dp, CR, lp->cr | CR_WEPROM, &err, usberr); + + /* eerom allows only word access for writing */ + IN(dp, URF_EEPROM_BASE + 8, ®, &err, usberr); + new = (new << 8) | reg; + + OUTW(dp, URF_EEPROM_BASE + 8, new, &err, usberr); + + /* make eeprom non-writable */ + OUTB(dp, CR, lp->cr, &err, usberr); + } + + /* + * load EEPROM contents into nic + */ + OUTB(dp, CR, lp->cr | CR_AUTOLOAD, &err, usberr); + CHECK_AND_JUMP(err, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, CR, ®, &err, usberr); + if ((reg & CR_AUTOLOAD) == 0) { + goto autoload_done; + } + } + /* timeout */ + cmn_err(CE_WARN, "%s: %s: failed to autoload: timeout", + dp->name, __func__); + goto usberr; + +autoload_done: + /* + * mac address in EEPROM has loaded to ID registers. + */ + INS(dp, IDR, dp->dev_addr.ether_addr_octet, ETHERADDRL, &err, usberr); + + /* no need to scan phy */ + dp->mii_phy_addr = -1; + +#if DEBUG_LEVEL > 2 + urf_eeprom_dump(dp, 0x80); +#endif + +#ifdef CONFIG_VLAN + dp->misc_flag = USBGEM_VLAN; +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: urf_attach_chip: usb error detected", dp->name); + return (USB_FAILURE); +} + +static int +urfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int unit; + struct chip_info *p; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct urf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, __func__, unit, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * Check if the chip is supported. + */ + + /* + * Check the chip if it is really realtek rtl8150 + */ + + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + sprintf(ugcp->usbgc_name, + "%s%d(ppa=%d)", drv_name, unit, urf_ppa); +#ifdef USBGEM_CONFIG_GLDv3 + ugcp->usbgc_ppa = urf_ppa; +#else + ugcp->usbgc_ppa = unit; +#endif + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 16; + + /* the rx status partially replaces FCS */ + ugcp->usbgc_rx_header_len = 0; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = ONESEC; + + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = (25*ONESEC)/10; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &urf_attach_chip; + ugcp->usbgc_reset_chip = &urf_reset_chip; + ugcp->usbgc_init_chip = &urf_init_chip; + ugcp->usbgc_start_chip = &urf_start_chip; + ugcp->usbgc_stop_chip = &urf_stop_chip; + ugcp->usbgc_multicast_hash = &urf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &urf_set_rx_filter; + ugcp->usbgc_set_media = &urf_set_media; + ugcp->usbgc_get_stats = &urf_get_stats; +#ifdef notdef + ugcp->usbgc_interrupt = &urf_interrupt; +#else + ugcp->usbgc_interrupt = NULL; +#endif + /* packet operation */ + ugcp->usbgc_tx_make_packet = &urf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &urf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &usbgem_mii_probe_default; + ugcp->usbgc_mii_init = &usbgem_mii_init_default; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &urf_mii_read; + ugcp->usbgc_mii_write = &urf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct urf_dev), KM_SLEEP); + lp->chip = p; + + ddi_set_driver_private(dip, NULL); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct urf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + urf_ppa++; + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct urf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + return (DDI_FAILURE); +} + +static int +urfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + urf_ppa--; + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(urf_ops, urfattach, urfdetach); +#else +static struct module_info urfminfo = { + 0, /* mi_idnum */ + "urf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit urfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &urfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit urfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &urfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab urf_info = { + &urfrinit, /* st_rdinit */ + &urfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_urf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &urf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops urf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + urfattach, /* devo_attach */ + urfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_urf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif + +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &urf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!urf: _init: called")); + + status = usbgem_mod_init(&urf_ops, "urf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&urf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!urf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&urf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/usbgem/usbgem.c b/usr/src/uts/common/io/usbgem/usbgem.c new file mode 100644 index 0000000000..a42f7119ef --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem.c @@ -0,0 +1,6389 @@ +/* + * usbgem.c: General USB to Fast Ethernet mac driver framework + * + * Copyright (c) 2002-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "@(#)usbgem.c 1.6 12/02/09" + +/* + * Change log + */ + +/* + * TODO: + * implement DELAYED_START + */ + +/* + * System Header files. + */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/vtrace.h> +#include <sys/ethernet.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#ifndef USBGEM_CONFIG_GLDv3 +#include <sys/dlpi.h> +#include <sys/strsubr.h> +#endif +#include <sys/stream.h> /* required for MBLK* */ +#include <sys/strsun.h> /* required for mionack() */ +#include <sys/byteorder.h> + +#include <sys/usb/usba.h> +#ifdef USBGEM_CONFIG_GLDv3 +#include <inet/common.h> +#include <inet/led.h> +#include <inet/mi.h> +#include <inet/nd.h> +#endif + +/* supplement definitions */ +extern const char *usb_str_cr(usb_cr_t); + +#ifndef USBGEM_CONFIG_GLDv3 +#pragma weak gld_linkstate +#endif +#include <sys/note.h> + +#include "usbgem_mii.h" +#include "usbgem.h" + +#ifdef MODULE +char ident[] = "usb general ethernet mac driver v" VERSION; +#else +extern char ident[]; +#endif + +/* Debugging support */ +#ifdef USBGEM_DEBUG_LEVEL +static int usbgem_debug = USBGEM_DEBUG_LEVEL; +#define DPRINTF(n, args) if (usbgem_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Useful macros and typedefs + */ +#define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define DEFAULT_PIPE(dp) ((dp)->reg_data->dev_default_ph) +#define VTAG_SIZE 4 +#define BOOLEAN(x) ((x) != 0) +/* + * configuration parameters + */ +#define USBDRV_MAJOR_VER 2 +#define USBDRV_MINOR_VER 0 + +#define ETHERHEADERL (sizeof (struct ether_header)) +#define MAXPKTLEN(dp) ((dp)->mtu + ETHERHEADERL) +#define MAXPKTBUF(dp) ((dp)->mtu + ETHERHEADERL + ETHERFCSL) + +#define WATCH_INTERVAL_FAST drv_usectohz(100*1000) + +#define STOP_GRACEFUL B_TRUE + +/* + * Private functions + */ +static int usbgem_open_pipes(struct usbgem_dev *dp); +static int usbgem_close_pipes(struct usbgem_dev *dp); +static void usbgem_intr_cb(usb_pipe_handle_t, usb_intr_req_t *); +static void usbgem_bulkin_cb(usb_pipe_handle_t, usb_bulk_req_t *); +static void usbgem_bulkout_cb(usb_pipe_handle_t, usb_bulk_req_t *); + +static int usbgem_mii_start(struct usbgem_dev *); +static void usbgem_mii_stop(struct usbgem_dev *); + +/* local buffer management */ +static int usbgem_init_rx_buf(struct usbgem_dev *); + +/* internal mac interfaces */ +static void usbgem_tx_timeout(struct usbgem_dev *); +static void usbgem_mii_link_watcher(struct usbgem_dev *); +static int usbgem_mac_init(struct usbgem_dev *); +static int usbgem_mac_start(struct usbgem_dev *); +static int usbgem_mac_stop(struct usbgem_dev *, int, boolean_t); +static void usbgem_mac_ioctl(struct usbgem_dev *, queue_t *, mblk_t *); + +int usbgem_speed_value[] = {10, 100, 1000}; + +static int usbgem_ctrl_retry = 5; + +/* usb event support */ +static int usbgem_disconnect_cb(dev_info_t *dip); +static int usbgem_reconnect_cb(dev_info_t *dip); +int usbgem_suspend(dev_info_t *dip); +int usbgem_resume(dev_info_t *dip); + +static uint8_t usbgem_bcastaddr[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +#ifdef MODULE +extern struct mod_ops mod_miscops; + +static struct modlmisc modlmisc = { + &mod_miscops, + "usbgem v" VERSION, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlmisc, NULL +}; + +/* + * _init : done + */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!usbgem: _init: called")); + status = mod_install(&modlinkage); + + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!usbgem: _fini: called")); + status = mod_remove(&modlinkage); + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} +#endif /* MODULE */ + +/* ============================================================== */ +/* + * Ether CRC calculation utilities + */ +/* ============================================================== */ +/* + * Ether CRC calculation according to 21143 data sheet + */ +#define CRC32_POLY_LE 0xedb88320 +uint32_t +usbgem_ether_crc_le(const uint8_t *addr) +{ + int idx; + int bit; + uint_t data; + uint32_t crc = 0xffffffff; + + crc = 0xffffffff; + for (idx = 0; idx < ETHERADDRL; idx++) { + for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) { + crc = (crc >> 1) ^ + (((crc ^ data) & 1) ? CRC32_POLY_LE : 0); + } + } + return (crc); +} + +#define CRC32_POLY_BE 0x04c11db7 +uint32_t +usbgem_ether_crc_be(const uint8_t *addr) +{ + int idx; + int bit; + uint_t data; + uint32_t crc; + + crc = 0xffffffff; + for (idx = 0; idx < ETHERADDRL; idx++) { + for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) { + crc = (crc << 1) ^ + ((((crc >> 31) ^ data) & 1) ? CRC32_POLY_BE : 0); + } + } + return (crc); +} + +int +usbgem_prop_get_int(struct usbgem_dev *dp, char *prop_template, int def_val) +{ + char propname[32]; + + (void) sprintf(propname, prop_template, dp->name); + + return (ddi_prop_get_int(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, propname, def_val)); +} + +static int +usbgem_population(uint32_t x) +{ + int i; + int cnt; + + cnt = 0; + for (i = 0; i < 32; i++) { + if (x & (1 << i)) { + cnt++; + } + } + return (cnt); +} + +static clock_t +usbgem_timestamp_nz() +{ + clock_t now; + now = ddi_get_lbolt(); + return (now ? now : (clock_t)1); +} + +#ifdef USBGEM_DEBUG_LEVEL +#ifdef USBGEM_DEBUG_VLAN +#ifdef notdef +#include <netinet/in.h> +#endif +static void +usbgem_dump_packet(struct usbgem_dev *dp, char *title, mblk_t *mp, + boolean_t check_cksum) +{ + char msg[180]; + uint8_t buf[18+20+20]; + uint8_t *p; + size_t offset; + uint_t ethertype; + uint_t proto; + uint_t ipproto = 0; + uint_t iplen; + uint_t iphlen; + uint_t tcplen; + uint_t udplen; + uint_t cksum; + int rest; + int len; + char *bp; + mblk_t *tp; + extern uint_t ip_cksum(mblk_t *, int, uint32_t); + + msg[0] = 0; + bp = msg; + + rest = sizeof (buf); + offset = 0; + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + len = min(rest, len); + bcopy(tp->b_rptr, &buf[offset], len); + rest -= len; + offset += len; + if (rest == 0) { + break; + } + } + + offset = 0; + p = &buf[offset]; + + /* ethernet address */ + sprintf(bp, + "ether: %02x:%02x:%02x:%02x:%02x:%02x" + " -> %02x:%02x:%02x:%02x:%02x:%02x", + p[6], p[7], p[8], p[9], p[10], p[11], + p[0], p[1], p[2], p[3], p[4], p[5]); + bp = &msg[strlen(msg)]; + + /* vlag tag and etherrtype */ + ethertype = GET_ETHERTYPE(p); + if (ethertype == VTAG_TPID) { + sprintf(bp, " vtag:0x%04x", GET_NET16(&p[14])); + bp = &msg[strlen(msg)]; + + offset += VTAG_SIZE; + p = &buf[offset]; + ethertype = GET_ETHERTYPE(p); + } + sprintf(bp, " type:%04x", ethertype); + bp = &msg[strlen(msg)]; + + /* ethernet packet length */ + sprintf(bp, " mblklen:%d", msgdsize(mp)); + bp = &msg[strlen(msg)]; + if (mp->b_cont) { + sprintf(bp, "("); + bp = &msg[strlen(msg)]; + for (tp = mp; tp; tp = tp->b_cont) { + if (tp == mp) { + sprintf(bp, "%d", tp->b_wptr - tp->b_rptr); + } else { + sprintf(bp, "+%d", tp->b_wptr - tp->b_rptr); + } + bp = &msg[strlen(msg)]; + } + sprintf(bp, ")"); + bp = &msg[strlen(msg)]; + } + + if (ethertype != ETHERTYPE_IP) { + goto x; + } + + /* ip address */ + offset += sizeof (struct ether_header); + p = &buf[offset]; + ipproto = p[9]; + iplen = GET_NET16(&p[2]); + sprintf(bp, ", ip: %d.%d.%d.%d -> %d.%d.%d.%d proto:%d iplen:%d", + p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], + ipproto, iplen); + bp = (void *)&msg[strlen(msg)]; + + iphlen = (p[0] & 0xf) * 4; + + /* cksum for psuedo header */ + cksum = *(uint16_t *)&p[12]; + cksum += *(uint16_t *)&p[14]; + cksum += *(uint16_t *)&p[16]; + cksum += *(uint16_t *)&p[18]; + cksum += BE_16(ipproto); + + /* tcp or udp protocol header */ + offset += iphlen; + p = &buf[offset]; + if (ipproto == IPPROTO_TCP) { + tcplen = iplen - iphlen; + sprintf(bp, ", tcp: len:%d cksum:%x", + tcplen, GET_NET16(&p[16])); + bp = (void *)&msg[strlen(msg)]; + + if (check_cksum) { + cksum += BE_16(tcplen); + cksum = (uint16_t)ip_cksum(mp, offset, cksum); + sprintf(bp, " (%s)", + (cksum == 0 || cksum == 0xffff) ? "ok" : "ng"); + bp = (void *)&msg[strlen(msg)]; + } + } else if (ipproto == IPPROTO_UDP) { + udplen = GET_NET16(&p[4]); + sprintf(bp, ", udp: len:%d cksum:%x", + udplen, GET_NET16(&p[6])); + bp = (void *)&msg[strlen(msg)]; + + if (GET_NET16(&p[6]) && check_cksum) { + cksum += *(uint16_t *)&p[4]; + cksum = (uint16_t)ip_cksum(mp, offset, cksum); + sprintf(bp, " (%s)", + (cksum == 0 || cksum == 0xffff) ? "ok" : "ng"); + bp = (void *)&msg[strlen(msg)]; + } + } +x: + cmn_err(CE_CONT, "!%s: %s: %s", dp->name, title, msg); +} +#endif /* USBGEM_DEBUG_VLAN */ +#endif /* USBGEM_DEBUG_LEVEL */ + +#ifdef GEM_GCC_RUNTIME +/* + * gcc3 runtime routines + */ +#pragma weak memcmp +int +memcmp(const void *s1, const void *s2, size_t n) +{ + int i; + int ret; + + ret = 0; + for (i = 0; i < n; i++) { + ret = (int)((uint8_t *)s1)[i] - (int)((uint8_t *)s2)[i]; + if (ret) { + return (ret); + } + } + return (0); +} + +#pragma weak memset +void * +memset(void *s, int c, size_t n) +{ + if ((c & 0xff) == 0) { + bzero(s, n); + } else { + while (n--) { + ((uint8_t *)s)[n] = c; + } + } + return (s); +} + +#pragma weak _memcpy = memcpy +#pragma weak memcpy +void * +memcpy(void *s1, const void *s2, size_t n) +{ + bcopy(s2, s1, n); + return (s1); +} +#endif /* GEM_GCC_RUNTIME */ +/* ============================================================== */ +/* + * hardware operations + */ +/* ============================================================== */ +static int +usbgem_hal_reset_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_reset_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_init_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_init_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_attach_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_attach_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_set_rx_filter(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_set_rx_filter)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_set_media(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_set_media)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_start_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_start_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_stop_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_stop_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_get_stats(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_get_stats)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + + +/* ============================================================== */ +/* + * USB pipe management + */ +/* ============================================================== */ +static boolean_t +usbgem_rx_start_unit(struct usbgem_dev *dp, usb_bulk_req_t *req) +{ + mblk_t *mp; + int err; + usb_flags_t flags; + + ASSERT(req); + + mp = allocb(dp->rx_buf_len, BPRI_MED); + if (mp == NULL) { + cmn_err(CE_WARN, "!%s: %s: failed to allocate mblk", + dp->name, __func__); + goto err; + } + + req->bulk_len = dp->rx_buf_len; + req->bulk_data = mp; + req->bulk_client_private = (usb_opaque_t)dp; + req->bulk_timeout = 0; + req->bulk_attributes = USB_ATTRS_SHORT_XFER_OK; + req->bulk_cb = usbgem_bulkin_cb; + req->bulk_exc_cb = usbgem_bulkin_cb; + req->bulk_completion_reason = 0; + req->bulk_cb_flags = 0; + + flags = 0; + err = usb_pipe_bulk_xfer(dp->bulkin_pipe, req, flags); + + if (err != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: failed to bulk_xfer for rx, err:%d", + dp->name, err); + + /* free req and mp */ + usb_free_bulk_req(req); + goto err; + } + return (B_TRUE); +err: + return (B_FALSE); +} + +/* ============================================================== */ +/* + * Rx/Tx buffer management + */ +/* ============================================================== */ +static int +usbgem_init_rx_buf(struct usbgem_dev *dp) +{ + int i; + usb_bulk_req_t *req; + + ASSERT(dp->mac_state == MAC_STATE_ONLINE); + + for (i = 0; i < dp->ugc.usbgc_rx_list_max; i++) { + req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to allocate bulkreq for rx", + dp->name, __func__); + return (USB_FAILURE); + } + if (!usbgem_rx_start_unit(dp, req)) { + return (USB_FAILURE); + } + mutex_enter(&dp->rxlock); + dp->rx_busy_cnt++; + mutex_exit(&dp->rxlock); + } + return (USB_SUCCESS); +} + +/* ============================================================== */ +/* + * memory resource management + */ +/* ============================================================== */ +static int +usbgem_free_memory(struct usbgem_dev *dp) +{ + usb_bulk_req_t *req; + + /* free all tx requst structure */ + while ((req = dp->tx_free_list) != NULL) { + dp->tx_free_list = + (usb_bulk_req_t *)req->bulk_client_private; + req->bulk_data = NULL; + usb_free_bulk_req(req); + } + return (USB_SUCCESS); +} + +static int +usbgem_alloc_memory(struct usbgem_dev *dp) +{ + int i; + usb_bulk_req_t *req; + + /* allocate tx requests */ + dp->tx_free_list = NULL; + for (i = 0; i < dp->ugc.usbgc_tx_list_max; i++) { + req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, + "%s:%s failed to allocate tx requests", + dp->name, __func__); + + /* free partially allocated tx requests */ + (void) usbgem_free_memory(dp); + return (USB_FAILURE); + } + + /* add the new one allocated into tx free list */ + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + } + + return (USB_SUCCESS); +} + +/* ========================================================== */ +/* + * Start transmission. + * Return zero on success, + */ +/* ========================================================== */ + +#ifdef TXTIMEOUT_TEST +static int usbgem_send_cnt = 0; +#endif + +/* + * usbgem_send is used only to send data packet into ethernet line. + */ +static mblk_t * +usbgem_send_common(struct usbgem_dev *dp, mblk_t *mp, uint32_t flags) +{ + int err; + mblk_t *new; + usb_bulk_req_t *req; + int mcast; + int bcast; + int len; + boolean_t intr; + usb_flags_t usb_flags = 0; +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_state_t p_state; +#endif + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + intr = (flags & 1) != 0; + len = msgdsize(mp); + bcast = 0; + mcast = 0; + if (mp->b_rptr[0] & 1) { + if (bcmp(mp->b_rptr, &usbgem_bcastaddr, ETHERADDRL) == 0) { + bcast = 1; + } else { + mcast = 1; + } + } + new = (*dp->ugc.usbgc_tx_make_packet)(dp, mp); + if (new == NULL) { + /* + * no memory resource. we don't stop downstream, + * we just discard the packet. + */ + DPRINTF(0, (CE_CONT, "!%s: %s: no memory", + dp->name, __func__)); + freemsg(mp); + + mutex_enter(&dp->txlock); + dp->stats.noxmtbuf++; + dp->stats.errxmt++; + mutex_exit(&dp->txlock); + + return (NULL); + } + + ASSERT(new->b_cont == NULL); + + mutex_enter(&dp->txlock); + if (dp->tx_free_list == NULL) { + /* + * no tx free slot + */ + ASSERT(dp->tx_busy_cnt == dp->ugc.usbgc_tx_list_max); + mutex_exit(&dp->txlock); + + DPRINTF(4, (CE_CONT, "!%s: %s: no free slot", + dp->name, __func__)); + if (new && new != mp) { + /* free reallocated message */ + freemsg(new); + } + return (mp); + } + req = dp->tx_free_list; + dp->tx_free_list = (usb_bulk_req_t *)req->bulk_client_private; + dp->tx_busy_cnt++; + + if (dp->tx_free_list == NULL) { + intr = B_TRUE; + } + if (intr) { + dp->tx_intr_pended++; + } + DB_TCI(new) = intr; +#ifdef USBGEM_DEBUG_LEVEL + new->b_datap->db_cksum32 = dp->tx_seq_num; + dp->tx_seq_num++; +#endif + dp->stats.obytes += len; + dp->stats.opackets++; + if (bcast | mcast) { + dp->stats.obcast += bcast; + dp->stats.omcast += mcast; + } + mutex_exit(&dp->txlock); + + DPRINTF(2, (CE_CONT, "!%s: %s: sending", dp->name, __func__)); + + req->bulk_len = (long)new->b_wptr - (long)new->b_rptr; + req->bulk_data = new; + req->bulk_client_private = (usb_opaque_t)dp; + req->bulk_timeout = dp->bulkout_timeout; /* in second */ + req->bulk_attributes = 0; + req->bulk_cb = usbgem_bulkout_cb; + req->bulk_exc_cb = usbgem_bulkout_cb; + req->bulk_completion_reason = 0; + req->bulk_cb_flags = 0; + + if (intr) { + usb_flags = USB_FLAGS_SLEEP; + } + if ((err = usb_pipe_bulk_xfer(dp->bulkout_pipe, req, usb_flags)) + != USB_SUCCESS) { + + /* failed to transfer the packet, discard it. */ + freemsg(new); + req->bulk_data = NULL; + + /* recycle the request block */ + mutex_enter(&dp->txlock); + dp->tx_busy_cnt--; + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + mutex_exit(&dp->txlock); + + cmn_err(CE_NOTE, + "%s: %s: usb_pipe_bulk_xfer: failed: err:%d", + dp->name, __func__, err); + + /* we use another flag to indicate error state. */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + } else { + /* record the start time */ + dp->tx_start_time = ddi_get_lbolt(); + } + + if (err == USB_SUCCESS && (usb_flags & USB_FLAGS_SLEEP)) { + usbgem_bulkout_cb(dp->bulkout_pipe, req); + } + + if (new != mp) { + freemsg(mp); + } + return (NULL); +} + +int +usbgem_restart_nic(struct usbgem_dev *dp) +{ + int ret; + int flags = 0; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + ASSERT(dp->mac_state != MAC_STATE_DISCONNECTED); + + /* + * ensure to stop the nic + */ + if (dp->mac_state == MAC_STATE_ONLINE) { + (void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL); + } + + /* now the nic become quiescent, reset the chip */ + if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: %s: failed to reset chip", + dp->name, __func__); + goto err; + } + + /* + * restore the nic state step by step + */ + if (dp->nic_state < NIC_STATE_INITIALIZED) { + goto done; + } + + if (usbgem_mac_init(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: %s: failed to initialize chip", + dp->name, __func__); + goto err; + } + + /* setup mac address and enable rx filter */ + sema_p(&dp->rxfilter_lock); + dp->rxmode |= RXMODE_ENABLE; + ret = usbgem_hal_set_rx_filter(dp); + sema_v(&dp->rxfilter_lock); + if (ret != USB_SUCCESS) { + goto err; + } + + /* + * update the link state asynchronously + */ + cv_signal(&dp->link_watcher_wait_cv); + + /* + * XXX - a panic happened because of linkdown. + * We must check mii_state here, because the link can be down just + * before the restart event happen. If the link is down now, + * gem_mac_start() will be called from gem_mii_link_check() when + * the link become up later. + */ + if (dp->mii_state == MII_STATE_LINKUP) { + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto err; + } + if (dp->nic_state < NIC_STATE_ONLINE) { + goto done; + } + + (void) usbgem_mac_start(dp); + + } +done: + return (USB_SUCCESS); +err: +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (USB_FAILURE); +} + +static void +usbgem_tx_timeout(struct usbgem_dev *dp) +{ + int ret; + uint_t rwlock; + clock_t now; + + for (; ; ) { + mutex_enter(&dp->tx_watcher_lock); + ret = cv_timedwait(&dp->tx_watcher_cv, &dp->tx_watcher_lock, + dp->tx_watcher_interval + ddi_get_lbolt()); + mutex_exit(&dp->tx_watcher_lock); + + if (dp->tx_watcher_stop) { + break; + } + + now = ddi_get_lbolt(); + + rwlock = RW_READER; +again: + rw_enter(&dp->dev_state_lock, rwlock); + + if ((dp->mac_state != MAC_STATE_DISCONNECTED && + dp->fatal_error && + now - dp->fatal_error >= dp->ugc.usbgc_tx_timeout) || + (dp->mac_state == MAC_STATE_ONLINE && + dp->mii_state == MII_STATE_LINKUP && + dp->tx_busy_cnt != 0 && + now - dp->tx_start_time >= dp->ugc.usbgc_tx_timeout)) { + if (rwlock == RW_READER) { + /* + * Upgrade dev_state_lock from shared mode + * to exclusive mode to restart nic + */ + rwlock = RW_WRITER; + rw_exit(&dp->dev_state_lock); + goto again; + } + cmn_err(CE_WARN, "%s: %s: restarting the nic:" + " fatal_error:%ld nic_state:%d" + " mac_state:%d starttime:%ld", + dp->name, __func__, + dp->fatal_error ? now - dp->fatal_error: 0, + dp->nic_state, dp->mac_state, + dp->tx_busy_cnt ? now - dp->tx_start_time : 0); + + (void) usbgem_restart_nic(dp); + } + + rw_exit(&dp->dev_state_lock); + } +} + +static int +usbgem_tx_watcher_start(struct usbgem_dev *dp) +{ + int err; + kthread_t *wdth; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* make a first call of uwgem_lw_link_check() */ + dp->tx_watcher_stop = 0; + dp->tx_watcher_interval = drv_usectohz(1000*1000); + + wdth = thread_create(NULL, 0, usbgem_tx_timeout, dp, 0, &p0, + TS_RUN, minclsyspri); + if (wdth == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to create a tx_watcher thread", + dp->name, __func__); + return (USB_FAILURE); + } + dp->tx_watcher_did = wdth->t_did; + + return (USB_SUCCESS); +} + +static void +usbgem_tx_watcher_stop(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + if (dp->tx_watcher_did) { + /* Ensure timer routine stopped */ + dp->tx_watcher_stop = 1; + cv_signal(&dp->tx_watcher_cv); + thread_join(dp->tx_watcher_did); + dp->tx_watcher_did = NULL; + } +} + +/* ================================================================== */ +/* + * Callback handlers + */ +/* ================================================================== */ +static void +usbgem_bulkin_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req) +{ + mblk_t *newmp; + mblk_t *mp; + mblk_t *tp; + int len = 0; + int pkts = 0; + int bcast = 0; + int mcast = 0; + boolean_t busy; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->bulk_client_private; + mp = req->bulk_data; + req->bulk_data = NULL; + + DPRINTF(2, (CE_CONT, "!%s: %s: mp:%p, cr:%s(%d)", + dp->name, __func__, mp, + usb_str_cr(req->bulk_completion_reason), + req->bulk_completion_reason)); + + /* + * we cannot acquire dev_state_lock because the routine + * must be executed during usbgem_mac_stop() to avoid + * dead lock. + * we use a simle membar operation to get the state correctly. + */ + membar_consumer(); + + if (req->bulk_completion_reason == USB_CR_OK && + dp->nic_state == NIC_STATE_ONLINE) { + newmp = (*dp->ugc.usbgc_rx_make_packet)(dp, mp); + + if (newmp != mp) { + /* the message has been reallocated, free old one */ + freemsg(mp); + } + + /* the message may includes one or more ethernet packets */ + for (tp = newmp; tp; tp = tp->b_next) { + len += tp->b_wptr - tp->b_rptr; + pkts++; + if (tp->b_rptr[0] & 1) { + if (bcmp(tp->b_rptr, &usbgem_bcastaddr, + ETHERADDRL) == 0) { + bcast++; + } else { + mcast++; + } + } + } + + /* send up if it is a valid packet */ +#ifdef USBGEM_CONFIG_GLDv3 + mac_rx(dp->mh, NULL, newmp); +#else + while (newmp) { + tp = newmp; + newmp = newmp->b_next; + tp->b_next = NULL; + gld_recv(dp->macinfo, tp); + } +#endif + } else { + freemsg(mp); + len = 0; + } + + mutex_enter(&dp->rxlock); + /* update rx_active */ + if (dp->rx_active) { + dp->rx_active = dp->mac_state == MAC_STATE_ONLINE; + } + + dp->stats.rbytes += len; + dp->stats.rpackets += pkts; + if (bcast | mcast) { + dp->stats.rbcast += bcast; + dp->stats.rmcast += mcast; + } + mutex_exit(&dp->rxlock); + + if (dp->rx_active) { + /* prepare to receive the next packets */ + if (usbgem_rx_start_unit(dp, req)) { + /* we successed */ + goto done; + } + cmn_err(CE_WARN, + "!%s: %s: failed to fill next rx packet", + dp->name, __func__); + /* + * we use another flag to indicate error state. + * if we acquire dev_state_lock for RW_WRITER here, + * usbgem_mac_stop() may hang. + */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + } else { + /* no need to prepare the next packets */ + usb_free_bulk_req(req); + } + + mutex_enter(&dp->rxlock); + dp->rx_active = B_FALSE; + dp->rx_busy_cnt--; + if (dp->rx_busy_cnt == 0) { + /* wake up someone waits for me */ + cv_broadcast(&dp->rx_drain_cv); + } + mutex_exit(&dp->rxlock); +done: + ; +} + +static void +usbgem_bulkout_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req) +{ + boolean_t intr; + boolean_t tx_sched; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->bulk_client_private; + tx_sched = B_FALSE; + + DPRINTF(2, (CE_CONT, + "!%s: %s: cr:%s(%d) cb_flags:0x%x head:%d tail:%d", + dp->name, __func__, + usb_str_cr(req->bulk_completion_reason), + req->bulk_completion_reason, + req->bulk_cb_flags, + dp->tx_busy_cnt)); + + /* we have finished to transfer the packet into tx fifo */ + intr = DB_TCI(req->bulk_data); + freemsg(req->bulk_data); + + if (req->bulk_completion_reason != USB_CR_OK && + dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + + mutex_enter(&dp->txlock); + + if (intr) { + ASSERT(dp->tx_intr_pended > 0); + /* find the last interrupt we have scheduled */ + if (--(dp->tx_intr_pended) == 0) { + tx_sched = B_TRUE; + } + } + + ASSERT(dp->tx_busy_cnt > 0); + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + dp->tx_busy_cnt--; + +#ifdef CONFIG_TX_LIMITER + if (tx_sched) { + dp->tx_max_packets = + min(dp->tx_max_packets + 1, dp->ugc.usbgc_tx_list_max); + } +#endif + if (dp->mac_state != MAC_STATE_ONLINE && dp->tx_busy_cnt == 0) { + cv_broadcast(&dp->tx_drain_cv); + } + + mutex_exit(&dp->txlock); + + if (tx_sched) { +#ifdef USBGEM_CONFIG_GLDv3 + mac_tx_update(dp->mh); +#else + gld_sched(dp->macinfo); +#endif + } +} + +static void +usbgem_intr_cb(usb_pipe_handle_t ph, usb_intr_req_t *req) +{ + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->intr_client_private; + dp->stats.intr++; + + if (req->intr_completion_reason == USB_CR_OK) { + (*dp->ugc.usbgc_interrupt)(dp, req->intr_data); + } + + /* free the request and data */ + usb_free_intr_req(req); +} + +/* ======================================================================== */ +/* + * MII support routines + */ +/* ======================================================================== */ +static void +usbgem_choose_forcedmode(struct usbgem_dev *dp) +{ + /* choose media mode */ + if (dp->anadv_1000fdx || dp->anadv_1000hdx) { + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = dp->anadv_1000fdx; + } else if (dp->anadv_100fdx || dp->anadv_100t4) { + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if (dp->anadv_100hdx) { + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_FALSE; + } else { + dp->speed = USBGEM_SPD_10; + dp->full_duplex = dp->anadv_10fdx; + } +} + +static uint16_t +usbgem_mii_read(struct usbgem_dev *dp, uint_t reg, int *errp) +{ + uint16_t val; + + sema_p(&dp->hal_op_lock); + val = (*dp->ugc.usbgc_mii_read)(dp, reg, errp); + sema_v(&dp->hal_op_lock); + + return (val); +} + +static void +usbgem_mii_write(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp) +{ + sema_p(&dp->hal_op_lock); + (*dp->ugc.usbgc_mii_write)(dp, reg, val, errp); + sema_v(&dp->hal_op_lock); +} + +static int +usbgem_mii_probe(struct usbgem_dev *dp) +{ + int err; + + err = (*dp->ugc.usbgc_mii_probe)(dp); + return (err); +} + +static int +usbgem_mii_init(struct usbgem_dev *dp) +{ + int err; + + err = (*dp->ugc.usbgc_mii_init)(dp); + return (err); +} + +#define fc_cap_decode(x) \ + ((((x) & MII_ABILITY_PAUSE) != 0 ? 1 : 0) | \ + (((x) & MII_ABILITY_ASM_DIR) != 0 ? 2 : 0)) + +int +usbgem_mii_config_default(struct usbgem_dev *dp, int *errp) +{ + uint16_t mii_stat; + uint16_t val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Configure bits in advertisement register + */ + mii_stat = dp->mii_status; + + DPRINTF(1, (CE_CONT, "!%s: %s: MII_STATUS reg:%b", + dp->name, __func__, mii_stat, MII_STATUS_BITS)); + + if ((mii_stat & MII_STATUS_ABILITY_TECH) == 0) { + /* it's funny */ + cmn_err(CE_WARN, "!%s: wrong ability bits: mii_status:%b", + dp->name, mii_stat, MII_STATUS_BITS); + return (USB_FAILURE); + } + + /* Do not change the rest of ability bits in advert reg */ + val = usbgem_mii_read(dp, MII_AN_ADVERT, errp) & ~MII_ABILITY_ALL; + if (*errp != USB_SUCCESS) { + goto usberr; + } + + DPRINTF(0, (CE_CONT, + "!%s: %s: 100T4:%d 100F:%d 100H:%d 10F:%d 10H:%d", + dp->name, __func__, + dp->anadv_100t4, dp->anadv_100fdx, dp->anadv_100hdx, + dp->anadv_10fdx, dp->anadv_10hdx)); + + /* set technology bits */ + if (dp->anadv_100t4) { + val |= MII_ABILITY_100BASE_T4; + } + if (dp->anadv_100fdx) { + val |= MII_ABILITY_100BASE_TX_FD; + } + if (dp->anadv_100hdx) { + val |= MII_ABILITY_100BASE_TX; + } + if (dp->anadv_10fdx) { + val |= MII_ABILITY_10BASE_T_FD; + } + if (dp->anadv_10hdx) { + val |= MII_ABILITY_10BASE_T; + } + + /* set flow control capabilities */ + if (dp->anadv_pause) { + val |= MII_ABILITY_PAUSE; + } + if (dp->anadv_asmpause) { + val |= MII_ABILITY_ASM_DIR; + } + + DPRINTF(0, (CE_CONT, + "!%s: %s: setting MII_AN_ADVERT reg:%b, pause:%d, asmpause:%d", + dp->name, __func__, val, MII_ABILITY_BITS, + dp->anadv_pause, dp->anadv_asmpause)); + + usbgem_mii_write(dp, MII_AN_ADVERT, val, errp); + if (*errp != USB_SUCCESS) { + goto usberr; + } + + if (dp->mii_status & MII_STATUS_XSTATUS) { + /* + * 1000Base-T GMII support + */ + if (!dp->anadv_autoneg) { + /* enable manual configuration */ + val = MII_1000TC_CFG_EN; + if (dp->anadv_1000t_ms == 2) { + val |= MII_1000TC_CFG_VAL; + } + } else { + val = 0; + if (dp->anadv_1000fdx) { + val |= MII_1000TC_ADV_FULL; + } + if (dp->anadv_1000hdx) { + val |= MII_1000TC_ADV_HALF; + } + switch (dp->anadv_1000t_ms) { + case 1: + /* slave */ + val |= MII_1000TC_CFG_EN; + break; + + case 2: + /* master */ + val |= MII_1000TC_CFG_EN | MII_1000TC_CFG_VAL; + break; + + default: + /* auto: do nothing */ + break; + } + } + DPRINTF(0, (CE_CONT, + "!%s: %s: setting MII_1000TC reg:%b", + dp->name, __func__, val, MII_1000TC_BITS)); + + usbgem_mii_write(dp, MII_1000TC, val, errp); + if (*errp != USB_SUCCESS) { + goto usberr; + } + } + return (USB_SUCCESS); + +usberr: + return (*errp); +} + +static char *usbgem_fc_type[] = { + "without", + "with symmetric", + "with tx", + "with rx", +}; + +#ifdef USBGEM_CONFIG_GLDv3 +#define USBGEM_LINKUP(dp) mac_link_update((dp)->mh, LINK_STATE_UP) +#define USBGEM_LINKDOWN(dp) mac_link_update((dp)->mh, LINK_STATE_DOWN) +#else +#define USBGEM_LINKUP(dp) \ + if (gld_linkstate) { \ + gld_linkstate((dp)->macinfo, GLD_LINKSTATE_UP); \ + } +#define USBGEM_LINKDOWN(dp) \ + if (gld_linkstate) { \ + gld_linkstate((dp)->macinfo, GLD_LINKSTATE_DOWN); \ + } +#endif + +static uint8_t usbgem_fc_result[4 /* my cap */][4 /* lp cap */] = { +/* none symm tx rx/symm */ +/* none */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE}, +/* sym */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC, + FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC}, +/* tx */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_TX_PAUSE}, +/* rx/symm */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC, + FLOW_CONTROL_RX_PAUSE, + FLOW_CONTROL_SYMMETRIC}, +}; + +static boolean_t +usbgem_mii_link_check(struct usbgem_dev *dp, int *oldstatep, int *newstatep) +{ + boolean_t tx_sched = B_FALSE; + uint16_t status; + uint16_t advert; + uint16_t lpable; + uint16_t exp; + uint16_t ctl1000; + uint16_t stat1000; + uint16_t val; + clock_t now; + clock_t diff; + int linkdown_action; + boolean_t fix_phy = B_FALSE; + int err; + uint_t rwlock; + + DPRINTF(4, (CE_CONT, "!%s: %s: time:%d state:%d", + dp->name, __func__, ddi_get_lbolt(), dp->mii_state)); + + if (dp->mii_state != MII_STATE_LINKUP) { + rwlock = RW_WRITER; + } else { + rwlock = RW_READER; + } +again: + rw_enter(&dp->dev_state_lock, rwlock); + + /* save old mii state */ + *oldstatep = dp->mii_state; + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* stop periodic execution of the link watcher */ + dp->mii_interval = 0; + tx_sched = B_FALSE; + goto next; + } + + now = ddi_get_lbolt(); + diff = now - dp->mii_last_check; + dp->mii_last_check = now; + + /* + * For NWAM, don't show linkdown state right + * when the device is attached. + */ + if (dp->linkup_delay > 0) { + if (dp->linkup_delay > diff) { + dp->linkup_delay -= diff; + } else { + /* link up timeout */ + dp->linkup_delay = -1; + } + } + +next_nowait: + switch (dp->mii_state) { + case MII_STATE_UNKNOWN: + goto reset_phy; + + case MII_STATE_RESETTING: + dp->mii_timer -= diff; + if (dp->mii_timer > 0) { + /* don't read phy registers in resetting */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (val & MII_CONTROL_RESET) { + cmn_err(CE_NOTE, + "!%s: time:%ld resetting phy not complete." + " mii_control:0x%b", + dp->name, ddi_get_lbolt(), + val, MII_CONTROL_BITS); + } + + /* ensure neither isolated nor pwrdown nor auto-nego mode */ + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + goto usberr; + } +#if USBGEM_DEBUG_LEVEL > 10 + val = usbgem_mii_read(dp, MII_CONTROL, &err); + cmn_err(CE_CONT, "!%s: readback control %b", + dp->name, val, MII_CONTROL_BITS); +#endif + /* As resetting PHY has completed, configure PHY registers */ + if ((*dp->ugc.usbgc_mii_config)(dp, &err) != USB_SUCCESS) { + /* we failed to configure PHY */ + goto usberr; + } + + /* prepare for forced mode */ + usbgem_choose_forcedmode(dp); + + dp->mii_lpable = 0; + dp->mii_advert = 0; + dp->mii_exp = 0; + dp->mii_ctl1000 = 0; + dp->mii_stat1000 = 0; + + dp->flow_control = FLOW_CONTROL_NONE; + + if (!dp->anadv_autoneg) { + /* skip auto-negotiation phase */ + dp->mii_state = MII_STATE_MEDIA_SETUP; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + goto next_nowait; + } + + /* issue an auto-negotiation command */ + goto autonego; + + case MII_STATE_AUTONEGOTIATING: + /* + * Autonegotiation in progress + */ + dp->mii_timer -= diff; + if (dp->mii_timer - + (dp->ugc.usbgc_mii_an_timeout - dp->ugc.usbgc_mii_an_wait) + > 0) { + /* wait for minimum time (2.3 - 2.5 sec) */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + /* read PHY status */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + DPRINTF(4, (CE_CONT, + "!%s: %s: called: mii_state:%d MII_STATUS reg:%b", + dp->name, __func__, dp->mii_state, + status, MII_STATUS_BITS)); + + if (status & MII_STATUS_REMFAULT) { + /* + * The link parnert told me something wrong happend. + * What do we do ? + */ + cmn_err(CE_CONT, + "!%s: auto-negotiation failed: remote fault", + dp->name); + goto autonego; + } + + if ((status & MII_STATUS_ANDONE) == 0) { + if (dp->mii_timer <= 0) { + /* + * Auto-negotiation has been timed out, + * Reset PHY and try again. + */ + if (!dp->mii_supress_msg) { + cmn_err(CE_WARN, + "!%s: auto-negotiation failed:" + " timeout", + dp->name); + dp->mii_supress_msg = B_TRUE; + } + goto autonego; + } + /* + * Auto-negotiation is in progress. Wait for a while. + */ + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + } + + /* + * Auto-negotiation has been completed. Let's go to AN_DONE. + */ + dp->mii_state = MII_STATE_AN_DONE; + dp->mii_supress_msg = B_FALSE; + DPRINTF(0, (CE_CONT, + "!%s: auto-negotiation completed, MII_STATUS:%b", + dp->name, status, MII_STATUS_BITS)); + + if (dp->ugc.usbgc_mii_an_delay > 0) { + dp->mii_timer = dp->ugc.usbgc_mii_an_delay; + dp->mii_interval = drv_usectohz(20*1000); + goto next; + } + + dp->mii_timer = 0; + diff = 0; + goto next_nowait; + + case MII_STATE_AN_DONE: + /* + * Auto-negotiation has done. Now we can set up media. + */ + dp->mii_timer -= diff; + if (dp->mii_timer > 0) { + /* wait for a while */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + /* + * Setup speed and duplex mode according with + * the result of auto negotiation. + */ + + /* + * Read registers required to determin current + * duplex mode and media speed. + */ + if (dp->ugc.usbgc_mii_an_delay > 0) { + /* the 'status' variable is not initialized yet */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + advert = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + lpable = usbgem_mii_read(dp, MII_AN_LPABLE, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + exp = usbgem_mii_read(dp, MII_AN_EXPANSION, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (exp == 0xffff) { + /* some phys don't have exp register */ + exp = 0; + } + + ctl1000 = 0; + stat1000 = 0; + if (dp->mii_status & MII_STATUS_XSTATUS) { + ctl1000 = usbgem_mii_read(dp, MII_1000TC, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + stat1000 = usbgem_mii_read(dp, MII_1000TS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + dp->mii_lpable = lpable; + dp->mii_advert = advert; + dp->mii_exp = exp; + dp->mii_ctl1000 = ctl1000; + dp->mii_stat1000 = stat1000; + + cmn_err(CE_CONT, + "!%s: auto-negotiation done: " + "status:%b, advert:%b, lpable:%b, exp:%b", + dp->name, + status, MII_STATUS_BITS, + advert, MII_ABILITY_BITS, + lpable, MII_ABILITY_BITS, + exp, MII_AN_EXP_BITS); + + DPRINTF(0, (CE_CONT, "!%s: MII_STATUS:%b", + dp->name, status, MII_STATUS_BITS)); + + if (dp->mii_status & MII_STATUS_XSTATUS) { + cmn_err(CE_CONT, + "! MII_1000TC reg:%b, MII_1000TS reg:%b", + ctl1000, MII_1000TC_BITS, + stat1000, MII_1000TS_BITS); + } + + if (usbgem_population(lpable) <= 1 && + (exp & MII_AN_EXP_LPCANAN) == 0) { + if ((advert & MII_ABILITY_TECH) != lpable) { + cmn_err(CE_WARN, + "!%s: but the link partner doesn't seem" + " to have auto-negotiation capability." + " please check the link configuration.", + dp->name); + } + /* + * it should be a result of pararell detection, + * which cannot detect duplex mode. + */ + if ((advert & lpable) == 0 && + lpable & MII_ABILITY_10BASE_T) { + /* no common technology, try 10M half mode */ + lpable |= advert & MII_ABILITY_10BASE_T; + fix_phy = B_TRUE; + } + } else if (lpable == 0) { + cmn_err(CE_WARN, "!%s: wrong lpable.", dp->name); + goto reset_phy; + } + /* + * configure current link mode according to AN priority. + */ + val = advert & lpable; + if ((ctl1000 & MII_1000TC_ADV_FULL) && + (stat1000 & MII_1000TS_LP_FULL)) { + /* 1000BaseT & full duplex */ + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = B_TRUE; + } else if ((ctl1000 & MII_1000TC_ADV_HALF) && + (stat1000 & MII_1000TS_LP_HALF)) { + /* 1000BaseT & half duplex */ + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = B_FALSE; + } else if ((val & MII_ABILITY_100BASE_TX_FD)) { + /* 100BaseTx & fullduplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_100BASE_T4)) { + /* 100BaseTx & fullduplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_100BASE_TX)) { + /* 100BaseTx & half duplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_FALSE; + } else if ((val & MII_ABILITY_10BASE_T_FD)) { + /* 10BaseT & full duplex */ + dp->speed = USBGEM_SPD_10; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_10BASE_T)) { + /* 10BaseT & half duplex */ + dp->speed = USBGEM_SPD_10; + dp->full_duplex = B_FALSE; + } else { + /* + * the link partner doesn't seem to have + * auto-negotiation capability and our PHY + * could not report current mode correctly. + * We guess current mode by mii_control register. + */ + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + /* select 100m half or 10m half */ + dp->speed = (val & MII_CONTROL_100MB) ? + USBGEM_SPD_100 : USBGEM_SPD_10; + dp->full_duplex = B_FALSE; + fix_phy = B_TRUE; + + cmn_err(CE_NOTE, + "!%s: auto-negotiation done but " + "common ability not found.\n" + "PHY state: control:%b advert:%b lpable:%b\n" + "guessing %d Mbps %s duplex mode", + dp->name, + val, MII_CONTROL_BITS, + advert, MII_ABILITY_BITS, + lpable, MII_ABILITY_BITS, + usbgem_speed_value[dp->speed], + dp->full_duplex ? "full" : "half"); + } + + if (dp->full_duplex) { + dp->flow_control = + usbgem_fc_result[fc_cap_decode(advert)] + [fc_cap_decode(lpable)]; + } else { + dp->flow_control = FLOW_CONTROL_NONE; + } + dp->mii_state = MII_STATE_MEDIA_SETUP; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + goto next_nowait; + + case MII_STATE_MEDIA_SETUP: + DPRINTF(2, (CE_CONT, "!%s: setup midia mode", dp->name)); + + /* assume the link state is down */ + dp->mii_state = MII_STATE_LINKDOWN; + dp->mii_supress_msg = B_FALSE; + + /* use short interval */ + dp->mii_interval = WATCH_INTERVAL_FAST; + + if ((!dp->anadv_autoneg) || + dp->ugc.usbgc_mii_an_oneshot || fix_phy) { + + /* + * write the result of auto negotiation back. + */ + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + val &= ~(MII_CONTROL_SPEED | MII_CONTROL_FDUPLEX | + MII_CONTROL_ANE | MII_CONTROL_RSAN); + + if (dp->full_duplex) { + val |= MII_CONTROL_FDUPLEX; + } + + switch (dp->speed) { + case USBGEM_SPD_1000: + val |= MII_CONTROL_1000MB; + break; + + case USBGEM_SPD_100: + val |= MII_CONTROL_100MB; + break; + + default: + cmn_err(CE_WARN, "%s: unknown speed:%d", + dp->name, dp->speed); + /* FALLTHROUGH */ + + case USBGEM_SPD_10: + /* for USBGEM_SPD_10, do nothing */ + break; + } + + if (dp->mii_status & MII_STATUS_XSTATUS) { + usbgem_mii_write(dp, + MII_1000TC, MII_1000TC_CFG_EN, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + usbgem_mii_write(dp, MII_CONTROL, val, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + /* + * XXX -- nic state should be one of + * NIC_STATE_DISCONNECTED + * NIC_STATE_STOPPED + * NIC_STATE_INITIALIZED + * NIC_STATE_ONLINE + */ + if (dp->nic_state >= NIC_STATE_INITIALIZED) { + /* notify the result of autonegotiation to mac */ + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto usberr; + } + } + goto next_nowait; + + case MII_STATE_LINKDOWN: + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (status & MII_STATUS_LINKUP) { + /* + * Link is going up + */ + dp->mii_state = MII_STATE_LINKUP; + dp->mii_supress_msg = B_FALSE; + + DPRINTF(0, (CE_CONT, + "!%s: link up detected: status:%b", + dp->name, status, MII_STATUS_BITS)); + + /* + * MII_CONTROL_100MB and MII_CONTROL_FDUPLEX are + * ignored when MII_CONTROL_ANE is set. + */ + cmn_err(CE_CONT, + "!%s: Link up: %d Mbps %s duplex %s flow control", + dp->name, + usbgem_speed_value[dp->speed], + dp->full_duplex ? "full" : "half", + usbgem_fc_type[dp->flow_control]); + + dp->mii_interval = + dp->ugc.usbgc_mii_link_watch_interval; + + if (dp->ugc.usbgc_mii_hw_link_detection && + dp->nic_state == NIC_STATE_ONLINE) { + dp->mii_interval = 0; + } + + if (dp->nic_state == NIC_STATE_ONLINE) { + if (dp->mac_state == MAC_STATE_INITIALIZED) { + (void) usbgem_mac_start(dp); + } + tx_sched = B_TRUE; + } + + goto next; + } + + dp->mii_supress_msg = B_TRUE; + if (dp->anadv_autoneg) { + dp->mii_timer -= diff; + if (dp->mii_timer <= 0) { + /* + * the link down timer expired. + * need to restart auto-negotiation. + */ + linkdown_action = + dp->ugc.usbgc_mii_linkdown_timeout_action; + goto restart_autonego; + } + } + /* don't change mii_state */ + goto next; + + case MII_STATE_LINKUP: + if (rwlock == RW_READER) { + /* first pass, read mii status */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + if ((status & MII_STATUS_LINKUP) == 0) { + /* + * Link is going down + */ + cmn_err(CE_NOTE, + "!%s: link down detected: status:%b", + dp->name, status, MII_STATUS_BITS); + /* + * Acquire exclusive lock to change mii_state + */ + if (rwlock == RW_READER) { + rwlock = RW_WRITER; + rw_exit(&dp->dev_state_lock); + goto again; + } + + dp->mii_state = MII_STATE_LINKDOWN; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + + /* + * As we may change the state of the device, + * let us acquire exclusive lock for the state. + */ + if (dp->nic_state == NIC_STATE_ONLINE && + dp->mac_state == MAC_STATE_ONLINE && + dp->ugc.usbgc_mii_stop_mac_on_linkdown) { + (void) usbgem_restart_nic(dp); + /* drain tx */ + tx_sched = B_TRUE; + } + + if (dp->anadv_autoneg) { + /* need to restart auto-negotiation */ + linkdown_action = + dp->ugc.usbgc_mii_linkdown_action; + goto restart_autonego; + } + /* + * don't use hw link down detection until the link + * status become stable for a while. + */ + dp->mii_interval = + dp->ugc.usbgc_mii_link_watch_interval; + + goto next; + } + + /* + * still link up, no need to change mii_state + */ + if (dp->ugc.usbgc_mii_hw_link_detection && + dp->nic_state == NIC_STATE_ONLINE) { + /* + * no need to check link status periodicly + * if nic can generate interrupts when link go down. + */ + dp->mii_interval = 0; + } + goto next; + } + /* NOTREACHED */ + cmn_err(CE_PANIC, "!%s: %s: not reached", dp->name, __func__); + + /* + * Actions for new state. + */ +restart_autonego: + switch (linkdown_action) { + case MII_ACTION_RESET: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: resetting PHY", dp->name); + } + dp->mii_supress_msg = B_TRUE; + goto reset_phy; + + case MII_ACTION_NONE: + dp->mii_supress_msg = B_TRUE; + if (dp->ugc.usbgc_mii_an_oneshot) { + goto autonego; + } + /* PHY will restart autonego automatically */ + dp->mii_state = MII_STATE_AUTONEGOTIATING; + dp->mii_timer = dp->ugc.usbgc_mii_an_timeout; + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + + case MII_ACTION_RSA: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: restarting auto-negotiation", + dp->name); + } + dp->mii_supress_msg = B_TRUE; + goto autonego; + + default: + cmn_err(CE_PANIC, "!%s: unknowm linkdown action: %d", + dp->name, dp->ugc.usbgc_mii_linkdown_action); + dp->mii_supress_msg = B_TRUE; + } + /* NOTREACHED */ + +reset_phy: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: resetting PHY", dp->name); + } + dp->mii_state = MII_STATE_RESETTING; + dp->mii_timer = dp->ugc.usbgc_mii_reset_timeout; + if (!dp->ugc.usbgc_mii_dont_reset) { + usbgem_mii_write(dp, MII_CONTROL, MII_CONTROL_RESET, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + +autonego: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: auto-negotiation started", dp->name); + } + dp->mii_state = MII_STATE_AUTONEGOTIATING; + dp->mii_timer = dp->ugc.usbgc_mii_an_timeout; + + /* start/restart autoneg */ + val = usbgem_mii_read(dp, MII_CONTROL, &err) & + ~(MII_CONTROL_ISOLATE | MII_CONTROL_PWRDN | MII_CONTROL_RESET); + if (err != USB_SUCCESS) { + goto usberr; + } + if (val & MII_CONTROL_ANE) { + val |= MII_CONTROL_RSAN; + } + usbgem_mii_write(dp, MII_CONTROL, + val | dp->ugc.usbgc_mii_an_cmd | MII_CONTROL_ANE, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + +usberr: + dp->mii_state = MII_STATE_UNKNOWN; + dp->mii_interval = dp->ugc.usbgc_mii_link_watch_interval; + tx_sched = B_TRUE; + +next: + *newstatep = dp->mii_state; + rw_exit(&dp->dev_state_lock); + return (tx_sched); +} + +static void +usbgem_mii_link_watcher(struct usbgem_dev *dp) +{ + int old_mii_state; + int new_mii_state; + boolean_t tx_sched; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + for (; ; ) { + + mutex_enter(&dp->link_watcher_lock); + if (dp->mii_interval) { + (void) cv_timedwait(&dp->link_watcher_wait_cv, + &dp->link_watcher_lock, + dp->mii_interval + ddi_get_lbolt()); + } else { + cv_wait(&dp->link_watcher_wait_cv, + &dp->link_watcher_lock); + } + mutex_exit(&dp->link_watcher_lock); + + if (dp->link_watcher_stop) { + break; + } + + /* we block callbacks from disconnect/suspend and restart */ + tx_sched = usbgem_mii_link_check(dp, + &old_mii_state, &new_mii_state); + + /* + * gld v2 notifier functions are not able to + * be called with any locks in this layer. + */ + if (tx_sched) { + /* kick potentially stopped downstream */ +#ifdef USBGEM_CONFIG_GLDv3 + mac_tx_update(dp->mh); +#else + gld_sched(dp->macinfo); +#endif + } + + if (old_mii_state != new_mii_state) { + /* notify new mii link state */ + if (new_mii_state == MII_STATE_LINKUP) { + dp->linkup_delay = 0; + USBGEM_LINKUP(dp); + } else if (dp->linkup_delay <= 0) { + USBGEM_LINKDOWN(dp); + } + } else if (dp->linkup_delay < 0) { + /* first linkup timeout */ + dp->linkup_delay = 0; + USBGEM_LINKDOWN(dp); + } + } + + thread_exit(); +} + +void +usbgem_mii_update_link(struct usbgem_dev *dp) +{ + cv_signal(&dp->link_watcher_wait_cv); +} + +int +usbgem_mii_probe_default(struct usbgem_dev *dp) +{ + int phy; + uint16_t status; + uint16_t xstatus; + int err; + uint16_t adv; + uint16_t adv_org; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Scan PHY + */ + dp->mii_status = 0; + + /* Try default phy first */ + if (dp->mii_phy_addr) { + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (status != 0xffff && status != 0x0000) { + goto PHY_found; + } + + if (dp->mii_phy_addr < 0) { + cmn_err(CE_NOTE, + "!%s: failed to probe default internal and/or non-MII PHY", + dp->name); + return (USB_FAILURE); + } + + cmn_err(CE_NOTE, + "!%s: failed to probe default MII PHY at %d", + dp->name, dp->mii_phy_addr); + } + + /* Try all possible address */ + for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) { + dp->mii_phy_addr = phy; + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(status) failed", + dp->name, __func__)); + goto usberr; + } + + if (status != 0xffff && status != 0x0000) { + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_write(control) failed", + dp->name, __func__)); + goto usberr; + } + goto PHY_found; + } + } + for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) { + dp->mii_phy_addr = phy; + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_write(control) failed", + dp->name, __func__)); + goto usberr; + } + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(status) failed", + dp->name, __func__)); + goto usberr; + } + + if (status != 0xffff && status != 0) { + goto PHY_found; + } + } + + cmn_err(CE_NOTE, "!%s: no MII PHY found", dp->name); + return (USB_FAILURE); + +PHY_found: + dp->mii_status = status; + dp->mii_status_ro = ~status; + dp->mii_phy_id = usbgem_mii_read(dp, MII_PHYIDH, &err) << 16; + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(PHYIDH) failed", + dp->name, __func__)); + goto usberr; + } + dp->mii_phy_id |= usbgem_mii_read(dp, MII_PHYIDL, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(PHYIDL) failed", + dp->name, __func__)); + goto usberr; + } + + if (dp->mii_phy_addr < 0) { + cmn_err(CE_CONT, "!%s: using internal/non-MII PHY(0x%08x)", + dp->name, dp->mii_phy_id); + } else { + cmn_err(CE_CONT, "!%s: MII PHY (0x%08x) found at %d", + dp->name, dp->mii_phy_id, dp->mii_phy_addr); + } + + cmn_err(CE_CONT, + "!%s: PHY control:%b, status:%b, advert:%b, lpar:%b, exp:%b", + dp->name, + usbgem_mii_read(dp, MII_CONTROL, &err), MII_CONTROL_BITS, + status, MII_STATUS_BITS, + usbgem_mii_read(dp, MII_AN_ADVERT, &err), MII_ABILITY_BITS, + usbgem_mii_read(dp, MII_AN_LPABLE, &err), MII_ABILITY_BITS, + usbgem_mii_read(dp, MII_AN_EXPANSION, &err), MII_AN_EXP_BITS); + + dp->mii_xstatus = 0; + if (status & MII_STATUS_XSTATUS) { + dp->mii_xstatus = usbgem_mii_read(dp, MII_XSTATUS, &err); + + cmn_err(CE_CONT, "!%s: xstatus:%b", + dp->name, dp->mii_xstatus, MII_XSTATUS_BITS); + } + dp->mii_xstatus_ro = ~dp->mii_xstatus; + + /* check if the phy can advertize pause abilities */ + adv_org = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + usbgem_mii_write(dp, MII_AN_ADVERT, + MII_ABILITY_PAUSE | MII_ABILITY_ASM_DIR, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + adv = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + if ((adv & MII_ABILITY_PAUSE) == 0) { + dp->ugc.usbgc_flow_control &= ~1; + } + + if ((adv & MII_ABILITY_ASM_DIR) == 0) { + dp->ugc.usbgc_flow_control &= ~2; + } + + usbgem_mii_write(dp, MII_AN_ADVERT, adv_org, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + return (USB_SUCCESS); + +usberr: + return (USB_FAILURE); +} + +int +usbgem_mii_init_default(struct usbgem_dev *dp) +{ + /* ENPTY */ + return (USB_SUCCESS); +} + +static int +usbgem_mii_start(struct usbgem_dev *dp) +{ + int err; + kthread_t *lwth; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* make a first call of usbgem_mii_link_check() */ + dp->link_watcher_stop = 0; + dp->mii_state = MII_STATE_UNKNOWN; + dp->mii_interval = drv_usectohz(1000*1000); /* 1sec */ + dp->mii_last_check = ddi_get_lbolt(); + dp->linkup_delay = 600 * drv_usectohz(1000*1000); /* 10 minutes */ + + lwth = thread_create(NULL, 0, usbgem_mii_link_watcher, dp, 0, &p0, + TS_RUN, minclsyspri); + if (lwth == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to create a link watcher thread", + dp->name, __func__); + return (USB_FAILURE); + } + dp->link_watcher_did = lwth->t_did; + + return (USB_SUCCESS); +} + +static void +usbgem_mii_stop(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* Ensure timer routine stopped */ + dp->link_watcher_stop = 1; + cv_signal(&dp->link_watcher_wait_cv); + thread_join(dp->link_watcher_did); +} + +/* ============================================================== */ +/* + * internal mac register operation interface + */ +/* ============================================================== */ +/* + * usbgem_mac_init: cold start + */ +static int +usbgem_mac_init(struct usbgem_dev *dp) +{ + int err; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* pretend we succeeded */ + return (USB_SUCCESS); + } + + ASSERT(dp->mac_state == MAC_STATE_STOPPED); + + /* reset fatal error timestamp */ + dp->fatal_error = (clock_t)0; + + /* reset tx side state */ + mutex_enter(&dp->txlock); + dp->tx_busy_cnt = 0; + dp->tx_max_packets = dp->ugc.usbgc_tx_list_max; + mutex_exit(&dp->txlock); + + /* reset rx side state */ + mutex_enter(&dp->rxlock); + dp->rx_busy_cnt = 0; + mutex_exit(&dp->rxlock); + + err = usbgem_hal_init_chip(dp); + if (err == USB_SUCCESS) { + dp->mac_state = MAC_STATE_INITIALIZED; + } + + return (err); +} + +/* + * usbgem_mac_start: warm start + */ +static int +usbgem_mac_start(struct usbgem_dev *dp) +{ + int err; + int i; + usb_flags_t flags = 0; + usb_intr_req_t *req; +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_state_t p_state; +#endif + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* do nothing but don't return failure */ + return (USB_SUCCESS); + } + + if (dp->mac_state != MAC_STATE_INITIALIZED) { + /* don't return failer */ + DPRINTF(0, (CE_CONT, + "!%s: %s: mac_state(%d) is not MAC_STATE_INITIALIZED", + dp->name, __func__, dp->mac_state)); + goto x; + } + + dp->mac_state = MAC_STATE_ONLINE; + + if (usbgem_hal_start_chip(dp) != USB_SUCCESS) { + cmn_err(CE_NOTE, + "!%s: %s: usb error was detected during start_chip", + dp->name, __func__); + goto x; + } + +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_get_state(dp->intr_pipe, &p_state, 0); + ASSERT(p_state == USB_PIPE_STATE_IDLE); +#endif /* USBGEM_DEBUG_LEVEL */ + + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + + /* make a request for interrupt */ + + req = usb_alloc_intr_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, "!%s: %s: failed to allocate intreq", + dp->name, __func__); + goto x; + } + req->intr_data = NULL; + req->intr_client_private = (usb_opaque_t)dp; + req->intr_timeout = 0; + req->intr_attributes = + USB_ATTRS_SHORT_XFER_OK | USB_ATTRS_AUTOCLEARING; + req->intr_len = dp->ep_intr->wMaxPacketSize; + req->intr_cb = usbgem_intr_cb; + req->intr_exc_cb = usbgem_intr_cb; + req->intr_completion_reason = 0; + req->intr_cb_flags = 0; + + err = usb_pipe_intr_xfer(dp->intr_pipe, req, flags); + if (err != USB_SUCCESS) { + cmn_err(CE_WARN, + "%s: err:%d failed to start polling of intr pipe", + dp->name, err); + goto x; + } + } + + /* kick to receive the first packet */ + if (usbgem_init_rx_buf(dp) != USB_SUCCESS) { + goto err_stop_intr; + } + dp->rx_active = B_TRUE; + + return (USB_SUCCESS); + +err_stop_intr: + /* stop the interrupt pipe */ + DPRINTF(0, (CE_CONT, "!%s: %s: FAULURE", dp->name, __func__)); + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP); + } +x: + ASSERT(dp->mac_state == MAC_STATE_ONLINE); + /* we use another flag to indicate error state. */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + return (USB_FAILURE); +} + +static int +usbgem_mac_stop(struct usbgem_dev *dp, int new_state, boolean_t graceful) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * we must have writer lock for dev_state_lock + */ + ASSERT(new_state == MAC_STATE_STOPPED + || new_state == MAC_STATE_DISCONNECTED); + + /* stop polling interrupt pipe */ + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP); + } + + if (new_state == MAC_STATE_STOPPED || graceful) { + /* stop the nic hardware completely */ + if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) { + (void) usbgem_hal_reset_chip(dp); + } + } + + /* stop preparing new rx packets and sending new packets */ + dp->mac_state = new_state; + + /* other processors must get mac_state correctly after here */ + membar_producer(); + + /* cancel all requests we have sent */ + usb_pipe_reset(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + usb_pipe_reset(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + + DPRINTF(0, (CE_CONT, + "!%s: %s: rx_busy_cnt:%d tx_busy_cnt:%d", + dp->name, __func__, dp->rx_busy_cnt, dp->tx_busy_cnt)); + + /* + * Here all rx packets has been cancelled and their call back + * function has been exeuted, because we called usb_pipe_reset + * synchronously. + * So actually we just ensure rx_busy_cnt == 0. + */ + mutex_enter(&dp->rxlock); + while (dp->rx_busy_cnt > 0) { + cv_wait(&dp->rx_drain_cv, &dp->rxlock); + } + mutex_exit(&dp->rxlock); + + DPRINTF(0, (CE_CONT, "!%s: %s: rx_busy_cnt is %d now", + dp->name, __func__, dp->rx_busy_cnt)); + + mutex_enter(&dp->txlock); + while (dp->tx_busy_cnt > 0) { + cv_wait(&dp->tx_drain_cv, &dp->txlock); + } + mutex_exit(&dp->txlock); + + DPRINTF(0, (CE_CONT, "!%s: %s: tx_busy_cnt is %d now", + dp->name, __func__, dp->tx_busy_cnt)); + + return (USB_SUCCESS); +} + +static int +usbgem_add_multicast(struct usbgem_dev *dp, const uint8_t *ep) +{ + int cnt; + int err; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + if (dp->mc_count_req++ < USBGEM_MAXMC) { + /* append the new address at the end of the mclist */ + cnt = dp->mc_count; + bcopy(ep, dp->mc_list[cnt].addr.ether_addr_octet, + ETHERADDRL); + if (dp->ugc.usbgc_multicast_hash) { + dp->mc_list[cnt].hash = + (*dp->ugc.usbgc_multicast_hash)(dp, ep); + } + dp->mc_count = cnt + 1; + } + + if (dp->mc_count_req != dp->mc_count) { + /* multicast address list overflow */ + dp->rxmode |= RXMODE_MULTI_OVF; + } else { + dp->rxmode &= ~RXMODE_MULTI_OVF; + } + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + /* tell new multicast list to the hardware */ + err = usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (err); +} + +static int +usbgem_remove_multicast(struct usbgem_dev *dp, const uint8_t *ep) +{ + size_t len; + int i; + int cnt; + int err; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + dp->mc_count_req--; + cnt = dp->mc_count; + for (i = 0; i < cnt; i++) { + if (bcmp(ep, &dp->mc_list[i].addr, ETHERADDRL)) { + continue; + } + /* shrink the mclist by copying forward */ + len = (cnt - (i + 1)) * sizeof (*dp->mc_list); + if (len > 0) { + bcopy(&dp->mc_list[i+1], &dp->mc_list[i], len); + } + dp->mc_count--; + break; + } + + if (dp->mc_count_req != dp->mc_count) { + /* multicast address list overflow */ + dp->rxmode |= RXMODE_MULTI_OVF; + } else { + dp->rxmode &= ~RXMODE_MULTI_OVF; + } + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + err = usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (err); +} + + +/* ============================================================== */ +/* + * ioctl + */ +/* ============================================================== */ +enum ioc_reply { + IOC_INVAL = -1, /* bad, NAK with EINVAL */ + IOC_DONE, /* OK, reply sent */ + IOC_ACK, /* OK, just send ACK */ + IOC_REPLY, /* OK, just send reply */ + IOC_RESTART_ACK, /* OK, restart & ACK */ + IOC_RESTART_REPLY /* OK, restart & reply */ +}; + + +#ifdef USBGEM_CONFIG_MAC_PROP +static int +usbgem_get_def_val(struct usbgem_dev *dp, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + link_flowctrl_t fl; + int err = 0; + + ASSERT(pr_valsize > 0); + switch (pr_num) { + case MAC_PROP_AUTONEG: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + break; + + case MAC_PROP_FLOWCTRL: + if (pr_valsize < sizeof (link_flowctrl_t)) { + return (EINVAL); + } + switch (dp->ugc.usbgc_flow_control) { + case FLOW_CONTROL_NONE: + fl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_SYMMETRIC: + fl = LINK_FLOWCTRL_BI; + break; + case FLOW_CONTROL_TX_PAUSE: + fl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_RX_PAUSE: + fl = LINK_FLOWCTRL_RX; + break; + } + bcopy(&fl, pr_val, sizeof (fl)); + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_EN_1000FDX_CAP: + *(uint8_t *)pr_val = + (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_EN_1000HDX_CAP: + *(uint8_t *)pr_val = + (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case MAC_PROP_ADV_100T4_CAP: + case MAC_PROP_EN_100T4_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_EN_100FDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_EN_100HDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_EN_10FDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_EN_10HDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + default: + err = ENOTSUP; + break; + } + return (err); +} + +#ifdef MAC_VERSION_V1 +static void +usbgem_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + struct usbgem_dev *dp = arg; + link_flowctrl_t fl; + + /* + * By default permissions are read/write unless specified + * otherwise by the driver. + */ + + switch (pr_num) { + case MAC_PROP_DUPLEX: + case MAC_PROP_SPEED: + case MAC_PROP_STATUS: + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_ADV_100T4_CAP: + case MAC_PROP_EN_100T4_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + break; + + case MAC_PROP_EN_1000FDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASET_FD)); + } else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD) + == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_1000HDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASET)); + } else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASEX)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_100FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_100HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_10FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_10_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_10HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_10)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_AUTONEG: + if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_FLOWCTRL: + switch (dp->ugc.usbgc_flow_control) { + case FLOW_CONTROL_NONE: + fl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_SYMMETRIC: + fl = LINK_FLOWCTRL_BI; + break; + case FLOW_CONTROL_TX_PAUSE: + fl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_RX_PAUSE: + fl = LINK_FLOWCTRL_RX; + break; + } + mac_prop_info_set_default_link_flowctrl(prh, fl); + break; + + case MAC_PROP_MTU: + mac_prop_info_set_range_uint32(prh, + dp->ugc.usbgc_min_mtu, dp->ugc.usbgc_max_mtu); + break; + + case MAC_PROP_PRIVATE: + break; + } +} +#endif + +static int +usbgem_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + struct usbgem_dev *dp = arg; + int err = 0; + boolean_t update = B_FALSE; + link_flowctrl_t flowctrl; + uint32_t cur_mtu, new_mtu; + + rw_enter(&dp->dev_state_lock, RW_WRITER); + switch (pr_num) { + case MAC_PROP_EN_1000FDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0 || + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD) == 0) { + if (dp->anadv_1000fdx != *(uint8_t *)pr_val) { + dp->anadv_1000fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_1000HDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0 || + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) { + if (dp->anadv_1000hdx != *(uint8_t *)pr_val) { + dp->anadv_1000hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_100FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) { + if (dp->anadv_100fdx != *(uint8_t *)pr_val) { + dp->anadv_100fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_100HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) { + if (dp->anadv_100hdx != *(uint8_t *)pr_val) { + dp->anadv_100hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_10FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + if (dp->anadv_10fdx != *(uint8_t *)pr_val) { + dp->anadv_10fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_10HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + if (dp->anadv_10hdx != *(uint8_t *)pr_val) { + dp->anadv_10hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_AUTONEG: + if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) { + if (dp->anadv_autoneg != *(uint8_t *)pr_val) { + dp->anadv_autoneg = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_FLOWCTRL: + bcopy(pr_val, &flowctrl, sizeof (flowctrl)); + + switch (flowctrl) { + default: + err = EINVAL; + break; + + case LINK_FLOWCTRL_NONE: + if (dp->flow_control != FLOW_CONTROL_NONE) { + dp->flow_control = FLOW_CONTROL_NONE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_RX: + if (dp->flow_control != FLOW_CONTROL_RX_PAUSE) { + dp->flow_control = FLOW_CONTROL_RX_PAUSE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_TX: + if (dp->flow_control != FLOW_CONTROL_TX_PAUSE) { + dp->flow_control = FLOW_CONTROL_TX_PAUSE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_BI: + if (dp->flow_control != FLOW_CONTROL_SYMMETRIC) { + dp->flow_control = FLOW_CONTROL_SYMMETRIC; + update = B_TRUE; + } + break; + } + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_STATUS: + case MAC_PROP_SPEED: + case MAC_PROP_DUPLEX: + err = ENOTSUP; /* read-only prop. Can't set this. */ + break; + + case MAC_PROP_MTU: + bcopy(pr_val, &new_mtu, sizeof (new_mtu)); + if (new_mtu != dp->mtu) { + err = EINVAL; + } + break; + + case MAC_PROP_PRIVATE: + err = ENOTSUP; + break; + + default: + err = ENOTSUP; + break; + } + + if (update) { + /* sync with PHY */ + usbgem_choose_forcedmode(dp); + dp->mii_state = MII_STATE_UNKNOWN; + cv_signal(&dp->link_watcher_wait_cv); + } + rw_exit(&dp->dev_state_lock); + return (err); +} + +static int +#ifdef MAC_VERSION_V1 +usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +#else +usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm) +#endif +{ + struct usbgem_dev *dp = arg; + int err = 0; + link_flowctrl_t flowctrl; + uint64_t tmp = 0; + + if (pr_valsize == 0) { + return (EINVAL); + } +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_RW; +#endif + bzero(pr_val, pr_valsize); +#ifndef MAC_VERSION_V1 + if ((pr_flags & MAC_PROP_DEFAULT) && (pr_num != MAC_PROP_PRIVATE)) { + return (usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val)); + } +#endif + rw_enter(&dp->dev_state_lock, RW_READER); + switch (pr_num) { + case MAC_PROP_DUPLEX: +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_READ; +#endif + if (pr_valsize >= sizeof (link_duplex_t)) { + if (dp->mii_state != MII_STATE_LINKUP) { + *(link_duplex_t *)pr_val = LINK_DUPLEX_UNKNOWN; + } else if (dp->full_duplex) { + *(link_duplex_t *)pr_val = LINK_DUPLEX_FULL; + } else { + *(link_duplex_t *)pr_val = LINK_DUPLEX_HALF; + } + } else { + err = EINVAL; + } + break; + case MAC_PROP_SPEED: +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_READ; +#endif + if (pr_valsize >= sizeof (uint64_t)) { + switch (dp->speed) { + case USBGEM_SPD_1000: + tmp = 1000000000; + break; + case USBGEM_SPD_100: + tmp = 100000000; + break; + case USBGEM_SPD_10: + tmp = 10000000; + break; + default: + tmp = 0; + } + bcopy(&tmp, pr_val, sizeof (tmp)); + } else { + err = EINVAL; + } + break; + + case MAC_PROP_AUTONEG: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_CANAUTONEG) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_autoneg; + break; + + case MAC_PROP_FLOWCTRL: + if (pr_valsize >= sizeof (link_flowctrl_t)) { + switch (dp->flow_control) { + case FLOW_CONTROL_NONE: + flowctrl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_RX_PAUSE: + flowctrl = LINK_FLOWCTRL_RX; + break; + case FLOW_CONTROL_TX_PAUSE: + flowctrl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_SYMMETRIC: + flowctrl = LINK_FLOWCTRL_BI; + break; + } + bcopy(&flowctrl, pr_val, sizeof (flowctrl)); + } else { + err = EINVAL; + } + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_ADV_100T4_CAP: + usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val); + break; + + case MAC_PROP_EN_1000FDX_CAP: +#ifndef MAC_VERSION_V1 + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) && + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD)) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_1000fdx; + break; + + case MAC_PROP_EN_1000HDX_CAP: +#ifndef MAC_VERSION_V1 + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) && + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX)) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_1000hdx; + break; + + case MAC_PROP_EN_100FDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASEX_FD) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100fdx; + break; + + case MAC_PROP_EN_100HDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASEX) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100hdx; + break; + + case MAC_PROP_EN_10FDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_10_FD) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_10fdx; + break; + + case MAC_PROP_EN_10HDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_10) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_10hdx; + break; + + case MAC_PROP_EN_100T4_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASE_T4) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100t4; + break; + + case MAC_PROP_PRIVATE: + err = ENOTSUP; + break; + +#ifndef MAC_VERSION_V1 + case MAC_PROP_MTU: { + mac_propval_range_t range; + if (!(pr_flags & MAC_PROP_POSSIBLE)) { + err = ENOTSUP; + break; + } + if (pr_valsize < sizeof (mac_propval_range_t)) { + err = EINVAL; + break; + } + range.mpr_count = 1; + range.mpr_type = MAC_PROPVAL_UINT32; + range.range_uint32[0].mpur_min = ETHERMTU; + range.range_uint32[0].mpur_max = dp->mtu; + bcopy(&range, pr_val, sizeof (range)); + break; + } +#endif + default: + err = ENOTSUP; + break; + } + + rw_exit(&dp->dev_state_lock); + return (err); +} +#endif /* USBGEM_CONFIG_MAC_PROP */ + +#ifdef USBGEM_CONFIG_ND +/* ============================================================== */ +/* + * ND interface + */ +/* ============================================================== */ +enum { + PARAM_AUTONEG_CAP, + PARAM_PAUSE_CAP, + PARAM_ASYM_PAUSE_CAP, + PARAM_1000FDX_CAP, + PARAM_1000HDX_CAP, + PARAM_100T4_CAP, + PARAM_100FDX_CAP, + PARAM_100HDX_CAP, + PARAM_10FDX_CAP, + PARAM_10HDX_CAP, + + PARAM_ADV_AUTONEG_CAP, + PARAM_ADV_PAUSE_CAP, + PARAM_ADV_ASYM_PAUSE_CAP, + PARAM_ADV_1000FDX_CAP, + PARAM_ADV_1000HDX_CAP, + PARAM_ADV_100T4_CAP, + PARAM_ADV_100FDX_CAP, + PARAM_ADV_100HDX_CAP, + PARAM_ADV_10FDX_CAP, + PARAM_ADV_10HDX_CAP, + PARAM_ADV_1000T_MS, + + PARAM_LP_AUTONEG_CAP, + PARAM_LP_PAUSE_CAP, + PARAM_LP_ASYM_PAUSE_CAP, + PARAM_LP_1000FDX_CAP, + PARAM_LP_1000HDX_CAP, + PARAM_LP_100T4_CAP, + PARAM_LP_100FDX_CAP, + PARAM_LP_100HDX_CAP, + PARAM_LP_10FDX_CAP, + PARAM_LP_10HDX_CAP, + + PARAM_LINK_STATUS, + PARAM_LINK_SPEED, + PARAM_LINK_DUPLEX, + + PARAM_LINK_AUTONEG, + PARAM_LINK_RX_PAUSE, + PARAM_LINK_TX_PAUSE, + + PARAM_LOOP_MODE, + PARAM_MSI_CNT, +#ifdef DEBUG_RESUME + PARAM_RESUME_TEST, +#endif + + PARAM_COUNT +}; + +struct usbgem_nd_arg { + struct usbgem_dev *dp; + int item; +}; + +static int +usbgem_param_get(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *credp) +{ + struct usbgem_dev *dp = ((struct usbgem_nd_arg *)(void *)arg)->dp; + int item = ((struct usbgem_nd_arg *)(void *)arg)->item; + long val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called, item:%d", + dp->name, __func__, item)); + + switch (item) { + case PARAM_AUTONEG_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + DPRINTF(1, (CE_CONT, "autoneg_cap:%d", val)); + break; + + case PARAM_PAUSE_CAP: + val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE; + break; + + case PARAM_ASYM_PAUSE_CAP: + val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC; + break; + + case PARAM_1000FDX_CAP: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case PARAM_1000HDX_CAP: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case PARAM_100T4_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case PARAM_100FDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case PARAM_100HDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case PARAM_10FDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case PARAM_10HDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + case PARAM_ADV_AUTONEG_CAP: + val = dp->anadv_autoneg; + break; + + case PARAM_ADV_PAUSE_CAP: + val = dp->anadv_pause; + break; + + case PARAM_ADV_ASYM_PAUSE_CAP: + val = dp->anadv_asmpause; + break; + + case PARAM_ADV_1000FDX_CAP: + val = dp->anadv_1000fdx; + break; + + case PARAM_ADV_1000HDX_CAP: + val = dp->anadv_1000hdx; + break; + + case PARAM_ADV_100T4_CAP: + val = dp->anadv_100t4; + break; + + case PARAM_ADV_100FDX_CAP: + val = dp->anadv_100fdx; + break; + + case PARAM_ADV_100HDX_CAP: + val = dp->anadv_100hdx; + break; + + case PARAM_ADV_10FDX_CAP: + val = dp->anadv_10fdx; + break; + + case PARAM_ADV_10HDX_CAP: + val = dp->anadv_10hdx; + break; + + case PARAM_ADV_1000T_MS: + val = dp->anadv_1000t_ms; + break; + + case PARAM_LP_AUTONEG_CAP: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case PARAM_LP_PAUSE_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE); + break; + + case PARAM_LP_ASYM_PAUSE_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR); + break; + + case PARAM_LP_1000FDX_CAP: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + break; + + case PARAM_LP_1000HDX_CAP: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + break; + + case PARAM_LP_100T4_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4); + break; + + case PARAM_LP_100FDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + break; + + case PARAM_LP_100HDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + break; + + case PARAM_LP_10FDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + break; + + case PARAM_LP_10HDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + break; + + case PARAM_LINK_STATUS: + val = (dp->mii_state == MII_STATE_LINKUP); + break; + + case PARAM_LINK_SPEED: + val = usbgem_speed_value[dp->speed]; + break; + + case PARAM_LINK_DUPLEX: + val = 0; + if (dp->mii_state == MII_STATE_LINKUP) { + val = dp->full_duplex ? 2 : 1; + } + break; + + case PARAM_LINK_AUTONEG: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case PARAM_LINK_RX_PAUSE: + val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) || + (dp->flow_control == FLOW_CONTROL_RX_PAUSE); + break; + + case PARAM_LINK_TX_PAUSE: + val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) || + (dp->flow_control == FLOW_CONTROL_TX_PAUSE); + break; + +#ifdef DEBUG_RESUME + case PARAM_RESUME_TEST: + val = 0; + break; +#endif + default: + cmn_err(CE_WARN, "%s: unimplemented ndd control (%d)", + dp->name, item); + break; + } + + (void) mi_mpprintf(mp, "%ld", val); + + return (0); +} + +static int +usbgem_param_set(queue_t *q, + mblk_t *mp, char *value, caddr_t arg, cred_t *credp) +{ + struct usbgem_dev *dp = ((struct usbgem_nd_arg *)(void *)arg)->dp; + int item = ((struct usbgem_nd_arg *)(void *)arg)->item; + long val; + char *end; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + if (ddi_strtol(value, &end, 10, &val)) { + return (EINVAL); + } + if (end == value) { + return (EINVAL); + } + + switch (item) { + case PARAM_ADV_AUTONEG_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_CANAUTONEG) == 0) { + goto err; + } + dp->anadv_autoneg = (int)val; + break; + + case PARAM_ADV_PAUSE_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && dp->ugc.usbgc_flow_control == FLOW_CONTROL_NONE) { + goto err; + } + dp->anadv_pause = (int)val; + break; + + case PARAM_ADV_ASYM_PAUSE_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && + dp->ugc.usbgc_flow_control <= FLOW_CONTROL_SYMMETRIC) { + goto err; + } + dp->anadv_asmpause = (int)val; + break; + + case PARAM_ADV_1000FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | + MII_XSTATUS_1000BASEX_FD)) == 0) { + goto err; + } + dp->anadv_1000fdx = (int)val; + break; + + case PARAM_ADV_1000HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASEX)) == 0) { + goto err; + } + dp->anadv_1000hdx = (int)val; + break; + + case PARAM_ADV_100T4_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASE_T4) == 0) { + goto err; + } + dp->anadv_100t4 = (int)val; + break; + + case PARAM_ADV_100FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASEX_FD) == 0) { + goto err; + } + dp->anadv_100fdx = (int)val; + break; + + case PARAM_ADV_100HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASEX) == 0) { + goto err; + } + dp->anadv_100hdx = (int)val; + break; + + case PARAM_ADV_10FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_10_FD) == 0) { + goto err; + } + dp->anadv_10fdx = (int)val; + break; + + case PARAM_ADV_10HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_10) == 0) { + goto err; + } + dp->anadv_10hdx = (int)val; + break; + + case PARAM_ADV_1000T_MS: + if (val != 0 && val != 1 && val != 2) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASET_FD)) == 0) { + goto err; + } + dp->anadv_1000t_ms = (int)val; + break; + +#ifdef DEBUG_RESUME + case PARAM_RESUME_TEST: + mutex_exit(&dp->xmitlock); + mutex_exit(&dp->intrlock); + gem_suspend(dp->dip); + gem_resume(dp->dip); + mutex_enter(&dp->intrlock); + mutex_enter(&dp->xmitlock); + break; +#endif + } + + /* sync with PHY */ + usbgem_choose_forcedmode(dp); + + dp->mii_state = MII_STATE_UNKNOWN; + if (dp->ugc.usbgc_mii_hw_link_detection) { + /* wake up link watcher possiblely sleeps */ + cv_signal(&dp->link_watcher_wait_cv); + } + + return (0); +err: + return (EINVAL); +} + +static void +usbgem_nd_load(struct usbgem_dev *dp, + char *name, ndgetf_t gf, ndsetf_t sf, int item) +{ + struct usbgem_nd_arg *arg; + + ASSERT(item >= 0); + ASSERT(item < PARAM_COUNT); + + arg = &((struct usbgem_nd_arg *)(void *)dp->nd_arg_p)[item]; + arg->dp = dp; + arg->item = item; + + DPRINTF(2, (CE_CONT, "!%s: %s: name:%s, item:%d", + dp->name, __func__, name, item)); + (void) nd_load(&dp->nd_data_p, name, gf, sf, (caddr_t)arg); +} + +static void +usbgem_nd_setup(struct usbgem_dev *dp) +{ + DPRINTF(1, (CE_CONT, "!%s: %s: called, mii_status:0x%b", + dp->name, __func__, dp->mii_status, MII_STATUS_BITS)); + + ASSERT(dp->nd_arg_p == NULL); + + dp->nd_arg_p = + kmem_zalloc(sizeof (struct usbgem_nd_arg) * PARAM_COUNT, KM_SLEEP); + +#define SETFUNC(x) ((x) ? usbgem_param_set : NULL) + + usbgem_nd_load(dp, "autoneg_cap", + usbgem_param_get, NULL, PARAM_AUTONEG_CAP); + usbgem_nd_load(dp, "pause_cap", + usbgem_param_get, NULL, PARAM_PAUSE_CAP); + usbgem_nd_load(dp, "asym_pause_cap", + usbgem_param_get, NULL, PARAM_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "1000fdx_cap", + usbgem_param_get, NULL, PARAM_1000FDX_CAP); + usbgem_nd_load(dp, "1000hdx_cap", + usbgem_param_get, NULL, PARAM_1000HDX_CAP); + usbgem_nd_load(dp, "100T4_cap", + usbgem_param_get, NULL, PARAM_100T4_CAP); + usbgem_nd_load(dp, "100fdx_cap", + usbgem_param_get, NULL, PARAM_100FDX_CAP); + usbgem_nd_load(dp, "100hdx_cap", + usbgem_param_get, NULL, PARAM_100HDX_CAP); + usbgem_nd_load(dp, "10fdx_cap", + usbgem_param_get, NULL, PARAM_10FDX_CAP); + usbgem_nd_load(dp, "10hdx_cap", + usbgem_param_get, NULL, PARAM_10HDX_CAP); + + /* Our advertised capabilities */ + usbgem_nd_load(dp, "adv_autoneg_cap", usbgem_param_get, + SETFUNC(dp->mii_status & MII_STATUS_CANAUTONEG), + PARAM_ADV_AUTONEG_CAP); + usbgem_nd_load(dp, "adv_pause_cap", usbgem_param_get, + SETFUNC(dp->ugc.usbgc_flow_control & 1), + PARAM_ADV_PAUSE_CAP); + usbgem_nd_load(dp, "adv_asym_pause_cap", usbgem_param_get, + SETFUNC(dp->ugc.usbgc_flow_control & 2), + PARAM_ADV_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "adv_1000fdx_cap", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD)), + PARAM_ADV_1000FDX_CAP); + usbgem_nd_load(dp, "adv_1000hdx_cap", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET)), + PARAM_ADV_1000HDX_CAP); + usbgem_nd_load(dp, "adv_100T4_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASE_T4) && + !dp->mii_advert_ro), + PARAM_ADV_100T4_CAP); + usbgem_nd_load(dp, "adv_100fdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASEX_FD) && + !dp->mii_advert_ro), + PARAM_ADV_100FDX_CAP); + usbgem_nd_load(dp, "adv_100hdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASEX) && + !dp->mii_advert_ro), + PARAM_ADV_100HDX_CAP); + usbgem_nd_load(dp, "adv_10fdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_10_FD) && + !dp->mii_advert_ro), + PARAM_ADV_10FDX_CAP); + usbgem_nd_load(dp, "adv_10hdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_10) && + !dp->mii_advert_ro), + PARAM_ADV_10HDX_CAP); + usbgem_nd_load(dp, "adv_1000t_ms", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)), + PARAM_ADV_1000T_MS); + + + /* Partner's advertised capabilities */ + usbgem_nd_load(dp, "lp_autoneg_cap", + usbgem_param_get, NULL, PARAM_LP_AUTONEG_CAP); + usbgem_nd_load(dp, "lp_pause_cap", + usbgem_param_get, NULL, PARAM_LP_PAUSE_CAP); + usbgem_nd_load(dp, "lp_asym_pause_cap", + usbgem_param_get, NULL, PARAM_LP_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "lp_1000fdx_cap", + usbgem_param_get, NULL, PARAM_LP_1000FDX_CAP); + usbgem_nd_load(dp, "lp_1000hdx_cap", + usbgem_param_get, NULL, PARAM_LP_1000HDX_CAP); + usbgem_nd_load(dp, "lp_100T4_cap", + usbgem_param_get, NULL, PARAM_LP_100T4_CAP); + usbgem_nd_load(dp, "lp_100fdx_cap", + usbgem_param_get, NULL, PARAM_LP_100FDX_CAP); + usbgem_nd_load(dp, "lp_100hdx_cap", + usbgem_param_get, NULL, PARAM_LP_100HDX_CAP); + usbgem_nd_load(dp, "lp_10fdx_cap", + usbgem_param_get, NULL, PARAM_LP_10FDX_CAP); + usbgem_nd_load(dp, "lp_10hdx_cap", + usbgem_param_get, NULL, PARAM_LP_10HDX_CAP); + + /* Current operating modes */ + usbgem_nd_load(dp, "link_status", + usbgem_param_get, NULL, PARAM_LINK_STATUS); + usbgem_nd_load(dp, "link_speed", + usbgem_param_get, NULL, PARAM_LINK_SPEED); + usbgem_nd_load(dp, "link_duplex", + usbgem_param_get, NULL, PARAM_LINK_DUPLEX); + usbgem_nd_load(dp, "link_autoneg", + usbgem_param_get, NULL, PARAM_LINK_AUTONEG); + usbgem_nd_load(dp, "link_rx_pause", + usbgem_param_get, NULL, PARAM_LINK_RX_PAUSE); + usbgem_nd_load(dp, "link_tx_pause", + usbgem_param_get, NULL, PARAM_LINK_TX_PAUSE); +#ifdef DEBUG_RESUME + usbgem_nd_load(dp, "resume_test", + usbgem_param_get, usbgem_param_set, PARAM_RESUME_TEST); +#endif +#undef SETFUNC +} + +static +enum ioc_reply +usbgem_nd_ioctl(struct usbgem_dev *dp, + queue_t *wq, mblk_t *mp, struct iocblk *iocp) +{ + boolean_t ok; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + switch (iocp->ioc_cmd) { + case ND_GET: + ok = nd_getset(wq, dp->nd_data_p, mp); + DPRINTF(1, (CE_CONT, + "%s: get %s", dp->name, ok ? "OK" : "FAIL")); + return (ok ? IOC_REPLY : IOC_INVAL); + + case ND_SET: + ok = nd_getset(wq, dp->nd_data_p, mp); + + DPRINTF(1, (CE_CONT, "%s: set %s err %d", + dp->name, ok ? "OK" : "FAIL", iocp->ioc_error)); + + if (!ok) { + return (IOC_INVAL); + } + + if (iocp->ioc_error) { + return (IOC_REPLY); + } + + return (IOC_RESTART_REPLY); + } + + cmn_err(CE_WARN, "%s: invalid cmd 0x%x", dp->name, iocp->ioc_cmd); + + return (IOC_INVAL); +} + +static void +usbgem_nd_cleanup(struct usbgem_dev *dp) +{ + ASSERT(dp->nd_data_p != NULL); + ASSERT(dp->nd_arg_p != NULL); + + nd_free(&dp->nd_data_p); + + kmem_free(dp->nd_arg_p, sizeof (struct usbgem_nd_arg) * PARAM_COUNT); + dp->nd_arg_p = NULL; +} +#endif /* USBGEM_CONFIG_ND */ + +static void +usbgem_mac_ioctl(struct usbgem_dev *dp, queue_t *wq, mblk_t *mp) +{ + struct iocblk *iocp; + enum ioc_reply status; + int cmd; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Validate the command before bothering with the mutex ... + */ + iocp = (void *)mp->b_rptr; + iocp->ioc_error = 0; + cmd = iocp->ioc_cmd; + + DPRINTF(1, (CE_CONT, "%s: %s cmd:0x%x", dp->name, __func__, cmd)); + +#ifdef USBGEM_CONFIG_ND + switch (cmd) { + default: + _NOTE(NOTREACHED) + status = IOC_INVAL; + break; + + case ND_GET: + case ND_SET: + status = usbgem_nd_ioctl(dp, wq, mp, iocp); + break; + } + + /* + * Finally, decide how to reply + */ + switch (status) { + default: + case IOC_INVAL: + /* + * Error, reply with a NAK and EINVAL or the specified error + */ + miocnak(wq, mp, 0, iocp->ioc_error == 0 ? + EINVAL : iocp->ioc_error); + break; + + case IOC_DONE: + /* + * OK, reply already sent + */ + break; + + case IOC_RESTART_ACK: + case IOC_ACK: + /* + * OK, reply with an ACK + */ + miocack(wq, mp, 0, 0); + break; + + case IOC_RESTART_REPLY: + case IOC_REPLY: + /* + * OK, send prepared reply as ACK or NAK + */ + mp->b_datap->db_type = + iocp->ioc_error == 0 ? M_IOCACK : M_IOCNAK; + qreply(wq, mp); + break; + } +#else + miocnak(wq, mp, 0, EINVAL); + return; +#endif /* USBGEM_CONFIG_GLDv3 */ +} + +#ifndef SYS_MAC_H +#define XCVR_UNDEFINED 0 +#define XCVR_NONE 1 +#define XCVR_10 2 +#define XCVR_100T4 3 +#define XCVR_100X 4 +#define XCVR_100T2 5 +#define XCVR_1000X 6 +#define XCVR_1000T 7 +#endif +static int +usbgem_mac_xcvr_inuse(struct usbgem_dev *dp) +{ + int val = XCVR_UNDEFINED; + + if ((dp->mii_status & MII_STATUS_XSTATUS) == 0) { + if (dp->mii_status & MII_STATUS_100_BASE_T4) { + val = XCVR_100T4; + } else if (dp->mii_status & + (MII_STATUS_100_BASEX_FD | + MII_STATUS_100_BASEX)) { + val = XCVR_100X; + } else if (dp->mii_status & + (MII_STATUS_100_BASE_T2_FD | + MII_STATUS_100_BASE_T2)) { + val = XCVR_100T2; + } else if (dp->mii_status & + (MII_STATUS_10_FD | MII_STATUS_10)) { + val = XCVR_10; + } + } else if (dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)) { + val = XCVR_1000T; + } else if (dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASEX)) { + val = XCVR_1000X; + } + + return (val); +} + +#ifdef USBGEM_CONFIG_GLDv3 +/* ============================================================== */ +/* + * GLDv3 interface + */ +/* ============================================================== */ +static int usbgem_m_getstat(void *, uint_t, uint64_t *); +static int usbgem_m_start(void *); +static void usbgem_m_stop(void *); +static int usbgem_m_setpromisc(void *, boolean_t); +static int usbgem_m_multicst(void *, boolean_t, const uint8_t *); +static int usbgem_m_unicst(void *, const uint8_t *); +static mblk_t *usbgem_m_tx(void *, mblk_t *); +static void usbgem_m_ioctl(void *, queue_t *, mblk_t *); +#ifdef GEM_CONFIG_MAC_PROP +static int usbgem_m_setprop(void *, const char *, mac_prop_id_t, + uint_t, const void *); +#ifdef MAC_VERSION_V1 +static int usbgem_m_getprop(void *, const char *, mac_prop_id_t, + uint_t, void *); +#else +static int usbgem_m_getprop(void *, const char *, mac_prop_id_t, + uint_t, uint_t, void *, uint_t *); +#endif +#endif + +#ifdef _SYS_MAC_PROVIDER_H +#define GEM_M_CALLBACK_FLAGS (MC_IOCTL) +#else +#define GEM_M_CALLBACK_FLAGS (MC_IOCTL) +#endif + +static mac_callbacks_t gem_m_callbacks = { +#ifdef USBGEM_CONFIG_MAC_PROP +#ifdef MAC_VERSION_V1 + GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP | MC_PROPINFO, +#else + GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP, +#endif +#else + GEM_M_CALLBACK_FLAGS, +#endif + usbgem_m_getstat, + usbgem_m_start, + usbgem_m_stop, + usbgem_m_setpromisc, + usbgem_m_multicst, + usbgem_m_unicst, + usbgem_m_tx, +#ifdef _SYS_MAC_PROVIDER_H +#ifdef MAC_VERSION_V1 + NULL, +#endif +#else + NULL, /* m_resources */ +#endif + usbgem_m_ioctl, + NULL, /* m_getcapab */ +#ifdef USBGEM_CONFIG_MAC_PROP + NULL, + NULL, + usbgem_m_setprop, + usbgem_m_getprop, +#endif +#ifdef MAC_VERSION_V1 + usbgem_m_propinfo, +#endif +}; + +static int +usbgem_m_start(void *arg) +{ + int ret; + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + err = EIO; + + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_ONLINE; + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = 0; + goto x; + } + if (usbgem_mac_init(dp) != USB_SUCCESS) { + goto x; + } + + /* initialize rx filter state */ + sema_p(&dp->rxfilter_lock); + dp->mc_count = 0; + dp->mc_count_req = 0; + + bcopy(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + ret = usbgem_hal_set_rx_filter(dp); + sema_v(&dp->rxfilter_lock); + + if (ret != USB_SUCCESS) { + goto x; + } + + if (dp->mii_state == MII_STATE_LINKUP) { + /* setup media mode if the link have been up */ + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto x; + } + if (usbgem_mac_start(dp) != USB_SUCCESS) { + goto x; + } + } + + err = 0; +x: + rw_exit(&dp->dev_state_lock); + return (err); +} + +static void +usbgem_m_stop(void *arg) +{ + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* stop rx gracefully */ + rw_enter(&dp->dev_state_lock, RW_READER); + sema_p(&dp->rxfilter_lock); + dp->rxmode &= ~RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + + /* make the nic state inactive */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_STOPPED; + + /* stop mac completely */ + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL); + } + rw_exit(&dp->dev_state_lock); +} + +static int +usbgem_m_multicst(void *arg, boolean_t add, const uint8_t *ep) +{ + int err; + int ret; + struct usbgem_dev *dp = arg; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (add) { + ret = usbgem_add_multicast(dp, ep); + } else { + ret = usbgem_remove_multicast(dp, ep); + } + rw_exit(&dp->dev_state_lock); + + err = 0; + if (ret != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + err = EIO; + } + + return (err); +} + +static int +usbgem_m_setpromisc(void *arg, boolean_t on) +{ + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + + sema_p(&dp->rxfilter_lock); + if (on) { + dp->rxmode |= RXMODE_PROMISC; + } else { + dp->rxmode &= ~RXMODE_PROMISC; + } + + err = 0; + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) { + err = EIO; + } + } + sema_v(&dp->rxfilter_lock); + + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (err != 0) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + } +#endif + return (err); +} + +int +usbgem_m_getstat(void *arg, uint_t stat, uint64_t *valp) +{ + int ret; + uint64_t val; + struct usbgem_dev *dp = arg; + struct usbgem_stats *gstp = &dp->stats; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + rw_exit(&dp->dev_state_lock); + return (0); + } + ret = usbgem_hal_get_stats(dp); + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (ret != USB_SUCCESS) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + return (EIO); + } +#endif + + switch (stat) { + case MAC_STAT_IFSPEED: + val = usbgem_speed_value[dp->speed] *1000000ull; + break; + + case MAC_STAT_MULTIRCV: + val = gstp->rmcast; + break; + + case MAC_STAT_BRDCSTRCV: + val = gstp->rbcast; + break; + + case MAC_STAT_MULTIXMT: + val = gstp->omcast; + break; + + case MAC_STAT_BRDCSTXMT: + val = gstp->obcast; + break; + + case MAC_STAT_NORCVBUF: + val = gstp->norcvbuf + gstp->missed; + break; + + case MAC_STAT_IERRORS: + val = gstp->errrcv; + break; + + case MAC_STAT_NOXMTBUF: + val = gstp->noxmtbuf; + break; + + case MAC_STAT_OERRORS: + val = gstp->errxmt; + break; + + case MAC_STAT_COLLISIONS: + val = gstp->collisions; + break; + + case MAC_STAT_RBYTES: + val = gstp->rbytes; + break; + + case MAC_STAT_IPACKETS: + val = gstp->rpackets; + break; + + case MAC_STAT_OBYTES: + val = gstp->obytes; + break; + + case MAC_STAT_OPACKETS: + val = gstp->opackets; + break; + + case MAC_STAT_UNDERFLOWS: + val = gstp->underflow; + break; + + case MAC_STAT_OVERFLOWS: + val = gstp->overflow; + break; + + case ETHER_STAT_ALIGN_ERRORS: + val = gstp->frame; + break; + + case ETHER_STAT_FCS_ERRORS: + val = gstp->crc; + break; + + case ETHER_STAT_FIRST_COLLISIONS: + val = gstp->first_coll; + break; + + case ETHER_STAT_MULTI_COLLISIONS: + val = gstp->multi_coll; + break; + + case ETHER_STAT_SQE_ERRORS: + val = gstp->sqe; + break; + + case ETHER_STAT_DEFER_XMTS: + val = gstp->defer; + break; + + case ETHER_STAT_TX_LATE_COLLISIONS: + val = gstp->xmtlatecoll; + break; + + case ETHER_STAT_EX_COLLISIONS: + val = gstp->excoll; + break; + + case ETHER_STAT_MACXMT_ERRORS: + val = gstp->xmit_internal_err; + break; + + case ETHER_STAT_CARRIER_ERRORS: + val = gstp->nocarrier; + break; + + case ETHER_STAT_TOOLONG_ERRORS: + val = gstp->frame_too_long; + break; + + case ETHER_STAT_MACRCV_ERRORS: + val = gstp->rcv_internal_err; + break; + + case ETHER_STAT_XCVR_ADDR: + val = dp->mii_phy_addr; + break; + + case ETHER_STAT_XCVR_ID: + val = dp->mii_phy_id; + break; + + case ETHER_STAT_XCVR_INUSE: + val = usbgem_mac_xcvr_inuse(dp); + break; + + case ETHER_STAT_CAP_1000FDX: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case ETHER_STAT_CAP_1000HDX: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case ETHER_STAT_CAP_100FDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case ETHER_STAT_CAP_100HDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case ETHER_STAT_CAP_10FDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case ETHER_STAT_CAP_10HDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + case ETHER_STAT_CAP_ASMPAUSE: + val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC; + break; + + case ETHER_STAT_CAP_PAUSE: + val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE; + break; + + case ETHER_STAT_CAP_AUTONEG: + val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + break; + + case ETHER_STAT_ADV_CAP_1000FDX: + val = dp->anadv_1000fdx; + break; + + case ETHER_STAT_ADV_CAP_1000HDX: + val = dp->anadv_1000hdx; + break; + + case ETHER_STAT_ADV_CAP_100FDX: + val = dp->anadv_100fdx; + break; + + case ETHER_STAT_ADV_CAP_100HDX: + val = dp->anadv_100hdx; + break; + + case ETHER_STAT_ADV_CAP_10FDX: + val = dp->anadv_10fdx; + break; + + case ETHER_STAT_ADV_CAP_10HDX: + val = dp->anadv_10hdx; + break; + + case ETHER_STAT_ADV_CAP_ASMPAUSE: + val = dp->anadv_asmpause; + break; + + case ETHER_STAT_ADV_CAP_PAUSE: + val = dp->anadv_pause; + break; + + case ETHER_STAT_ADV_CAP_AUTONEG: + val = dp->anadv_autoneg; + break; + + case ETHER_STAT_LP_CAP_1000FDX: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + break; + + case ETHER_STAT_LP_CAP_1000HDX: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + break; + + case ETHER_STAT_LP_CAP_100FDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + break; + + case ETHER_STAT_LP_CAP_100HDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + break; + + case ETHER_STAT_LP_CAP_10FDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + break; + + case ETHER_STAT_LP_CAP_10HDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + break; + + case ETHER_STAT_LP_CAP_ASMPAUSE: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR); + break; + + case ETHER_STAT_LP_CAP_PAUSE: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE); + break; + + case ETHER_STAT_LP_CAP_AUTONEG: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case ETHER_STAT_LINK_ASMPAUSE: + val = BOOLEAN(dp->flow_control & 2); + break; + + case ETHER_STAT_LINK_PAUSE: + val = BOOLEAN(dp->flow_control & 1); + break; + + case ETHER_STAT_LINK_AUTONEG: + val = dp->anadv_autoneg && + BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case ETHER_STAT_LINK_DUPLEX: + val = (dp->mii_state == MII_STATE_LINKUP) ? + (dp->full_duplex ? 2 : 1) : 0; + break; + + case ETHER_STAT_TOOSHORT_ERRORS: + val = gstp->runt; + break; +#ifdef NEVER /* it doesn't make sense */ + case ETHER_STAT_CAP_REMFAULT: + val = B_TRUE; + break; + + case ETHER_STAT_ADV_REMFAULT: + val = dp->anadv_remfault; + break; +#endif + case ETHER_STAT_LP_REMFAULT: + val = BOOLEAN(dp->mii_lpable & MII_AN_ADVERT_REMFAULT); + break; + + case ETHER_STAT_JABBER_ERRORS: + val = gstp->jabber; + break; + + case ETHER_STAT_CAP_100T4: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case ETHER_STAT_ADV_CAP_100T4: + val = dp->anadv_100t4; + break; + + case ETHER_STAT_LP_CAP_100T4: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4); + break; + + default: +#if GEM_DEBUG_LEVEL > 2 + cmn_err(CE_WARN, + "%s: unrecognized parameter value = %d", + __func__, stat); +#endif + *valp = 0; + return (ENOTSUP); + } + + *valp = val; + + return (0); +} + +static int +usbgem_m_unicst(void *arg, const uint8_t *mac) +{ + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + + sema_p(&dp->rxfilter_lock); + bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + err = 0; + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) { + err = EIO; + } + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (err != 0) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + } +#endif + return (err); +} + +/* + * usbgem_m_tx is used only for sending data packets into ethernet wire. + */ +static mblk_t * +usbgem_m_tx(void *arg, mblk_t *mp_head) +{ + int limit; + mblk_t *mp; + mblk_t *nmp; + uint32_t flags; + struct usbgem_dev *dp = arg; + + DPRINTF(4, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + mp = mp_head; + flags = 0; + + rw_enter(&dp->dev_state_lock, RW_READER); + + if (dp->mii_state != MII_STATE_LINKUP || + dp->mac_state != MAC_STATE_ONLINE) { + /* some nics hate to send packets during the link is down */ + for (; mp; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + } + goto x; + } + + ASSERT(dp->nic_state == NIC_STATE_ONLINE); + + limit = dp->tx_max_packets; + for (; limit-- && mp; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + if (usbgem_send_common(dp, mp, + (limit == 0 && nmp) ? 1 : 0)) { + mp->b_next = nmp; + break; + } + } +#ifdef CONFIG_TX_LIMITER + if (mp == mp_head) { + /* no packets were sent, descrease allocation limit */ + mutex_enter(&dp->txlock); + dp->tx_max_packets = max(dp->tx_max_packets - 1, 1); + mutex_exit(&dp->txlock); + } +#endif +x: + rw_exit(&dp->dev_state_lock); + + return (mp); +} + +static void +usbgem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) +{ + struct usbgem_dev *dp = arg; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", + ((struct usbgem_dev *)arg)->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + usbgem_mac_ioctl((struct usbgem_dev *)arg, wq, mp); + rw_exit(&dp->dev_state_lock); +} + +static void +usbgem_gld3_init(struct usbgem_dev *dp, mac_register_t *macp) +{ + macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + macp->m_driver = dp; + macp->m_dip = dp->dip; + macp->m_src_addr = dp->dev_addr.ether_addr_octet; + macp->m_callbacks = &gem_m_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = dp->mtu; + + if (dp->misc_flag & USBGEM_VLAN) { + macp->m_margin = VTAG_SIZE; + } +} +#else +/* ============================================================== */ +/* + * GLDv2 interface + */ +/* ============================================================== */ +static int usbgem_gld_reset(gld_mac_info_t *); +static int usbgem_gld_start(gld_mac_info_t *); +static int usbgem_gld_stop(gld_mac_info_t *); +static int usbgem_gld_set_mac_address(gld_mac_info_t *, uint8_t *); +static int usbgem_gld_set_multicast(gld_mac_info_t *, uint8_t *, int); +static int usbgem_gld_set_promiscuous(gld_mac_info_t *, int); +static int usbgem_gld_get_stats(gld_mac_info_t *, struct gld_stats *); +static int usbgem_gld_send(gld_mac_info_t *, mblk_t *); +static int usbgem_gld_send_tagged(gld_mac_info_t *, mblk_t *, uint32_t); + +static int +usbgem_gld_reset(gld_mac_info_t *macinfo) +{ + int err; + struct usbgem_dev *dp; + + err = GLD_SUCCESS; + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_WRITER); + if (usbgem_mac_init(dp) != USB_SUCCESS) { + err = GLD_FAILURE; + goto x; + } + + dp->nic_state = NIC_STATE_INITIALIZED; + + /* setup media mode if the link have been up */ + if (dp->mii_state == MII_STATE_LINKUP) { + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_media(dp); + } + } +x: + rw_exit(&dp->dev_state_lock); + return (err); +} + +static int +usbgem_gld_start(gld_mac_info_t *macinfo) +{ + int err; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_WRITER); + + dp->nic_state = NIC_STATE_ONLINE; + + if (dp->mii_state == MII_STATE_LINKUP) { + if (usbgem_mac_start(dp) != USB_SUCCESS) { + /* sema_v(&dp->mii_lock); */ + err = GLD_FAILURE; + goto x; + } + } + + /* + * XXX - don't call gld_linkstate() here, + * otherwise it cause recursive mutex call. + */ + err = GLD_SUCCESS; +x: + rw_exit(&dp->dev_state_lock); + + return (err); +} + +static int +usbgem_gld_stop(gld_mac_info_t *macinfo) +{ + int err = GLD_SUCCESS; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* try to stop rx gracefully */ + rw_enter(&dp->dev_state_lock, RW_READER); + sema_p(&dp->rxfilter_lock); + dp->rxmode &= ~RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + + /* make the nic state inactive */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_STOPPED; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL) + != USB_SUCCESS) { + err = GLD_FAILURE; + } + } + rw_exit(&dp->dev_state_lock); + + return (err); +} + +static int +usbgem_gld_set_multicast(gld_mac_info_t *macinfo, uint8_t *ep, int flag) +{ + int err; + int ret; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (flag == GLD_MULTI_ENABLE) { + ret = usbgem_add_multicast(dp, ep); + } else { + ret = usbgem_remove_multicast(dp, ep); + } + rw_exit(&dp->dev_state_lock); + + err = GLD_SUCCESS; + if (ret != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + err = GLD_FAILURE; + } + return (err); +} + +static int +usbgem_gld_set_promiscuous(gld_mac_info_t *macinfo, int flag) +{ + boolean_t need_to_change = B_TRUE; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + if (flag == GLD_MAC_PROMISC_NONE) { + dp->rxmode &= ~(RXMODE_PROMISC | RXMODE_ALLMULTI_REQ); + } else if (flag == GLD_MAC_PROMISC_MULTI) { + dp->rxmode |= RXMODE_ALLMULTI_REQ; + } else if (flag == GLD_MAC_PROMISC_PHYS) { + dp->rxmode |= RXMODE_PROMISC; + } else { + /* mode unchanged */ + need_to_change = B_FALSE; + } + + if (need_to_change) { + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + } + sema_v(&dp->rxfilter_lock); + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_set_mac_address(gld_mac_info_t *macinfo, uint8_t *mac) +{ + struct usbgem_dev *dp; + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_get_stats(gld_mac_info_t *macinfo, struct gld_stats *gs) +{ + struct usbgem_dev *dp; + struct usbgem_stats *vs; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + if ((*dp->ugc.usbgc_get_stats)(dp) != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (USB_FAILURE); + } + + vs = &dp->stats; + + gs->glds_errxmt = vs->errxmt; + gs->glds_errrcv = vs->errrcv; + gs->glds_collisions = vs->collisions; + + gs->glds_excoll = vs->excoll; + gs->glds_defer = vs->defer; + gs->glds_frame = vs->frame; + gs->glds_crc = vs->crc; + + gs->glds_overflow = vs->overflow; /* fifo err,underrun,rbufovf */ + gs->glds_underflow = vs->underflow; + gs->glds_short = vs->runt; + gs->glds_missed = vs->missed; /* missed pkts while rbuf ovf */ + gs->glds_xmtlatecoll = vs->xmtlatecoll; + gs->glds_nocarrier = vs->nocarrier; + gs->glds_norcvbuf = vs->norcvbuf; /* OS resource exaust */ + gs->glds_intr = vs->intr; + + /* all before here must be kept in place for v0 compatibility */ + gs->glds_speed = usbgem_speed_value[dp->speed] * 1000000; + gs->glds_media = GLDM_PHYMII; + gs->glds_duplex = dp->full_duplex ? GLD_DUPLEX_FULL : GLD_DUPLEX_HALF; + + /* gs->glds_media_specific */ + gs->glds_dot3_first_coll = vs->first_coll; + gs->glds_dot3_multi_coll = vs->multi_coll; + gs->glds_dot3_sqe_error = 0; + gs->glds_dot3_mac_xmt_error = 0; + gs->glds_dot3_mac_rcv_error = 0; + gs->glds_dot3_frame_too_long = vs->frame_too_long; + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_ioctl(gld_mac_info_t *macinfo, queue_t *wq, mblk_t *mp) +{ + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + usbgem_mac_ioctl(dp, wq, mp); + + return (GLD_SUCCESS); +} + +/* + * gem_gld_send is used only for sending data packets into ethernet wire. + */ +static int +usbgem_gld_send(gld_mac_info_t *macinfo, mblk_t *mp) +{ + int ret; + uint32_t flags = 0; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + /* nic state must be online of suspended */ + rw_enter(&dp->dev_state_lock, RW_READER); + + ASSERT(dp->nic_state == NIC_STATE_ONLINE); + ASSERT(mp->b_next == NULL); + + if (dp->mii_state != MII_STATE_LINKUP) { + /* Some nics hate to send packets while the link is down. */ + /* we discard the untransmitted packets silently */ + rw_exit(&dp->dev_state_lock); + + freemsg(mp); +#ifdef GEM_CONFIG_FMA + /* FIXME - should we ignore the error? */ + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (GLD_SUCCESS); + } + + ret = (usbgem_send_common(dp, mp, flags) == NULL) + ? GLD_SUCCESS : GLD_NORESOURCES; + rw_exit(&dp->dev_state_lock); + + return (ret); +} + +/* + * usbgem_gld_send is used only for sending data packets into ethernet wire. + */ +static int +usbgem_gld_send_tagged(gld_mac_info_t *macinfo, mblk_t *mp, uint32_t vtag) +{ + uint32_t flags; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + /* + * Some nics hate to send packets while the link is down. + */ + if (dp->mii_state != MII_STATE_LINKUP) { + /* we dicard the untransmitted packets silently */ + freemsg(mp); +#ifdef GEM_CONFIG_FMA + /* FIXME - should we ignore the error? */ + ddi_fm_service_impact(dp->dip, DDI_SERVICE_UNAFFECTED); +#endif + return (GLD_SUCCESS); + } +#ifdef notyet + flags = GLD_VTAG_TCI(vtag) << GEM_SEND_VTAG_SHIFT; +#endif + return ((usbgem_send_common(dp, mp, 0) == NULL) ? + GLD_SUCCESS : GLD_NORESOURCES); +} + +static void +usbgem_gld_init(struct usbgem_dev *dp, gld_mac_info_t *macinfo, char *ident) +{ + /* + * configure GLD + */ + macinfo->gldm_devinfo = dp->dip; + macinfo->gldm_private = (caddr_t)dp; + + macinfo->gldm_reset = usbgem_gld_reset; + macinfo->gldm_start = usbgem_gld_start; + macinfo->gldm_stop = usbgem_gld_stop; + macinfo->gldm_set_mac_addr = usbgem_gld_set_mac_address; + macinfo->gldm_send = usbgem_gld_send; + macinfo->gldm_set_promiscuous = usbgem_gld_set_promiscuous; + macinfo->gldm_get_stats = usbgem_gld_get_stats; + macinfo->gldm_ioctl = usbgem_gld_ioctl; + macinfo->gldm_set_multicast = usbgem_gld_set_multicast; + macinfo->gldm_intr = NULL; + macinfo->gldm_mctl = NULL; + + macinfo->gldm_ident = ident; + macinfo->gldm_type = DL_ETHER; + macinfo->gldm_minpkt = 0; + macinfo->gldm_maxpkt = dp->mtu; + macinfo->gldm_addrlen = ETHERADDRL; + macinfo->gldm_saplen = -2; + macinfo->gldm_ppa = ddi_get_instance(dp->dip); +#ifdef GLD_CAP_LINKSTATE + macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; +#endif + macinfo->gldm_vendor_addr = dp->dev_addr.ether_addr_octet; + macinfo->gldm_broadcast_addr = usbgem_bcastaddr; +} +#endif /* USBGEM_CONFIG_GLDv3 */ + + +/* ======================================================================== */ +/* + * .conf interface + */ +/* ======================================================================== */ +void +usbgem_generate_macaddr(struct usbgem_dev *dp, uint8_t *mac) +{ + extern char hw_serial[]; + char *hw_serial_p; + int i; + uint64_t val; + uint64_t key; + + cmn_err(CE_NOTE, + "!%s: using temp ether address," + " do not use this for long time", + dp->name); + + /* prefer a fixed address for DHCP */ + hw_serial_p = &hw_serial[0]; + val = stoi(&hw_serial_p); + + key = 0; + for (i = 0; i < USBGEM_NAME_LEN; i++) { + if (dp->name[i] == 0) { + break; + } + key ^= dp->name[i]; + } + key ^= ddi_get_instance(dp->dip); + val ^= key << 32; + + /* generate a local address */ + mac[0] = 0x02; + mac[1] = (uint8_t)(val >> 32); + mac[2] = (uint8_t)(val >> 24); + mac[3] = (uint8_t)(val >> 16); + mac[4] = (uint8_t)(val >> 8); + mac[5] = (uint8_t)val; +} + +boolean_t +usbgem_get_mac_addr_conf(struct usbgem_dev *dp) +{ + char propname[32]; + char *valstr; + uint8_t mac[ETHERADDRL]; + char *cp; + int c; + int i; + int j; + uint8_t v; + uint8_t d; + uint8_t ored; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + /* + * Get ethernet address from .conf file + */ + (void) sprintf(propname, "mac-addr"); + if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, propname, &valstr)) != DDI_PROP_SUCCESS) { + return (B_FALSE); + } + + if (strlen(valstr) != ETHERADDRL*3-1) { + goto syntax_err; + } + + cp = valstr; + j = 0; + ored = 0; + for (;;) { + v = 0; + for (i = 0; i < 2; i++) { + c = *cp++; + + if (c >= 'a' && c <= 'f') { + d = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + d = c - 'A' + 10; + } else if (c >= '0' && c <= '9') { + d = c - '0'; + } else { + goto syntax_err; + } + v = (v << 4) | d; + } + + mac[j++] = v; + ored |= v; + if (j == ETHERADDRL) { + /* done */ + break; + } + + c = *cp++; + if (c != ':') { + goto syntax_err; + } + } + + if (ored == 0) { + usbgem_generate_macaddr(dp, mac); + } + for (i = 0; i < ETHERADDRL; i++) { + dp->dev_addr.ether_addr_octet[i] = mac[i]; + } + ddi_prop_free(valstr); + return (B_TRUE); + +syntax_err: + cmn_err(CE_CONT, + "!%s: read mac addr: trying .conf: syntax err %s", + dp->name, valstr); + ddi_prop_free(valstr); + + return (B_FALSE); +} + +static void +usbgem_read_conf(struct usbgem_dev *dp) +{ + int val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Get media mode infomation from .conf file + */ + dp->anadv_autoneg = usbgem_prop_get_int(dp, "adv_autoneg_cap", 1) != 0; + dp->anadv_1000fdx = usbgem_prop_get_int(dp, "adv_1000fdx_cap", 1) != 0; + dp->anadv_1000hdx = usbgem_prop_get_int(dp, "adv_1000hdx_cap", 1) != 0; + dp->anadv_100t4 = usbgem_prop_get_int(dp, "adv_100T4_cap", 1) != 0; + dp->anadv_100fdx = usbgem_prop_get_int(dp, "adv_100fdx_cap", 1) != 0; + dp->anadv_100hdx = usbgem_prop_get_int(dp, "adv_100hdx_cap", 1) != 0; + dp->anadv_10fdx = usbgem_prop_get_int(dp, "adv_10fdx_cap", 1) != 0; + dp->anadv_10hdx = usbgem_prop_get_int(dp, "adv_10hdx_cap", 1) != 0; + dp->anadv_1000t_ms = usbgem_prop_get_int(dp, "adv_1000t_ms", 0); + + if ((ddi_prop_exists(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, "full-duplex"))) { + dp->full_duplex = + usbgem_prop_get_int(dp, "full-duplex", 1) != 0; + dp->anadv_autoneg = B_FALSE; + if (dp->full_duplex) { + dp->anadv_1000hdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + } else { + dp->anadv_1000fdx = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + } + } + + if ((val = usbgem_prop_get_int(dp, "speed", 0)) > 0) { + dp->anadv_autoneg = B_FALSE; + switch (val) { + case 1000: + dp->speed = USBGEM_SPD_1000; + dp->anadv_100t4 = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + break; + case 100: + dp->speed = USBGEM_SPD_100; + dp->anadv_1000fdx = B_FALSE; + dp->anadv_1000hdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + break; + case 10: + dp->speed = USBGEM_SPD_10; + dp->anadv_1000fdx = B_FALSE; + dp->anadv_1000hdx = B_FALSE; + dp->anadv_100t4 = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + break; + default: + cmn_err(CE_WARN, + "!%s: property %s: illegal value:%d", + dp->name, "speed", val); + dp->anadv_autoneg = B_TRUE; + break; + } + } + val = usbgem_prop_get_int(dp, + "adv_pause", dp->ugc.usbgc_flow_control & 1); + val |= usbgem_prop_get_int(dp, + "adv_asmpause", BOOLEAN(dp->ugc.usbgc_flow_control & 2)) << 1; + if (val > FLOW_CONTROL_RX_PAUSE || val < FLOW_CONTROL_NONE) { + cmn_err(CE_WARN, + "!%s: property %s: illegal value:%d", + dp->name, "flow-control", val); + } else { + val = min(val, dp->ugc.usbgc_flow_control); + } + dp->anadv_pause = BOOLEAN(val & 1); + dp->anadv_asmpause = BOOLEAN(val & 2); + + dp->mtu = usbgem_prop_get_int(dp, "mtu", dp->mtu); + dp->txthr = usbgem_prop_get_int(dp, "txthr", dp->txthr); + dp->rxthr = usbgem_prop_get_int(dp, "rxthr", dp->rxthr); + dp->txmaxdma = usbgem_prop_get_int(dp, "txmaxdma", dp->txmaxdma); + dp->rxmaxdma = usbgem_prop_get_int(dp, "rxmaxdma", dp->rxmaxdma); +#ifdef GEM_CONFIG_POLLING + dp->poll_pkt_delay = + usbgem_prop_get_int(dp, "pkt_delay", dp->poll_pkt_delay); + + dp->max_poll_interval[GEM_SPD_10] = + usbgem_prop_get_int(dp, "max_poll_interval_10", + dp->max_poll_interval[GEM_SPD_10]); + dp->max_poll_interval[GEM_SPD_100] = + usbgem_prop_get_int(dp, "max_poll_interval_100", + dp->max_poll_interval[GEM_SPD_100]); + dp->max_poll_interval[GEM_SPD_1000] = + usbgem_prop_get_int(dp, "max_poll_interval_1000", + dp->max_poll_interval[GEM_SPD_1000]); + + dp->min_poll_interval[GEM_SPD_10] = + usbgem_prop_get_int(dp, "min_poll_interval_10", + dp->min_poll_interval[GEM_SPD_10]); + dp->min_poll_interval[GEM_SPD_100] = + usbgem_prop_get_int(dp, "min_poll_interval_100", + dp->min_poll_interval[GEM_SPD_100]); + dp->min_poll_interval[GEM_SPD_1000] = + usbgem_prop_get_int(dp, "min_poll_interval_1000", + dp->min_poll_interval[GEM_SPD_1000]); +#endif +} + +/* + * usbem kstat support + */ +#ifndef GEM_CONFIG_GLDv3 +/* kstat items based from dmfe driver */ + +struct usbgem_kstat_named { + struct kstat_named ks_xcvr_addr; + struct kstat_named ks_xcvr_id; + struct kstat_named ks_xcvr_inuse; + struct kstat_named ks_link_up; + struct kstat_named ks_link_duplex; /* 0:unknwon, 1:half, 2:full */ + struct kstat_named ks_cap_1000fdx; + struct kstat_named ks_cap_1000hdx; + struct kstat_named ks_cap_100fdx; + struct kstat_named ks_cap_100hdx; + struct kstat_named ks_cap_10fdx; + struct kstat_named ks_cap_10hdx; +#ifdef NEVER + struct kstat_named ks_cap_remfault; +#endif + struct kstat_named ks_cap_autoneg; + + struct kstat_named ks_adv_cap_1000fdx; + struct kstat_named ks_adv_cap_1000hdx; + struct kstat_named ks_adv_cap_100fdx; + struct kstat_named ks_adv_cap_100hdx; + struct kstat_named ks_adv_cap_10fdx; + struct kstat_named ks_adv_cap_10hdx; +#ifdef NEVER + struct kstat_named ks_adv_cap_remfault; +#endif + struct kstat_named ks_adv_cap_autoneg; + struct kstat_named ks_lp_cap_1000fdx; + struct kstat_named ks_lp_cap_1000hdx; + struct kstat_named ks_lp_cap_100fdx; + struct kstat_named ks_lp_cap_100hdx; + struct kstat_named ks_lp_cap_10fdx; + struct kstat_named ks_lp_cap_10hdx; + struct kstat_named ks_lp_cap_remfault; + struct kstat_named ks_lp_cap_autoneg; +}; + +static int +usbgem_kstat_update(kstat_t *ksp, int rw) +{ + struct usbgem_kstat_named *knp; + struct usbgem_dev *dp = (struct usbgem_dev *)ksp->ks_private; + + if (rw != KSTAT_READ) { + return (0); + } + + knp = (struct usbgem_kstat_named *)ksp->ks_data; + + knp->ks_xcvr_addr.value.ul = dp->mii_phy_addr; + knp->ks_xcvr_id.value.ul = dp->mii_phy_id; + knp->ks_xcvr_inuse.value.ul = usbgem_mac_xcvr_inuse(dp); + knp->ks_link_up.value.ul = dp->mii_state == MII_STATE_LINKUP; + knp->ks_link_duplex.value.ul = + (dp->mii_state == MII_STATE_LINKUP) ? + (dp->full_duplex ? 2 : 1) : 0; + + knp->ks_cap_1000fdx.value.ul = + (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + knp->ks_cap_1000hdx.value.ul = + (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + knp->ks_cap_100fdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + knp->ks_cap_100hdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + knp->ks_cap_10fdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + knp->ks_cap_10hdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_10); +#ifdef NEVER + knp->ks_cap_remfault.value.ul = B_TRUE; +#endif + knp->ks_cap_autoneg.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + + knp->ks_adv_cap_1000fdx.value.ul = dp->anadv_1000fdx; + knp->ks_adv_cap_1000hdx.value.ul = dp->anadv_1000hdx; + knp->ks_adv_cap_100fdx.value.ul = dp->anadv_100fdx; + knp->ks_adv_cap_100hdx.value.ul = dp->anadv_100hdx; + knp->ks_adv_cap_10fdx.value.ul = dp->anadv_10fdx; + knp->ks_adv_cap_10hdx.value.ul = dp->anadv_10hdx; +#ifdef NEVER + knp->ks_adv_cap_remfault.value.ul = 0; +#endif + knp->ks_adv_cap_autoneg.value.ul = dp->anadv_autoneg; + + knp->ks_lp_cap_1000fdx.value.ul = + BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + knp->ks_lp_cap_1000hdx.value.ul = + BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + knp->ks_lp_cap_100fdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + knp->ks_lp_cap_100hdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + knp->ks_lp_cap_10fdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + knp->ks_lp_cap_10hdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + knp->ks_lp_cap_remfault.value.ul = + BOOLEAN(dp->mii_exp & MII_AN_EXP_PARFAULT); + knp->ks_lp_cap_autoneg.value.ul = + BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + + return (0); +} + + +static int +usbgem_kstat_init(struct usbgem_dev *dp) +{ + int i; + kstat_t *ksp; + struct usbgem_kstat_named *knp; + + ksp = kstat_create( + (char *)ddi_driver_name(dp->dip), ddi_get_instance(dp->dip), + "mii", "net", KSTAT_TYPE_NAMED, + sizeof (*knp) / sizeof (knp->ks_xcvr_addr), 0); + + if (ksp == NULL) { + cmn_err(CE_WARN, "%s: %s() for mii failed", + dp->name, __func__); + return (USB_FAILURE); + } + + knp = (struct usbgem_kstat_named *)ksp->ks_data; + + kstat_named_init(&knp->ks_xcvr_addr, "xcvr_addr", + KSTAT_DATA_INT32); + kstat_named_init(&knp->ks_xcvr_id, "xcvr_id", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_xcvr_inuse, "xcvr_inuse", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_link_up, "link_up", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_1000fdx, "cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_1000hdx, "cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_100fdx, "cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_100hdx, "cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_10fdx, "cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_10hdx, "cap_10hdx", + KSTAT_DATA_UINT32); +#ifdef NEVER + kstat_named_init(&knp->ks_cap_remfault, "cap_rem_fault", + KSTAT_DATA_UINT32); +#endif + kstat_named_init(&knp->ks_cap_autoneg, "cap_autoneg", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_1000fdx, "adv_cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_1000hdx, "adv_cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_100fdx, "adv_cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_100hdx, "adv_cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_10fdx, "adv_cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_10hdx, "adv_cap_10hdx", + KSTAT_DATA_UINT32); +#ifdef NEVER + kstat_named_init(&knp->ks_adv_cap_remfault, "adv_rem_fault", + KSTAT_DATA_UINT32); +#endif + kstat_named_init(&knp->ks_adv_cap_autoneg, "adv_cap_autoneg", + KSTAT_DATA_UINT32); + + kstat_named_init(&knp->ks_lp_cap_1000fdx, "lp_cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_1000hdx, "lp_cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_100fdx, "lp_cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_100hdx, "lp_cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_10fdx, "lp_cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_10hdx, "lp_cap_10hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_remfault, "lp_cap_rem_fault", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_autoneg, "lp_cap_autoneg", + KSTAT_DATA_UINT32); + + ksp->ks_private = (void *) dp; + ksp->ks_update = usbgem_kstat_update; + dp->ksp = ksp; + + kstat_install(ksp); + + return (USB_SUCCESS); +} +#endif /* GEM_CONFIG_GLDv3 */ +/* ======================================================================== */ +/* + * attach/detatch/usb support + */ +/* ======================================================================== */ +int +usbgem_ctrl_out(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size) +{ + mblk_t *data; + usb_ctrl_setup_t setup; + usb_cr_t completion_reason; + usb_cb_flags_t cb_flags; + usb_flags_t flags; + int i; + int ret; + + DPRINTF(4, (CE_CONT, "!%s: %s " + "reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x " + "bp:0x%p nic_state:%d", + dp->name, __func__, reqt, req, val, ix, len, bp, dp->nic_state)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + return (USB_PIPE_ERROR); + } + + data = NULL; + if (size > 0) { + if ((data = allocb(size, 0)) == NULL) { + return (USB_FAILURE); + } + + bcopy(bp, data->b_rptr, size); + data->b_wptr = data->b_rptr + size; + } + + setup.bmRequestType = reqt; + setup.bRequest = req; + setup.wValue = val; + setup.wIndex = ix; + setup.wLength = len; + setup.attrs = 0; /* attributes */ + + for (i = usbgem_ctrl_retry; i > 0; i--) { + completion_reason = 0; + cb_flags = 0; + + ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp), + &setup, &data, &completion_reason, &cb_flags, 0); + + if (ret == USB_SUCCESS) { + break; + } + if (i == 1) { + cmn_err(CE_WARN, + "!%s: %s failed: " + "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x " + "ret:%d cr:%s(%d), cb_flags:0x%x %s", + dp->name, __func__, reqt, req, val, ix, len, + ret, usb_str_cr(completion_reason), + completion_reason, + cb_flags, + (i > 1) ? "retrying..." : "fatal"); + } + } + + if (data != NULL) { + freemsg(data); + } + + return (ret); +} + +int +usbgem_ctrl_in(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size) +{ + mblk_t *data; + usb_ctrl_setup_t setup; + usb_cr_t completion_reason; + usb_cb_flags_t cb_flags; + int i; + int ret; + int reclen; + + DPRINTF(4, (CE_CONT, + "!%s: %s:" + " reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x" + " bp:x%p mac_state:%d", + dp->name, __func__, reqt, req, val, ix, len, bp, dp->mac_state)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + return (USB_PIPE_ERROR); + } + + data = NULL; + + setup.bmRequestType = reqt; + setup.bRequest = req; + setup.wValue = val; + setup.wIndex = ix; + setup.wLength = len; + setup.attrs = USB_ATTRS_AUTOCLEARING; /* XXX */ + + for (i = usbgem_ctrl_retry; i > 0; i--) { + completion_reason = 0; + cb_flags = 0; + ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp), &setup, &data, + &completion_reason, &cb_flags, 0); + + if (ret == USB_SUCCESS) { + reclen = msgdsize(data); + bcopy(data->b_rptr, bp, min(reclen, size)); + break; + } + if (i == 1) { + cmn_err(CE_WARN, + "!%s: %s failed: " + "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x " + "ret:%d cr:%s(%d) cb_flags:0x%x %s", + dp->name, __func__, + reqt, req, val, ix, len, + ret, usb_str_cr(completion_reason), + completion_reason, + cb_flags, + (i > 1) ? "retrying..." : "fatal"); + } + } + + if (data) { + freemsg(data); + } + + return (ret); +} + +int +usbgem_ctrl_out_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + uint32_t v) +{ + uint8_t buf[4]; + + /* convert to little endian from native byte order */ + switch (len) { + case 4: + buf[3] = v >> 24; + buf[2] = v >> 16; + /* fall thru */ + case 2: + buf[1] = v >> 8; + /* fall thru */ + case 1: + buf[0] = v; + } + + return (usbgem_ctrl_out(dp, reqt, req, val, ix, len, buf, len)); +} + +int +usbgem_ctrl_in_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *valp) +{ + uint8_t buf[4]; + uint_t v; + int err; + +#ifdef SANITY + bzero(buf, sizeof (buf)); +#endif + err = usbgem_ctrl_in(dp, reqt, req, val, ix, len, buf, len); + if (err == USB_SUCCESS) { + v = 0; + switch (len) { + case 4: + v |= buf[3] << 24; + v |= buf[2] << 16; + /* FALLTHROUGH */ + case 2: + v |= buf[1] << 8; + /* FALLTHROUGH */ + case 1: + v |= buf[0]; + } + + switch (len) { + case 4: + *(uint32_t *)valp = v; + break; + case 2: + *(uint16_t *)valp = v; + break; + case 1: + *(uint8_t *)valp = v; + break; + } + } + return (err); +} + +/* + * Attach / detach / disconnect / reconnect management + */ +static int +usbgem_open_pipes(struct usbgem_dev *dp) +{ + int i; + int ret; + int ifnum; + int alt; + usb_client_dev_data_t *reg_data; + usb_ep_data_t *ep_tree_node; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + ifnum = dp->ugc.usbgc_ifnum; + alt = dp->ugc.usbgc_alt; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_BULK, USB_EP_DIR_IN); + if (ep_tree_node == NULL) { + cmn_err(CE_WARN, "!%s: %s: ep_bulkin is NULL", + dp->name, __func__); + goto err; + } + dp->ep_bulkin = &ep_tree_node->ep_descr; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_BULK, USB_EP_DIR_OUT); + if (ep_tree_node == NULL) { + cmn_err(CE_WARN, "!%s: %s: ep_bulkout is NULL", + dp->name, __func__); + goto err; + } + dp->ep_bulkout = &ep_tree_node->ep_descr; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_INTR, USB_EP_DIR_IN); + if (ep_tree_node) { + dp->ep_intr = &ep_tree_node->ep_descr; + } else { + /* don't care */ + DPRINTF(1, (CE_CONT, "!%s: %s: ep_intr is NULL", + dp->name, __func__)); + dp->ep_intr = NULL; + } + + /* XXX -- no need to open default pipe */ + + /* open bulk out pipe */ + bzero(&dp->policy_bulkout, sizeof (usb_pipe_policy_t)); + dp->policy_bulkout.pp_max_async_reqs = 1; + + if ((ret = usb_pipe_open(dp->dip, + dp->ep_bulkout, &dp->policy_bulkout, USB_FLAGS_SLEEP, + &dp->bulkout_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: err:%x: failed to open bulk-out pipe", + dp->name, __func__, ret); + dp->bulkout_pipe = NULL; + goto err; + } + DPRINTF(1, (CE_CONT, "!%s: %s: bulkout_pipe opened successfully", + dp->name, __func__)); + + /* open bulk in pipe */ + bzero(&dp->policy_bulkin, sizeof (usb_pipe_policy_t)); + dp->policy_bulkin.pp_max_async_reqs = 1; + if ((ret = usb_pipe_open(dp->dip, + dp->ep_bulkin, &dp->policy_bulkin, USB_FLAGS_SLEEP, + &dp->bulkin_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: ret:%x failed to open bulk-in pipe", + dp->name, __func__, ret); + dp->bulkin_pipe = NULL; + goto err; + } + DPRINTF(1, (CE_CONT, "!%s: %s: bulkin_pipe opened successfully", + dp->name, __func__)); + + if (dp->ep_intr) { + /* open interrupt pipe */ + bzero(&dp->policy_interrupt, sizeof (usb_pipe_policy_t)); + dp->policy_interrupt.pp_max_async_reqs = 1; + if ((ret = usb_pipe_open(dp->dip, dp->ep_intr, + &dp->policy_interrupt, USB_FLAGS_SLEEP, + &dp->intr_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: ret:%x failed to open interrupt pipe", + dp->name, __func__, ret); + dp->intr_pipe = NULL; + goto err; + } + } + DPRINTF(1, (CE_CONT, "!%s: %s: intr_pipe opened successfully", + dp->name, __func__)); + + return (USB_SUCCESS); + +err: + if (dp->bulkin_pipe) { + usb_pipe_close(dp->dip, + dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkin_pipe = NULL; + } + if (dp->bulkout_pipe) { + usb_pipe_close(dp->dip, + dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkout_pipe = NULL; + } + if (dp->intr_pipe) { + usb_pipe_close(dp->dip, + dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->intr_pipe = NULL; + } + + return (USB_FAILURE); +} + +static int +usbgem_close_pipes(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->intr_pipe) { + usb_pipe_close(dp->dip, + dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->intr_pipe = NULL; + } + DPRINTF(1, (CE_CONT, "!%s: %s: 1", dp->name, __func__)); + + ASSERT(dp->bulkin_pipe); + usb_pipe_close(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkin_pipe = NULL; + DPRINTF(1, (CE_CONT, "!%s: %s: 2", dp->name, __func__)); + + ASSERT(dp->bulkout_pipe); + usb_pipe_close(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkout_pipe = NULL; + DPRINTF(1, (CE_CONT, "!%s: %s: 3", dp->name, __func__)); + + return (USB_SUCCESS); +} + +#define FREEZE_GRACEFUL (B_TRUE) +#define FREEZE_NO_GRACEFUL (B_FALSE) +static int +usbgem_freeze_device(struct usbgem_dev *dp, boolean_t graceful) +{ + DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__)); + + /* stop nic activity */ + (void) usbgem_mac_stop(dp, MAC_STATE_DISCONNECTED, graceful); + + /* + * Here we free all memory resource allocated, because it will + * cause to panic the system that we free usb_bulk_req objects + * during the usb device is disconnected. + */ + (void) usbgem_free_memory(dp); + + return (USB_SUCCESS); +} + +static int +usbgem_disconnect_cb(dev_info_t *dip) +{ + int ret; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + cmn_err(CE_NOTE, "!%s: the usb device was disconnected (dp=%p)", + dp->name, dp); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + ret = usbgem_freeze_device(dp, 0); + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (ret); +} + +static int +usbgem_recover_device(struct usbgem_dev *dp) +{ + int err; + + DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__)); + + err = USB_SUCCESS; + + /* reinitialize the usb connection */ + usbgem_close_pipes(dp); + if ((err = usbgem_open_pipes(dp)) != USB_SUCCESS) { + goto x; + } + + /* initialize nic state */ + dp->mac_state = MAC_STATE_STOPPED; + dp->mii_state = MII_STATE_UNKNOWN; + + /* allocate memory resources again */ + if ((err = usbgem_alloc_memory(dp)) != USB_SUCCESS) { + goto x; + } + + /* restart nic and recover state */ + (void) usbgem_restart_nic(dp); + + usbgem_mii_init(dp); + + /* kick potentially stopped house keeping thread */ + cv_signal(&dp->link_watcher_wait_cv); +x: + return (err); +} + +static int +usbgem_reconnect_cb(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + DPRINTF(0, (CE_CONT, "!%s: dp=%p", ddi_get_name(dip), dp)); +#ifdef notdef + /* check device changes after disconnect */ + if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1, + USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) { + cmn_err(CE_CONT, + "!%s: no or different device installed", dp->name); + return (DDI_SUCCESS); + } +#endif + cmn_err(CE_NOTE, "%s: the usb device was reconnected", dp->name); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_recover_device(dp); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +int +usbgem_suspend(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__)); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_freeze_device(dp, STOP_GRACEFUL); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +int +usbgem_resume(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__)); +#ifdef notdef + /* check device changes after disconnect */ + if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1, + USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) { + cmn_err(CE_CONT, + "!%s: no or different device installed", dp->name); + return (DDI_SUCCESS); + } +#endif + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_recover_device(dp); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +#define USBGEM_LOCAL_DATA_SIZE(gc) \ + (sizeof (struct usbgem_dev) + USBGEM_MCALLOC) + +struct usbgem_dev * +usbgem_do_attach(dev_info_t *dip, + struct usbgem_conf *gc, void *lp, int lmsize) +{ + struct usbgem_dev *dp; + int i; +#ifdef USBGEM_CONFIG_GLDv3 + mac_register_t *macp = NULL; +#else + gld_mac_info_t *macinfo; + void *tmp; +#endif + int ret; + int unit; + int err; + + unit = ddi_get_instance(dip); + + DPRINTF(2, (CE_CONT, "!usbgem%d: %s: called", unit, __func__)); + + /* + * Allocate soft data structure + */ + dp = kmem_zalloc(USBGEM_LOCAL_DATA_SIZE(gc), KM_SLEEP); + if (dp == NULL) { +#ifndef USBGEM_CONFIG_GLDv3 + gld_mac_free(macinfo); +#endif + return (NULL); + } +#ifdef USBGEM_CONFIG_GLDv3 + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + cmn_err(CE_WARN, "!gem%d: %s: mac_alloc failed", + unit, __func__); + return (NULL); + } +#else + macinfo = gld_mac_alloc(dip); + dp->macinfo = macinfo; +#endif + + /* link to private area */ + dp->private = lp; + dp->priv_size = lmsize; + dp->mc_list = (struct mcast_addr *)&dp[1]; + + dp->dip = dip; + bcopy(gc->usbgc_name, dp->name, USBGEM_NAME_LEN); + + /* + * register with usb service + */ + if (usb_client_attach(dip, USBDRV_VERSION, 0) != USB_SUCCESS) { + cmn_err(CE_WARN, + "%s: %s: usb_client_attach failed", + dp->name, __func__); + goto err_free_private; + } + + if (usb_get_dev_data(dip, &dp->reg_data, + USB_PARSE_LVL_ALL, 0) != USB_SUCCESS) { + dp->reg_data = NULL; + goto err_unregister_client; + } +#ifdef USBGEM_DEBUG_LEVEL + usb_print_descr_tree(dp->dip, dp->reg_data); +#endif + + if (usbgem_open_pipes(dp) != USB_SUCCESS) { + /* failed to open pipes */ + cmn_err(CE_WARN, "!%s: %s: failed to open pipes", + dp->name, __func__); + goto err_unregister_client; + } + + /* + * Initialize mutexs and condition variables + */ + mutex_init(&dp->rxlock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&dp->txlock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dp->rx_drain_cv, NULL, CV_DRIVER, NULL); + cv_init(&dp->tx_drain_cv, NULL, CV_DRIVER, NULL); + rw_init(&dp->dev_state_lock, NULL, RW_DRIVER, NULL); + mutex_init(&dp->link_watcher_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dp->link_watcher_wait_cv, NULL, CV_DRIVER, NULL); + sema_init(&dp->hal_op_lock, 1, NULL, SEMA_DRIVER, NULL); + sema_init(&dp->rxfilter_lock, 1, NULL, SEMA_DRIVER, NULL); + + /* + * Initialize configuration + */ + dp->ugc = *gc; + + dp->mtu = ETHERMTU; + dp->rxmode = 0; + dp->speed = USBGEM_SPD_10; /* default is 10Mbps */ + dp->full_duplex = B_FALSE; /* default is half */ + dp->flow_control = FLOW_CONTROL_NONE; + + dp->nic_state = NIC_STATE_STOPPED; + dp->mac_state = MAC_STATE_STOPPED; + dp->mii_state = MII_STATE_UNKNOWN; + + /* performance tuning parameters */ + dp->txthr = ETHERMAX; /* tx fifo threshoold */ + dp->txmaxdma = 16*4; /* tx max dma burst size */ + dp->rxthr = 128; /* rx fifo threshoold */ + dp->rxmaxdma = 16*4; /* rx max dma burst size */ + + /* + * Get media mode infomation from .conf file + */ + usbgem_read_conf(dp); + + /* rx_buf_len depend on MTU */ + dp->rx_buf_len = MAXPKTBUF(dp) + dp->ugc.usbgc_rx_header_len; + + /* + * Reset the chip + */ + if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to reset the usb device", + dp->name, __func__); + goto err_destroy_locks; + } + + /* + * HW dependant paremeter initialization + */ + if (usbgem_hal_attach_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to attach the usb device", + dp->name, __func__); + goto err_destroy_locks; + } + + /* allocate resources */ + if (usbgem_alloc_memory(dp) != USB_SUCCESS) { + goto err_destroy_locks; + } + + DPRINTF(0, (CE_CONT, + "!%s: %02x:%02x:%02x:%02x:%02x:%02x", + dp->name, + dp->dev_addr.ether_addr_octet[0], + dp->dev_addr.ether_addr_octet[1], + dp->dev_addr.ether_addr_octet[2], + dp->dev_addr.ether_addr_octet[3], + dp->dev_addr.ether_addr_octet[4], + dp->dev_addr.ether_addr_octet[5])); + + /* copy mac address */ + dp->cur_addr = dp->dev_addr; + + /* pre-calculated tx timeout in second for performance */ + dp->bulkout_timeout = + dp->ugc.usbgc_tx_timeout / drv_usectohz(1000*1000); + +#ifdef USBGEM_CONFIG_GLDv3 + usbgem_gld3_init(dp, macp); +#else + usbgem_gld_init(dp, macinfo, ident); +#endif + + /* Probe MII phy (scan phy) */ + dp->mii_lpable = 0; + dp->mii_advert = 0; + dp->mii_exp = 0; + dp->mii_ctl1000 = 0; + dp->mii_stat1000 = 0; + + dp->mii_status_ro = 0; + dp->mii_xstatus_ro = 0; + + if (usbgem_mii_probe(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: mii_probe failed", + dp->name, __func__); + goto err_free_memory; + } + + /* mask unsupported abilities */ + dp->anadv_autoneg &= BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + dp->anadv_1000fdx &= + BOOLEAN(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD)); + dp->anadv_1000hdx &= + BOOLEAN(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET)); + dp->anadv_100t4 &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + dp->anadv_100fdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + dp->anadv_100hdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + dp->anadv_10fdx &= BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + dp->anadv_10hdx &= BOOLEAN(dp->mii_status & MII_STATUS_10); + + if (usbgem_mii_init(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: mii_init failed", + dp->name, __func__); + goto err_free_memory; + } + + /* + * initialize kstats including mii statistics + */ +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + usbgem_nd_setup(dp); +#endif +#else + if (usbgem_kstat_init(dp) != USB_SUCCESS) { + goto err_free_memory; + } +#endif + + /* + * Add interrupt to system. + */ +#ifdef USBGEM_CONFIG_GLDv3 + if (ret = mac_register(macp, &dp->mh)) { + cmn_err(CE_WARN, "!%s: mac_register failed, error:%d", + dp->name, ret); + goto err_release_stats; + } + mac_free(macp); + macp = NULL; +#else + /* gld_register will corrupts driver_private */ + tmp = ddi_get_driver_private(dip); + if (gld_register(dip, + (char *)ddi_driver_name(dip), macinfo) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: gld_register failed", + dp->name, __func__); + ddi_set_driver_private(dip, tmp); + goto err_release_stats; + } + /* restore driver private */ + ddi_set_driver_private(dip, tmp); +#endif /* USBGEM_CONFIG_GLDv3 */ + if (usb_register_hotplug_cbs(dip, + usbgem_suspend, usbgem_resume) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to register hotplug cbs", + dp->name, __func__); + goto err_unregister_gld; + } + + /* reset mii and start mii link watcher */ + if (usbgem_mii_start(dp) != USB_SUCCESS) { + goto err_unregister_hotplug; + } + + /* start tx watchdow watcher */ + if (usbgem_tx_watcher_start(dp)) { + goto err_usbgem_mii_stop; + } + + ddi_set_driver_private(dip, (caddr_t)dp); + + DPRINTF(2, (CE_CONT, "!%s: %s: return: success", dp->name, __func__)); + + return (dp); + +err_usbgem_mii_stop: + usbgem_mii_stop(dp); + +err_unregister_hotplug: + usb_unregister_hotplug_cbs(dip); + +err_unregister_gld: +#ifdef USBGEM_CONFIG_GLDv3 + mac_unregister(dp->mh); +#else + gld_unregister(macinfo); +#endif + +err_release_stats: +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + /* release NDD resources */ + usbgem_nd_cleanup(dp); +#endif +#else + kstat_delete(dp->ksp); +#endif + +err_free_memory: + usbgem_free_memory(dp); + +err_destroy_locks: + cv_destroy(&dp->tx_drain_cv); + cv_destroy(&dp->rx_drain_cv); + mutex_destroy(&dp->txlock); + mutex_destroy(&dp->rxlock); + rw_destroy(&dp->dev_state_lock); + mutex_destroy(&dp->link_watcher_lock); + cv_destroy(&dp->link_watcher_wait_cv); + sema_destroy(&dp->hal_op_lock); + sema_destroy(&dp->rxfilter_lock); + +err_close_pipes: + (void) usbgem_close_pipes(dp); + +err_unregister_client: + usb_client_detach(dp->dip, dp->reg_data); + +err_free_private: +#ifdef USBGEM_CONFIG_GLDv3 + if (macp) { + mac_free(macp); + } +#else + gld_mac_free(macinfo); +#endif + kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(gc)); + + return (NULL); +} + +int +usbgem_do_detach(dev_info_t *dip) +{ + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + +#ifdef USBGEM_CONFIG_GLDv3 + /* unregister with gld v3 */ + if (mac_unregister(dp->mh) != DDI_SUCCESS) { + return (DDI_FAILURE); + } +#else + /* unregister with gld v2 */ + if (gld_unregister(dp->macinfo) != DDI_SUCCESS) { + return (DDI_FAILURE); + } +#endif + /* unregister with hotplug service */ + usb_unregister_hotplug_cbs(dip); + + /* stop tx watchdog watcher*/ + usbgem_tx_watcher_stop(dp); + + /* stop the link manager */ + usbgem_mii_stop(dp); + + /* unregister with usb service */ + (void) usbgem_free_memory(dp); + (void) usbgem_close_pipes(dp); + usb_client_detach(dp->dip, dp->reg_data); + dp->reg_data = NULL; + + /* unregister with kernel statistics */ +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + /* release ndd resources */ + usbgem_nd_cleanup(dp); +#endif +#else + /* destroy kstat objects */ + kstat_delete(dp->ksp); +#endif + + /* release locks and condition variables */ + mutex_destroy(&dp->txlock); + mutex_destroy(&dp->rxlock); + cv_destroy(&dp->tx_drain_cv); + cv_destroy(&dp->rx_drain_cv); + rw_destroy(&dp->dev_state_lock); + mutex_destroy(&dp->link_watcher_lock); + cv_destroy(&dp->link_watcher_wait_cv); + sema_destroy(&dp->hal_op_lock); + sema_destroy(&dp->rxfilter_lock); + + /* release basic memory resources */ +#ifndef USBGEM_CONFIG_GLDv3 + gld_mac_free(dp->macinfo); +#endif + kmem_free((caddr_t)(dp->private), dp->priv_size); + kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(&dp->ugc)); + + DPRINTF(2, (CE_CONT, "!%s: %s: return: success", + ddi_driver_name(dip), __func__)); + + return (DDI_SUCCESS); +} + +int +usbgem_mod_init(struct dev_ops *dop, char *name) +{ +#ifdef USBGEM_CONFIG_GLDv3 + major_t major; + major = ddi_name_to_major(name); + if (major == DDI_MAJOR_T_NONE) { + return (DDI_FAILURE); + } + mac_init_ops(dop, name); +#endif + return (DDI_SUCCESS); +} + +void +usbgem_mod_fini(struct dev_ops *dop) +{ +#ifdef USBGEM_CONFIG_GLDv3 + mac_fini_ops(dop); +#endif +} + +int +usbgem_quiesce(dev_info_t *dip) +{ + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + ASSERT(dp != NULL); + + if (dp->mac_state != MAC_STATE_DISCONNECTED && + dp->mac_state != MAC_STATE_STOPPED) { + if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) { + (void) usbgem_hal_reset_chip(dp); + } + } + + /* devo_quiesce() must return DDI_SUCCESS always */ + return (DDI_SUCCESS); +} diff --git a/usr/src/uts/common/io/usbgem/usbgem.h b/usr/src/uts/common/io/usbgem/usbgem.h new file mode 100644 index 0000000000..80b89a260e --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem.h @@ -0,0 +1,428 @@ +/* + * usbgem.h: General USB to Ethernet MAC driver framework + * @(#)usbgem.h 1.4 12/02/09 + * (C) Copyright 2003-2009 Masayuki Murayama KHF04453@nifty.ne.jp + */ + +#ifndef __USBGEM_H__ +#define __USBGEM_H__ + +#pragma ident "@(#)usbgem.h 1.4 12/02/09" + +#ifdef USBGEM_CONFIG_GLDv3 +#include <sys/mac.h> +#ifndef MAC_VERSION +#include <sys/mac_provider.h> +#endif +#include <sys/mac_ether.h> +#else +#include <sys/gld.h> +#endif /* GLDv3 */ + +/* + * Useful macros and typedefs + */ +#define USBGEM_NAME_LEN 32 + +#define USBGEM_TX_TIMEOUT (drv_usectohz(3*1000000)) +#define USBGEM_TX_TIMEOUT_INTERVAL (drv_usectohz(1*1000000)) +#define USBGEM_LINK_WATCH_INTERVAL (drv_usectohz(1*1000000)) + +/* general return code */ +#define USBGEM_SUCCESS 0 +#define USBGEM_FAILURE 1 + +/* return code of usbgem_tx_done */ +#define INTR_RESTART_TX 0x80000000U + +struct usbgem_stats { + uint32_t intr; + + uint32_t crc; + uint32_t errrcv; + uint32_t overflow; + uint32_t frame; + uint32_t missed; + uint32_t runt; + uint32_t frame_too_long; + uint32_t norcvbuf; + uint32_t sqe; + + uint32_t collisions; + uint32_t first_coll; + uint32_t multi_coll; + uint32_t excoll; + uint32_t xmit_internal_err; + uint32_t nocarrier; + uint32_t defer; + uint32_t errxmt; + uint32_t underflow; + uint32_t xmtlatecoll; + uint32_t noxmtbuf; + uint32_t jabber; + + + uint64_t rbytes; + uint64_t obytes; + uint64_t rpackets; + uint64_t opackets; + uint32_t rbcast; + uint32_t obcast; + uint32_t rmcast; + uint32_t omcast; + uint32_t rcv_internal_err; +}; + +struct mcast_addr { + struct ether_addr addr; + uint32_t hash; +}; + +#define USBGEM_MAXMC 64 +#define USBGEM_MCALLOC (sizeof(struct mcast_addr) * USBGEM_MAXMC) + +#define SLOT(dp, n) ((n) % (dp)->ugc.usbgc_tx_list_max) + +/* + * mac soft state + */ +struct usbgem_dev { + dev_info_t *dip; +#ifdef USBGEM_CONFIG_GLDv3 + mac_handle_t mh; +#else + void *macinfo; /* opaque handle for upper layer */ +#endif + char name[USBGEM_NAME_LEN]; + + /* pointer to usb private data */ + usb_client_dev_data_t *reg_data; + + /* usb handles */ + usb_pipe_handle_t default_pipe; + usb_pipe_handle_t bulkin_pipe; + usb_pipe_handle_t bulkout_pipe; + usb_pipe_handle_t intr_pipe; + + /* usb endpoints */ + usb_ep_descr_t *ep_default; + usb_ep_descr_t *ep_bulkin; + usb_ep_descr_t *ep_bulkout; + usb_ep_descr_t *ep_intr; + + /* usb policies */ + usb_pipe_policy_t policy_default; + usb_pipe_policy_t policy_bulkin; + usb_pipe_policy_t policy_bulkout; + usb_pipe_policy_t policy_interrupt; + + /* MAC address information */ + struct ether_addr cur_addr; + struct ether_addr dev_addr; + + /* RX state and resource management */ + kmutex_t rxlock; + int rx_busy_cnt; + boolean_t rx_active; + kcondvar_t rx_drain_cv; + + /* RX buffer management */ + int rx_buf_len; + + /* TX state and resource management */ + kmutex_t txlock; + int tx_busy_cnt; + usb_bulk_req_t *tx_free_list; + kcondvar_t tx_drain_cv; + clock_t tx_start_time; + int bulkout_timeout; /* in second */ + int tx_max_packets; + int tx_seq_num; + int tx_intr_pended; + + /* NIC state from OS view */ + int nic_state; +#define NIC_STATE_UNKNOWN 0 +#define NIC_STATE_STOPPED 1 +#define NIC_STATE_INITIALIZED 2 +#define NIC_STATE_ONLINE 3 + + /* MAC state from hardware view */ + int mac_state; +#define MAC_STATE_DISCONNECTED 0 /* it includes suspended state too */ +#define MAC_STATE_STOPPED 1 /* powered up / buf not initialized */ +#define MAC_STATE_INITIALIZED 2 /* initialized */ +#define MAC_STATE_ONLINE 3 /* working correctly */ +#define MAC_STATE_ERROR 4 /* need to restart nic */ + + clock_t fatal_error; + + /* robustness: timer and watchdog */ + uint_t tx_watcher_stop; + kt_did_t tx_watcher_did; + kcondvar_t tx_watcher_cv; + kmutex_t tx_watcher_lock; + clock_t tx_watcher_timeout; + clock_t tx_watcher_interval; + + /* MII mamagement */ + boolean_t anadv_autoneg:1; + boolean_t anadv_1000fdx:1; + boolean_t anadv_1000hdx:1; + boolean_t anadv_100t4:1; + boolean_t anadv_100fdx:1; + boolean_t anadv_100hdx:1; + boolean_t anadv_10fdx:1; + boolean_t anadv_10hdx:1; + boolean_t anadv_1000t_ms:2; + boolean_t anadv_pause:1; + boolean_t anadv_asmpause:1; + boolean_t mii_advert_ro:1; + + boolean_t full_duplex:1; + int speed:3; +#define USBGEM_SPD_10 0 +#define USBGEM_SPD_100 1 +#define USBGEM_SPD_1000 2 +#define USBGEM_SPD_NUM 3 + unsigned int flow_control:2; +#define FLOW_CONTROL_NONE 0 +#define FLOW_CONTROL_SYMMETRIC 1 +#define FLOW_CONTROL_TX_PAUSE 2 +#define FLOW_CONTROL_RX_PAUSE 3 + + boolean_t mii_supress_msg:1; + + uint32_t mii_phy_id; + uint16_t mii_status; + uint16_t mii_advert; + uint16_t mii_lpable; + uint16_t mii_exp; + uint16_t mii_ctl1000; + uint16_t mii_stat1000; + uint16_t mii_xstatus; + int8_t mii_phy_addr; /* must be signed */ + + uint16_t mii_status_ro; + uint16_t mii_xstatus_ro; + + int mii_state; +#define MII_STATE_UNKNOWN 0 +#define MII_STATE_RESETTING 1 +#define MII_STATE_AUTONEGOTIATING 2 +#define MII_STATE_AN_DONE 3 +#define MII_STATE_MEDIA_SETUP 4 +#define MII_STATE_LINKUP 5 +#define MII_STATE_LINKDOWN 6 + + clock_t mii_last_check; /* in tick */ + clock_t mii_timer; /* in tick */ +#define MII_RESET_TIMEOUT drv_usectohz(1000*1000) +#define MII_AN_TIMEOUT drv_usectohz(5000*1000) +#define MII_LINKDOWN_TIMEOUT drv_usectohz(10000*1000) + + clock_t mii_interval; /* in tick */ + clock_t linkup_delay; /* in tick */ + + uint_t link_watcher_stop; + kt_did_t link_watcher_did; + kcondvar_t link_watcher_wait_cv; + kmutex_t link_watcher_lock; + + krwlock_t dev_state_lock; /* mac_state and nic_state */ + ksema_t hal_op_lock; /* serialize hw operations */ + ksema_t drv_op_lock; /* hotplug op lock */ + + /* multcast list */ + ksema_t rxfilter_lock; + int mc_count; + int mc_count_req; + struct mcast_addr *mc_list; + int rxmode; +#define RXMODE_PROMISC 0x01 +#define RXMODE_ALLMULTI_REQ 0x02 +#define RXMODE_MULTI_OVF 0x04 +#define RXMODE_ENABLE 0x08 +#define RXMODE_ALLMULTI (RXMODE_ALLMULTI_REQ | RXMODE_MULTI_OVF) +#define RXMODE_BITS \ + "\020" \ + "\004ENABLE" \ + "\003MULTI_OVF" \ + "\002ALLMULTI_REQ" \ + "\001PROMISC" + + /* statistcs */ + struct usbgem_stats stats; + + /* pointer to local structure */ + void *private; + int priv_size; + + /* configuration */ + struct usbgem_conf { + /* name */ + char usbgc_name[USBGEM_NAME_LEN]; + int usbgc_ppa; + + /* specification on usb */ + int usbgc_ifnum; /* interface number */ + int usbgc_alt; /* alternate */ + + /* specification on tx engine */ + int usbgc_tx_list_max; + + /* specification on rx engine */ + int usbgc_rx_header_len; + int usbgc_rx_list_max; + + /* time out parameters */ + clock_t usbgc_tx_timeout; + clock_t usbgc_tx_timeout_interval; + + /* flow control */ + int usbgc_flow_control; + + /* MII timeout parameters */ + clock_t usbgc_mii_linkdown_timeout; + clock_t usbgc_mii_link_watch_interval; + clock_t usbgc_mii_reset_timeout; + + clock_t usbgc_mii_an_watch_interval; + clock_t usbgc_mii_an_timeout; + clock_t usbgc_mii_an_wait; + clock_t usbgc_mii_an_delay; + + /* MII configuration */ + int usbgc_mii_addr_min; + int usbgc_mii_linkdown_action; + int usbgc_mii_linkdown_timeout_action; +#define MII_ACTION_NONE 0 +#define MII_ACTION_RESET 1 +#define MII_ACTION_RSA 2 + boolean_t usbgc_mii_dont_reset:1; + boolean_t usbgc_mii_an_oneshot:1; + boolean_t usbgc_mii_hw_link_detection:1; + boolean_t usbgc_mii_stop_mac_on_linkdown:1; + uint16_t usbgc_mii_an_cmd; + + /* I/O methods */ + + /* mac operation */ + int (*usbgc_attach_chip)(struct usbgem_dev *dp); + int (*usbgc_reset_chip)(struct usbgem_dev *dp); + int (*usbgc_init_chip)(struct usbgem_dev *dp); + int (*usbgc_start_chip)(struct usbgem_dev *dp); + int (*usbgc_stop_chip)(struct usbgem_dev *dp); + uint32_t (*usbgc_multicast_hash)(struct usbgem_dev *dp, + const uint8_t *); + int (*usbgc_set_rx_filter)(struct usbgem_dev *dp); + int (*usbgc_set_media)(struct usbgem_dev *dp); + int (*usbgc_get_stats)(struct usbgem_dev *dp); + void (*usbgc_interrupt)(struct usbgem_dev *dp, mblk_t *mp); + + /* packet manupilation */ + mblk_t *(*usbgc_tx_make_packet)(struct usbgem_dev *dp, mblk_t *mp); + mblk_t *(*usbgc_rx_make_packet)(struct usbgem_dev *dp, mblk_t *mp); + /* mii operations */ + int (*usbgc_mii_probe)(struct usbgem_dev *dp); + int (*usbgc_mii_init)(struct usbgem_dev *dp); + int (*usbgc_mii_config)(struct usbgem_dev *dp, int *errp); + uint16_t (*usbgc_mii_read)(struct usbgem_dev *dp, uint_t reg, int *errp); + void (*usbgc_mii_write)(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp); + + /* jumbo frame */ + int usbgc_max_mtu; + int usbgc_default_mtu; + int usbgc_min_mtu; + } ugc; + + int misc_flag; +#define USBGEM_VLAN 0x0001 + timeout_id_t intr_watcher_id; + + /* buffer size */ + uint_t mtu; + + /* performance tuning parameters */ + uint_t txthr; /* tx fifo threshoold */ + uint_t txmaxdma; /* tx max dma burst size */ + uint_t rxthr; /* rx fifo threshoold */ + uint_t rxmaxdma; /* tx max dma burst size */ + + /* kstat stuff */ + kstat_t *ksp; + + /* ndd stuff */ + caddr_t nd_data_p; + caddr_t nd_arg_p; + +#ifdef USBGEM_DEBUG_LEVEL + int tx_cnt; +#endif +}; + +/* + * Exported functions + */ +int usbgem_ctrl_out(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size); + +int usbgem_ctrl_in(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size); + +int usbgem_ctrl_out_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + uint32_t v); + +int usbgem_ctrl_in_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *valp); + +void usbgem_generate_macaddr(struct usbgem_dev *, uint8_t *); +boolean_t usbgem_get_mac_addr_conf(struct usbgem_dev *); +int usbgem_mii_probe_default(struct usbgem_dev *); +int usbgem_mii_init_default(struct usbgem_dev *); +int usbgem_mii_config_default(struct usbgem_dev *, int *errp); +void usbgem_mii_update_link(struct usbgem_dev *); +void usbgem_restart_tx(struct usbgem_dev *); +boolean_t usbgem_tx_done(struct usbgem_dev *, int); +void usbgem_receive(struct usbgem_dev *); +struct usbgem_dev *usbgem_do_attach(dev_info_t *, + struct usbgem_conf *, void *, int); +int usbgem_do_detach(dev_info_t *); + +uint32_t usbgem_ether_crc_le(const uint8_t *addr); +uint32_t usbgem_ether_crc_be(const uint8_t *addr); + +int usbgem_resume(dev_info_t *); +int usbgem_suspend(dev_info_t *); +int usbgem_quiesce(dev_info_t *); + +#ifdef USBGEM_CONFIG_GLDv3 +#if DEVO_REV < 4 +#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \ + DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \ + nodev, NULL, D_MP, NULL) +#else +#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \ + DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \ + nodev, NULL, D_MP, NULL, usbgem_quiesce) +#endif +#else +#define usbgem_getinfo gld_getinfo +#define usbgem_open gld_open +#define usbgem_close gld_close +#define usbgem_wput gld_wput +#define usbgem_wsrv gld_wsrv +#define usbgem_rsrv gld_rsrv +#define usbgem_power NULL +#endif +int usbgem_mod_init(struct dev_ops *, char *); +void usbgem_mod_fini(struct dev_ops *); + +#define USBGEM_GET_DEV(dip) \ + ((struct usbgem_dev *)(ddi_get_driver_private(dip))) + +#endif /* __USBGEM_H__ */ diff --git a/usr/src/uts/common/io/usbgem/usbgem_mii.h b/usr/src/uts/common/io/usbgem/usbgem_mii.h new file mode 100644 index 0000000000..2b4176a340 --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem_mii.h @@ -0,0 +1,242 @@ +/* + * gem_mii.h: mii header for gem + * + * Copyright (c) 2002-2007 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +#pragma ident "@(#)gem_mii.h 1.4 07/11/30" + +/* + * gem_mii.h : MII registers + */ +#ifndef _GEM_MII_H_ +#define _GEM_MII_H_ + +#ifdef GEM_CONFIG_GLDv3 +#include <sys/miiregs.h> +#else +#define MII_CONTROL 0 +#define MII_STATUS 1 +#define MII_PHYIDH 2 +#define MII_PHYIDL 3 +#define MII_AN_ADVERT 4 +#define MII_AN_LPABLE 5 +#define MII_AN_EXPANSION 6 +#define MII_AN_NXTPGXMIT 7 +#endif /* GEM_CONFIG_GLDv3 */ + +#define MII_AN_LPANXT 8 +#define MII_MS_CONTROL 9 +#define MII_MS_STATUS 10 +#define MII_XSTATUS 15 + +/* for 1000BaseT support */ +#define MII_1000TC MII_MS_CONTROL +#define MII_1000TS MII_MS_STATUS +#ifndef GEM_CONFIG_GLDv3 +#define MII_CONTROL_RESET 0x8000 +#define MII_CONTROL_LOOPBACK 0x4000 +#define MII_CONTROL_100MB 0x2000 +#define MII_CONTROL_ANE 0x1000 +#define MII_CONTROL_PWRDN 0x0800 +#define MII_CONTROL_ISOLATE 0x0400 +#define MII_CONTROL_RSAN 0x0200 +#define MII_CONTROL_FDUPLEX 0x0100 +#define MII_CONTROL_COLTST 0x0080 +#endif /* !GEM_CONFIG_GLDv3 */ +#define MII_CONTROL_SPEED 0x2040 + +#define MII_CONTROL_10MB 0x0000 +#define MII_CONTROL_1000MB 0x0040 + +#define MII_CONTROL_BITS \ + "\020" \ + "\020RESET" \ + "\017LOOPBACK" \ + "\016100MB" \ + "\015ANE" \ + "\014PWRDN" \ + "\013ISOLATE" \ + "\012RSAN" \ + "\011FDUPLEX" \ + "\010COLTST" \ + "\0071000M" +#ifndef GEM_CONFIG_GLDv3 +#define MII_STATUS_100_BASE_T4 0x8000 +#define MII_STATUS_100_BASEX_FD 0x4000 +#define MII_STATUS_100_BASEX 0x2000 +#define MII_STATUS_10_FD 0x1000 +#define MII_STATUS_10 0x0800 +#define MII_STATUS_MFPRMBLSUPR 0x0040 +#define MII_STATUS_ANDONE 0x0020 +#define MII_STATUS_REMFAULT 0x0010 +#define MII_STATUS_CANAUTONEG 0x0008 +#define MII_STATUS_LINKUP 0x0004 +#define MII_STATUS_JABBERING 0x0002 +#define MII_STATUS_EXTENDED 0x0001 +#endif /* !GEM_CONFIG_GLDv3 */ +#define MII_STATUS_XSTATUS 0x0100 +#define MII_STATUS_100_BASE_T2_FD 0x0400 +#define MII_STATUS_100_BASE_T2 0x0200 + +#define MII_STATUS_ABILITY_TECH \ + (MII_STATUS_100_BASE_T4 | \ + MII_STATUS_100_BASEX_FD | \ + MII_STATUS_100_BASEX | \ + MII_STATUS_10 | \ + MII_STATUS_10_FD) + + +#define MII_STATUS_BITS \ + "\020" \ + "\020100_BASE_T4" \ + "\017100_BASEX_FD" \ + "\016100_BASEX" \ + "\01510_BASE_FD" \ + "\01410_BASE" \ + "\013100_BASE_T2_FD" \ + "\012100_BASE_T2" \ + "\011XSTATUS" \ + "\007MFPRMBLSUPR" \ + "\006ANDONE" \ + "\005REMFAULT" \ + "\004CANAUTONEG" \ + "\003LINKUP" \ + "\002JABBERING" \ + "\001EXTENDED" +#ifndef GEM_CONFIG_GLDv3 +#define MII_AN_ADVERT_NP 0x8000 +#define MII_AN_ADVERT_REMFAULT 0x2000 +#define MII_AN_ADVERT_SELECTOR 0x001f +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_ABILITY_ASM_DIR 0x0800 /* for annex 28B */ +#ifndef MII_ABILITY_PAUSE +#define MII_ABILITY_PAUSE 0x0400 /* for IEEE 802.3x */ +#endif +#ifndef GEM_CONFIG_GLDv3 +#define MII_ABILITY_100BASE_T4 0x0200 +#define MII_ABILITY_100BASE_TX_FD 0x0100 +#define MII_ABILITY_100BASE_TX 0x0080 +#define MII_ABILITY_10BASE_T_FD 0x0040 +#define MII_ABILITY_10BASE_T 0x0020 +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_AN_LPABLE_NP 0x8000 + +#define MII_ABILITY_TECH \ + (MII_ABILITY_100BASE_T4 | \ + MII_ABILITY_100BASE_TX_FD | \ + MII_ABILITY_100BASE_TX | \ + MII_ABILITY_10BASE_T | \ + MII_ABILITY_10BASE_T_FD) + +#define MII_ABILITY_ALL \ + (MII_AN_ADVERT_REMFAULT | \ + MII_ABILITY_ASM_DIR | \ + MII_ABILITY_PAUSE | \ + MII_ABILITY_TECH) + + +#define MII_ABILITY_BITS \ + "\020" \ + "\016REMFAULT" \ + "\014ASM_DIR" \ + "\013PAUSE" \ + "\012100BASE_T4" \ + "\011100BASE_TX_FD" \ + "\010100BASE_TX" \ + "\00710BASE_T_FD" \ + "\00610BASE_T" +#ifndef GEM_CONFIG_GLDv3 +#define MII_AN_EXP_PARFAULT 0x0010 +#define MII_AN_EXP_LPCANNXTP 0x0008 +#define MII_AN_EXP_CANNXTPP 0x0004 +#define MII_AN_EXP_PAGERCVD 0x0002 +#define MII_AN_EXP_LPCANAN 0x0001 +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_AN_EXP_BITS \ + "\020" \ + "\005PARFAULT" \ + "\004LPCANNXTP" \ + "\003CANNXTPP" \ + "\002PAGERCVD" \ + "\001LPCANAN" + +#define MII_1000TC_TESTMODE 0xe000 +#define MII_1000TC_CFG_EN 0x1000 +#define MII_1000TC_CFG_VAL 0x0800 +#define MII_1000TC_PORTTYPE 0x0400 +#define MII_1000TC_ADV_FULL 0x0200 +#define MII_1000TC_ADV_HALF 0x0100 + +#define MII_1000TC_BITS \ + "\020" \ + "\015CFG_EN" \ + "\014CFG_VAL" \ + "\013PORTTYPE" \ + "\012FULL" \ + "\011HALF" + +#define MII_1000TS_CFG_FAULT 0x8000 +#define MII_1000TS_CFG_MASTER 0x4000 +#define MII_1000TS_LOCALRXOK 0x2000 +#define MII_1000TS_REMOTERXOK 0x1000 +#define MII_1000TS_LP_FULL 0x0800 +#define MII_1000TS_LP_HALF 0x0400 + +#define MII_1000TS_BITS \ + "\020" \ + "\020CFG_FAULT" \ + "\017CFG_MASTER" \ + "\014CFG_LOCALRXOK" \ + "\013CFG_REMOTERXOK" \ + "\012LP_FULL" \ + "\011LP_HALF" + +#define MII_XSTATUS_1000BASEX_FD 0x8000 +#define MII_XSTATUS_1000BASEX 0x4000 +#define MII_XSTATUS_1000BASET_FD 0x2000 +#define MII_XSTATUS_1000BASET 0x1000 + +#define MII_XSTATUS_BITS \ + "\020" \ + "\0201000BASEX_FD" \ + "\0171000BASEX" \ + "\0161000BASET_FD" \ + "\0151000BASET" + +#define MII_READ_CMD(p, r) \ + ((6<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18)) + +#define MII_WRITE_CMD(p, r, v) \ + ((5<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18) | (2 << 16) | (v)) + +#endif /* _GEM_MII_H_ */ diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c index 0d1132febc..27241894aa 100644 --- a/usr/src/uts/common/io/vioif/vioif.c +++ b/usr/src/uts/common/io/vioif/vioif.c @@ -12,6 +12,7 @@ /* * Copyright 2013 Nexenta Inc. All rights reserved. * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Based on the NetBSD virtio driver by Minoura Makoto. */ @@ -285,6 +286,13 @@ struct vioif_softc { unsigned int sc_tx_csum:1; unsigned int sc_tx_tso4:1; + /* + * For debugging, it is useful to know whether the MAC address we + * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or + * was otherwise generated or set from within the guest. + */ + unsigned int sc_mac_from_host:1; + int sc_mtu; uint8_t sc_mac[ETHERADDRL]; /* @@ -312,7 +320,10 @@ struct vioif_softc { /* Copying small packets turns out to be faster then mapping them. */ unsigned long sc_rxcopy_thresh; unsigned long sc_txcopy_thresh; - /* Some statistic coming here */ + + /* + * Statistics visible through mac: + */ uint64_t sc_ipackets; uint64_t sc_opackets; uint64_t sc_rbytes; @@ -325,6 +336,18 @@ struct vioif_softc { uint64_t sc_notxbuf; uint64_t sc_ierrors; uint64_t sc_oerrors; + + /* + * Internal debugging statistics: + */ + uint64_t sc_rxfail_dma_handle; + uint64_t sc_rxfail_dma_buffer; + uint64_t sc_rxfail_dma_bind; + uint64_t sc_rxfail_chain_undersize; + uint64_t sc_rxfail_no_descriptors; + uint64_t sc_txfail_dma_handle; + uint64_t sc_txfail_dma_bind; + uint64_t sc_txfail_indirect_limit; }; #define ETHER_HEADER_LEN sizeof (struct ether_header) @@ -474,8 +497,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags) if (ddi_dma_alloc_handle(sc->sc_dev, &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmah)) { - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate dma handle for rx buffer"); + sc->sc_rxfail_dma_handle++; goto exit_handle; } @@ -483,8 +505,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags) VIOIF_RX_SIZE + sizeof (struct virtio_net_hdr), &vioif_bufattr, DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_buf, &len, &buf->rb_mapping.vbm_acch)) { - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate rx buffer"); + sc->sc_rxfail_dma_buffer++; goto exit_alloc; } ASSERT(len >= VIOIF_RX_SIZE); @@ -493,8 +514,7 @@ vioif_rx_construct(void *buffer, void *user_arg, int kmflags) buf->rb_mapping.vbm_buf, len, DDI_DMA_READ | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmac, &buf->rb_mapping.vbm_ncookies)) { - dev_err(sc->sc_dev, CE_WARN, "Can't bind tx buffer"); - + sc->sc_rxfail_dma_bind++; goto exit_bind; } @@ -716,27 +736,24 @@ vioif_add_rx(struct vioif_softc *sc, int kmflag) struct vioif_rx_buf *buf; ve = vq_alloc_entry(sc->sc_rx_vq); - if (!ve) { + if (ve == NULL) { /* * Out of free descriptors - ring already full. - * It would be better to update sc_norxdescavail - * but MAC does not ask for this info, hence we - * update sc_norecvbuf. */ + sc->sc_rxfail_no_descriptors++; sc->sc_norecvbuf++; goto exit_vq; } buf = sc->sc_rxbufs[ve->qe_index]; - if (!buf) { + if (buf == NULL) { /* First run, allocate the buffer. */ buf = kmem_cache_alloc(sc->sc_rxbuf_cache, kmflag); sc->sc_rxbufs[ve->qe_index] = buf; } /* Still nothing? Bye. */ - if (!buf) { - dev_err(sc->sc_dev, CE_WARN, "Can't allocate rx buffer"); + if (buf == NULL) { sc->sc_norecvbuf++; goto exit_buf; } @@ -789,20 +806,19 @@ static int vioif_populate_rx(struct vioif_softc *sc, int kmflag) { int i = 0; - int ret; for (;;) { - ret = vioif_add_rx(sc, kmflag); - if (ret) + if (vioif_add_rx(sc, kmflag) != DDI_SUCCESS) { /* * We could not allocate some memory. Try to work with * what we've got. */ break; + } i++; } - if (i) + if (i != 0) virtio_sync_vq(sc->sc_rx_vq); return (i); @@ -823,8 +839,7 @@ vioif_process_rx(struct vioif_softc *sc) ASSERT(buf); if (len < sizeof (struct virtio_net_hdr)) { - dev_err(sc->sc_dev, CE_WARN, "RX: Cnain too small: %u", - len - (uint32_t)sizeof (struct virtio_net_hdr)); + sc->sc_rxfail_chain_undersize++; sc->sc_ierrors++; virtio_free_chain(ve); continue; @@ -838,7 +853,7 @@ vioif_process_rx(struct vioif_softc *sc) */ if (len < sc->sc_rxcopy_thresh) { mp = allocb(len, 0); - if (!mp) { + if (mp == NULL) { sc->sc_norecvbuf++; sc->sc_ierrors++; @@ -855,7 +870,7 @@ vioif_process_rx(struct vioif_softc *sc) buf->rb_mapping.vbm_buf + sizeof (struct virtio_net_hdr) + VIOIF_IP_ALIGN, len, 0, &buf->rb_frtn); - if (!mp) { + if (mp == NULL) { sc->sc_norecvbuf++; sc->sc_ierrors++; @@ -901,31 +916,32 @@ vioif_reclaim_used_tx(struct vioif_softc *sc) struct vioif_tx_buf *buf; uint32_t len; mblk_t *mp; - int i = 0; + unsigned chains = 0; while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) { /* We don't chain descriptors for tx, so don't expect any. */ - ASSERT(!ve->qe_next); + ASSERT(ve->qe_next == NULL); buf = &sc->sc_txbufs[ve->qe_index]; mp = buf->tb_mp; buf->tb_mp = NULL; - if (mp) { - for (i = 0; i < buf->tb_external_num; i++) + if (mp != NULL) { + for (int i = 0; i < buf->tb_external_num; i++) { (void) ddi_dma_unbind_handle( buf->tb_external_mapping[i].vbm_dmah); + } } virtio_free_chain(ve); /* External mapping used, mp was not freed in vioif_send() */ - if (mp) + if (mp != NULL) freemsg(mp); - i++; + chains++; } - if (sc->sc_tx_stopped && i) { + if (sc->sc_tx_stopped != 0 && chains > 0) { sc->sc_tx_stopped = 0; mac_tx_update(sc->sc_mac_handle); } @@ -962,8 +978,7 @@ vioif_tx_lazy_handle_alloc(struct vioif_softc *sc, struct vioif_tx_buf *buf, &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL, &buf->tb_external_mapping[i].vbm_dmah); if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate dma handle for external tx buffer"); + sc->sc_txfail_dma_handle++; } } @@ -1017,17 +1032,14 @@ vioif_tx_external(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp, DDI_DMA_SLEEP, NULL, &dmac, &ncookies); if (ret != DDI_SUCCESS) { + sc->sc_txfail_dma_bind++; sc->sc_oerrors++; - dev_err(sc->sc_dev, CE_NOTE, - "TX: Failed to bind external handle"); goto exit_bind; } /* Check if we still fit into the indirect table. */ if (virtio_ve_indirect_available(ve) < ncookies) { - dev_err(sc->sc_dev, CE_NOTE, - "TX: Indirect descriptor table limit reached." - " It took %d fragments.", i); + sc->sc_txfail_indirect_limit++; sc->sc_notxbuf++; sc->sc_oerrors++; @@ -1086,7 +1098,7 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp) ve = vq_alloc_entry(sc->sc_tx_vq); - if (!ve) { + if (ve == NULL) { sc->sc_notxbuf++; /* Out of free descriptors - try later. */ return (B_FALSE); @@ -1138,9 +1150,9 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp) /* meanwhile update the statistic */ if (mp->b_rptr[0] & 0x1) { if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) - sc->sc_multixmt++; - else - sc->sc_brdcstxmt++; + sc->sc_multixmt++; + else + sc->sc_brdcstxmt++; } /* @@ -1202,8 +1214,7 @@ vioif_start(void *arg) { struct vioif_softc *sc = arg; - mac_link_update(sc->sc_mac_handle, - vioif_link_state(sc)); + mac_link_update(sc->sc_mac_handle, vioif_link_state(sc)); virtio_start_vq_intr(sc->sc_rx_vq); @@ -1404,10 +1415,8 @@ vioif_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, case MAC_PROP_PRIVATE: bzero(valstr, sizeof (valstr)); if (strcmp(pr_name, vioif_txcopy_thresh) == 0) { - value = sc->sc_txcopy_thresh; - } else if (strcmp(pr_name, - vioif_rxcopy_thresh) == 0) { + } else if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) { value = sc->sc_rxcopy_thresh; } else { return; @@ -1483,7 +1492,6 @@ vioif_show_features(struct vioif_softc *sc, const char *prefix, bufp += virtio_show_features(features, bufp, bufend - bufp); *bufp = '\0'; - /* Using '!' to only CE_NOTE this to the system log. */ dev_err(sc->sc_dev, CE_NOTE, "!%s Vioif (%b)", buf, features, VIRTIO_NET_FEATURE_BITS); @@ -1512,8 +1520,8 @@ vioif_dev_features(struct vioif_softc *sc) sc->sc_virtio.sc_features); if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) { - dev_err(sc->sc_dev, CE_NOTE, - "Host does not support RING_INDIRECT_DESC, bye."); + dev_err(sc->sc_dev, CE_WARN, + "Host does not support RING_INDIRECT_DESC. Cannot attach."); return (DDI_FAILURE); } @@ -1535,6 +1543,7 @@ vioif_set_mac(struct vioif_softc *sc) virtio_write_device_config_1(&sc->sc_virtio, VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]); } + sc->sc_mac_from_host = 0; } /* Get the mac address out of the hardware, or make up one. */ @@ -1548,8 +1557,7 @@ vioif_get_mac(struct vioif_softc *sc) &sc->sc_virtio, VIRTIO_NET_CONFIG_MAC + i); } - dev_err(sc->sc_dev, CE_NOTE, "Got MAC address from host: %s", - ether_sprintf((struct ether_addr *)sc->sc_mac)); + sc->sc_mac_from_host = 1; } else { /* Get a few random bytes */ (void) random_get_pseudo_bytes(sc->sc_mac, ETHERADDRL); @@ -1561,7 +1569,7 @@ vioif_get_mac(struct vioif_softc *sc) vioif_set_mac(sc); dev_err(sc->sc_dev, CE_NOTE, - "Generated a random MAC address: %s", + "!Generated a random MAC address: %s", ether_sprintf((struct ether_addr *)sc->sc_mac)); } } @@ -1624,7 +1632,7 @@ vioif_check_features(struct vioif_softc *sc) if (!vioif_has_feature(sc, VIRTIO_NET_F_GUEST_CSUM)) { sc->sc_rx_csum = 0; } - cmn_err(CE_NOTE, "Csum enabled."); + dev_err(sc->sc_dev, CE_NOTE, "!Csum enabled."); if (vioif_has_feature(sc, VIRTIO_NET_F_HOST_TSO4)) { @@ -1638,11 +1646,11 @@ vioif_check_features(struct vioif_softc *sc) */ if (!vioif_has_feature(sc, VIRTIO_NET_F_HOST_ECN)) { dev_err(sc->sc_dev, CE_NOTE, - "TSO4 supported, but not ECN. " + "!TSO4 supported, but not ECN. " "Not using LSO."); sc->sc_tx_tso4 = 0; } else { - cmn_err(CE_NOTE, "LSO enabled"); + dev_err(sc->sc_dev, CE_NOTE, "!LSO enabled"); } } } @@ -1766,7 +1774,7 @@ vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) vioif_check_features(sc); - if (vioif_alloc_mems(sc)) + if (vioif_alloc_mems(sc) != 0) goto exit_alloc_mems; if ((macp = mac_alloc(MAC_VERSION)) == NULL) { @@ -1854,7 +1862,7 @@ vioif_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) return (DDI_FAILURE); } - if (sc->sc_rxloan) { + if (sc->sc_rxloan > 0) { dev_err(devinfo, CE_WARN, "!Some rx buffers are still upstream," " not detaching."); return (DDI_FAILURE); diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c new file mode 100644 index 0000000000..e4e700fa12 --- /dev/null +++ b/usr/src/uts/common/io/vnd/frameio.c @@ -0,0 +1,464 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* + * Frame I/O utility functions + */ + +#include <sys/frameio.h> + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/inttypes.h> + +static kmem_cache_t *frameio_cache; + +int +frameio_init(void) +{ + frameio_cache = kmem_cache_create("frameio_cache", + sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX, + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (frameio_cache == NULL) + return (1); + + return (0); +} + +void +frameio_fini(void) +{ + if (frameio_cache != NULL) + kmem_cache_destroy(frameio_cache); +} + +frameio_t * +frameio_alloc(int kmflags) +{ + return (kmem_cache_alloc(frameio_cache, kmflags)); +} + +void +frameio_free(frameio_t *fio) +{ + return (kmem_cache_free(frameio_cache, fio)); +} + +/* + * Ensure that we don't see any garbage in the framevecs that we're nominally + * supposed to work with. Specifically we want to make sure that the buflen and + * the address are not zero. + */ +static int +frameio_hdr_check_vecs(frameio_t *fio) +{ + int i; + for (i = 0; i < fio->fio_nvecs; i++) + if (fio->fio_vecs[i].fv_buf == NULL || + fio->fio_vecs[i].fv_buflen == 0) + return (EINVAL); + + return (0); +} + +/* + * We have to copy in framevec32_t's. To work around the data model issues and + * trying not to copy memory we first copy in the framevec32_t data into the + * standard fio_vec space. Next we work backwards copying a given framevec32_t + * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that + * it is important that we do this in reverse so as to ensure that we don't + * clobber data as the framevec_t is larger than the framevec32_t. + */ +static int +frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr) +{ + framevec32_t *vec32p; + framevec_t fv; + int i; + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + + if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs, + 0) != 0) + return (EFAULT); + + for (i = fio->fio_nvecs - 1; i >= 0; i--) { + fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf; + fv.fv_buflen = vec32p[i].fv_buflen; + fv.fv_actlen = vec32p[i].fv_actlen; + fio->fio_vecs[i].fv_buf = fv.fv_buf; + fio->fio_vecs[i].fv_buflen = fv.fv_buflen; + fio->fio_vecs[i].fv_actlen = fv.fv_actlen; + } + + return (frameio_hdr_check_vecs(fio)); +} + +/* + * Copy in a frame io header into fio with space for up to nvecs. If the frameio + * contains more vectors than specified it will be ignored. mode should contain + * information about the datamodel. + */ +int +frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode) +{ + int model = ddi_model_convert_from(mode & FMODELS); + int cpf = mode & FKIOCTL ? FKIOCTL : 0; + size_t fsize = model == DDI_MODEL_ILP32 ? + sizeof (frameio32_t) : sizeof (frameio_t); + + /* + * The start of the header is the same in all data models for the + * current verison. + */ + if (ddi_copyin(addr, fio, fsize, cpf) != 0) + return (EFAULT); + + if (fio->fio_version != FRAMEIO_VERSION_ONE) + return (EINVAL); + + if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0) + return (EINVAL); + + if (fio->fio_nvpf == 0) + return (EINVAL); + + if (fio->fio_nvecs % fio->fio_nvpf != 0) + return (EINVAL); + + if (fio->fio_nvecs > max_vecs) + return (EOVERFLOW); + + addr = (void *)((uintptr_t)addr + fsize); + if (model == DDI_MODEL_ILP32) { + if (cpf != 0) + return (EINVAL); + return (frameio_hdr_copyin_ilp32(fio, addr)); + } + + if (ddi_copyin(addr, &fio->fio_vecs[0], + sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0) + return (EFAULT); + + return (frameio_hdr_check_vecs(fio)); +} + +static mblk_t * +frameio_allocb(size_t sz) +{ + mblk_t *mp; + + mp = allocb(sz, 0); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_DATA; + return (mp); +} + +static int +framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf) +{ + mblk_t *mp; + cpf = cpf != 0 ? FKIOCTL : 0; + + mp = frameio_allocb(fv->fv_buflen); + + if (mp == NULL) { + freemsg(mp); + return (EAGAIN); + } + + if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen, + cpf) != 0) { + freemsg(mp); + return (EFAULT); + } + + mp->b_wptr += fv->fv_buflen; + *mpp = mp; + return (0); +} + +/* + * Read a set of frame vectors that make up a single message boundary and return + * that as a single message in *mpp that consists of multiple data parts. + */ +static int +frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf) +{ + int nparts = fio->fio_nvpf; + int part, error; + mblk_t *mp; + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + /* + * Construct the initial frame + */ + for (part = 0; part < nparts; part++) { + error = framevec_mblk_read(fv, &mp, cpf); + if (error != 0) { + freemsg(*mpp); + return (error); + } + + if (*mpp == NULL) + *mpp = mp; + else + linkb(*mpp, mp); + fv++; + } + + return (0); +} + +/* + * Read data from a series of frameio vectors into a message block chain. A + * given frameio request has a number of discrete messages divided into + * individual vectors based on fio->fio_nvcspframe. Each discrete message will + * be constructed into a message block chain pointed to by b_next. + * + * If we get an EAGAIN while trying to construct a given message block what we + * return depends on what else we've done so far. If we have succesfully + * completed at least one message then we free everything else we've done so + * far and return that. If no messages have been completed we return EAGAIN. If + * instead we encounter a different error, say EFAULT, then all of the fv_actlen + * entries values are undefined. + */ +int +frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf) +{ + int error = ENOTSUP; + int nframes = fio->fio_nvecs / fio->fio_nvpf; + int frame; + framevec_t *fv; + mblk_t *mp, *bmp = NULL; + + /* + * Protect against bogus kernel subsystems. + */ + VERIFY(fio->fio_nvecs > 0); + VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0); + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + fv = &fio->fio_vecs[0]; + for (frame = 0; frame < nframes; frame++) { + error = frameio_mblk_read(fio, fv, &mp, cpf); + if (error != 0) + goto failed; + + if (bmp != NULL) + bmp->b_next = mp; + else + *mpp = mp; + bmp = mp; + } + + *nvecs = nframes; + return (0); +failed: + /* + * On EAGAIN we've already taken care of making sure that we have no + * leftover messages, eg. they were never linked in. + */ + if (error == EAGAIN) { + if (frame != 0) + error = 0; + if (*nvecs != NULL) + *nvecs = frame; + ASSERT(*mpp != NULL); + } else { + for (mp = *mpp; mp != NULL; mp = bmp) { + bmp = mp->b_next; + freemsg(mp); + } + if (nvecs != NULL) + *nvecs = 0; + *mpp = NULL; + } + return (error); +} + +size_t +frameio_frame_length(frameio_t *fio, framevec_t *fv) +{ + int i; + size_t len = 0; + + for (i = 0; i < fio->fio_nvpf; i++, fv++) + len += fv->fv_buflen; + + return (len); +} + +/* + * Write a portion of an mblk to the current. + */ +static int +framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff, + size_t foff, int cpf) +{ + ASSERT(len <= MBLKL(mp) - moff); + ASSERT(len <= fv->fv_buflen - fv->fv_actlen); + cpf = cpf != 0 ? FKIOCTL : 0; + + if (ddi_copyout(mp->b_rptr + moff, fv->fv_buf + foff, len, cpf) != 0) + return (EFAULT); + fv->fv_actlen += len; + + return (0); +} + +/* + * Because copying this out to the user might fail we don't want to update the + * b_rptr in case we need to copy it out again. + */ +static int +framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf) +{ + int err; + size_t msize, blksize, len, moff, foff; + + msize = msgsize(mp); + if (msize > frameio_frame_length(fio, fv)) + return (EOVERFLOW); + + moff = 0; + foff = 0; + blksize = MBLKL(mp); + fv->fv_actlen = 0; + while (msize != 0) { + len = MIN(blksize, fv->fv_buflen - fv->fv_actlen); + err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf); + if (err != 0) + return (err); + + msize -= len; + blksize -= len; + moff += len; + foff += len; + + if (blksize == 0 && msize != 0) { + mp = mp->b_cont; + ASSERT(mp != NULL); + moff = 0; + blksize = MBLKL(mp); + } + + if (fv->fv_buflen == fv->fv_actlen && msize != 0) { + fv++; + fv->fv_actlen = 0; + foff = 0; + } + } + + return (0); +} + +int +frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map, + mblk_t *mp, int *nwrite, int cpf) +{ + int mcount = 0; + int ret = 0; + + if (map != MAP_BLK_FRAME) + return (EINVAL); + + while (mp != NULL && mcount < fio->fio_nvecs) { + ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf); + if (ret != 0) + break; + mcount += fio->fio_nvpf; + mp = mp->b_next; + } + + if (ret != 0 && mcount == 0) { + if (nwrite != NULL) + *nwrite = 0; + return (ret); + } + + if (nwrite != NULL) + *nwrite = mcount / fio->fio_nvpf; + + return (0); +} + +/* + * Copy out nframes worth of frameio header data back to userland. + */ +int +frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode) +{ + int i; + int model = ddi_model_convert_from(mode & FMODELS); + framevec32_t *vec32p; + framevec32_t f; + + if (fio->fio_nvecs / fio->fio_nvpf < nframes) + return (EINVAL); + + fio->fio_nvecs = nframes * fio->fio_nvpf; + + if (model == DDI_MODEL_NONE) { + if (ddi_copyout(fio, addr, + sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); + } + + ASSERT(model == DDI_MODEL_ILP32); + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + for (i = 0; i < fio->fio_nvecs; i++) { + f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf; + if (fio->fio_vecs[i].fv_buflen > UINT_MAX || + fio->fio_vecs[i].fv_actlen > UINT_MAX) + return (EOVERFLOW); + f.fv_buflen = fio->fio_vecs[i].fv_buflen; + f.fv_actlen = fio->fio_vecs[i].fv_actlen; + vec32p[i].fv_buf = f.fv_buf; + vec32p[i].fv_buflen = f.fv_buflen; + vec32p[i].fv_actlen = f.fv_actlen; + } + + if (ddi_copyout(fio, addr, + sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +void +frameio_mark_consumed(frameio_t *fio, int nframes) +{ + int i; + + ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes); + for (i = 0; i < nframes * fio->fio_nvpf; i++) + fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen; +} diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c new file mode 100644 index 0000000000..2abb6f9464 --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -0,0 +1,5800 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * vnd - virtual (machine) networking datapath + * + * vnd's purpose is to provide a highly performant data path for Layer 2 network + * traffic and exist side by side an active IP netstack, each servicing + * different datalinks. vnd provides many of the same capabilities as the + * current TCP/IP stack does and some specific to layer two. Specifically: + * + * o Use of the DLD fastpath + * o Packet capture hooks + * o Ability to use hardware capabilities + * o Useful interfaces for handling multiple frames + * + * The following image shows where vnd fits into today's networking stack: + * + * +---------+----------+----------+ + * | libdlpi | libvnd | libsocket| + * +---------+----------+----------+ + * | · · VFS | + * | VFS · VFS +----------+ + * | · | sockfs | + * +---------+----------+----------+ + * | | VND | IP | + * | +----------+----------+ + * | DLD/DLS | + * +-------------------------------+ + * | MAC | + * +-------------------------------+ + * | GLDv3 | + * +-------------------------------+ + * + * ----------------------------------------- + * A Tale of Two Devices - DDI Device Basics + * ----------------------------------------- + * + * vnd presents itself to userland as a character device; however, it also is a + * STREAMS device so that it can interface with dld and the rest of the + * networking stack. Users never interface with the STREAMs devices directly and + * they are purely an implementation detail of vnd. Opening the STREAMS device + * require kcred and as such userland cannot interact with it or push it onto + * the stream head. + * + * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every + * clone gets its own minor number; however, minor nodes are not created in the + * devices tree for these instances. In this state a user may do two different + * things. They may issue ioctls that affect global state or they may issue + * ioctls that try to attach it to a given datalink. Once a minor device has + * been attached to a datalink, all operations on it are scoped to that context, + * therefore subsequent global operations are not permitted. + * + * A given device can be linked into the /devices and /dev name space via a link + * ioctl. That ioctl causes a minor node to be created in /devices and then it + * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar + * to, but simpler than, IP's persistence mechanism. + * + * --------------------- + * Binding to a datalink + * --------------------- + * + * Datalinks are backed by the dld (datalink device) and dls (datalink services) + * drivers. These drivers provide a STREAMS device for datalinks on the system + * which are exposed through /dev/net. Userland generally manipulates datalinks + * through libdlpi. When an IP interface is being plumbed up what actually + * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink + * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may + * then can negotiate with dld and dls to obtain access to various capabilities + * and fast paths via a series of STREAMS messages. + * + * In vnd, we do the same thing, but we leave our STREAMS module as an + * implementation detail of the system. We don't want users to be able to + * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require + * kcred to manipulate it. Thus, when a user issues a request to attach a + * datalink to a minor instance of the character device, that vnd minor instance + * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. + * vnd does that open using the passed in credentials from the ioctl, not kcred. + * This ensures that users who doesn't have permissions to open the device + * cannot. Once that's been opened, we push on the vnd streams module. + * + * Once the vnd STREAMS instance has been created for this device, eg. the + * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl + * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. + * This association begins the STREAM device's initialization. We start up an + * asynchronous state machine that takes care of all the different aspects of + * plumbing up the device with dld and dls and enabling the MAC fast path. We + * need to guarantee to consumers of the character device that by the time their + * ioctl returns, the data path has been fully initialized. + * + * The state progression is fairly linear. There are two general steady states. + * The first is VND_S_ONLINE, which means that everything is jacked up and good + * to go. The alternative is VND_S_ZOMBIE, which means that the streams device + * encountered an error or we have finished tearing it down and the character + * device can clean it up. The following is our state progression and the + * meaning of each state: + * + * | + * | + * V + * +---------------+ + * | VNS_S_INITIAL | This is our initial state. Every + * +---------------+ vnd STREAMS device starts here. + * | While in this state, only dlpi + * | M_PROTO and M_IOCTL messages can be + * | sent or received. All STREAMS based + * | data messages are dropped. + * | We transition out of this state by + * | sending a DL_INFO_REQ to obtain + * | information about the underlying + * | link. + * v + * +-----------------+ + * +--<-| VNS_S_INFO_SENT | In this state, we verify and + * | +-----------------+ record information about the + * | | underlying device. If the device is + * | | not suitable, eg. not of type + * v | DL_ETHER, then we immediately + * | | become a ZOMBIE. To leave this + * | | state we request exclusive active + * | | access to the device via + * v | DL_EXCLUSIVE_REQ. + * | v + * | +----------------------+ + * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether + * | +----------------------+ or not we were able to obtain + * | | | exclusive access to the device. If + * | | | we were not able to, then we leave, + * v | | as that means that something like + * | | | IP is already plumbed up on top of + * | | | the datalink. We leave this state + * | | | by progressing through to the + * | | | appropriate DLPI primitive, either + * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ + * | | | depending on the style of the + * | | | datalink. + * | | v + * | | +-------------------+ + * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were + * | | +-------------------+ able to perform a standard DLPI + * | | | attach and if so, go ahead and + * v | | send a DLPI_BIND_REQ. + * | v v + * | +-------------------+ + * +--<-| VNS_S_BIND_SENT | In this state we see the result of + * | +-------------------+ our attempt to bind to PPA 0 of the + * v | underlying device. Because we're + * | | trying to be a layer two datapath, + * | | the specific attachment point isn't + * | | too important as we're going to + * v | have to enable promiscuous mode. We + * | | transition out of this by sending + * | | our first of three promiscuous mode + * | | requests. + * v v + * | +------------------------+ + * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we + * | +------------------------+ were able to enable promiscuous + * | | mode at the physical level. We + * | | transition out of this by enabling + * | | multicast and broadcast promiscuous + * v | mode. + * | v + * | +--------------------------+ + * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ have enabled DL_PROMISC_MULTI and + * v | move onto the second promiscuous + * | | mode request. + * | v + * | +----------------------------+ + * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we + * | +----------------------------+ enabled RX_ONLY promiscuous mode. + * | | We specifically do this as we don't + * v | want to receive our own traffic + * | | that we'll send out. We leave this + * | | state by enabling the final flag + * | | DL_PROMISC_FIXUPS. + * | v + * | +--------------------------+ + * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ enabled FIXUP promiscuous mode. + * | | We specifically do this as we need + * v | to ensure that traffic which is + * | | received by being looped back to us + * | | correctly has checksums fixed. We + * | | leave this state by requesting the + * | | dld/dls capabilities that we can + * v | process. + * | v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of + * | +--------------------+ capabilities that dld advertised + * | | and enable the ones that currently + * v | support for use. See the section + * | | later on regarding capabilities + * | | for more information. We leave this + * | | state by sending an enable request. + * v v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability + * | +--------------------+ initialization. Once finished, we + * | | transition to the next state. If + * v | the dld fast path is not available, + * | | we become a zombie. + * | v + * | +--------------+ + * | | VNS_S_ONLINE | This is a vnd STREAMS device's + * | +--------------+ steady state. It will normally + * | | reside in this state while it is in + * | | active use. It will only transition + * v | to the next state when the STREAMS + * | | device is closed by the character + * | | device. In this state, all data + * | | flows over the dld fast path. + * | v + * | +---------------------+ + * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of + * | +---------------------+ disabling capabilities and + * | | flushing all data. At this point + * | | any additional data that we receive + * | | will be dropped. We leave this + * v | state by trying to remove multicast + * | | promiscuity. + * | | + * | v + * | +---------------------------------+ + * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------------+ successfully removed multicast + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. We leave this state by trying + * | | to disable SAP level promiscuous + * | | mode. + * | v + * | +---------------------------+ + * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------+ successfully removed SAP level + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. Note that we don't worry + * | | about either of + * | | DL_PROMISC_FIXUPS or + * | | DL_PROMISC_RX_ONLY. If these are + * | | the only two entries left, then we + * | | should have anything that MAC is + * | | doing for us at this point, + * | | therefore it's safe for us to + * | | proceed to unbind, which is how we + * | | leave this state via a + * | v DL_UNBIND_REQ. + * | +-------------------+ + * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind + * | +-------------------+ request went. Regardless of its + * | | success, we always transition to + * | | a zombie state. + * | v + * | +--------------+ + * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS + * +--------------+ device is waiting to finish being + * reaped. Because we have no more + * ways to receive data it should be + * safe to destroy all remaining data + * structures. + * + * If the stream association fails for any reason the state machine reaches + * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the + * STREAMS ioctl to the character device. That will fail the user ioctl and + * propagate the vnd_errno_t back to userland. If, on the other hand, the + * association succeeds, then the vnd STREAMS device will be fully plumbed up + * and ready to transmit and receive message blocks. Consumers will be able to + * start using the other cbops(9E) entry points once the attach has fully + * finished, which will occur after the original user attach ioctl to the + * character device returns. + * + * It's quite important that we end up sending the full series of STREAMS + * messages when tearing down. While it's tempting to say that we should just + * rely on the STREAMS device being closed to properly ensure that we have no + * more additional data, that's not sufficient due to our use of direct + * callbacks. DLS does not ensure that by the time we change the direct + * callback (vnd_mac_input) that all callers to it will have been quiesced. + * However, it does guarantee that if we disable promiscuous mode ourselves and + * we turn off the main data path via DL_UNBIND_REQ that it will work. + * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do + * it as part of tearing down the STREAMS device. This ensures that we'll + * quiesce all data before we destroy our data structures and thus we should + * eliminate the race in changing the data function. + * + * -------------------- + * General Architecture + * -------------------- + * + * There are several different devices and structures in the vnd driver. There + * is a per-netstack component, pieces related to the character device that + * consumers see, the internal STREAMS device state, and the data queues + * themselves. The following ASCII art picture describes their relationships and + * some of the major pieces of data that contain them. These are not exhaustive, + * e.g. synchronization primitives are left out. + * + * +----------------+ +-----------------+ + * | global | | global | + * | device list | | netstack list | + * | vnd_dev_list | | vnd_nsd_list | + * +----------------+ +-----------------+ + * | | + * | v + * | +-------------------+ +-------------------+ + * | | per-netstack data | ---> | per-netstack data | --> ... + * | | vnd_pnsd_t | | vnd_pnsd_t | + * | | | +-------------------+ + * | | | + * | | nestackid_t ---+----> Netstack ID + * | | vnd_pnsd_flags_t -+----> Status flags + * | | zoneid_t ---+----> Zone ID for this netstack + * | | hook_family_t ---+----> VND IPv4 Hooks + * | | hook_family_t ---+----> VND IPv6 Hooks + * | | list_t ----+ | + * | +------------+------+ + * | | + * | v + * | +------------------+ +------------------+ + * | | character device | ---> | character device | -> ... + * +---------->| vnd_dev_t | | vnd_dev_t | + * | | +------------------+ + * | | + * | minor_t ---+--> device minor number + * | ldi_handle_t ---+--> handle to /dev/net/%datalink + * | vnd_dev_flags_t -+--> device flags, non blocking, etc. + * | char[] ---+--> name if linked + * | vnd_str_t * -+ | + * +--------------+---+ + * | + * v + * +-------------------------+ + * | STREAMS device | + * | vnd_str_t | + * | | + * | vnd_str_state_t ---+---> State machine state + * | gsqueue_t * ---+---> mblk_t Serialization queue + * | vnd_str_stat_t ---+---> per-device kstats + * | vnd_str_capab_t ---+----------------------------+ + * | vnd_data_queue_t ---+ | | + * | vnd_data_queue_t -+ | | v + * +-------------------+-+---+ +---------------------+ + * | | | Stream capabilities | + * | | | vnd_str_capab_t | + * | | | | + * | | supported caps <--+-- vnd_capab_flags_t | + * | | dld cap handle <--+-- void * | + * | | direct tx func <--+-- vnd_dld_tx_t | + * | | +---------------------+ + * | | + * +----------------+ +-------------+ + * | | + * v v + * +-------------------+ +-------------------+ + * | Read data queue | | Write data queue | + * | vnd_data_queue_t | | vnd_data_queue_t | + * | | | | + * | size_t ----+--> Current size | size_t ----+--> Current size + * | size_t ----+--> Max size | size_t ----+--> Max size + * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head + * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail + * +-------------------+ +-------------------+ + * + * + * Globally, we maintain two lists. One list contains all of the character + * device soft states. The other maintains a list of all our netstack soft + * states. Each netstack maintains a list of active devices that have been + * associated with a datalink in its netstack. + * + * Recall that a given minor instance of the character device exists in one of + * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, + * or it can be associated with a given datalink. When minor instances are in + * the former state, they do not exist in a given vnd_pnsd_t's list of devices. + * As part of attaching to a datalink, the given vnd_dev_t will be inserted into + * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a + * vnd_str_t, to be created and associated to a vnd_dev_t. + * + * The character device, and its vnd_dev_t, is the interface to the rest of the + * system. The vnd_dev_t keeps track of various aspects like whether various + * operations, such as read, write and the frameio ioctls, are considered + * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for + * keeping track of things like the name of the device, if any, in /dev. The + * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual + * data queues. However, ioctls that manipulate these properties all go through + * the vnd_dev_t to its associated vnd_str_t. + * + * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One + * for frames to transmit (write queue) and one for frames received (read + * queue). These data queues have a maximum size and attempting to add data + * beyond that maximum size will result in data being dropped. The sizes are + * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits + * in those buffers or has a reservation in those buffers while they are in vnd + * and waiting to be consumed by the user or by mac. + * + * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the + * available, negotiated, and currently active features. + * + * ---------------------- + * Data Path and gsqueues + * ---------------------- + * + * There's a lot of plumbing in vnd to get to the point where we can send data, + * but vnd's bread and butter is the data path, so it's worth diving into it in + * more detail. Data enters and exits the system from two ends. + * + * The first end is the vnd consumer. This comes in the form of read and write + * system calls as well as the frame I/O ioctls. The read and write system calls + * operate on a single frame at a time. Think of a frame as a single message + * that has come in off the wire, which may itself comprise multiple mblk_t's + * linked together in the kernel. readv(2) and writev(2) have the same + * limitations as read(2) and write(2). We enforce this as the system is + * required to fill up every uio(9S) buffer before moving onto the next one. + * This means that if you have a MTU sized buffer and two frames come in which + * are less than half of the MTU they must fill up the given iovec. Even if we + * didn't want to do this, we have no way of informing the supplier of the + * iovecs that they were only partially filled or where one frame ends and + * another begins. That's life, as such we have frame I/O which solves this + * problem. It allows for multiple frames to be consumed as well as for frames + * to be broken down into multiple vector components. + * + * The second end is the mac direct calls. As part of negotiating capabilities + * via dld, we give mac a function of ours to call when packets are received + * [vnd_mac_input()] and a callback to indicate that flow has been restored + * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can + * transmit data with. As part of the contract with mac, mac is allowed to flow + * control us by returning a cookie to the transmit function. When that happens, + * all outbound traffic is halted until our callback function is called and we + * can schedule drains. + * + * It's worth looking at these in further detail. We'll start with the rx path. + * + * + * | + * * . . . packets from gld + * | + * v + * +-------------+ + * | mac | + * +-------------+ + * | + * v + * +-------------+ + * | dld | + * +-------------+ + * | + * * . . . dld direct callback + * | + * v + * +---------------+ + * | vnd_mac_input | + * +---------------+ + * | + * v + * +---------+ +-------------+ + * | dropped |<--*---------| vnd_hooks | + * | by | . +-------------+ + * | hooks | . drop probe | + * +---------+ kstat bump * . . . Do we have free + * | buffer space? + * | + * no . | . yes + * . + . + * +---*--+------*-------+ + * | | + * * . . drop probe * . . recv probe + * | kstat bump | kstat bump + * v | + * +---------+ * . . fire pollin + * | freemsg | v + * +---------+ +-----------------------+ + * | vnd_str_t`vns_dq_read | + * +-----------------------+ + * ^ ^ + * +----------+ | | +---------+ + * | read(9E) |-->-+ +--<--| frameio | + * +----------+ +---------+ + * + * The rx path is rather linear. Packets come into us from mac. We always run + * them through the various hooks, and if they come out of that, we inspect the + * read data queue. If there is not enough space for a packet, we drop it. + * Otherwise, we append it to the data queue, and fire read notifications + * targetting anyone polling or doing blocking I/O on this device. Those + * consumers then drain the head of the data queue. + * + * The tx path is more complicated due to mac flow control. After any call into + * mac, we may have to potentially suspend writes and buffer data for an + * arbitrary amount of time. As such, we need to carefully track the total + * amount of outstanding data so that we don't waste kernel memory. This is + * further complicated by the fact that mac will asynchronously tell us when our + * flow has been resumed. + * + * For data to be able to enter the system, it needs to be able to take a + * reservation from the write data queue. Once the reservation has been + * obtained, we enter the gsqueue so that we can actually append it. We use + * gsqueues (serialization queues) to ensure that packets are manipulated in + * order as we deal with the draining and appending packets. We also leverage + * its worker thread to help us do draining after mac has restorted our flow. + * + * The following image describes the flow: + * + * +-----------+ +--------------+ +-------------------------+ +------+ + * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | + * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ + * +-----------+ +--------------+ . +-------------------------+ + * | ^ . + * | | . reserve space from gsqueue + * | | | + * queue . . . * | space v + * full | * . . . avail +------------------------+ + * v | | vnd_squeue_tx_append() | + * +--------+ +------------+ +------------------------+ + * | EAGAIN |<--*------| Non-block? |<-+ | + * +--------+ . +------------+ | v + * . yes v | wait +--------------+ + * no . .* * . . for | append chain | + * +----+ space | to outgoing | + * | mblk chain | + * from gsqueue +--------------+ + * | | + * | +-------------------------------------------------+ + * | | + * | | yes . . . + * v v . + * +-----------------------+ +--------------+ . +------+ + * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | + * +-----------------------+ +--------------+ +------+ + * | | + * +---------------------------------|---------------------+ + * | | tx | + * | no . . * queue . . * + * | flow controlled . | empty * . fire pollout + * | . v | if mblk_t's + * +-------------+ . +---------------------+ | sent + * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ + * | flags | +---------------------+ | + * +-------------+ More data | | | More data | + * and limit ^ v * . . and limit ^ + * not reached . . * | | reached | + * +----+ | | + * v | + * +----------+ +-------------+ +---------------------------+ + * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | + * | control | | block flags | | vnd_squeue_tx_drain() and | + * | callback | +-------------+ | GSQUEUE_FILL flag, iff | + * +----------+ | not already scheduled | + * +---------------------------+ + * + * The final path taken for a given write(9E)/frameio ioctl depends on whether + * or not the vnd_dev_t is non-blocking. That controls the initial path of + * trying to take a reservation in write data queue. If the device is in + * non-blocking mode, we'll return EAGAIN when there is not enough space + * available, otherwise, the calling thread blocks on the data queue. + * + * Today when we call into vnd_squeue_tx_drain() we will not try to drain the + * entire queue, as that could be quite large and we don't want to necessarily + * keep the thread that's doing the drain until it's been finished. Not only + * could more data be coming in, but the draining thread could be a userland + * thread that has more work to do. We have two limits today. There is an upper + * bound on the total amount of data and the total number of mblk_t chains. If + * we hit either limit, then we will schedule another drain in the gsqueue and + * go from there. + * + * It's worth taking some time to describe how we interact with gsqueues. vnd + * has a gsqueue_set_t for itself. It's important that it has its own set, as + * the profile of work that vnd does is different from other sub-systems in the + * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. + * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up + * maintaining one for a given device. Because of that, we want to use a + * pseudo-random one to try and spread out the load, and picking one at random + * is likely to be just as good as any fancy algorithm we might come up with, + * especially as any two devices could have radically different transmit + * profiles. + * + * While some of the write path may seem complicated, it does allow us to + * maintain an important property. Once we have acknowledged a write(9E) or + * frameio ioctl, we will not drop the packet, excepting something like ipf via + * the firewall hooks. + * + * There is one other source of flow control that can exist in the system which + * is in the form of a barrier. The barrier is an internal mechanism used for + * ensuring that an gsqueue is drained for a given device. We use this as part + * of tearing down. Specifically we disable the write path so nothing new can be + * inserted into the gsqueue and then insert a barrier block. Once the barrier + * block comes out of the gsqueue, then we know nothing else in the gsqueue that + * could refer to the vnd_str_t, being destroyed, exists. + * + * --------------------- + * vnd, zones, netstacks + * --------------------- + * + * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. + * Because of that, vnd is also a netstack module. It registers with the + * netstack sub-system and receives callbacks every time a netstack is created, + * being shutdown, and destroyed. The netstack callbacks drive the creation and + * destruction of the vnd_pnsd_t structures. + * + * Recall from the earlier architecture diagrams that every vnd device is scoped + * to a netstack and known about by a given vnd_pnsd_t. When that netstack is + * torn down, we also tear down any vnd devices that are hanging around. When + * the netstack is torn down, we know that any zones that are scoped to that + * netstack are being shut down and have no processes remaining. This is going + * to be the case whether they are shared or exclusive stack zones. We have to + * perform a careful dance. + * + * There are two different callbacks that happen on tear down, the first is a + * shutdown callback, the second is a destroy callback. When the shutdown + * callback is fired we need to prepare for the netstack to go away and ensure + * that nothing can continue to persist itself. + * + * More specifically, when we get notice of a stack being shutdown we first + * remove the netstack from the global netstack list to ensure that no one new + * can come in and find the netstack and get a reference to it. After that, we + * notify the neti hooks that they're going away. Once that's all done, we get + * to the heart of the matter. + * + * When shutting down there could be any number of outstanding contexts that + * have a reference on the vnd_pnsd_t and on the individual links. However, we + * know that no one new will be able to find the vnd_pnsd_t. To account for + * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with + * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device + * to the netstack's list. If this is set, then they must not append to it. + * Once this is set, we know that the netstack's list of devices can never grow, + * only shrink. + * + * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that + * the container for the device is being destroyed and that we should not allow + * additional references to the device to be created, whether via open, or + * linking. The presence of this bit also allows things like the list ioctl and + * sdev to know not to consider its existence. At the conclusion of this being + * set, we know that no one else should be able to obtain a new reference to the + * device. + * + * Once that has been set for all devices, we go through and remove any existing + * links that have been established in sdev. Because doing that may cause the + * final reference for the device to be dropped, which still has a reference to + * the netstack, we have to restart our walk due to dropped locks. We know that + * this walk will eventually complete because the device cannot be relinked and + * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. + * Once that's finished, the shutdown callback returns. + * + * When we reach the destroy callback, we simply wait for references on the + * netstack to disappear. Because the zone has been shut down, all processes in + * it that have open references have been terminated and reaped. Any threads + * that are newly trying to reference it will fail. However, there is one thing + * that can halt this that we have no control over, which is the global zone + * holding open a reference to the device. In this case the zone halt will hang + * in vnd_stack_destroy. Once the last references is dropped we finish destroy + * the netinfo hooks and free the vnd_pnsd_t. + * + * ---- + * sdev + * ---- + * + * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd + * for both the global and non-global zones. In any given zone we always supply + * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone + * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. + * if a link was named net0, there would be a /dev/vnd/net0. The global zone can + * also see every link for every zone, ala /dev/net, under + * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device + * named net0, the global zone would have /dev/vnd/turin/net0. + * + * The sdev plugin has three interfaces that it supplies back to sdev. One is to + * validate that a given node is still valid. The next is a callback from sdev + * to say that it is no longer using the node. The third and final one is from + * sdev where it asks us to fill a directory. All of the heavy lifting is done + * in directory filling and in valiation. We opt not to maintain a reference on + * the device while there is an sdev node present. This makes the removal of + * nodes much simpler and most of the possible failure modes shouldn't cause any + * real problems. For example, the open path has to handle both dev_t's which no + * longer exist and which are no longer linked. + * + * ----- + * hooks + * ----- + * + * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd + * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks + * in a minimal fashion. While we will allow traffic to be filtered through the + * hooks, we do not provide means for packet injection or additional inspection + * at this time. There are a total of four different events created: + * + * o IPv4 physical in + * o IPv4 physical out + * o IPv6 physical in + * o IPv6 physical out + * + * --------------- + * Synchronization + * --------------- + * + * To make our synchronization simpler, we've put more effort into making the + * metadata/setup paths do more work. That work allows the data paths to make + * assumptions around synchronization that simplify the general case. Each major + * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is + * annotated with the protection that its members receives. The following + * annotations are used: + * + * A Atomics; these values are only modified using atomics values. + * Currently this only applies to kstat values. + * E Existence; no lock is needed to access this member, it does not + * change while the structure is valid. + * GL Global Lock; these members are protected by the global + * vnd_dev_lock. + * L Locked; access to the member is controlled by a lock that is in + * the structure. + * NSL netstack lock; this member is protected by the containing + * netstack. This only applies to the vnd_dev_t`vdd_nslink. + * X This member is special, and is discussed in this section. + * + * In addition to locking, we also have reference counts on the vnd_dev_t and + * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. + * With rare exception, once a reference count is decremented, the consumer + * should not assume that the data is valid any more. The only exception to this + * is the case where we're removing an extant reference count from a link into + * /devices or /dev. Reference counts are obtained on these structures as a part + * of looking them up. + * + * # Global Lock Ordering + * ###################### + * + * The following is the order that you must take locks in vnd: + * + * 1) vnd`vnd_dev_lock + * 2) vnd_pnsd_t`vpnd_lock + * 3) vnd_dev_t`vnd_lock + * 4) vnd_str_t`vns_lock + * 5) vnd_data_queue_t`vdq_lock + * + * One must adhere to the following rules: + * + * o You must acquire a lower numbered lock before a high numbered lock. + * o It is NOT legal to hold two locks of the same level concurrently, eg. you + * can not hold two different vnd_dev_t's vnd_lock at the same time. + * o You may release locks in any order. + * o If you release a lock, you must honor the locking rules before acquiring + * it again. + * o You should not hold any locks when calling any of the rele functions. + * + * # Special Considerations + * ######################## + * + * While most of the locking is what's expected, it's worth going into the + * special nature that a few members hold. Today, only two structures have + * special considerations: the vnd_dev_t and the vnd_str_t. All members with + * special considerations have an additional annotation that describes how you + * should interact with it. + * + * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is + * attached or in the process of attaching. If the code path that goes through + * requires an attached vnd_dev_t, eg. the data path and tear down path, then it + * is always legal to dereference that member without a lock held. When they are + * added to the system, they should be done under the vdd_lock and done as part + * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the + * lifetime of the vnd_dev_t. + * + * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it + * always exists as it is a part of the structure. The only time that it's valid + * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag + * set or during tear down. Outside of those paths which are naturally + * serialized, there is no explicit locking around the member. + * + * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not + * initially set as part of creating the structure, but are set as part of + * responding to the association ioctl. Anything in the data path or metadata + * path that requires association may assume that they exist, as we do not kick + * off the state machine until they're set. + * + * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The + * members are designed to be used as part of various operations with the + * gsqueues. A lock isn't needed to use them, but to work with them, the + * appropriate flag in the vnd_str_t`vns_flags must have been set by the current + * thread. Otherwise, it is always fair game to refer to their addresses. Their + * contents are ignored by vnd, but some members are manipulated by the gsqueue + * subsystem. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/ddi.h> +#include <sys/ethernet.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/ksynch.h> +#include <sys/taskq_impl.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/dlpi.h> +#include <sys/cred.h> +#include <sys/id_space.h> +#include <sys/list.h> +#include <sys/ctype.h> +#include <sys/policy.h> +#include <sys/sunldi.h> +#include <sys/cred.h> +#include <sys/strsubr.h> +#include <sys/poll.h> +#include <sys/neti.h> +#include <sys/hook.h> +#include <sys/hook_event.h> +#include <sys/vlan.h> +#include <sys/dld.h> +#include <sys/mac_client.h> +#include <sys/netstack.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/gsqueue.h> + +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/vnd.h> + +/* + * Globals + */ +static dev_info_t *vnd_dip; +static taskq_t *vnd_taskq; +static kmem_cache_t *vnd_str_cache; +static kmem_cache_t *vnd_dev_cache; +static kmem_cache_t *vnd_pnsd_cache; +static id_space_t *vnd_minors; +static int vnd_list_init = 0; +static sdev_plugin_hdl_t vnd_sdev_hdl; +static gsqueue_set_t *vnd_sqset; + +static kmutex_t vnd_dev_lock; +static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ +static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ + +/* + * STREAMs ioctls + * + * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such + * they aren't a part of the header file. + */ +#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) + +/* + * Private ioctl to associate a given streams instance with a minor instance of + * the character device. + */ +#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) + +typedef struct vnd_strioc_associate { + minor_t vsa_minor; /* minor device node */ + netstackid_t vsa_nsid; /* netstack id */ + vnd_errno_t vsa_errno; /* errno */ +} vnd_strioc_associate_t; + +typedef enum vnd_strioc_state { + VSS_UNKNOWN = 0, + VSS_COPYIN = 1, + VSS_COPYOUT = 2, +} vnd_strioc_state_t; + +typedef struct vnd_strioc { + vnd_strioc_state_t vs_state; + caddr_t vs_addr; +} vnd_strioc_t; + +/* + * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though + * really, overlap is at the end of the day, inevitable. + */ +#define VND_SQUEUE_TAG_TX_DRAIN 0x42 +#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 +#define VND_SQUEUE_TAG_VND_WRITE 0x44 +#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 +#define VND_SQUEUE_TAG_STRBARRIER 0x46 + +/* + * vnd reserved names. These are names which are reserved by vnd and thus + * shouldn't be used by some external program. + */ +static char *vnd_reserved_names[] = { + "ctl", + "zone", + NULL +}; + +/* + * vnd's DTrace probe macros + * + * DTRACE_VND* are all for a stable provider. We also have an unstable internal + * set of probes for reference count manipulation. + */ +#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_VND_REFINC(vdp) \ + DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); +#define DTRACE_VND_REFDEC(vdp) \ + DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); + + +/* + * Tunables + */ +size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ +size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ + +/* + * These numbers are designed as per-device tunables that are applied when a new + * vnd device is attached. They're a rough stab at what may be a reasonable + * amount of work to do in one burst in an squeue. + */ +size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ +size_t vnd_flush_nburst = 10; /* 10 frames */ + +/* + * Constants related to our sdev plugins + */ +#define VND_SDEV_NAME "vnd" +#define VND_SDEV_ROOT "/dev/vnd" +#define VND_SDEV_ZROOT "/dev/vnd/zone" + +/* + * Statistic macros + */ +#define VND_STAT_INC(vsp, field, val) \ + atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) +#define VND_LATENCY_1MS 1000000 +#define VND_LATENCY_10MS 10000000 +#define VND_LATENCY_100MS 100000000 +#define VND_LATENCY_1S 1000000000 +#define VND_LATENCY_10S 10000000000 + +/* + * Constants for vnd hooks + */ +static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; +#define IPV4_MCAST_LEN 3 +static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +#define IPV6_MCAST_LEN 2 +static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* + * vnd internal data structures and types + */ + +struct vnd_str; +struct vnd_dev; +struct vnd_pnsd; + +/* + * As part of opening the device stream we need to properly communicate with our + * underlying stream. This is a bit of an asynchronous dance and we need to + * properly work with dld to get everything set up. We have to initiate the + * conversation with dld and as such we keep track of our state here. + */ +typedef enum vnd_str_state { + VNS_S_INITIAL = 0, + VNS_S_INFO_SENT, + VNS_S_EXCLUSIVE_SENT, + VNS_S_ATTACH_SENT, + VNS_S_BIND_SENT, + VNS_S_SAP_PROMISC_SENT, + VNS_S_MULTI_PROMISC_SENT, + VNS_S_RX_ONLY_PROMISC_SENT, + VNS_S_FIXUP_PROMISC_SENT, + VNS_S_CAPAB_Q_SENT, + VNS_S_CAPAB_E_SENT, + VNS_S_ONLINE, + VNS_S_SHUTTING_DOWN, + VNS_S_MULTICAST_PROMISCOFF_SENT, + VNS_S_SAP_PROMISCOFF_SENT, + VNS_S_UNBIND_SENT, + VNS_S_ZOMBIE +} vnd_str_state_t; + +typedef enum vnd_str_flags { + VNS_F_NEED_ZONE = 0x1, + VNS_F_TASKQ_DISPATCHED = 0x2, + VNS_F_CONDEMNED = 0x4, + VNS_F_FLOW_CONTROLLED = 0x8, + VNS_F_DRAIN_SCHEDULED = 0x10, + VNS_F_BARRIER = 0x20, + VNS_F_BARRIER_DONE = 0x40 +} vnd_str_flags_t; + +typedef enum vnd_capab_flags { + VNS_C_HCKSUM = 0x1, + VNS_C_DLD = 0x2, + VNS_C_DIRECT = 0x4, + VNS_C_HCKSUM_BADVERS = 0x8 +} vnd_capab_flags_t; + +/* + * Definitions to interact with direct callbacks + */ +typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, + mac_header_info_t *); +typedef uintptr_t vnd_mac_cookie_t; +/* DLD Direct capability function */ +typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); +/* DLD Direct tx function */ +typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +/* DLD Direct function to set flow control callback */ +typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), + void *); +/* DLD Direct function to see if flow controlled still */ +typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); + +/* + * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. + */ +typedef struct vnd_str_capab { + vnd_capab_flags_t vsc_flags; + t_uscalar_t vsc_hcksum_opts; + vnd_dld_cap_t vsc_capab_f; + void *vsc_capab_hdl; + vnd_dld_tx_t vsc_tx_f; + void *vsc_tx_hdl; + vnd_dld_set_fcb_t vsc_set_fcb_f; + void *vsc_set_fcb_hdl; + vnd_dld_is_fc_t vsc_is_fc_f; + void *vsc_is_fc_hdl; + vnd_mac_cookie_t vsc_fc_cookie; + void *vsc_tx_fc_hdl; +} vnd_str_capab_t; + +/* + * The vnd_data_queue is a simple construct for storing a series of messages in + * a queue. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_data_queue { + struct vnd_str *vdq_vns; /* E */ + kmutex_t vdq_lock; + kcondvar_t vdq_ready; /* Uses vdq_lock */ + ssize_t vdq_max; /* L */ + ssize_t vdq_cur; /* L */ + mblk_t *vdq_head; /* L */ + mblk_t *vdq_tail; /* L */ +} vnd_data_queue_t; + +typedef struct vnd_str_stat { + kstat_named_t vks_rbytes; + kstat_named_t vks_rpackets; + kstat_named_t vks_obytes; + kstat_named_t vks_opackets; + kstat_named_t vks_nhookindrops; + kstat_named_t vks_nhookoutdrops; + kstat_named_t vks_ndlpidrops; + kstat_named_t vks_ndataindrops; + kstat_named_t vks_ndataoutdrops; + kstat_named_t vks_tdrops; + kstat_named_t vks_linkname; + kstat_named_t vks_zonename; + kstat_named_t vks_nmacflow; + kstat_named_t vks_tmacflow; + kstat_named_t vks_mac_flow_1ms; + kstat_named_t vks_mac_flow_10ms; + kstat_named_t vks_mac_flow_100ms; + kstat_named_t vks_mac_flow_1s; + kstat_named_t vks_mac_flow_10s; +} vnd_str_stat_t; + +/* + * vnd stream structure + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_str { + kmutex_t vns_lock; + kcondvar_t vns_cancelcv; /* Uses vns_lock */ + kcondvar_t vns_barriercv; /* Uses vns_lock */ + kcondvar_t vns_stcv; /* Uses vns_lock */ + vnd_str_state_t vns_state; /* L */ + vnd_str_state_t vns_laststate; /* L */ + vnd_errno_t vns_errno; /* L */ + vnd_str_flags_t vns_flags; /* L */ + vnd_str_capab_t vns_caps; /* L */ + taskq_ent_t vns_tqe; /* L */ + vnd_data_queue_t vns_dq_read; /* E */ + vnd_data_queue_t vns_dq_write; /* E */ + mblk_t *vns_dlpi_inc; /* L */ + queue_t *vns_rq; /* E */ + queue_t *vns_wq; /* E */ + queue_t *vns_lrq; /* E */ + t_uscalar_t vns_dlpi_style; /* L */ + t_uscalar_t vns_minwrite; /* L */ + t_uscalar_t vns_maxwrite; /* L */ + hrtime_t vns_fclatch; /* L */ + hrtime_t vns_fcupdate; /* L */ + kstat_t *vns_kstat; /* E */ + gsqueue_t *vns_squeue; /* E */ + mblk_t vns_drainblk; /* E + X */ + mblk_t vns_barrierblk; /* E + X */ + vnd_str_stat_t vns_ksdata; /* A */ + size_t vns_nflush; /* L */ + size_t vns_bsize; /* L */ + struct vnd_dev *vns_dev; /* E + X */ + struct vnd_pnsd *vns_nsd; /* E + X */ +} vnd_str_t; + +typedef enum vnd_dev_flags { + VND_D_ATTACH_INFLIGHT = 0x001, + VND_D_ATTACHED = 0x002, + VND_D_LINK_INFLIGHT = 0x004, + VND_D_LINKED = 0x008, + VND_D_CONDEMNED = 0x010, + VND_D_ZONE_DYING = 0x020, + VND_D_OPENED = 0x040 +} vnd_dev_flags_t; + +/* + * This represents the data associated with a minor device instance. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_dev { + kmutex_t vdd_lock; + list_node_t vdd_link; /* GL */ + list_node_t vdd_nslink; /* NSL */ + int vdd_ref; /* L */ + vnd_dev_flags_t vdd_flags; /* L */ + minor_t vdd_minor; /* E */ + dev_t vdd_devid; /* E */ + ldi_ident_t vdd_ldiid; /* E */ + ldi_handle_t vdd_ldih; /* X */ + cred_t *vdd_cr; /* X */ + vnd_str_t *vdd_str; /* L */ + struct pollhead vdd_ph; /* E */ + struct vnd_pnsd *vdd_nsd; /* E + X */ + char vdd_datalink[VND_NAMELEN]; /* L */ + char vdd_lname[VND_NAMELEN]; /* L */ +} vnd_dev_t; + +typedef enum vnd_pnsd_flags { + VND_NS_CONDEMNED = 0x1 +} vnd_pnsd_flags_t; + +/* + * Per netstack data structure. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_pnsd { + list_node_t vpnd_link; /* protected by global dev lock */ + zoneid_t vpnd_zid; /* E */ + netstackid_t vpnd_nsid; /* E */ + boolean_t vpnd_hooked; /* E */ + net_handle_t vpnd_neti_v4; /* E */ + hook_family_t vpnd_family_v4; /* E */ + hook_event_t vpnd_event_in_v4; /* E */ + hook_event_t vpnd_event_out_v4; /* E */ + hook_event_token_t vpnd_token_in_v4; /* E */ + hook_event_token_t vpnd_token_out_v4; /* E */ + net_handle_t vpnd_neti_v6; /* E */ + hook_family_t vpnd_family_v6; /* E */ + hook_event_t vpnd_event_in_v6; /* E */ + hook_event_t vpnd_event_out_v6; /* E */ + hook_event_token_t vpnd_token_in_v6; /* E */ + hook_event_token_t vpnd_token_out_v6; /* E */ + kmutex_t vpnd_lock; /* Protects remaining members */ + kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ + int vpnd_ref; /* L */ + vnd_pnsd_flags_t vpnd_flags; /* L */ + list_t vpnd_dev_list; /* L */ +} vnd_pnsd_t; + +static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); + +/* + * Drop function signature. + */ +typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); + +static void +vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + panic("illegal vnd drop"); +} + +static void +vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + mblk_t *mp; + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp->b_next; + vnd_drop_hook_in(vsp, mp, "stream not associated"); + } +} + +static vnd_pnsd_t * +vnd_nsd_lookup(netstackid_t nsid) +{ + vnd_pnsd_t *nsp; + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + if (nsp->vpnd_nsid == nsid) { + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref >= 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zid(zoneid_t zid) +{ + netstack_t *ns; + vnd_pnsd_t *nsp; + ns = netstack_find_by_zoneid(zid); + if (ns == NULL) + return (NULL); + nsp = vnd_nsd_lookup(ns->netstack_stackid); + netstack_rele(ns); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zonename(char *zname) +{ + zone_t *zonep; + vnd_pnsd_t *nsp; + + zonep = zone_find_by_name(zname); + if (zonep == NULL) + return (NULL); + + nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); + zone_rele(zonep); + return (nsp); +} + +static void +vnd_nsd_ref(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + /* + * This can only be used on something that has been obtained through + * some other means. As such, the caller should already have a reference + * before adding another one. This function should not be used as a + * means of creating the initial reference. + */ + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static void +vnd_nsd_rele(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref--; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static vnd_dev_t * +vnd_dev_lookup(minor_t m) +{ + vnd_dev_t *vdp; + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + if (vdp->vdd_minor == m) { + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (vdp); +} + +static void +vnd_dev_free(vnd_dev_t *vdp) +{ + /* + * When the STREAM exists we need to go through and make sure + * communication gets torn down. As part of closing the stream, we + * guarantee that nothing else should be able to enter the stream layer + * at this point. That means no one should be able to call + * read(),write() or one of the frameio ioctls. + */ + if (vdp->vdd_flags & VND_D_ATTACHED) { + ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + + /* + * We have to remove ourselves from our parents list now. It is + * really quite important that we have already set the condemend + * flag here so that our containing netstack basically knows + * that we're on the way down and knows not to wait for us. It's + * also important that we do that before we put a rele on the + * the device as that is the point at which it will check again. + */ + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(vdp->vdd_nsd); + vdp->vdd_nsd = NULL; + } + ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); + id_free(vnd_minors, vdp->vdd_minor); + mutex_destroy(&vdp->vdd_lock); + kmem_cache_free(vnd_dev_cache, vdp); +} + +static void +vnd_dev_ref(vnd_dev_t *vdp) +{ + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); +} + +/* + * As part of releasing the hold on this we may tear down a given vnd_dev_t As + * such we need to make sure that we grab the list lock first before grabbing + * the vnd_dev_t's lock to ensure proper lock ordering. + */ +static void +vnd_dev_rele(vnd_dev_t *vdp) +{ + mutex_enter(&vnd_dev_lock); + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref--; + DTRACE_VND_REFDEC(vdp); + if (vdp->vdd_ref > 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return; + } + + /* + * Now that we've removed this from the list, we can go ahead and + * drop the list lock. No one else can find this device and reference + * it. As its reference count is zero, it by definition does not have + * any remaining entries in /devices that could lead someone back to + * this. + */ + vdp->vdd_flags |= VND_D_CONDEMNED; + list_remove(&vnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + vnd_dev_free(vdp); +} + +/* + * Insert a mesage block chain if there's space, otherwise drop it. Return one + * so someone who was waiting for data would now end up having found it. eg. + * caller should consider a broadcast. + */ +static int +vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, + vnd_dropper_f dropf) +{ + size_t msize; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + if (reserved == B_FALSE) { + msize = msgsize(mp); + if (vqp->vdq_cur + msize > vqp->vdq_max) { + dropf(vqp->vdq_vns, mp, "buffer full"); + return (0); + } + vqp->vdq_cur += msize; + } + + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + vqp->vdq_head = mp; + vqp->vdq_tail = mp; + } else { + vqp->vdq_tail->b_next = mp; + vqp->vdq_tail = mp; + } + + return (1); +} + +/* + * Remove a message message block chain. If the amount of space in the buffer + * has changed we return 1. We have no way of knowing whether or not there is + * enough space overall for a given writer who is blocked, so we always end up + * having to return true and thus tell consumers that they should consider + * signalling. + */ +static int +vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) +{ + size_t msize; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(mpp != NULL); + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + *mpp = NULL; + return (0); + } + + mp = vqp->vdq_head; + msize = msgsize(mp); + + vqp->vdq_cur -= msize; + if (mp->b_next == NULL) { + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + /* + * We can't be certain that this is always going to be zero. + * Someone may have basically taken a reservation of space on + * the data queue, eg. claimed spae but not yet pushed it on + * yet. + */ + ASSERT(vqp->vdq_cur >= 0); + } else { + vqp->vdq_head = mp->b_next; + ASSERT(vqp->vdq_cur > 0); + } + mp->b_next = NULL; + *mpp = mp; + return (1); +} + +/* + * Reserve space in the queue. This will bump up the size of the queue and + * entitle the user to push something on later without bumping the space. + */ +static int +vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size >= 0); + + if (size == 0) + return (0); + + if (size + vqp->vdq_cur > vqp->vdq_max) + return (0); + + vqp->vdq_cur += size; + return (1); +} + +static void +vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size > 0); + ASSERT(size <= vqp->vdq_cur); + + vqp->vdq_cur -= size; +} + +static void +vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) +{ + mblk_t *mp, *next; + + mutex_enter(&vqp->vdq_lock); + for (mp = vqp->vdq_head; mp != NULL; mp = next) { + next = mp->b_next; + mp->b_next = NULL; + dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); + } + vqp->vdq_cur = 0; + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + mutex_exit(&vqp->vdq_lock); +} + +static boolean_t +vnd_dq_is_empty(vnd_data_queue_t *vqp) +{ + boolean_t ret; + + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head == NULL) + ret = B_TRUE; + else + ret = B_FALSE; + mutex_exit(&vqp->vdq_lock); + + return (ret); +} + +/* + * Get a network uint16_t from the message and translate it into something the + * host understands. + */ +static int +vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); +} + +/* + * Given an mblk chain find the mblk and address of a particular offset. + */ +static int +vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) +{ + size_t mpsize; + + if (off >= msgsize(mp)) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + *mpp = mp; + *offp = (uintptr_t)mp->b_rptr + off; + + return (0); +} + +/* + * Fetch the destination mac address. Set *dstp to that mac address. If the data + * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. + */ +static int +vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) +{ + int i; + + if (MBLKL(mp) >= ETHERADDRL) { + *dstpp = mp->b_rptr; + return (0); + } + + *dstpp = datap; + for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { + if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) + return (1); + } + + return (0); +} + +static int +vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, + hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, + hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) +{ + uint16_t etype; + int vlan = 0; + hook_pkt_event_t info; + size_t offset, mblen; + uint8_t *dstp; + uint8_t dstaddr[6]; + hook_event_t he; + hook_event_token_t het; + net_handle_t neti; + + /* + * Before we can ask if we're interested we have to do enough work to + * determine the ethertype. + */ + + /* Byte 12 is either the VLAN tag or the ethertype */ + if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + if (etype == ETHERTYPE_VLAN) { + vlan = 1; + /* Actual ethertype is another four bytes in */ + if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { + ddrop(vsp, *mpp, + "packet has incomplete ethernet vlan header"); + *mpp = NULL; + return (1); + } + offset = sizeof (struct ether_vlan_header); + } else { + offset = sizeof (struct ether_header); + } + + /* + * At the moment we only hook on the kinds of things that the IP module + * would normally. + */ + if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) + return (0); + + if (etype == ETHERTYPE_IP) { + neti = netiv4; + he = hev4; + het = hetv4; + } else { + neti = netiv6; + he = hev6; + het = hetv6; + } + + if (!he.he_interested) + return (0); + + + if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + /* + * Now that we know we're interested, we have to do some additional + * sanity checking for IPF's sake, ala ip_check_length(). Specifically + * we need to check to make sure that the remaining packet size, + * excluding MAC, is at least the size of an IP header. + */ + mblen = msgsize(*mpp); + if ((etype == ETHERTYPE_IP && + mblen - offset < IP_SIMPLE_HDR_LENGTH) || + (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { + ddrop(vsp, *mpp, "packet has invalid IP header"); + *mpp = NULL; + return (1); + } + + info.hpe_protocol = neti; + info.hpe_ifp = (phy_if_t)vsp; + info.hpe_ofp = (phy_if_t)vsp; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) + info.hpe_flags |= HPE_BROADCAST; + else if (etype == ETHERTYPE_IP && + bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + else if (etype == ETHERTYPE_IPV6 && + bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + + if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, + (uintptr_t *)&info.hpe_hdr) != 0) { + ddrop(vsp, *mpp, "packet too small -- " + "unable to find payload"); + *mpp = NULL; + return (1); + } + + if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { + hdrop(vsp, *mpp, "drooped by hooks"); + return (1); + } + + return (0); +} + +/* + * This should not be used for DL_INFO_REQ. + */ +static mblk_t * +vnd_dlpi_alloc(size_t len, t_uscalar_t prim) +{ + mblk_t *mp; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_PROTO; + mp->b_wptr = mp->b_rptr + len; + bzero(mp->b_rptr, len); + ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; + + return (mp); +} + +static void +vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) +{ + mblk_t **mpp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + ASSERT(mp->b_next == NULL); + mpp = &vsp->vns_dlpi_inc; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + *mpp = mp; +} + +static mblk_t * +vnd_dlpi_inc_pop(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vsp->vns_dlpi_inc; + if (mp != NULL) { + VERIFY(mp->b_next == NULL || mp->b_next != mp); + vsp->vns_dlpi_inc = mp->b_next; + mp->b_next = NULL; + } + return (mp); +} + +static int +vnd_st_sinfo(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_info_req_t *dlir; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + vsp->vns_state = VNS_S_INFO_SENT; + cv_broadcast(&vsp->vns_stcv); + + mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_info(vnd_str_t *vsp) +{ + dl_info_ack_t *dlia; + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + dlia = (dl_info_ack_t *)mp->b_rptr; + vsp->vns_dlpi_style = dlia->dl_provider_style; + vsp->vns_minwrite = dlia->dl_min_sdu; + vsp->vns_maxwrite = dlia->dl_max_sdu; + + /* + * At this time we only support DL_ETHER devices. + */ + if (dlia->dl_mac_type != DL_ETHER) { + freemsg(mp); + vsp->vns_errno = VND_E_NOTETHER; + return (1); + } + + /* + * Because vnd operates on entire packets, we need to manually account + * for the ethernet header information. We add the size of the + * ether_vlan_header to account for this, regardless if it is using + * vlans or not. + */ + vsp->vns_maxwrite += sizeof (struct ether_vlan_header); + + freemsg(mp); + return (0); +} + +static int +vnd_st_sexclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_EXCLUSIVE_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + return (0); +} + +static int +vnd_st_exclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_exclusive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_EXCLUSIVE_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_exclusive: got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_DLEXCL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +/* + * Send down a DLPI_ATTACH_REQ. + */ +static int +vnd_st_sattach(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; + vsp->vns_state = VNS_S_ATTACH_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_attach(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_ATTACH_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_attach: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_ATTACHFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_sbind(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_bind_req_t *dbrp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), + DL_BIND_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + dbrp = (dl_bind_req_t *)(mp->b_rptr); + dbrp->dl_sap = 0; + dbrp->dl_service_mode = DL_CLDLS; + + vsp->vns_state = VNS_S_BIND_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_bind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + + if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_BINDFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + mblk_t *mp; + dl_promiscon_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + dprp = (dl_promiscon_req_t *)mp->b_rptr; + dprp->dl_level = type; + + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_promisc(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promisc"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_PROMISCON_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promisc: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_PROMISCFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_scapabq(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_CAPAB_Q_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static void +vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + int signal = 0; + mblk_t *mp; + vnd_pnsd_t *nsp = vsp->vns_nsd; + + ASSERT(vsp != NULL); + ASSERT(mp_chain != NULL); + + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + uint16_t vid; + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * If we were operating in a traditional dlpi context then we + * would have enabled DLIOCRAW and rather than the fast path, we + * would come through dld_str_rx_raw. That function does two + * things that we have to consider doing ourselves. The first is + * that it adjusts the b_rptr back to account for dld bumping us + * past the mac header. It also tries to account for cases where + * mac provides an illusion of the mac header. Fortunately, dld + * only allows the fastpath when the media type is the same as + * the native type. Therefore all we have to do here is adjust + * the b_rptr. + */ + ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); + mp->b_rptr -= mhip->mhi_hdrsize; + vid = VLAN_ID(mhip->mhi_tci); + if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { + bcopy(mp->b_rptr, mp->b_rptr + 4, 12); + mp->b_rptr += 4; + } + + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, + nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, + nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) + continue; + + VND_STAT_INC(vsp, vks_rpackets, 1); + VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); + DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + mutex_enter(&vsp->vns_dq_read.vdq_lock); + signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, + vnd_drop_in); + mutex_exit(&vsp->vns_dq_read.vdq_lock); + + } + + if (signal != 0) { + cv_broadcast(&vsp->vns_dq_read.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); + } + +} + +static void +vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) +{ + VND_STAT_INC(vsp, vks_nmacflow, 1); + VND_STAT_INC(vsp, vks_tmacflow, diff); + if (diff >= VND_LATENCY_1MS) + VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); + if (diff >= VND_LATENCY_10MS) + VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); + if (diff >= VND_LATENCY_100MS) + VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); + if (diff >= VND_LATENCY_1S) + VND_STAT_INC(vsp, vks_mac_flow_1s, 1); + if (diff >= VND_LATENCY_10S) + VND_STAT_INC(vsp, vks_mac_flow_10s, 1); +} + +/* + * This is a callback from MAC that indicates that we are allowed to send + * packets again. + */ +static void +vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) +{ + vnd_str_t *vsp = arg; + hrtime_t now, diff; + + mutex_enter(&vsp->vns_lock); + now = gethrtime(); + + /* + * Check for the case that we beat vnd_squeue_tx_one to the punch. + * There's also an additional case here that we got notified because + * we're sharing a device that ran out of tx descriptors, even though it + * wasn't because of us. + */ + if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { + vsp->vns_fcupdate = now; + mutex_exit(&vsp->vns_lock); + return; + } + + ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); + ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); + vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = NULL; + diff = now - vsp->vns_fclatch; + vsp->vns_fclatch = 0; + DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, + vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); + /* + * If someone has asked to flush the squeue and thus inserted a barrier, + * than we shouldn't schedule a drain. + */ + if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, + vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, + VND_SQUEUE_TAG_MAC_FLOW_CONTROL); + } + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); +} + +static void +vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); +} + +static int +vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) +{ + int ret; + dld_capab_direct_t d; + mac_perim_handle_t mph; + vnd_str_capab_t *c = &vsp->vns_caps; + + bzero(&d, sizeof (d)); + d.di_rx_cf = (uintptr_t)rxfunc; + d.di_rx_ch = vsp; + d.di_flags = DI_DIRECT_RAW; + + vnd_mac_enter(vsp, &mph); + + /* + * If we're coming in here for a second pass, we need to make sure that + * we remove an existing flow control notification callback, otherwise + * we'll create a duplicate that will remain with garbage data. + */ + if (c->vsc_tx_fc_hdl != NULL) { + ASSERT(c->vsc_set_fcb_hdl != NULL); + (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, + c->vsc_tx_fc_hdl); + c->vsc_tx_fc_hdl = NULL; + } + + if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, + DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { + c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; + c->vsc_tx_hdl = d.di_tx_dh; + c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; + c->vsc_set_fcb_hdl = d.di_tx_cb_dh; + c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; + c->vsc_is_fc_hdl = d.di_tx_fctl_dh; + c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, + vnd_mac_flow_control, vsp); + c->vsc_flags |= VNS_C_DIRECT; + ret = 0; + } else { + vsp->vns_errno = VND_E_DIRECTFAIL; + ret = 1; + } + vnd_mac_exit(vsp, mph); + return (ret); +} + +static int +vnd_st_capabq(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_capability_ack_t *cap; + dl_capability_sub_t *subp; + dl_capab_hcksum_t *hck; + dl_capab_dld_t *dld; + unsigned char *rp; + int ret = 0; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + + rp = mp->b_rptr; + cap = (dl_capability_ack_t *)rp; + if (cap->dl_sub_length == 0) + goto done; + + /* Don't try to process something too big */ + if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_CAPACKINVAL; + ret = 1; + goto done; + } + + rp += cap->dl_sub_offset; + + while (cap->dl_sub_length > 0) { + subp = (dl_capability_sub_t *)rp; + /* Sanity check something crazy from down below */ + if (subp->dl_length + sizeof (dl_capability_sub_t) > + cap->dl_sub_length) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_SUBCAPINVAL; + ret = 1; + goto done; + } + + switch (subp->dl_cap) { + case DL_CAPAB_HCKSUM: + hck = (dl_capab_hcksum_t *)(rp + + sizeof (dl_capability_sub_t)); + if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; + break; + } + if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; + vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; + break; + case DL_CAPAB_DLD: + dld = (dl_capab_dld_t *)(rp + + sizeof (dl_capability_sub_t)); + if (dld->dld_version != DLD_CURRENT_VERSION) { + vsp->vns_errno = VND_E_DLDBADVERS; + ret = 1; + goto done; + } + if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_DLD; + vsp->vns_caps.vsc_capab_f = + (vnd_dld_cap_t)dld->dld_capab; + vsp->vns_caps.vsc_capab_hdl = + (void *)dld->dld_capab_handle; + /* + * At this point in time, we have to set up a direct + * function that drops all input. This validates that + * we'll be able to set up direct input and that we can + * easily switch it earlier to the real data function + * when we've plumbed everything up. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { + /* vns_errno set by vnd_dld_cap_enable */ + ret = 1; + goto done; + } + break; + default: + /* Ignore unsupported cap */ + break; + } + + rp += sizeof (dl_capability_sub_t) + subp->dl_length; + cap->dl_sub_length -= sizeof (dl_capability_sub_t) + + subp->dl_length; + } + +done: + /* Make sure we enabled direct callbacks */ + if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { + vsp->vns_errno = VND_E_DIRECTNOTSUP; + ret = 1; + } + + freemsg(mp); + return (ret); +} + +static void +vnd_st_sonline(vnd_str_t *vsp) +{ + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + vsp->vns_state = VNS_S_ONLINE; + cv_broadcast(&vsp->vns_stcv); +} + +static void +vnd_st_shutdown(vnd_str_t *vsp) +{ + mac_perim_handle_t mph; + vnd_str_capab_t *vsc = &vsp->vns_caps; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * At this point in time we know that there is no one transmitting as + * our final reference has been torn down and that vnd_s_close inserted + * a barrier to validate that everything is flushed. + */ + if (vsc->vsc_flags & VNS_C_DIRECT) { + vnd_mac_enter(vsp, &mph); + vsc->vsc_flags &= ~VNS_C_DIRECT; + (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, + vsc->vsc_tx_fc_hdl); + vsc->vsc_tx_fc_hdl = NULL; + (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + vnd_mac_exit(vsp, mph); + } +} + +static boolean_t +vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + boolean_t ret = B_TRUE; + mblk_t *mp; + dl_promiscoff_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "promiscoff request"); + ret = B_FALSE; + goto next; + } + + dprp = (dl_promiscoff_req_t *)mp->b_rptr; + dprp->dl_level = type; + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_promiscoff(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + return; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promiscoff"); + return; + } + + if (cprim != DL_PROMISCOFF_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promiscoff: Got ack/nack for wrong primitive"); + return; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to disable promiscuos mode during " + "vnd teardown"); + } +} + +static boolean_t +vnd_st_sunbind(vnd_str_t *vsp) +{ + mblk_t *mp; + boolean_t ret = B_TRUE; + + mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "unbind request"); + ret = B_FALSE; + goto next; + } + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = VNS_S_UNBIND_SENT; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_unbind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + goto next; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_unbind"); + goto next; + } + + if (cprim != DL_UNBIND_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_unbind: Got ack/nack for wrong primitive"); + goto next; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to unbind stream during vnd " + "teardown"); + } + +next: + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); +} + +/* + * Perform state transitions. This is a one way shot down the flow chart + * described in the big theory statement. + */ +static void +vnd_str_state_transition(void *arg) +{ + boolean_t died = B_FALSE; + vnd_str_t *vsp = arg; + mblk_t *mp; + + mutex_enter(&vsp->vns_lock); + if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && + vsp->vns_state != VNS_S_SHUTTING_DOWN)) { + mutex_exit(&vsp->vns_lock); + return; + } + + /* + * When trying to shut down, or unwinding from a failed enabling, rather + * than immediately entering the ZOMBIE state, we may instead opt to try + * and enter the next state in the progression. This is especially + * important when trying to tear everything down. + */ +loop: + DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, + vnd_str_state_t, vsp->vns_state); + switch (vsp->vns_state) { + case VNS_S_INITIAL: + VERIFY(vsp->vns_dlpi_inc == NULL); + if (vnd_st_sinfo(vsp) != 0) + died = B_TRUE; + break; + case VNS_S_INFO_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_info(vsp) == 0) { + if (vnd_st_sexclusive(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_EXCLUSIVE_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_exclusive(vsp) == 0) { + if (vsp->vns_dlpi_style == DL_STYLE2) { + if (vnd_st_sattach(vsp) != 0) + died = B_TRUE; + } else { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } + } else { + died = B_TRUE; + } + break; + case VNS_S_ATTACH_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_attach(vsp) == 0) { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_BIND_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_bind(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_SAP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, + VNS_S_MULTI_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_MULTI_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, + VNS_S_RX_ONLY_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_RX_ONLY_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS, + VNS_S_FIXUP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_FIXUP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_scapabq(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_CAPAB_Q_SENT: + if (vnd_st_capabq(vsp) != 0) + died = B_TRUE; + else + vnd_st_sonline(vsp); + break; + case VNS_S_SHUTTING_DOWN: + vnd_st_shutdown(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI, + VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_MULTICAST_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_SAP_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_sunbind(vsp) == B_FALSE) + goto loop; + break; + case VNS_S_UNBIND_SENT: + vnd_st_unbind(vsp); + break; + case VNS_S_ZOMBIE: + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vsp received data as a zombie"); + break; + default: + panic("vnd_str_t entered an unknown state"); + } + + if (died == B_TRUE) { + ASSERT(vsp->vns_errno != VND_E_SUCCESS); + vsp->vns_laststate = vsp->vns_state; + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); + } + + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_dlpi_taskq_dispatch(void *arg) +{ + vnd_str_t *vsp = arg; + int run = 1; + + while (run != 0) { + vnd_str_state_transition(vsp); + mutex_enter(&vsp->vns_lock); + if (vsp->vns_flags & VNS_F_CONDEMNED || + vsp->vns_dlpi_inc == NULL) { + run = 0; + vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; + } + if (vsp->vns_flags & VNS_F_CONDEMNED) + cv_signal(&vsp->vns_cancelcv); + mutex_exit(&vsp->vns_lock); + } +} + +static int +vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) +{ + return (-1); +} + +static int +vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +static int +vnd_neti_getptmue(net_handle_t neti) +{ + return (-1); +} + +static int +vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +static int +vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + zoneid_t *zid) +{ + return (-1); +} + +static int +vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} + +static phy_if_t +vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) +{ + return (-1); +} + +static phy_if_t +vnd_neti_phylookup(net_handle_t neti, const char *name) +{ + return (-1); +} + +static lif_if_t +vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +static int +vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +static phy_if_t +vnd_neti_route(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +static int +vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static int +vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static net_protocol_t vnd_neti_info_v4 = { + NETINFO_VERSION, + NHF_VND_INET, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + +static net_protocol_t vnd_neti_info_v6 = { + NETINFO_VERSION, + NHF_VND_INET6, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + + +static int +vnd_netinfo_init(vnd_pnsd_t *nsp) +{ + nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v4); + ASSERT(nsp->vpnd_neti_v4 != NULL); + + nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v6); + ASSERT(nsp->vpnd_neti_v6 != NULL); + + nsp->vpnd_family_v4.hf_version = HOOK_VERSION; + nsp->vpnd_family_v4.hf_name = "vnd_inet"; + + if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_family_v6.hf_version = HOOK_VERSION; + nsp->vpnd_family_v6.hf_name = "vnd_inet6"; + + if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v4.he_flags = 0; + nsp->vpnd_event_in_v4.he_interested = B_FALSE; + + nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + if (nsp->vpnd_token_in_v4 == NULL) { + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v6.he_flags = 0; + nsp->vpnd_event_in_v6.he_interested = B_FALSE; + + nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + if (nsp->vpnd_token_in_v6 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v4.he_flags = 0; + nsp->vpnd_event_out_v4.he_interested = B_FALSE; + + nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_out_v4); + if (nsp->vpnd_token_out_v4 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v6.he_flags = 0; + nsp->vpnd_event_out_v6.he_interested = B_FALSE; + + nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_out_v6); + if (nsp->vpnd_token_out_v6 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + return (0); +} + +static void +vnd_netinfo_shutdown(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); +} + +static void +vnd_netinfo_fini(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v4); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v6); + VERIFY(ret == 0); +} + +static void +vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + + VERIFY(bmp == &vsp->vns_barrierblk); + mutex_enter(&vsp->vns_lock); + VERIFY(vsp->vns_flags & VNS_F_BARRIER); + VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); + vsp->vns_flags |= VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * For better or worse, we have to broadcast here as we could have a + * thread that's blocked for completion as well as one that's blocked + * waiting to do a barrier itself. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * This is a data barrier for the stream while it is in fastpath mode. It blocks + * and ensures that there is nothing else in the squeue. + */ +static void +vnd_strbarrier(vnd_str_t *vsp) +{ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_flags & VNS_F_BARRIER) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags |= VNS_F_BARRIER; + mutex_exit(&vsp->vns_lock); + + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, + vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); + + mutex_enter(&vsp->vns_lock); + while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags &= ~VNS_F_BARRIER; + vsp->vns_flags &= ~VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * We have to broadcast in case anyone is waiting for the barrier + * themselves. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * Based on the type of message that we're dealing with we're going to want to + * do one of several things. Basically if it looks like it's something we know + * about, we should probably handle it in one of our transition threads. + * Otherwise, we should just simply putnext. + */ +static int +vnd_s_rput(queue_t *q, mblk_t *mp) +{ + t_uscalar_t prim; + int dispatch = 0; + vnd_str_t *vsp = q->q_ptr; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + vnd_drop_ctl(vsp, mp, "PROTO message too short"); + break; + } + + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; + if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { + vnd_drop_ctl(vsp, mp, + "recieved an unsupported dlpi DATA req"); + break; + } + + /* + * Enqueue the entry and fire off a taskq dispatch. + */ + mutex_enter(&vsp->vns_lock); + vnd_dlpi_inc_push(vsp, mp); + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + dispatch = 1; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + } + mutex_exit(&vsp->vns_lock); + if (dispatch != 0) + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, + vsp, 0, &vsp->vns_tqe); + break; + case M_DATA: + vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); + break; + default: + putnext(vsp->vns_rq, mp); + } + return (0); +} + +static void +vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) +{ + int error; + vnd_strioc_t *visp; + + if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || + iocp->ioc_count != TRANSPARENT) { + error = EINVAL; + goto nak; + } + + /* + * All streams ioctls that we support must use kcred as a means to + * distinguish that this is a layered open by the kernel as opposed to + * one by a user who has done an I_PUSH of the module. + */ + if (iocp->ioc_cr != kcred) { + error = EPERM; + goto nak; + } + + if (mp->b_cont == NULL) { + error = EAGAIN; + goto nak; + } + + visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); + ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); + visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; + visp->vs_state = VSS_COPYIN; + + mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); + qreply(q, mp); + + return; + +nak: + if (mp->b_cont != NULL) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + + iocp->ioc_error = error; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); +} + +static void +vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + int error; + vnd_str_state_t state; + struct copyreq *crp; + vnd_strioc_associate_t *vss; + vnd_dev_t *vdp = NULL; + vnd_pnsd_t *nsp = NULL; + char iname[2*VND_NAMELEN]; + zone_t *zone; + vnd_strioc_t *visp; + + visp = (vnd_strioc_t *)csp->cp_private; + + /* If it's not ours, it's not our problem */ + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* Data is sitting for us in b_cont */ + if (mp->b_cont == NULL || + MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { + kmem_free(visp, sizeof (vnd_strioc_t)); + miocnak(q, mp, 0, EINVAL); + return; + } + + vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; + vdp = vnd_dev_lookup(vss->vsa_minor); + if (vdp == NULL) { + error = EIO; + vss->vsa_errno = VND_E_NODEV; + goto nak; + } + + nsp = vnd_nsd_lookup(vss->vsa_nsid); + if (nsp == NULL) { + error = EIO; + vss->vsa_errno = VND_E_NONETSTACK; + goto nak; + } + + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { + mutex_exit(&vsp->vns_lock); + error = EEXIST; + vss->vsa_errno = VND_E_ASSOCIATED; + goto nak; + } + + vsp->vns_nsd = nsp; + vsp->vns_flags &= ~VNS_F_NEED_ZONE; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + mutex_exit(&vsp->vns_lock); + + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, + &vsp->vns_tqe); + + + /* At this point we need to wait until we have transitioned to ONLINE */ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + state = vsp->vns_state; + mutex_exit(&vsp->vns_lock); + + if (state == VNS_S_ZOMBIE) { + vss->vsa_errno = vsp->vns_errno; + error = EIO; + goto nak; + } + + mutex_enter(&vdp->vdd_lock); + mutex_enter(&vsp->vns_lock); + VERIFY(vdp->vdd_str == NULL); + /* + * Now initialize the remaining kstat properties and let's go ahead and + * create it. + */ + (void) snprintf(iname, sizeof (iname), "z%d_%d", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); + vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", + KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (vsp->vns_kstat == NULL) { + error = EIO; + vss->vsa_errno = VND_E_KSTATCREATE; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + vdp->vdd_str = vsp; + vsp->vns_dev = vdp; + + /* + * Now, it's time to do the las thing that can fail, changing out the + * input function. After this we know that we can receive data, so we + * should make sure that we're ready. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { + error = EIO; + vss->vsa_errno = VND_E_DIRECTFAIL; + vdp->vdd_str = NULL; + vsp->vns_dev = NULL; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + + zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); + ASSERT(zone != NULL); + vsp->vns_kstat->ks_data = &vsp->vns_ksdata; + /* Account for zone name */ + vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; + /* Account for eventual link name */ + vsp->vns_kstat->ks_data_size += VND_NAMELEN; + kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + zone_rele(zone); + kstat_install(vsp->vns_kstat); + + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + + /* + * Note that the vnd_str_t does not keep a permanent hold on the + * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what + * the nestack goes through to take care of everything. + */ + vss->vsa_errno = VND_E_SUCCESS; +nak: + if (vdp != NULL) + vnd_dev_rele(vdp); + if (nsp != NULL) + vnd_nsd_rele(nsp); + /* + * Change the copyin request to a copyout. Note that we can't use + * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's + * okay, as the copyin vs. copyout is basically the same. + */ + DB_TYPE(mp) = M_COPYOUT; + visp->vs_state = VSS_COPYOUT; + crp = (struct copyreq *)mp->b_rptr; + crp->cq_private = (void *)visp; + crp->cq_addr = visp->vs_addr; + crp->cq_size = sizeof (vnd_strioc_associate_t); + qreply(q, mp); +} + +static void +vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + ASSERT(csp->cp_private != NULL); + kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); + return; + } + + /* Ack and let's be done with it all */ + miocack(q, mp, 0, 0); +} + +static int +vnd_s_wput(queue_t *q, mblk_t *mp) +{ + vnd_str_t *vsp = q->q_ptr; + struct copyresp *crp; + vnd_strioc_state_t vstate; + vnd_strioc_t *visp; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); + return (0); + case M_IOCDATA: + crp = (struct copyresp *)mp->b_rptr; + ASSERT(crp->cp_private != NULL); + visp = (vnd_strioc_t *)crp->cp_private; + vstate = visp->vs_state; + ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); + if (vstate == VSS_COPYIN) + vnd_striocdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + else + vnd_stroutdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + return (0); + default: + break; + } + if (q->q_next != NULL) + putnext(q, mp); + else + vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); + + return (0); +} + +static int +vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) +{ + vnd_str_t *vsp; + uint_t rand; + + if (q->q_ptr != NULL) + return (EINVAL); + + if (!(sflag & MODOPEN)) + return (ENXIO); + + if (credp != kcred) + return (EPERM); + + vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); + bzero(vsp, sizeof (*vsp)); + mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); + vsp->vns_state = VNS_S_INITIAL; + + mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&vnd_dev_lock); + vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_read.vdq_vns = vsp; + vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_write.vdq_vns = vsp; + mutex_exit(&vnd_dev_lock); + vsp->vns_rq = q; + vsp->vns_wq = WR(q); + q->q_ptr = WR(q)->q_ptr = vsp; + vsp->vns_flags = VNS_F_NEED_ZONE; + vsp->vns_nflush = vnd_flush_nburst; + vsp->vns_bsize = vnd_flush_burst_size; + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); + + /* + * We create our kstat and initialize all of its fields now, but we + * don't install it until we actually do the zone association so we can + * get everything. + */ + kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, + "flowcontrol_100ms", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", + KSTAT_DATA_UINT64); + qprocson(q); + /* + * Now that we've called qprocson, grab the lower module for making sure + * that we don't have any pass through modules. + */ + vsp->vns_lrq = RD(vsp->vns_wq->q_next); + + return (0); +} + +static int +vnd_s_close(queue_t *q, int flag, cred_t *credp) +{ + vnd_str_t *vsp; + mblk_t *mp; + + VERIFY(WR(q)->q_next != NULL); + + vsp = q->q_ptr; + ASSERT(vsp != NULL); + + /* + * We need to transition ourselves down. This means that we have a few + * important different things to do in the process of tearing down our + * input and output buffers, making sure we've drained the current + * squeue, and disabling the fast path. Before we disable the fast path, + * we should make sure the squeue is drained. Because we're in streams + * close, we know that no packets can come into us from userland, but we + * can receive more. As such, the following is the exact order of things + * that we do: + * + * 1) flush the vns_dq_read + * 2) Insert the drain mblk + * 3) When it's been received, tear down the fast path by kicking + * off the state machine. + * 4) One final flush of both the vns_dq_read,vns_dq_write + */ + + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_strbarrier(vsp); + mutex_enter(&vsp->vns_lock); + vsp->vns_state = VNS_S_SHUTTING_DOWN; + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, + 0, &vsp->vns_tqe); + } + while (vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + mutex_exit(&vsp->vns_lock); + + qprocsoff(q); + mutex_enter(&vsp->vns_lock); + vsp->vns_flags |= VNS_F_CONDEMNED; + while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) + cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); + + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vnd_s_close"); + mutex_exit(&vsp->vns_lock); + + q->q_ptr = NULL; + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); + mutex_destroy(&vsp->vns_dq_read.vdq_lock); + mutex_destroy(&vsp->vns_dq_write.vdq_lock); + + if (vsp->vns_kstat != NULL) + kstat_delete(vsp->vns_kstat); + mutex_destroy(&vsp->vns_lock); + cv_destroy(&vsp->vns_stcv); + cv_destroy(&vsp->vns_barriercv); + cv_destroy(&vsp->vns_cancelcv); + kmem_cache_free(vnd_str_cache, vsp); + + return (0); +} + +static vnd_mac_cookie_t +vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) +{ + hrtime_t txtime; + vnd_mac_cookie_t vc; + + VND_STAT_INC(vsp, vks_opackets, 1); + VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); + DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + /* Actually tx now */ + txtime = gethrtime(); + vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, + mp, 0, MAC_DROP_ON_NO_DESC); + + /* + * We need to check two different conditions before we immediately set + * the flow control lock. The first thing that we need to do is verify + * that this is an instance of hard flow control, so to say. The flow + * control callbacks won't always fire in cases where we still get a + * cookie returned. The explicit check for flow control will guarantee + * us that we'll get a subsequent notification callback. + * + * The second case comes about because we do not hold the + * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow + * control notification already came across for us in a different thread + * calling vnd_mac_flow_control(). To deal with this, we record a + * timestamp every time that we change the flow control state. We grab + * txtime here before we transmit because that guarantees that the + * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. + * + * If the flow control notification beat us to the punch, the value of + * vns_fcupdate will be larger than the value of txtime, and we should + * just record the statistics. However, if we didn't beat it to the + * punch (txtime > vns_fcupdate), then we know that it's safe to wait + * for a notification. + */ + if (vc != NULL) { + hrtime_t diff; + + if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, + vc) == 0) + return (NULL); + mutex_enter(&vsp->vns_lock); + diff = vsp->vns_fcupdate - txtime; + if (diff > 0) { + mutex_exit(&vsp->vns_lock); + vnd_mac_flow_control_stat(vsp, diff); + return (NULL); + } + vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = vc; + vsp->vns_fclatch = txtime; + vsp->vns_fcupdate = txtime; + DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, + uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); + mutex_exit(&vsp->vns_lock); + } + + return (vc); +} + +static void +vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) +{ + mblk_t *mp; + int nmps; + size_t mptot, nflush, bsize; + boolean_t blocked, empty; + vnd_data_queue_t *vqp; + vnd_str_t *vsp = arg; + + mutex_enter(&vsp->vns_lock); + /* + * We either enter here via an squeue or via vnd_squeue_tx_append(). In + * the former case we need to mark that there is no longer an active + * user of the drain block. + */ + if (drain_mp != NULL) { + VERIFY(drain_mp == &vsp->vns_drainblk); + VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); + vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; + } + + /* + * If we're still flow controlled or under a flush barrier, nothing to + * do. + */ + if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { + mutex_exit(&vsp->vns_lock); + return; + } + + nflush = vsp->vns_nflush; + bsize = vsp->vns_bsize; + mutex_exit(&vsp->vns_lock); + + nmps = 0; + mptot = 0; + blocked = B_FALSE; + vqp = &vsp->vns_dq_write; + while (nmps < nflush && mptot <= bsize) { + mutex_enter(&vqp->vdq_lock); + if (vnd_dq_pop(vqp, &mp) == 0) { + mutex_exit(&vqp->vdq_lock); + break; + } + mutex_exit(&vqp->vdq_lock); + + nmps++; + mptot += msgsize(mp); + if (vnd_squeue_tx_one(vsp, mp) != NULL) { + blocked = B_TRUE; + break; + } + } + + empty = vnd_dq_is_empty(&vsp->vns_dq_write); + + /* + * If the queue is not empty, we're not blocked, and there isn't a drain + * scheduled, put it into the squeue with the drain block and + * GSQUEUE_FILL. + */ + if (blocked == B_FALSE && empty == B_FALSE) { + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { + mblk_t *mp = &vsp->vns_drainblk; + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, + mp, vnd_squeue_tx_drain, vsp, + GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); + } + mutex_exit(&vsp->vns_lock); + } + + /* + * If we drained some amount of data, we need to signal the data queue. + */ + if (nmps > 0) { + cv_broadcast(&vsp->vns_dq_write.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); + } +} + +static void +vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + vnd_data_queue_t *vqp = &vsp->vns_dq_write; + vnd_pnsd_t *nsp = vsp->vns_nsd; + size_t len = msgsize(mp); + + /* + * Before we append this packet, we should run it through the firewall + * rules. + */ + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, + nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, + vnd_drop_out) != 0) { + /* + * Because we earlier reserved space for this packet and it's + * not making the cut, we need to go through and unreserve that + * space. Also note that the message block will likely be freed + * by the time we return from vnd_hook so we cannot rely on it. + */ + mutex_enter(&vqp->vdq_lock); + vnd_dq_unreserve(vqp, len); + mutex_exit(&vqp->vdq_lock); + return; + } + + /* + * We earlier reserved space for this packet. So for now simply append + * it and call drain. We know that no other drain can be going on right + * now thanks to the squeue. + */ + mutex_enter(&vqp->vdq_lock); + (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); + mutex_exit(&vqp->vdq_lock); + vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); +} + +/* + * We need to see if this is a valid name of sorts for us. That means a few + * things. First off, we can't assume that what we've been given has actually + * been null terminated. More importantly, that it's a valid name as far as + * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We + * further constrain ourselves to simply alphanumeric characters and a few + * additional ones, ':', '-', and '_'. + */ +static int +vnd_validate_name(const char *buf, size_t buflen) +{ + int i, len; + + /* First make sure a null terminator exists */ + for (i = 0; i < buflen; i++) + if (buf[i] == '\0') + break; + len = i; + if (i == 0 || i == buflen) + return (0); + + for (i = 0; i < len; i++) + if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && + buf[i] != '_') + return (0); + + return (1); +} + +static int +vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) +{ + vnd_ioc_attach_t via; + vnd_strioc_associate_t vss; + vnd_pnsd_t *nsp; + zone_t *zonep; + zoneid_t zid; + char buf[2*VND_NAMELEN]; + int ret, rp; + + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) + return (EFAULT); + via.via_errno = VND_E_SUCCESS; + + if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + + /* + * Only the global zone can request to create a device in a different + * zone. + */ + zid = crgetzoneid(credp); + if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && + zid != via.via_zoneid) { + via.via_errno = VND_E_PERM; + ret = EIO; + goto errcopyout; + } + + if (via.via_zoneid == -1) + via.via_zoneid = zid; + + /* + * Establish the name we'll use now. We want to be extra paranoid about + * the device we're opening so check that now. + */ + if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { + zonep = zone_find_by_id(via.via_zoneid); + if (zonep == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, + via.via_name) >= sizeof (buf)) { + zone_rele(zonep); + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", + zonep->zone_name, via.via_name); + zone_rele(zonep); + zonep = NULL; + } else { + if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= + sizeof (buf)) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); + } + + /* + * If our zone is dying then the netstack will have been removed from + * this list. + */ + nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); + if (nsp == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + + /* + * Note we set the attached handle even though we haven't actually + * finished the process of attaching the ldi handle. + */ + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { + mutex_exit(&vdp->vdd_lock); + vnd_nsd_rele(nsp); + via.via_errno = VND_E_ATTACHED; + ret = EIO; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; + ASSERT(vdp->vdd_cr == NULL); + crhold(credp); + vdp->vdd_cr = credp; + ASSERT(vdp->vdd_nsd == NULL); + vdp->vdd_nsd = nsp; + mutex_exit(&vdp->vdd_lock); + + /* + * Place an additional hold on the vnd_pnsd_t as we go through and do + * all of the rest of our work. This will be the hold that we keep for + * as long as this thing is attached. + */ + vnd_nsd_ref(nsp); + + ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, + &vdp->vdd_ldih, vdp->vdd_ldiid); + if (ret != 0) { + if (ret == ENODEV) + via.via_errno = VND_E_NODATALINK; + goto err; + } + + /* + * Unfortunately the I_PUSH interface doesn't allow us a way to detect + * whether or not we're coming in from a layered device. We really want + * to make sure that a normal user can't push on our streams module. + * Currently the only idea I have for this is to make sure that the + * credp is kcred which is really terrible. + */ + ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, + kcred, &rp); + if (ret != 0) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + via.via_errno = VND_E_STRINIT; + ret = EIO; + goto err; + } + + vss.vsa_minor = vdp->vdd_minor; + vss.vsa_nsid = nsp->vpnd_nsid; + + ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, + FKIOCTL, kcred, &rp); + if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + if (ret == 0) { + via.via_errno = vss.vsa_errno; + ret = EIO; + } + goto err; + } + + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + + /* + * There's a chance that our netstack was condemned while we've had a + * hold on it. As such we need to check and if so, error out. + */ + if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + ret = EIO; + via.via_errno = VND_E_NOZONE; + goto err; + } + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_str != NULL); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + vdp->vdd_flags |= VND_D_ATTACHED; + (void) strlcpy(vdp->vdd_datalink, via.via_name, + sizeof (vdp->vdd_datalink)); + list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(nsp); + + return (0); + +err: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + vdp->vdd_nsd = NULL; + mutex_exit(&vdp->vdd_lock); + + /* + * We have two holds to drop here. One for our original reference and + * one for the hold this operation would have represented. + */ + vnd_nsd_rele(nsp); + vnd_nsd_rele(nsp); +errcopyout: + if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) + ret = EFAULT; + + return (ret); +} + +static int +vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret = 0; + vnd_ioc_link_t vil; + char mname[2*VND_NAMELEN]; + char **c; + vnd_dev_t *v; + zoneid_t zid; + + /* Not anyone can link something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) + return (EFAULT); + + if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + c = vnd_reserved_names; + while (*c != NULL) { + if (strcmp(vil.vil_name, *c) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + c++; + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOTATTACHED; + goto errcopyout; + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto errcopyout; + } + + if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKED; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_LINK_INFLIGHT; + zid = vdp->vdd_nsd->vpnd_zid; + mutex_exit(&vdp->vdd_lock); + + if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= + sizeof (mname)) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + mutex_enter(&vnd_dev_lock); + for (v = list_head(&vnd_dev_list); v != NULL; + v = list_next(&vnd_dev_list, v)) { + if (!(v->vdd_flags & VND_D_LINKED)) + continue; + + if (v->vdd_nsd->vpnd_zid == zid && + strcmp(v->vdd_lname, vil.vil_name) == 0) { + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKEXISTS; + goto error; + } + } + + /* + * We set the name and mark ourselves attached while holding the list + * lock to ensure that no other user can mistakingly find our name. + */ + (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, + vil.vil_name); + mutex_enter(&vdp->vdd_lock); + + /* + * Because we dropped our lock, we need to double check whether or not + * the zone was marked as dying while we were here. If it hasn't, then + * it's safe for us to link it in. + */ + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto error; + } + + (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); + if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + ret = EIO; + vil.vil_errno = VND_E_MINORNODE; + } else { + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_flags |= VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + ret = 0; + } + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + if (ret == 0) { + /* + * Add a reference to represent that this device is linked into + * the file system name space to ensure that it doesn't + * disappear. + */ + vnd_dev_ref(vdp); + return (0); + } + +error: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_lname[0] = '\0'; + mutex_exit(&vdp->vdd_lock); + +errcopyout: + if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) + ret = EFAULT; + return (ret); +} + +/* + * Common unlink function. This is used both from the ioctl path and from the + * netstack shutdown path. The caller is required to hold the mutex on the + * vnd_dev_t, but they basically will have it relinquished for them. The only + * thing the caller is allowed to do afterward is to potentially rele the + * vnd_dev_t if they have their own hold. Note that only the ioctl path has its + * own hold. + */ +static void +vnd_dev_unlink(vnd_dev_t *vdp) +{ + char mname[2*VND_NAMELEN]; + + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + (void) snprintf(mname, sizeof (mname), "z%d:%s", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); + ddi_remove_minor_node(vnd_dip, mname); + vdp->vdd_lname[0] = '\0'; + vdp->vdd_flags &= ~VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + mutex_exit(&vdp->vdd_lock); + + /* + * This rele corresponds to the reference that we took in + * vnd_ioctl_link. + */ + vnd_dev_rele(vdp); +} + +static int +vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret; + zoneid_t zid; + vnd_ioc_unlink_t viu; + + /* Not anyone can unlink something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + zid = crgetzoneid(credp); + + if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) + return (EFAULT); + + viu.viu_errno = VND_E_SUCCESS; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_NOTLINKED; + goto err; + } + VERIFY(vdp->vdd_flags & VND_D_ATTACHED); + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_PERM; + goto err; + } + + /* vnd_dev_unlink releases the vdp mutex for us */ + vnd_dev_unlink(vdp); + ret = 0; +err: + if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vdp->vdd_str->vns_dq_read.vdq_max = vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + vnd_ioc_buf_t vib; + + mutex_enter(&vnd_dev_lock); + vib.vib_size = vnd_vdq_hard_max; + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + mutex_exit(&vdp->vdd_str->vns_lock); + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vdp->vdd_str->vns_dq_write.vdq_max = vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) +{ + vnd_ioc_buf_t vib; + + vib.vib_errno = 0; + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED) { + mutex_enter(&vdp->vdd_str->vns_lock); + if (min == B_TRUE) + vib.vib_size = vdp->vdd_str->vns_minwrite; + else + vib.vib_size = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + } else { + vib.vib_errno = VND_E_NOTATTACHED; + } + mutex_exit(&vdp->vdd_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + int ret, nonblock, nwrite; + frameio_t *fio; + vnd_data_queue_t *vqp; + mblk_t *mp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, + mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EWOULDBLOCK); + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + } + + ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, + &nwrite, mode & FKIOCTL); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + while (nwrite > 0) { + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); + nwrite--; + } + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + + return (0); +} + +static int +vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + frameio_t *fio; + int ret, nonblock, nframes, i, nread; + size_t maxwrite, minwrite, total, flen; + mblk_t *mp_chain, *mp, *nmp; + vnd_data_queue_t *vqp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + /* + * Make sure no single frame is larger than we can accept. + */ + mutex_enter(&vdp->vdd_str->vns_lock); + minwrite = vdp->vdd_str->vns_minwrite; + maxwrite = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + + nframes = fio->fio_nvpf / fio->fio_nvecs; + total = 0; + for (i = 0; i < nframes; i++) { + flen = frameio_frame_length(fio, + &fio->fio_vecs[i*fio->fio_nvpf]); + if (flen < minwrite || flen > maxwrite) { + frameio_free(fio); + return (ERANGE); + } + total += flen; + } + + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, total) == 0) { + if (nonblock != 0) { + frameio_free(fio); + mutex_exit(&vqp->vdq_lock); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * We've reserved our space, let's copyin and go from here. + */ + ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); + if (ret != 0) { + frameio_free(fio); + vnd_dq_unreserve(vqp, total); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + return (ret); + } + + for (mp = mp_chain; mp != NULL; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + } + + /* + * Update the frameio structure to indicate that we wrote those frames. + */ + frameio_mark_consumed(fio, nread); + ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); + frameio_free(fio); + + return (ret); +} + +static int +vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) +{ + const char *link; + uint32_t vers = 1; + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + /* + * Copy all of the members out to userland. + */ + if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (vdp->vdd_flags & VND_D_LINKED) + link = vdp->vdd_lname; + else + link = "<anonymous>"; + if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, + sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, + sizeof (zoneid_t), mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +static int +vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) +{ + vnd_ioc_list_t vl; + vnd_ioc_list32_t vl32; + zoneid_t zid; + vnd_dev_t *vdp; + vnd_ioc_info_t *vip; + int found, cancopy, ret; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + vl.vl_nents = vl32.vl_nents; + vl.vl_actents = vl32.vl_actents; + vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; + } else { + if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), + mode & FKIOCTL) != 0) + return (EFAULT); + } + + cancopy = vl.vl_nents; + vip = vl.vl_ents; + found = 0; + zid = crgetzoneid(credp); + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && + (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { + found++; + if (cancopy > 0) { + ret = vnd_ioctl_list_copy_info(vdp, vip, mode); + if (ret != 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return (ret); + } + cancopy--; + vip++; + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, + sizeof (uint_t), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + + +static int +vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + ASSERT(m != 0); + + /* + * Make sure no one has come in on an ioctl from the strioc case. + */ + if ((cmd & VND_STRIOC) == VND_STRIOC) + return (ENOTTY); + + /* + * Like close, seems like if this minor isn't found, it's a programmer + * error somehow. + */ + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + switch (cmd) { + case VND_IOC_ATTACH: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_attach(vdp, arg, credp, mode); + break; + case VND_IOC_LINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_link(vdp, arg, credp, mode); + break; + case VND_IOC_UNLINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_unlink(vdp, arg, credp, mode); + break; + case VND_IOC_GETRXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_getrxbuf(vdp, arg, mode); + break; + case VND_IOC_SETRXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_setrxbuf(vdp, arg, mode); + break; + case VND_IOC_GETTXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettxbuf(vdp, arg, mode); + break; + case VND_IOC_SETTXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_settxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMAXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + if (crgetzoneid(credp) != GLOBAL_ZONEID) { + ret = EPERM; + break; + } + ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMINTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); + break; + case VND_IOC_GETMAXTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); + break; + case VND_IOC_FRAMEIO_READ: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_frameio_read(vdp, arg, mode); + break; + case VND_IOC_FRAMEIO_WRITE: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_frameio_write(vdp, arg, mode); + break; + case VND_IOC_LIST: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_list(arg, credp, mode); + break; + default: + ret = ENOTTY; + break; + } + + vnd_dev_rele(vdp); + return (ret); +} + +static int +vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + vnd_dev_t *vdp; + minor_t m; + zoneid_t zid; + + if (flag & (FEXCL | FNDELAY)) + return (ENOTSUP); + + if (otyp & OTYP_BLK) + return (ENOTSUP); + + zid = crgetzoneid(credp); + m = getminor(*devp); + + /* + * If we have an open of a non-zero instance then we need to look that + * up in our list of entries. + */ + if (m != 0) { + + /* + * We don't check for rawaccess globally as a user could be + * doing a list ioctl on the control node which doesn't require + * this privilege. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENOENT); + + /* + * We need to check to make sure that the user is allowed to + * open this node. At this point it should be an attached handle + * as that's all we're allowed to access. + */ + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (EBUSY); + } + + if (!(vdp->vdd_flags & VND_D_OPENED)) { + vdp->vdd_flags |= VND_D_OPENED; + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + } + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + + return (0); + } + + if (flag & FEXCL) + return (ENOTSUP); + + /* + * We need to clone ourselves and set up new a state. + */ + vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); + bzero(vdp, sizeof (vnd_dev_t)); + + if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { + kmem_cache_free(vnd_dev_cache, vdp); + return (EINVAL); + } + + vdp->vdd_minor = id_alloc(vnd_minors); + mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); + list_link_init(&vdp->vdd_link); + vdp->vdd_ref = 1; + *devp = makedevice(getmajor(*devp), vdp->vdd_minor); + vdp->vdd_devid = *devp; + DTRACE_VND_REFINC(vdp); + vdp->vdd_flags |= VND_D_OPENED; + + mutex_enter(&vnd_dev_lock); + list_insert_head(&vnd_dev_list, vdp); + mutex_exit(&vnd_dev_lock); + + return (0); +} + +static int +vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + if (m == 0) + return (ENXIO); + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_flags & VND_D_OPENED); + vdp->vdd_flags &= ~VND_D_OPENED; + mutex_exit(&vdp->vdd_lock); + + /* Remove the hold from the previous open. */ + vnd_dev_rele(vdp); + + /* And now from lookup */ + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error = 0; + size_t mpsize; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + mblk_t *mp = NULL; + offset_t u_loffset; + + /* + * If we have more than one uio we refuse to do anything. That's for + * frameio. + */ + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + error = EWOULDBLOCK; + goto err; + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + error = EINTR; + goto err; + } + } + } + + /* Ensure our buffer is big enough */ + mp = vqp->vdq_head; + ASSERT(mp != NULL); + mpsize = msgsize(mp); + if (mpsize > uiop->uio_resid) { + error = EOVERFLOW; + goto err; + } + + u_loffset = uiop->uio_loffset; + while (mp != NULL) { + if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { + error = EFAULT; + uiop->uio_loffset = u_loffset; + mp = NULL; + goto err; + } + mpsize -= MBLKL(mp); + mp = mp->b_cont; + } + ASSERT(mpsize == 0); + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); +err: + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + + return (error); +} + +static int +vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error; + vnd_dev_t *vdp; + mblk_t *mp; + ssize_t iosize, origsize; + vnd_data_queue_t *vqp; + + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + mutex_enter(&vdp->vdd_str->vns_lock); + if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || + uiop->uio_resid < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + vnd_dev_rele(vdp); + return (ERANGE); + } + mutex_exit(&vdp->vdd_str->vns_lock); + VERIFY(vdp->vdd_str != NULL); + + /* + * Reserve space in the data queue if we can. If we can't, block or + * return EAGAIN. If we can, go and squeue_enter. + */ + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * Now that we've reserved the space, try to allocate kernel space for + * and copy in the block. To take care of all this we use the + * strmakedata subroutine for now. + */ + origsize = iosize = uiop->uio_resid; + error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, + &mp); + + /* + * strmakedata() will return an error or it may only consume a portion + * of the data. + */ + if (error != 0 || uiop->uio_resid != 0) { + vnd_dq_unreserve(vqp, origsize); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + vnd_dev_rele(vdp); + return (ENOSR); + } + + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int ready = 0; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + if ((events & POLLIN) || (events & POLLRDNORM)) { + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head != NULL) + ready |= events & (POLLIN | POLLRDNORM); + mutex_exit(&vqp->vdq_lock); + } + + if (events & POLLOUT) { + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_cur != vqp->vdq_max) + ready |= POLLOUT; + mutex_exit(&vqp->vdq_lock); + } + + if (ready != 0) { + *reventsp = ready; + vnd_dev_rele(vdp); + return (0); + } + + *reventsp = 0; + if (!anyyet) + *phpp = &vdp->vdd_ph; + + vnd_dev_rele(vdp); + return (0); +} + +static void * +vnd_stack_init(netstackid_t stackid, netstack_t *ns) +{ + vnd_pnsd_t *nsp; + + nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); + bzero(nsp, sizeof (*nsp)); + nsp->vpnd_nsid = stackid; + nsp->vpnd_zid = netstackid_to_zoneid(stackid); + nsp->vpnd_flags = 0; + mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_nslink)); + if (vnd_netinfo_init(nsp) == 0) + nsp->vpnd_hooked = B_TRUE; + + mutex_enter(&vnd_dev_lock); + list_insert_tail(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + return (nsp); +} + +static void +vnd_stack_shutdown(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + vnd_dev_t *vdp; + + ASSERT(nsp != NULL); + /* + * After shut down no one should be able to find their way to this + * netstack again. + */ + mutex_enter(&vnd_dev_lock); + list_remove(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + /* + * Make sure hooks know that they're going away. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_shutdown(nsp); + + /* + * Now we need to go through and notify each zone that they are in + * teardown phase. See the big theory statement section on vnd, zones, + * netstacks, and sdev for more information about this. + */ + mutex_enter(&nsp->vpnd_lock); + nsp->vpnd_flags |= VND_NS_CONDEMNED; + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_CONDEMNED)) + vdp->vdd_flags |= VND_D_ZONE_DYING; + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + /* + * Next we remove all the links as we know nothing new can be added to + * the list and that none of the extent devices can obtain additional + * links. + */ +restart: + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_CONDEMNED) || + !(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + continue; + } + + /* + * We drop our lock here and restart afterwards. Note that as + * part of unlinking we end up doing a rele of the vnd_dev_t. If + * this is the final hold on the vnd_dev_t then it might try and + * remove itself. Our locking rules requires not to be holding + * any locks when we call any of the rele functions. + * + * Note that the unlink function requires holders to call into + * it with the vnd_dev_t->vdd_lock held and will take care of it + * for us. Because we don't have a hold on it, we're done at + * this point. + */ + mutex_exit(&nsp->vpnd_lock); + /* Forcibly unlink */ + vnd_dev_unlink(vdp); + goto restart; + } + mutex_exit(&nsp->vpnd_lock); +} + +static void +vnd_stack_destroy(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + + ASSERT(nsp != NULL); + + /* + * Now that we've unlinked everything we just have to hang out for + * it to finish exiting. Now that it's no longer the kernel itself + * that's doing this we just need to wait for our reference count to + * equal zero and then we're free. If the global zone is holding open a + * reference to a vnd device for another zone, that's bad, but there's + * nothing much we can do. See the section on 'vnd, zones, netstacks' in + * the big theory statement for more information. + */ + mutex_enter(&nsp->vpnd_lock); + while (nsp->vpnd_ref != 0) + cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); + mutex_exit(&nsp->vpnd_lock); + + /* + * During shutdown we removed ourselves from the list and now we have no + * more references so we can safely say that there is nothing left and + * destroy everything that we had sitting around. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_fini(nsp); + + mutex_destroy(&nsp->vpnd_lock); + list_destroy(&nsp->vpnd_dev_list); + kmem_cache_free(vnd_pnsd_cache, nsp); +} + +/* + * Convert a node with a name of the form /dev/vnd/zone/%zonename and + * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. + */ +static vnd_pnsd_t * +vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) +{ + enum vtype vt; + const char *path = sdev_ctx_path(ctx); + char *zstart, *dup; + size_t duplen; + vnd_pnsd_t *nsp; + + vt = sdev_ctx_vtype(ctx); + ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); + + if (vt == VDIR) { + zstart = strrchr(path, '/'); + ASSERT(zstart != NULL); + zstart++; + return (vnd_nsd_lookup_by_zonename(zstart)); + } + + ASSERT(vt == VCHR); + + dup = strdup(path); + duplen = strlen(dup) + 1; + zstart = strrchr(dup, '/'); + *zstart = '\0'; + zstart--; + zstart = strrchr(dup, '/'); + zstart++; + nsp = vnd_nsd_lookup_by_zonename(zstart); + kmem_free(dup, duplen); + + return (nsp); +} + +static sdev_plugin_validate_t +vnd_sdev_validate_dir(sdev_ctx_t ctx) +{ + vnd_pnsd_t *nsp; + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) + return (SDEV_VTOR_VALID); + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + return (SDEV_VTOR_VALID); + } + + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (SDEV_VTOR_INVALID); + vnd_nsd_rele(nsp); + + return (SDEV_VTOR_VALID); +} + +static sdev_plugin_validate_t +vnd_sdev_validate(sdev_ctx_t ctx) +{ + enum vtype vt; + dev_t dev; + vnd_dev_t *vdp; + + vt = sdev_ctx_vtype(ctx); + if (vt == VDIR) + return (vnd_sdev_validate_dir(ctx)); + ASSERT(vt == VCHR); + + if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) + return (SDEV_VTOR_VALID); + + dev = (uintptr_t)sdev_ctx_vtype_data(ctx); + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (SDEV_VTOR_STALE); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED) || + (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_VALID); +} + +/* + * This function is a no-op. sdev never has holds on our devices as they can go + * away at any time and specfs has to deal with that fact. + */ +static void +vnd_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static int +vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) +{ + int ret; + vnd_dev_t *vdp; + + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_LINKED) && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, S_IFCHR, + vdp->vdd_devid); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&nsp->vpnd_lock); + vnd_nsd_rele(nsp); + return (ret); + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + return (0); +} + +static int +vnd_sdev_filldir_root(sdev_ctx_t ctx) +{ + zoneid_t zid; + vnd_pnsd_t *nsp; + int ret; + + zid = getzoneid(); + nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); + ASSERT(nsp != NULL); + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + if (ret != 0) + return (ret); + + /* + * Checking the zone id is not sufficient as the global zone could be + * reaching down into a non-global zone's mounted /dev. + */ + if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { + ret = sdev_plugin_mkdir(ctx, "zone"); + if (ret != 0 && ret != EEXIST) + return (ret); + } + + /* + * Always add a reference to the control node. There's no need to + * reference it since it always exists and is always what we clone from. + */ + ret = sdev_plugin_mknod(ctx, "ctl", S_IFCHR, + makedevice(ddi_driver_major(vnd_dip), 0)); + if (ret != 0 && ret != EEXIST) + return (ret); + + return (0); +} + +static int +vnd_sdev_filldir_zroot(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + zone_t *zonep; + + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + mutex_enter(&nsp->vpnd_lock); + if (list_is_empty(&nsp->vpnd_dev_list)) { + mutex_exit(&nsp->vpnd_lock); + continue; + } + mutex_exit(&nsp->vpnd_lock); + zonep = zone_find_by_id(nsp->vpnd_zid); + /* + * This zone must be being torn down, so skip it. + */ + if (zonep == NULL) + continue; + ret = sdev_plugin_mkdir(ctx, zonep->zone_name); + zone_rele(zonep); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vnd_dev_lock); + return (ret); + } + } + mutex_exit(&vnd_dev_lock); + return (0); +} + +static int +vnd_sdev_filldir(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + + ASSERT(sdev_ctx_vtype(ctx) == VDIR); + if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_root(ctx)); + + if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_zroot(ctx)); + + ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), + strlen(VND_SDEV_ZROOT)) == 0); + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (0); + + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + + return (ret); +} + +static sdev_plugin_ops_t vnd_sdev_ops = { + SDEV_PLUGIN_VERSION, + SDEV_PLUGIN_SUBDIR, + vnd_sdev_validate, + vnd_sdev_filldir, + vnd_sdev_inactive +}; + +static int +vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int errp = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Only allow one instance. + */ + if (vnd_dip != NULL) + return (DDI_FAILURE); + + vnd_dip = dip; + if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != + DDI_SUCCESS) { + vnd_dip = NULL; + return (DDI_FAILURE); + } + + if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, + DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { + ddi_remove_minor_node(vnd_dip, NULL); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, + &errp); + if (vnd_sdev_hdl == NULL) { + ddi_remove_minor_node(vnd_dip, NULL); + ddi_prop_remove_all(vnd_dip); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_WAIT, + GSQUEUE_DEFAULT_PRIORITY); + + return (DDI_SUCCESS); +} + +static int +vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&vnd_dev_lock); + if (!list_is_empty(&vnd_dev_list)) { + mutex_exit(&vnd_dev_lock); + return (DDI_FAILURE); + } + mutex_exit(&vnd_dev_lock); + + return (DDI_FAILURE); +} + +static int +vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vnd_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + + + +static void +vnd_ddi_fini(void) +{ + netstack_unregister(NS_VND); + if (vnd_taskq != NULL) + taskq_destroy(vnd_taskq); + if (vnd_str_cache != NULL) + kmem_cache_destroy(vnd_str_cache); + if (vnd_dev_cache != NULL) + kmem_cache_destroy(vnd_dev_cache); + if (vnd_pnsd_cache != NULL) + kmem_cache_destroy(vnd_pnsd_cache); + if (vnd_minors != NULL) + id_space_destroy(vnd_minors); + if (vnd_list_init != 0) { + list_destroy(&vnd_nsd_list); + list_destroy(&vnd_dev_list); + mutex_destroy(&vnd_dev_lock); + vnd_list_init = 0; + } + frameio_fini(); +} + +static int +vnd_ddi_init(void) +{ + if (frameio_init() != 0) + return (DDI_FAILURE); + + vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_str_cache == NULL) { + frameio_fini(); + return (DDI_FAILURE); + } + vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_dev_cache == NULL) { + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", + sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_pnsd_cache == NULL) { + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); + if (vnd_taskq == NULL) { + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); + if (vnd_minors == NULL) { + taskq_destroy(vnd_taskq); + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&vnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_link)); + list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), + offsetof(vnd_pnsd_t, vpnd_link)); + vnd_list_init = 1; + + netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, + vnd_stack_destroy); + + return (DDI_SUCCESS); +} + +static struct module_info vnd_minfo = { + 0, /* module id */ + "vnd", /* module name */ + 1, /* smallest packet size */ + INFPSZ, /* largest packet size (infinite) */ + 1, /* high watermark */ + 0 /* low watermark */ +}; + +static struct qinit vnd_r_qinit = { + vnd_s_rput, + NULL, + vnd_s_open, + vnd_s_close, + NULL, + &vnd_minfo, + NULL +}; + +static struct qinit vnd_w_qinit = { + vnd_s_wput, + NULL, + NULL, + NULL, + NULL, + &vnd_minfo, + NULL +}; + +static struct streamtab vnd_strtab = { + &vnd_r_qinit, + &vnd_w_qinit, + NULL, + NULL +}; + + +static struct cb_ops vnd_cb_ops = { + vnd_open, /* open */ + vnd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + vnd_read, /* read */ + vnd_write, /* write */ + vnd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + vnd_chpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops vnd_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + vnd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + vnd_attach, /* attach */ + vnd_detach, /* detach */ + nodev, /* reset */ + &vnd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv vnd_modldrv = { + &mod_driverops, + "Virtual Networking Datapath Driver", + &vnd_dev_ops +}; + +static struct fmodsw vnd_fmodfsw = { + "vnd", + &vnd_strtab, + D_NEW | D_MP +}; + +static struct modlstrmod vnd_modlstrmod = { + &mod_strmodops, + "Virtual Networking Datapath Driver", + &vnd_fmodfsw +}; + +static struct modlinkage vnd_modlinkage = { + MODREV_1, + &vnd_modldrv, + &vnd_modlstrmod, + NULL +}; + +int +_init(void) +{ + int error; + + /* + * We need to do all of our global initialization in init as opposed to + * attach and detach. The problem here is that because vnd can be used + * from a stream context while being detached, we can not rely on having + * run attach to create everything, alas. so it goes in _init, just like + * our friend ip. + */ + if ((error = vnd_ddi_init()) != DDI_SUCCESS) + return (error); + error = mod_install((&vnd_modlinkage)); + if (error != 0) + vnd_ddi_fini(); + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vnd_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&vnd_modlinkage); + if (error == 0) + vnd_ddi_fini(); + return (error); +} diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf new file mode 100644 index 0000000000..65872e1ddf --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# + +name="vnd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index 3cacbe395b..3cb7e7660a 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -53,6 +53,7 @@ #include <sys/vlan.h> #include <sys/vnic.h> #include <sys/vnic_impl.h> +#include <sys/mac_impl.h> #include <sys/mac_flow_impl.h> #include <inet/ip_impl.h> @@ -369,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, bzero(vnic, sizeof (*vnic)); + vnic->vn_ls = LINK_STATE_UNKNOWN; vnic->vn_id = vnic_id; vnic->vn_link_id = linkid; vnic->vn_vrid = vrid; @@ -579,11 +581,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, vnic->vn_enabled = B_TRUE; if (is_anchor) { - mac_link_update(vnic->vn_mh, LINK_STATE_UP); + vnic->vn_ls = LINK_STATE_UP; } else { - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); } + mac_link_update(vnic->vn_mh, vnic->vn_ls); rw_exit(&vnic_lock); @@ -1072,6 +1075,18 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = mac_maxsdu_update(vn->vn_mh, mtu); break; } + case MAC_PROP_VN_PROMISC_FILTERED: { + boolean_t filtered; + + if (pr_valsize < sizeof (filtered)) { + err = EINVAL; + break; + } + + bcopy(pr_val, &filtered, sizeof (filtered)); + mac_set_promisc_filtered(vn->vn_mch, filtered); + break; + } case MAC_PROP_SECONDARY_ADDRS: { mac_secondary_addr_t msa; @@ -1079,6 +1094,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = vnic_set_secondary_macs(vn, &msa); break; } + case MAC_PROP_PRIVATE: { + long val, i; + const char *v; + + if (vn->vn_link_id != DATALINK_INVALID_LINKID || + strcmp(pr_name, "_linkstate") != 0) { + err = ENOTSUP; + break; + } + + for (v = pr_val, i = 0; i < pr_valsize; i++, v++) { + if (*v == '\0') + break; + } + if (i == pr_valsize) { + err = EINVAL; + break; + } + + (void) ddi_strtol(pr_val, (char **)NULL, 0, &val); + if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) { + err = EINVAL; + break; + } + vn->vn_ls = val; + mac_link_update(vn->vn_mh, vn->vn_ls); + break; + } default: err = ENOTSUP; break; @@ -1093,11 +1136,29 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, { vnic_t *vn = arg; int ret = 0; + boolean_t out; switch (pr_num) { + case MAC_PROP_VN_PROMISC_FILTERED: + out = mac_get_promisc_filtered(vn->vn_mch); + ASSERT(pr_valsize >= sizeof (boolean_t)); + bcopy(&out, pr_val, sizeof (boolean_t)); + break; case MAC_PROP_SECONDARY_ADDRS: ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val); break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) { + ret = EINVAL; + break; + } + + if (strcmp(pr_name, "_linkstate") != 0) { + ret = EINVAL; + break; + } + (void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls); + break; default: ret = ENOTSUP; break; @@ -1107,7 +1168,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, } /* ARGSUSED */ -static void vnic_m_propinfo(void *m_driver, const char *pr_name, +static void +vnic_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { vnic_t *vn = m_driver; @@ -1150,6 +1212,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name, mac_perim_exit(mph); } break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) + break; + + if (strcmp(pr_name, "_linkstate") == 0) { + char buf[16]; + + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + (void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls); + mac_prop_info_set_default_str(prh, buf); + } + break; } } @@ -1222,8 +1296,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type) break; case MAC_NOTE_LINK: - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); + mac_link_update(vnic->vn_mh, vnic->vn_ls); break; default: diff --git a/usr/src/uts/common/io/vscan/vscan_svc.c b/usr/src/uts/common/io/vscan/vscan_svc.c index a9817f571f..92eb0901c2 100644 --- a/usr/src/uts/common/io/vscan/vscan_svc.c +++ b/usr/src/uts/common/io/vscan/vscan_svc.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/stat.h> @@ -461,7 +462,7 @@ vscan_svc_scan_file(vnode_t *vp, cred_t *cr, int async) boolean_t allow; clock_t timeout, time_left; - if ((vp == NULL) || (vp->v_path == NULL) || cr == NULL) + if ((vp == NULL) || (vp->v_path == vn_vpath_empty) || cr == NULL) return (0); DTRACE_PROBE2(vscan__scan__file, char *, vp->v_path, int, async); @@ -1080,7 +1081,6 @@ vscan_svc_exempt_file(vnode_t *vp, boolean_t *allow) struct vattr attr; ASSERT(vp != NULL); - ASSERT(vp->v_path != NULL); attr.va_mask = AT_SIZE; diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c new file mode 100644 index 0000000000..2da310ab8d --- /dev/null +++ b/usr/src/uts/common/io/zfd.c @@ -0,0 +1,1154 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +/* + * Zone File Descriptor Driver. + * + * This driver is derived from the zcons driver which is in turn derived from + * the pts/ptm drivers. The purpose is to expose file descriptors within the + * zone which are connected to zoneadmd and used for logging or an interactive + * connection to a process within the zone. + * + * Its implementation is straightforward. Each instance of the driver + * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd + * uses these devices unidirectionally to provide stdin, stdout and stderr to + * the process within the zone. + * + * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd, + * using the devctl framework; thus the driver does not need to maintain any + * sort of "admin" node. + * + * The driver shuttles I/O from master side to slave side and back. In a break + * from the pts/ptm semantics, if one side is not open, I/O directed towards + * it will simply be discarded. This is so that if zoneadmd is not holding the + * master side fd open (i.e. it has died somehow), processes in the zone do not + * experience any errors and I/O to the fd does not cause the process to hang. + * + * The driver can also act as a multiplexer so that data written to the + * slave side within the zone is also redirected back to another zfd device + * inside the zone for consumption (i.e. it can be read). The intention is + * that a logging process within the zone can consume data that is being + * written by an application onto the primary stream. This is essentially + * a tee off of the primary stream into a log stream. This tee can also be + * configured to be flow controlled via an ioctl. Flow control happens on the + * primary stream and is used to ensure that the log stream receives all of + * the messages off the primary stream when consumption of the data off of + * the log stream gets behind. Configuring for flow control implies that the + * application writing to the primary stream will be blocked when the log + * consumer gets behind. Note that closing the log stream (e.g. when the zone + * halts) will cause the loss of all messages queued in the stream. + * + * The zone's zfd device configuration is driven by zoneadmd and a zone mode. + * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat + * of a misnomer since its purpose has evolved. The attribute can have a + * variety of values, but the lowest two positions are used to control how many + * zfd devices are created inside the zone and if the primary stream is a tty. + * + * Here is a summary of how the 4 modes control what zfd devices are created + * and how they're used: + * + * t-: 1 stdio zdev (0) configured as a tty + * --: 3 stdio zdevs (0, 1, 2), not configured as a tty + * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1) + * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4) + * + * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex + * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are + * autopushed onto the stream when the slave side is opened. There is only a + * single zfd dev (0) needed for the primary stream. + * + * When the 'n' flag is set, it is assumed that output logging will be done + * within the zone itself. In this configuration 1 or 2 additional zfd devices, + * depending on tty mode ('t' flag) are created within the zone. An application + * can then configure the zfd streams driver into a multiplexer. Output from + * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s) + * within the zone. + * + * The following is a diagram of how this works for a '-n' configuration: + * + * + * zoneadmd (for zlogin -I stdout) + * GZ: ^ + * | + * -------------------------- + * ^ + * NGZ: | + * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout) + * + * There would be a similar path for the app's stderr into zfd4 for the logger + * to consume stderr. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/debug.h> +#include <sys/devops.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kstr.h> +#include <sys/modctl.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zfd.h> +#include <sys/vnode.h> +#include <sys/fs/snode.h> +#include <sys/zone.h> +#include <sys/sdt.h> + +static kmutex_t zfd_mux_lock; + +static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int zfd_attach(dev_info_t *, ddi_attach_cmd_t); +static int zfd_detach(dev_info_t *, ddi_detach_cmd_t); + +static int zfd_open(queue_t *, dev_t *, int, int, cred_t *); +static int zfd_close(queue_t *, int, cred_t *); +static void zfd_wput(queue_t *, mblk_t *); +static void zfd_rsrv(queue_t *); +static void zfd_wsrv(queue_t *); + +/* + * The instance number is encoded in the dev_t in the minor number; the lowest + * bit of the minor number is used to track the master vs. slave side of the + * fd. The rest of the bits in the minor number are the instance. + */ +#define ZFD_MASTER_MINOR 0 +#define ZFD_SLAVE_MINOR 1 + +#define ZFD_INSTANCE(x) (getminor((x)) >> 1) +#define ZFD_NODE(x) (getminor((x)) & 0x01) + +/* + * This macro converts a zfd_state_t pointer to the associated slave minor + * node's dev_t. + */ +#define ZFD_STATE_TO_SLAVEDEV(x) \ + (makedevice(ddi_driver_major((x)->zfd_devinfo), \ + (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR))) + +int zfd_debug = 0; +#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a) +#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b) + +/* + * ZFD Pseudo Terminal Module: stream data structure definitions, + * based on zcons. + */ +static struct module_info zfd_info = { + 0x20FD, /* ZOFD - 8445 */ + "zfd", + 0, /* min packet size */ + INFPSZ, /* max packet size - infinity */ + 2048, /* high water */ + 128 /* low water */ +}; + +static struct qinit zfd_rinit = { + NULL, + (int (*)()) zfd_rsrv, + zfd_open, + zfd_close, + NULL, + &zfd_info, + NULL +}; + +static struct qinit zfd_winit = { + (int (*)()) zfd_wput, + (int (*)()) zfd_wsrv, + NULL, + NULL, + NULL, + &zfd_info, + NULL +}; + +static struct streamtab zfd_tab_info = { + &zfd_rinit, + &zfd_winit, + NULL, + NULL +}; + +#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL) + +/* + * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops) + */ +DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \ + nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \ + ddi_quiesce_not_needed); + +/* + * Module linkage information for the kernel. + */ + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module (this is a pseudo driver) */ + "Zone FD driver", /* description of module */ + &zfd_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +typedef enum { + ZFD_NO_MUX, + ZFD_PRIMARY_STREAM, + ZFD_LOG_STREAM +} zfd_mux_type_t; + +typedef struct zfd_state { + dev_info_t *zfd_devinfo; /* instance info */ + queue_t *zfd_master_rdq; /* GZ read queue */ + queue_t *zfd_slave_rdq; /* in-zone read queue */ + int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */ + int zfd_tty; /* ZFD_MAKETTY - strm mods will push */ + boolean_t zfd_is_flowcon; /* primary stream flow stopped */ + boolean_t zfd_allow_flowcon; /* use flow control */ + zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */ + struct zfd_state *zfd_inst_pri; /* log state's primary ptr */ + struct zfd_state *zfd_inst_log; /* primary state's log ptr */ +} zfd_state_t; + +#define ZFD_STATE_MOPEN 0x01 +#define ZFD_STATE_SOPEN 0x02 + +static void *zfd_soft_state; + +/* + * List of STREAMS modules that are autopushed onto a slave instance when its + * opened, but only if the ZFD_MAKETTY ioctl has first been received by the + * master. + */ +static char *zfd_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t), + 0)) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(zfd_soft_state); + + mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL); + return (err); +} + + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) { + return (err); + } + + ddi_soft_state_fini(&zfd_soft_state); + mutex_destroy(&zfd_mux_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN]; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME, + instance); + (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME, + instance); + + /* + * Create the master and slave minor nodes. + */ + if ((ddi_create_minor_node(dip, slavenm, S_IFCHR, + instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) || + (ddi_create_minor_node(dip, masternm, S_IFCHR, + instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) { + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + return (DDI_FAILURE); + } + + VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL); + zfds->zfd_devinfo = dip; + zfds->zfd_tty = 0; + zfds->zfd_muxt = ZFD_NO_MUX; + zfds->zfd_inst_log = NULL; + return (DDI_SUCCESS); +} + +static int +zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (DDI_FAILURE); + + if ((zfds->zfd_state & ZFD_STATE_MOPEN) || + (zfds->zfd_state & ZFD_STATE_SOPEN)) { + DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip); + return (DDI_FAILURE); + } + + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + + return (DDI_SUCCESS); +} + +/* + * zfd_getinfo() + * getinfo(9e) entrypoint. + */ +/*ARGSUSED*/ +static int +zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + zfd_state_t *zfds; + int instance = ZFD_INSTANCE((dev_t)arg); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) + return (DDI_FAILURE); + *result = zfds->zfd_devinfo; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Return the equivalent queue from the other side of the relationship. + * e.g.: given the slave's write queue, return the master's write queue. + */ +static queue_t * +zfd_switch(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq) + return (zfds->zfd_slave_rdq); + else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq + != NULL) + return (OTHERQ(zfds->zfd_slave_rdq)); + else if (qp == zfds->zfd_slave_rdq) + return (zfds->zfd_master_rdq); + else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq + != NULL) + return (OTHERQ(zfds->zfd_master_rdq)); + else + return (NULL); +} + +/* + * For debugging and outputting messages. Returns the name of the side of + * the relationship associated with this queue. + */ +static const char * +zfd_side(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq || + OTHERQ(qp) == zfds->zfd_master_rdq) { + return ("master"); + } + ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq); + return ("slave"); +} + +/*ARGSUSED*/ +static int +zfd_master_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + + /* + * Enforce exclusivity on the master side; the only consumer should + * be the zoneadmd for the zone. + */ + if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0) + return (EBUSY); + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_master_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_MOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + qprocson(rqp); + + /* + * Following qprocson(), the master side is fully plumbed into the + * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq + * will allow the slave to send messages to us (the master). + * This cannot occur before qprocson() because the master is not + * ready to process them until that point. + */ + zfds->zfd_master_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + if (oflag & FNOCTTY) + sop->so_flags = SO_HIWAT | SO_LOWAT; + else + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/*ARGSUSED*/ +static int +zfd_slave_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + /* + * The slave side can be opened as many times as needed. + */ + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds)); + return (0); + } + + /* A log stream is read-only */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + (oflag & (FREAD | FWRITE)) != FREAD) + return (EINVAL); + + if (zfds->zfd_tty == 1) { + major_t major; + minor_t minor; + minor_t lastminor; + uint_t anchorindex; + + /* + * Set up sad(7D) so that the necessary STREAMS modules will + * be in place. A wrinkle is that 'ptem' must be anchored + * in place (see streamio(7i)) because we always want the + * fd to have terminal semantics. + */ + minor = + ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR; + major = ddi_driver_major(zfds->zfd_devinfo); + lastminor = 0; + anchorindex = 1; + if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor, + &anchorindex, zfd_mods) != 0) { + DBG("zfd_slave_open(): kstr_autopush() failed\n"); + return (EIO); + } + } + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_slave_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_SOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + + qprocson(rqp); + + /* + * Must follow qprocson(), since we aren't ready to process until then. + */ + zfds->zfd_slave_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/* + * open(9e) entrypoint; checks sflag, and rejects anything unordinary. + */ +static int +zfd_open(queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + int instance = ZFD_INSTANCE(*devp); + int ret; + zfd_state_t *zfds; + + if (sflag != 0) + return (EINVAL); + + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (ENXIO); + + switch (ZFD_NODE(*devp)) { + case ZFD_MASTER_MINOR: + ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp); + break; + case ZFD_SLAVE_MINOR: + ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp); + /* + * If we just opened the log stream and flow control has + * been enabled, we want to make sure the primary stream can + * start flowing. + */ + if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + break; + default: + ret = ENXIO; + break; + } + + return (ret); +} + +/* + * close(9e) entrypoint. + */ +/*ARGSUSED1*/ +static int +zfd_close(queue_t *rqp, int flag, cred_t *credp) +{ + queue_t *wqp; + mblk_t *bp; + zfd_state_t *zfds; + major_t major; + minor_t minor; + + zfds = (zfd_state_t *)rqp->q_ptr; + + if (rqp == zfds->zfd_master_rdq) { + DBG("Closing master side"); + + zfds->zfd_master_rdq = NULL; + zfds->zfd_state &= ~ZFD_STATE_MOPEN; + + /* + * qenable slave side write queue so that it can flush + * its messages as master's read queue is going away + */ + if (zfds->zfd_slave_rdq != NULL) { + qenable(WR(zfds->zfd_slave_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + } else if (rqp == zfds->zfd_slave_rdq) { + + DBG("Closing slave side"); + zfds->zfd_state &= ~ZFD_STATE_SOPEN; + zfds->zfd_slave_rdq = NULL; + + wqp = WR(rqp); + while ((bp = getq(wqp)) != NULL) { + if (zfds->zfd_master_rdq != NULL) + putnext(zfds->zfd_master_rdq, bp); + else if (bp->b_datap->db_type == M_IOCTL) + miocnak(wqp, bp, 0, 0); + else + freemsg(bp); + } + + /* + * Qenable master side write queue so that it can flush its + * messages as slaves's read queue is going away. + */ + if (zfds->zfd_master_rdq != NULL) + qenable(WR(zfds->zfd_master_rdq)); + + /* + * Qenable primary stream if necessary. + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + if (zfds->zfd_tty == 1) { + /* + * Clear the sad configuration so that reopening + * doesn't fail to set up sad configuration. + */ + major = ddi_driver_major(zfds->zfd_devinfo); + minor = ddi_get_instance(zfds->zfd_devinfo) << 1 | + ZFD_SLAVE_MINOR; + (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor, + NULL, NULL, NULL); + } + } + + return (0); +} + +static void +handle_mflush(queue_t *qp, mblk_t *mp) +{ + mblk_t *nmp; + DBG1("M_FLUSH on %s side", zfd_side(qp)); + + if (*mp->b_rptr & FLUSHW) { + DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp)); + flushq(qp, FLUSHDATA); + *mp->b_rptr &= ~FLUSHW; + if ((*mp->b_rptr & FLUSHR) == 0) { + /* + * FLUSHW only. Change to FLUSHR and putnext other side, + * then we are done. + */ + *mp->b_rptr |= FLUSHR; + if (zfd_switch(RD(qp)) != NULL) { + putnext(zfd_switch(RD(qp)), mp); + return; + } + } else if ((zfd_switch(RD(qp)) != NULL) && + (nmp = copyb(mp)) != NULL) { + /* + * It is a FLUSHRW; we copy the mblk and send + * it to the other side, since we still need to use + * the mblk in FLUSHR processing, below. + */ + putnext(zfd_switch(RD(qp)), nmp); + } + } + + if (*mp->b_rptr & FLUSHR) { + DBG("qreply(qp) turning FLUSHR around\n"); + qreply(qp, mp); + return; + } + freemsg(mp); +} + +/* + * Evaluate the various conditionals to determine if we're teeing into a log + * stream and if the primary stream should be flow controlled. This function + * can set the zfd_is_flowcon flag as a side effect. + * + * When teeing with flow control, we always queue the teed msg here and if + * the queue is getting full, we set zfd_is_flowcon. The primary stream will + * always queue when zfd_is_flowcon and will also not be served when + * zfd_is_flowcon is set. This causes backpressure on the primary stream + * until the teed queue can drain. + */ +static void +zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp) +{ + queue_t *log_qp; + zfd_state_t *log_zfds; + mblk_t *lmp; + + if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM) + return; + + if (type != M_DATA) + return; + + log_zfds = zfds->zfd_inst_log; + if (log_zfds == NULL) + return; + + ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM); + + if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + /* The zfd_slave_rdq is null until the log dev is opened in the zone */ + log_qp = RD(log_zfds->zfd_slave_rdq); + DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds); + + if (!zfds->zfd_allow_flowcon) { + /* + * We're not supposed to tee with flow control and the tee is + * full so we skip teeing into the log stream. + */ + if ((log_qp->q_flag & QFULL) != 0) + return; + } + + /* + * Tee the message into the log stream. + */ + lmp = dupmsg(mp); + if (lmp == NULL) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) { + putnext(log_qp, lmp); + } else { + if (putq(log_qp, lmp) == 0) { + /* The logger queue is full, free the msg. */ + freemsg(lmp); + } + /* + * If we're supposed to tee with flow control and the tee is + * over the high water mark then we want the primary stream to + * stop flowing. We'll stop queueing the primary stream after + * the log stream has drained. + */ + if (zfds->zfd_allow_flowcon && + log_qp->q_count > log_qp->q_hiwat) { + zfds->zfd_is_flowcon = B_TRUE; + } + } +} + +/* + * wput(9E) is symmetric for master and slave sides, so this handles both + * without splitting the codepath. (The only exception to this is the + * processing of zfd ioctls, which is restricted to the master side.) + * + * zfd_wput() looks at the other side; if there is no process holding that + * side open, it frees the message. This prevents processes from hanging + * if no one is holding open the fd. Otherwise, it putnext's high + * priority messages, putnext's normal messages if possible, and otherwise + * enqueues the messages; in the case that something is enqueued, wsrv(9E) + * will take care of eventually shuttling I/O to the other side. + * + * When configured as a multiplexer, then anything written to the stream + * from inside the zone is also teed off to the corresponding log stream + * for consumption within the zone (i.e. the log stream can be read, but never + * written to, by an application inside the zone). + */ +static void +zfd_wput(queue_t *qp, mblk_t *mp) +{ + unsigned char type = mp->b_datap->db_type; + zfd_state_t *zfds; + struct iocblk *iocbp; + boolean_t must_queue = B_FALSE; + + ASSERT(qp->q_ptr); + + DBG1("entering zfd_wput, %s side", zfd_side(qp)); + + /* + * Process zfd ioctl messages if qp is the master side's write queue. + */ + zfds = (zfd_state_t *)qp->q_ptr; + + if (type == M_IOCTL) { + iocbp = (struct iocblk *)(void *)mp->b_rptr; + + switch (iocbp->ioc_cmd) { + case ZFD_MAKETTY: + zfds->zfd_tty = 1; + miocack(qp, mp, 0, 0); + return; + case ZFD_EOF: + if (zfds->zfd_slave_rdq != NULL) + (void) putnextctl(zfds->zfd_slave_rdq, + M_HANGUP); + miocack(qp, mp, 0, 0); + return; + case ZFD_HAS_SLAVE: + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + miocack(qp, mp, 0, 0); + } else { + miocack(qp, mp, 0, ENOTTY); + } + return; + case ZFD_MUX: { + /* + * Setup the multiplexer configuration for the two + * streams. + * + * We expect to be called on the stream that will + * become the log stream and be passed one data block + * with the minor number of the slave side of the + * primary stream. + */ + int to; + int instance; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Get the primary slave minor device number */ + to = *(int *)mp->b_cont->b_rptr; + instance = ZFD_INSTANCE(to); + + if ((prim_zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Disallow changing primary/log once set. */ + mutex_enter(&zfd_mux_lock); + if (zfds->zfd_muxt != ZFD_NO_MUX || + prim_zfds->zfd_muxt != ZFD_NO_MUX) { + mutex_exit(&zfd_mux_lock); + miocack(qp, mp, 0, EINVAL); + return; + } + + zfds->zfd_muxt = ZFD_LOG_STREAM; + zfds->zfd_inst_pri = prim_zfds; + prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM; + prim_zfds->zfd_inst_log = zfds; + mutex_exit(&zfd_mux_lock); + DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds, + void *, zfds); + + miocack(qp, mp, 0, 0); + return; + } + case ZFD_MUX_FLOWCON: { + /* + * We expect this ioctl to be issued against the + * log stream. We don't use the primary stream since + * there can be other streams modules pushed onto that + * stream which would interfere with the ioctl. + */ + int val; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + if (zfds->zfd_muxt != ZFD_LOG_STREAM) { + miocack(qp, mp, 0, EINVAL); + return; + } + prim_zfds = zfds->zfd_inst_pri; + + /* Get the flow control setting */ + val = *(int *)mp->b_cont->b_rptr; + if (val != 0 && val != 1) { + miocack(qp, mp, 0, EINVAL); + return; + } + + prim_zfds->zfd_allow_flowcon = (boolean_t)val; + if (!prim_zfds->zfd_allow_flowcon) + prim_zfds->zfd_is_flowcon = B_FALSE; + + DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds); + miocack(qp, mp, 0, 0); + return; + } + default: + break; + } + } + + /* if on the write side, may need to tee */ + if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) { + /* tee output to any attached log stream */ + zfd_tee_handler(zfds, type, mp); + + /* high-priority msgs are not subject to flow control */ + if (zfds->zfd_is_flowcon && type == M_DATA) + must_queue = B_TRUE; + } + + if (zfd_switch(RD(qp)) == NULL) { + DBG1("wput to %s side (no one listening)", zfd_side(qp)); + switch (type) { + case M_FLUSH: + handle_mflush(qp, mp); + break; + case M_IOCTL: + miocnak(qp, mp, 0, 0); + break; + default: + freemsg(mp); + break; + } + return; + } + + if (type >= QPCTL) { + DBG1("(hipri) wput, %s side", zfd_side(qp)); + switch (type) { + case M_READ: /* supposedly from ldterm? */ + DBG("zfd_wput: tossing M_READ\n"); + freemsg(mp); + break; + case M_FLUSH: + handle_mflush(qp, mp); + break; + default: + /* + * Put this to the other side. + */ + ASSERT(zfd_switch(RD(qp)) != NULL); + putnext(zfd_switch(RD(qp)), mp); + break; + } + DBG1("done (hipri) wput, %s side", zfd_side(qp)); + return; + } + + /* + * If the primary stream has been stopped for flow control then + * enqueue the msg, otherwise only putnext if there isn't already + * something in the queue. If we don't do this then things would wind + * up out of order. + */ + if (!must_queue && qp->q_first == NULL && + bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + putnext(RD(zfd_switch(qp)), mp); + } else { + /* + * zfd_wsrv expects msgs queued on the primary queue. Those + * will be handled by zfd_wsrv after zfd_rsrv performs the + * qenable on the proper queue. + */ + (void) putq(qp, mp); + } + + DBG1("done wput, %s side", zfd_side(qp)); +} + +/* + * Read server + * + * For primary stream: + * Under normal execution rsrv(9E) is symmetric for master and slave, so + * zfd_rsrv() can handle both without splitting up the codepath. We do this by + * enabling the write side of the partner. This triggers the partner to send + * messages queued on its write side to this queue's read side. + * + * For log stream: + * Internally we've queued up the msgs that we've teed off to the log stream + * so when we're invoked we need to pass these along. + */ +static void +zfd_rsrv(queue_t *qp) +{ + zfd_state_t *zfds; + zfds = (zfd_state_t *)qp->q_ptr; + + /* + * log stream server + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) { + queue_t *log_qp; + mblk_t *mp; + + log_qp = RD(zfds->zfd_slave_rdq); + + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + zfd_state_t *pzfds = zfds->zfd_inst_pri; + + while ((mp = getq(qp)) != NULL) { + if (bcanputnext(log_qp, mp->b_band)) { + putnext(log_qp, mp); + } else { + (void) putbq(log_qp, mp); + break; + } + } + + if (log_qp->q_count < log_qp->q_lowat) { + DTRACE_PROBE(zfd__flow__on); + pzfds->zfd_is_flowcon = B_FALSE; + if (pzfds->zfd_master_rdq != NULL) + qenable(RD(pzfds->zfd_master_rdq)); + } + } else { + /* No longer open, drain the queue */ + while ((mp = getq(qp)) != NULL) { + freemsg(mp); + } + flushq(qp, FLUSHALL); + } + return; + } + + /* + * Care must be taken here, as either of the master or slave side + * qptr could be NULL. + */ + ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq); + if (zfd_switch(qp) == NULL) { + DBG("zfd_rsrv: other side isn't listening\n"); + return; + } + qenable(WR(zfd_switch(qp))); +} + +/* + * Write server + * + * This routine is symmetric for master and slave, so it handles both without + * splitting up the codepath. + * + * If there are messages on this queue that can be sent to the other, send + * them via putnext(). Else, if queued messages cannot be sent, leave them + * on this queue. + */ +static void +zfd_wsrv(queue_t *qp) +{ + queue_t *swq; + mblk_t *mp; + zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr; + + ASSERT(zfds != NULL); + + /* + * Partner has no read queue, so take the data, and throw it away. + */ + if (zfd_switch(RD(qp)) == NULL) { + DBG("zfd_wsrv: other side isn't listening"); + while ((mp = getq(qp)) != NULL) { + if (mp->b_datap->db_type == M_IOCTL) + miocnak(qp, mp, 0, 0); + else + freemsg(mp); + } + flushq(qp, FLUSHALL); + return; + } + + swq = RD(zfd_switch(qp)); + + /* + * while there are messages on this write queue... + */ + while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) { + /* + * Due to the way zfd_wput is implemented, we should never + * see a high priority control message here. + */ + ASSERT(mp->b_datap->db_type < QPCTL); + + if (bcanputnext(swq, mp->b_band)) { + putnext(swq, mp); + } else { + (void) putbq(qp, mp); + break; + } + } +} diff --git a/usr/src/uts/common/mapfiles/README b/usr/src/uts/common/mapfiles/README new file mode 100644 index 0000000000..5b65771325 --- /dev/null +++ b/usr/src/uts/common/mapfiles/README @@ -0,0 +1,68 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +Kernel Module Build Time Symbol Verification +-------------------------------------------- + +Historically, kernel modules have all been built as relocatable objects. +They are not dynamic objects and dependency information is always noted +in individual makefiles. Along with this, there has never been any +verification of the symbols that are being used. This means that it's +possible for a kernel module author to refer to a symbol that doesn't +exist and not find out until they try to install the module. + +To help find these problems at build time, we provide an opt-in system +for modules to use, leveraging the link-editor's '-z defs' option. This +option ensures that there are no unknown definitons at link-edit time. +To supply these definitions we supply a series of mapfiles in this +directory. + +These mapfiles are not the traditional versioning mapfiles like those in +usr/src/lib/README.mapfiles! Please review the following differences +closely: + +* These mapfiles do not declare any versions! +* These mapfiles do not use the 'SYMBOL_VERSION' directive, instead they + use the 'SYMBOL_SCOPE' directive. +* These mapfiles do not hide symbols! Library mapfiles always have + something to catch all local symbols. That should *never* be used + here. These mapfiles should not effect visibility. +* All symbols in these mapfiles should be marked 'EXTERN' to indicate + that they are not provided by the kernel module but by another. +* These mapfiles do not declare what is or isn't a public interface, + though they are often grouped around interfaces, to make it easier for + a driver author to get this right. + +Mapfiles are organized based on kernel module. For example the GLDv3 +device driver interface is provided by the 'mac' module and thus is +found in the file 'mac.mapfile'. The DDI is currently in the 'ddi' +mapfile. Functions that are found in genunix and unix that aren't in +the DDI should not be put in that mapfile. + +Note, the existing files may not be complete. These are intended to only +have the public interfaces provided by modules and thus should not +include every symbol in them. As the need arises, add new symbols or +modules as appropriate. + +To opt a module into this, first declare a series of MAPFILES that they +should check against in the module. This should be a series of one or +more files, for example: + +MAPFILES += ddi mac + +Next, you should add an include of Makefile.mapfile right before you +include Makefile.targ. You can do this with the following line: + +include $(UTSBASE)/Makefile.mapfile diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile new file mode 100644 index 0000000000..25aa8ab045 --- /dev/null +++ b/usr/src/uts/common/mapfiles/ddi.mapfile @@ -0,0 +1,190 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +# +# This file contains core functions provided by the DDI and also items +# required as part of the platform's runime ABI (think compiler +# functions). +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + __divdi3 { FLAGS = EXTERN }; + __stack_chk_fail { FLAGS = EXTERN }; + __stack_chk_guard { FLAGS = EXTERN }; + allocb { FLAGS = EXTERN }; + assfail { FLAGS = EXTERN }; + assfail3 { FLAGS = EXTERN }; + atomic_dec_32_nv { FLAGS = EXTERN }; + bcmp { FLAGS = EXTERN }; + bcopy { FLAGS = EXTERN }; + bzero { FLAGS = EXTERN }; + cmn_err { FLAGS = EXTERN }; + cv_broadcast { FLAGS = EXTERN }; + cv_destroy { FLAGS = EXTERN }; + cv_init { FLAGS = EXTERN }; + cv_reltimedwait { FLAGS = EXTERN }; + ddi_cb_register { FLAGS = EXTERN }; + ddi_cb_unregister { FLAGS = EXTERN }; + ddi_dev_regsize { FLAGS = EXTERN }; + ddi_dma_addr_bind_handle { FLAGS = EXTERN }; + ddi_dma_alloc_handle { FLAGS = EXTERN }; + ddi_dma_free_handle { FLAGS = EXTERN }; + ddi_dma_mem_alloc { FLAGS = EXTERN }; + ddi_dma_mem_free { FLAGS = EXTERN }; + ddi_dma_nextcookie { FLAGS = EXTERN }; + ddi_dma_sync { FLAGS = EXTERN }; + ddi_dma_unbind_handle { FLAGS = EXTERN }; + ddi_fm_acc_err_clear { FLAGS = EXTERN }; + ddi_fm_acc_err_get { FLAGS = EXTERN }; + ddi_fm_dma_err_get { FLAGS = EXTERN }; + ddi_fm_ereport_post { FLAGS = EXTERN }; + ddi_fm_fini { FLAGS = EXTERN }; + ddi_fm_handler_register { FLAGS = EXTERN }; + ddi_fm_handler_unregister { FLAGS = EXTERN }; + ddi_fm_init { FLAGS = EXTERN }; + ddi_fm_service_impact { FLAGS = EXTERN }; + ddi_get_driver_private { FLAGS = EXTERN }; + ddi_get_instance { FLAGS = EXTERN }; + ddi_get_lbolt { FLAGS = EXTERN }; + ddi_get_lbolt64 { FLAGS = EXTERN }; + ddi_get_name { FLAGS = EXTERN }; + ddi_get_parent { FLAGS = EXTERN }; + ddi_get16 { FLAGS = EXTERN }; + ddi_get32 { FLAGS = EXTERN }; + ddi_get64 { FLAGS = EXTERN }; + ddi_intr_add_handler { FLAGS = EXTERN }; + ddi_intr_alloc { FLAGS = EXTERN }; + ddi_intr_block_disable { FLAGS = EXTERN }; + ddi_intr_block_enable { FLAGS = EXTERN }; + ddi_intr_disable { FLAGS = EXTERN }; + ddi_intr_enable { FLAGS = EXTERN }; + ddi_intr_free { FLAGS = EXTERN }; + ddi_intr_get_cap { FLAGS = EXTERN }; + ddi_intr_get_navail { FLAGS = EXTERN }; + ddi_intr_get_nintrs { FLAGS = EXTERN }; + ddi_intr_get_pri { FLAGS = EXTERN }; + ddi_intr_get_supported_types { FLAGS = EXTERN }; + ddi_intr_remove_handler { FLAGS = EXTERN }; + ddi_periodic_add { FLAGS = EXTERN }; + ddi_periodic_delete { FLAGS = EXTERN }; + ddi_power { FLAGS = EXTERN }; + ddi_prop_free { FLAGS = EXTERN }; + ddi_prop_get_int { FLAGS = EXTERN }; + ddi_prop_lookup_int_array { FLAGS = EXTERN }; + ddi_prop_op { FLAGS = EXTERN }; + ddi_prop_remove_all { FLAGS = EXTERN }; + ddi_prop_update_int_array { FLAGS = EXTERN }; + ddi_prop_update_string { FLAGS = EXTERN }; + ddi_ptob { FLAGS = EXTERN }; + ddi_put16 { FLAGS = EXTERN }; + ddi_put32 { FLAGS = EXTERN }; + ddi_quiesce_not_supported { FLAGS = EXTERN }; + ddi_regs_map_free { FLAGS = EXTERN }; + ddi_regs_map_setup { FLAGS = EXTERN }; + ddi_set_driver_private { FLAGS = EXTERN }; + ddi_strtol { FLAGS = EXTERN }; + ddi_taskq_create { FLAGS = EXTERN }; + ddi_taskq_destroy { FLAGS = EXTERN }; + ddi_taskq_dispatch { FLAGS = EXTERN }; + delay { FLAGS = EXTERN }; + desballoc { FLAGS = EXTERN }; + dev_err { FLAGS = EXTERN }; + drv_usectohz { FLAGS = EXTERN }; + drv_usecwait { FLAGS = EXTERN }; + fm_ena_generate { FLAGS = EXTERN }; + freeb { FLAGS = EXTERN }; + freemsg { FLAGS = EXTERN }; + freemsgchain { FLAGS = EXTERN }; + gethrtime { FLAGS = EXTERN }; + kmem_alloc { FLAGS = EXTERN }; + kmem_free { FLAGS = EXTERN }; + kmem_zalloc { FLAGS = EXTERN }; + kstat_create { FLAGS = EXTERN }; + kstat_delete { FLAGS = EXTERN }; + kstat_install { FLAGS = EXTERN }; + kstat_named_init { FLAGS = EXTERN }; + list_create { FLAGS = EXTERN }; + list_destroy { FLAGS = EXTERN }; + list_head { FLAGS = EXTERN }; + list_insert_tail { FLAGS = EXTERN }; + list_next { FLAGS = EXTERN }; + list_remove { FLAGS = EXTERN }; + memcpy { FLAGS = EXTERN }; + memset { FLAGS = EXTERN }; + miocack { FLAGS = EXTERN }; + miocnak { FLAGS = EXTERN }; + mod_driverops { FLAGS = EXTERN }; + mod_info { FLAGS = EXTERN }; + mod_install { FLAGS = EXTERN }; + mod_remove { FLAGS = EXTERN }; + msgpullup { FLAGS = EXTERN }; + msgsize { FLAGS = EXTERN }; + mutex_destroy { FLAGS = EXTERN }; + mutex_enter { FLAGS = EXTERN }; + mutex_exit { FLAGS = EXTERN }; + mutex_init { FLAGS = EXTERN }; + mutex_owned { FLAGS = EXTERN }; + mutex_tryenter { FLAGS = EXTERN }; + nochpoll { FLAGS = EXTERN }; + nodev { FLAGS = EXTERN }; + nulldev { FLAGS = EXTERN }; + panic { FLAGS = EXTERN }; + pci_config_get16 { FLAGS = EXTERN }; + pci_config_get32 { FLAGS = EXTERN }; + pci_config_get64 { FLAGS = EXTERN }; + pci_config_get8 { FLAGS = EXTERN }; + pci_config_put16 { FLAGS = EXTERN }; + pci_config_put32 { FLAGS = EXTERN }; + pci_config_put64 { FLAGS = EXTERN }; + pci_config_put8 { FLAGS = EXTERN }; + pci_config_setup { FLAGS = EXTERN }; + pci_config_teardown { FLAGS = EXTERN }; + pci_ereport_post { FLAGS = EXTERN }; + pci_ereport_setup { FLAGS = EXTERN }; + pci_ereport_teardown { FLAGS = EXTERN }; + pci_lcap_locate { FLAGS = EXTERN }; + qreply { FLAGS = EXTERN }; + rw_destroy { FLAGS = EXTERN }; + rw_enter { FLAGS = EXTERN }; + rw_exit { FLAGS = EXTERN }; + rw_init { FLAGS = EXTERN }; + snprintf { FLAGS = EXTERN }; + sprintf { FLAGS = EXTERN }; + strcat { FLAGS = EXTERN }; + strcmp { FLAGS = EXTERN }; + strcpy { FLAGS = EXTERN }; + strlen { FLAGS = EXTERN }; + timeout { FLAGS = EXTERN }; + untimeout { FLAGS = EXTERN }; + vsnprintf { FLAGS = EXTERN }; + vsprintf { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/dtrace.mapfile.awk b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk new file mode 100644 index 0000000000..b8a7e2d372 --- /dev/null +++ b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk @@ -0,0 +1,34 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# This script is designed to assemble a mapfile for DTrace probes. +# +BEGIN { + print "#" + print "# This file is autogenerated by dtrace.mapfile.awk" + print "#" + print "$mapfile_version 2" + print "SYMBOL_SCOPE {" + print " global:" +} + +/__dtrace_probe_/ { + printf "\t%s\t{ FLAGS = EXTERN };\n", $1 +} + +END { + print "};" +} diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile new file mode 100644 index 0000000000..6bddb3c7ef --- /dev/null +++ b/usr/src/uts/common/mapfiles/kernel.mapfile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +# +# This file contains functions provided by the kernel that various +# modules use. This is a combination of things in both unix and genunix. +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + bt_getlowbit { FLAGS = EXTERN }; + servicing_interrupt { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/mac.mapfile b/usr/src/uts/common/mapfiles/mac.mapfile new file mode 100644 index 0000000000..30462f80d5 --- /dev/null +++ b/usr/src/uts/common/mapfiles/mac.mapfile @@ -0,0 +1,55 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + mac_alloc { FLAGS = EXTERN }; + mac_fini_ops { FLAGS = EXTERN }; + mac_free { FLAGS = EXTERN }; + mac_hcksum_get { FLAGS = EXTERN }; + mac_hcksum_set { FLAGS = EXTERN }; + mac_init_ops { FLAGS = EXTERN }; + mac_link_update { FLAGS = EXTERN }; + mac_lso_get { FLAGS = EXTERN }; + mac_maxsdu_update { FLAGS = EXTERN }; + mac_prop_info_set_default_link_flowctrl { FLAGS = EXTERN }; + mac_prop_info_set_default_str { FLAGS = EXTERN }; + mac_prop_info_set_default_uint8 { FLAGS = EXTERN }; + mac_prop_info_set_perm { FLAGS = EXTERN }; + mac_prop_info_set_range_uint32 { FLAGS = EXTERN }; + mac_ring_intr_set { FLAGS = EXTERN }; + mac_register { FLAGS = EXTERN }; + mac_rx { FLAGS = EXTERN }; + mac_rx_ring { FLAGS = EXTERN }; + mac_tx_ring_update { FLAGS = EXTERN }; + mac_tx_update { FLAGS = EXTERN }; + mac_unregister { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/random.mapfile b/usr/src/uts/common/mapfiles/random.mapfile new file mode 100644 index 0000000000..d3d8bc89fa --- /dev/null +++ b/usr/src/uts/common/mapfiles/random.mapfile @@ -0,0 +1,37 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + random_get_bytes { FLAGS = EXTERN }; + random_get_blocking_bytes { FLAGS = EXTERN }; + random_get_pseudo_bytes { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index d530b7f36e..7927cf5e24 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -3,6 +3,7 @@ * Use is subject to license terms. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* * Copyright (c) 1982, 1986 Regents of the University of California. @@ -225,6 +226,7 @@ typedef uint16_t sa_family_t; #define IPPORT_SLP 427 #define IPPORT_MIP 434 #define IPPORT_SMB 445 /* a.k.a. microsoft-ds */ +#define IPPORT_VXLAN 4789 /* * Internet Key Exchange (IKE) ports @@ -268,6 +270,11 @@ typedef uint16_t sa_family_t; #define IPPORT_RESERVED 1024 #define IPPORT_USERRESERVED 5000 +#ifdef _KERNEL +#define IPPORT_DYNAMIC_MIN 49152 +#define IPPORT_DYNAMIC_MAX 65535 +#endif + /* * Link numbers */ diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h index c65a9bad3a..74cff75d43 100644 --- a/usr/src/uts/common/netinet/udp.h +++ b/usr/src/uts/common/netinet/udp.h @@ -1,6 +1,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -17,9 +18,6 @@ #ifndef _NETINET_UDP_H #define _NETINET_UDP_H -#pragma ident "%Z%%M% %I% %E% SMI" -/* udp.h 1.7 88/08/19 SMI; from UCB 7.1 6/5/86 */ - #ifdef __cplusplus extern "C" { #endif @@ -36,6 +34,16 @@ struct udphdr { #define UDP_EXCLBIND 0x0101 /* for internal use only */ #define UDP_RCVHDR 0x0102 /* for internal use only */ #define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */ +#define UDP_SRCPORT_HASH 0x0104 /* for internal use only */ +#define UDP_SND_TO_CONNECTED 0x0105 /* for internal use only */ + +/* + * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go + * handle UDP_SRCPORT_HASH. + */ +#define UDP_HASH_DISABLE 0x0000 /* for internal use only */ +#define UDP_HASH_VXLAN 0x0001 /* for internal use only */ + /* * Following option in UDP_ namespace required to be exposed through * <xti.h> (It also requires exposing options not implemented). The options diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 0af67f5d98..02901d023d 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,112 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +551,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,15 +669,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp) int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, + char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +687,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +703,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +790,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +798,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +809,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +830,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +846,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +867,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1007,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1151,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1194,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. */ #include <sys/timer.h> @@ -41,6 +41,9 @@ static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; + /*ARGSUSED*/ static int clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts) static int clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { - /* - * CLOCK_HIGHRES timers of sufficiently high resolution can deny - * service; only allow privileged users to create such timers. - * Sites that do not wish to have this restriction should - * give users the "proc_clock_highres" privilege. - */ - if (secpolicy_clock_highres(CRED()) != 0) { - it->it_arg = NULL; - return (EPERM); - } - it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP); it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags, cpu_t *cpu; cpupart_t *cpupart; int pset; + boolean_t value_need_clamp = B_FALSE; + boolean_t intval_need_clamp = B_FALSE; + cred_t *cr = CRED(); + struct itimerspec clamped; + + /* + * CLOCK_HIGHRES timers of sufficiently high resolution can deny + * service; only allow privileged users to create such timers. + * Non-privileged users (those without the "proc_clock_highres" + * privilege) can create timers with lower resolution but if they + * attempt to use a very low time value (< 200us) then their + * timer will be clamped at 200us. + */ + if (when->it_value.tv_sec == 0 && + when->it_value.tv_nsec > 0 && + when->it_value.tv_nsec < clock_highres_interval_min) + value_need_clamp = B_TRUE; + + if (when->it_interval.tv_sec == 0 && + when->it_interval.tv_nsec > 0 && + when->it_interval.tv_nsec < clock_highres_interval_min) + intval_need_clamp = B_TRUE; + + if ((value_need_clamp || intval_need_clamp) && + secpolicy_clock_highres(cr) != 0) { + clamped.it_value.tv_sec = when->it_value.tv_sec; + clamped.it_interval.tv_sec = when->it_interval.tv_sec; + + if (value_need_clamp) { + clamped.it_value.tv_nsec = clock_highres_interval_min; + } else { + clamped.it_value.tv_nsec = when->it_value.tv_nsec; + } + + if (intval_need_clamp) { + clamped.it_interval.tv_nsec = + clock_highres_interval_min; + } else { + clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; + } + + when = &clamped; + } cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 249066674e..9ea08f5535 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* @@ -287,7 +288,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 733fd03a92..b0098946b3 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -727,6 +727,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c index 781c564233..38d5f1ab18 100644 --- a/usr/src/uts/common/os/dumpsubr.c +++ b/usr/src/uts/common/os/dumpsubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -342,6 +343,7 @@ typedef struct dumpsync { uint_t neednl; /* will need to print a newline */ uint_t percent; /* dump progress */ uint_t percent_done; /* dump progress reported */ + int sec_done; /* dump progress last report time */ cqueue_t freebufq; /* free kmem bufs for writing */ cqueue_t mainq; /* input for main task */ cqueue_t helperq; /* input for helpers */ @@ -2285,7 +2287,7 @@ dumpsys_main_task(void *arg) cbuf_t *cp; pgcnt_t baseoff, pfnoff; pfn_t base, pfn; - int sec, i, dumpserial; + int i, dumpserial; /* * Fall back to serial mode if there are no helpers. @@ -2311,13 +2313,20 @@ dumpsys_main_task(void *arg) dump_init_memlist_walker(&mlw); - /* CONSTCOND */ - while (1) { + for (;;) { + int sec = (gethrtime() - ds->start) / NANOSEC; - if (ds->percent > ds->percent_done) { + /* + * Render a simple progress display on the system console to + * make clear to the operator that the system has not hung. + * Emit an update when dump progress has advanced by one + * percent, or when no update has been drawn in the last + * second. + */ + if (ds->percent > ds->percent_done || sec > ds->sec_done) { + ds->sec_done = sec; ds->percent_done = ds->percent; - sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; - uprintf("^\r%2d:%02d %3d%% done", + uprintf("^\rdumping: %2d:%02d %3d%% done", sec / 60, sec % 60, ds->percent); ds->neednl = 1; } @@ -2501,8 +2510,7 @@ dumpsys_main_task(void *arg) break; } /* end switch */ - - } /* end while(1) */ + } } #ifdef COLLECT_METRICS diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 172fce8d89..d46b8538a9 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -69,6 +69,7 @@ #include <sys/sdt.h> #include <sys/brand.h> #include <sys/klpd.h> +#include <sys/random.h> #include <c2/audit.h> @@ -97,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * exece() - system call wrapper around exec_common() @@ -297,14 +299,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -336,7 +367,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -360,6 +391,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -419,8 +452,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -544,7 +579,7 @@ gexec( long *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -858,8 +893,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1517,6 +1558,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1553,6 +1615,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1635,8 +1698,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1649,6 +1713,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1755,7 +1833,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1768,6 +1846,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1780,6 +1863,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1868,6 +1956,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index f0c0983a3a..0e213deb21 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -230,7 +230,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -366,19 +366,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -390,12 +377,35 @@ proc_exit(int why, int what) if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); + + /* + * If the init process should be restarted, the + * "zone_restart_init" member will be set. Some init + * programs in branded zones do not tolerate a restart + * in the traditional manner; setting the + * "zone_reboot_on_init_exit" member will cause the + * entire zone to be rebooted instead. If neither of + * these flags is set the zone will shut down. + */ + if (z->zone_reboot_on_init_exit == B_TRUE && + z->zone_restart_init == B_TRUE) { + /* + * Trigger a zone reboot and continue + * with exit processing. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } + + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); + zone_kcred()); } } @@ -407,6 +417,32 @@ proc_exit(int why, int what) z->zone_proc_initpid = -1; } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -658,10 +694,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -673,7 +721,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -847,8 +896,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -927,10 +1018,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1048,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -978,10 +1069,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; - proc_gone = 0; + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } + + if (pp->p_child == NULL) { + goto no_real_children; + } + } + + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -989,6 +1107,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1033,12 +1156,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1107,11 +1234,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1130,7 +1258,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1354,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..bfee77130d 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -852,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index fe3a362fa7..d5ba123894 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -696,7 +697,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -745,7 +746,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -782,6 +783,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -800,6 +804,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -843,8 +848,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -902,7 +917,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -943,7 +959,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1004,6 +1024,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1071,9 +1094,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1153,6 +1173,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index f5e92cfd94..0c4c0bcad6 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -19,7 +19,10 @@ * CDDL HEADER END */ -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. @@ -52,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -522,6 +526,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -537,8 +555,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -547,9 +563,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -717,8 +732,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -726,10 +739,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 9381019cd1..6a6f5d84ef 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index cc53c2fb76..734fa910e4 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -158,10 +159,22 @@ * find known objects and is about to free it, or * c) the client has freed the object. * In all these cases (a, b, and c) kmem frees the new object (the - * unused copy destination) and searches for the old object in the - * magazine layer. If found, the object is removed from the magazine - * layer and freed to the slab layer so it will no longer hold the - * slab hostage. + * unused copy destination). In the first case, the object is in + * use and the correct action is that for LATER; in the latter two + * cases, we know that the object is either freed or about to be + * freed, in which case it is either already in a magazine or about + * to be in one. In these cases, we know that the object will either + * be reallocated and reused, or it will end up in a full magazine + * that will be reaped (thereby liberating the slab). Because it + * is prohibitively expensive to differentiate these cases, and + * because the defrag code is executed when we're low on memory + * (thereby biasing the system to reclaim full magazines) we treat + * all DONT_KNOW cases as LATER and rely on cache reaping to + * generally clean up full magazines. While we take the same action + * for these cases, we maintain their semantic distinction: if + * defragmentation is not occurring, it is useful to know if this + * is due to objects in use (LATER) or objects in an unknown state + * of transition (DONT_KNOW). * * 2.3 Object States * @@ -284,10 +297,10 @@ * view of the slab layer, making it a candidate for the move callback. Most * objects unrecognized by the client in the move callback fall into this * category and are cheaply distinguished from known objects by the test - * described earlier. Since recognition is cheap for the client, and searching - * magazines is expensive for kmem, kmem defers searching until the client first - * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem - * elsewhere does what it can to avoid bothering the client unnecessarily. + * described earlier. Because searching magazines is prohibitively expensive + * for kmem, clients that do not mark freed objects (and therefore return + * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation + * efficacy reduced. * * Invalidating the designated pointer member before freeing the object marks * the object to be avoided in the callback, and conversely, assigning a valid @@ -997,6 +1010,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ size_t kmem_content_log_size; /* content log size [2% of memory] */ size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1004,6 +1018,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ + #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ #else @@ -1037,21 +1059,7 @@ static vmem_t *kmem_default_arena; static vmem_t *kmem_firewall_va_arena; static vmem_t *kmem_firewall_arena; -/* - * Define KMEM_STATS to turn on statistic gathering. By default, it is only - * turned on when DEBUG is also defined. - */ -#ifdef DEBUG -#define KMEM_STATS -#endif /* DEBUG */ - -#ifdef KMEM_STATS -#define KMEM_STAT_ADD(stat) ((stat)++) -#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) -#else -#define KMEM_STAT_ADD(stat) /* nothing */ -#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */ -#endif /* KMEM_STATS */ +static int kmem_zerosized; /* # of zero-sized allocs */ /* * kmem slab consolidator thresholds (tunables) @@ -1070,47 +1078,6 @@ size_t kmem_reclaim_max_slabs = 1; */ size_t kmem_reclaim_scan_range = 12; -#ifdef KMEM_STATS -static struct { - uint64_t kms_callbacks; - uint64_t kms_yes; - uint64_t kms_no; - uint64_t kms_later; - uint64_t kms_dont_need; - uint64_t kms_dont_know; - uint64_t kms_hunt_found_mag; - uint64_t kms_hunt_found_slab; - uint64_t kms_hunt_alloc_fail; - uint64_t kms_hunt_lucky; - uint64_t kms_notify; - uint64_t kms_notify_callbacks; - uint64_t kms_disbelief; - uint64_t kms_already_pending; - uint64_t kms_callback_alloc_fail; - uint64_t kms_callback_taskq_fail; - uint64_t kms_endscan_slab_dead; - uint64_t kms_endscan_slab_destroyed; - uint64_t kms_endscan_nomem; - uint64_t kms_endscan_refcnt_changed; - uint64_t kms_endscan_nomove_changed; - uint64_t kms_endscan_freelist; - uint64_t kms_avl_update; - uint64_t kms_avl_noupdate; - uint64_t kms_no_longer_reclaimable; - uint64_t kms_notify_no_longer_reclaimable; - uint64_t kms_notify_slab_dead; - uint64_t kms_notify_slab_destroyed; - uint64_t kms_alloc_fail; - uint64_t kms_constructor_fail; - uint64_t kms_dead_slabs_freed; - uint64_t kms_defrags; - uint64_t kms_scans; - uint64_t kms_scan_depot_ws_reaps; - uint64_t kms_debug_reaps; - uint64_t kms_debug_scans; -} kmem_move_stats; -#endif /* KMEM_STATS */ - /* consolidator knobs */ static boolean_t kmem_move_noreap; static boolean_t kmem_move_blocked; @@ -1141,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; kmem_log_header_t *kmem_slab_log; +kmem_log_header_t *kmem_zerosized_log; static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -1921,15 +1889,7 @@ kmem_slab_free(kmem_cache_t *cp, void *buf) cp->cache_complete_slab_count--; avl_add(&cp->cache_partial_slabs, sp); } else { -#ifdef DEBUG - if (avl_update_gt(&cp->cache_partial_slabs, sp)) { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); - } else { - KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); - } -#else (void) avl_update_gt(&cp->cache_partial_slabs, sp); -#endif } ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == @@ -2941,8 +2901,33 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we log to the kmem_zerosized_log that + * that this condition has occurred (which gives us + * enough information to be able to debug it). + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0; " + "see kmem_zerosized_log for details"); + } + + kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -3556,7 +3541,7 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw) kmcp->kmc_move_later.value.ui64 = kd->kmd_later; kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; - kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found; + kmcp->kmc_move_hunt_found.value.ui64 = 0; kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; kmcp->kmc_scan.value.ui64 = kd->kmd_scans; @@ -4127,7 +4112,8 @@ kmem_cache_destroy(kmem_cache_t *cp) if (kmem_taskq != NULL) taskq_wait(kmem_taskq); - if (kmem_move_taskq != NULL) + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) taskq_wait(kmem_move_taskq); kmem_cache_magazine_purge(cp); @@ -4465,8 +4451,8 @@ kmem_init(void) } kmem_failure_log = kmem_log_init(kmem_failure_log_size); - kmem_slab_log = kmem_log_init(kmem_slab_log_size); + kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size); /* * Initialize STREAMS message caches so allocb() is available. @@ -4654,94 +4640,6 @@ kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); } -static void * -kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, - void *tbuf) -{ - int i; /* magazine round index */ - - for (i = 0; i < n; i++) { - if (buf == m->mag_round[i]) { - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, tbuf, - caller()); - } - m->mag_round[i] = tbuf; - return (buf); - } - } - - return (NULL); -} - -/* - * Hunt the magazine layer for the given buffer. If found, the buffer is - * removed from the magazine layer and returned, otherwise NULL is returned. - * The state of the returned buffer is freed and constructed. - */ -static void * -kmem_hunt_mags(kmem_cache_t *cp, void *buf) -{ - kmem_cpu_cache_t *ccp; - kmem_magazine_t *m; - int cpu_seqid; - int n; /* magazine rounds */ - void *tbuf; /* temporary swap buffer */ - - ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); - - /* - * Allocated a buffer to swap with the one we hope to pull out of a - * magazine when found. - */ - tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); - if (tbuf == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); - return (NULL); - } - if (tbuf == buf) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); - if (cp->cache_flags & KMF_BUFTAG) { - (void) kmem_cache_free_debug(cp, buf, caller()); - } - return (buf); - } - - /* Hunt the depot. */ - mutex_enter(&cp->cache_depot_lock); - n = cp->cache_magtype->mt_magsize; - for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&cp->cache_depot_lock); - return (buf); - } - } - mutex_exit(&cp->cache_depot_lock); - - /* Hunt the per-CPU magazines. */ - for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { - ccp = &cp->cache_cpu[cpu_seqid]; - - mutex_enter(&ccp->cc_lock); - m = ccp->cc_loaded; - n = ccp->cc_rounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - m = ccp->cc_ploaded; - n = ccp->cc_prounds; - if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { - mutex_exit(&ccp->cc_lock); - return (buf); - } - mutex_exit(&ccp->cc_lock); - } - - kmem_cache_free(cp, tbuf); - return (NULL); -} - /* * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), * or when the buffer is freed. @@ -4805,7 +4703,7 @@ static void kmem_move_end(kmem_cache_t *, kmem_move_t *); * NO kmem frees the new buffer, marks the slab of the old buffer * non-reclaimable to avoid bothering the client again * LATER kmem frees the new buffer, increments slab_later_count - * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer + * DONT_KNOW kmem frees the new buffer * DONT_NEED kmem frees both the old buffer and the new buffer * * The pending callback argument now being processed contains both of the @@ -4839,19 +4737,14 @@ kmem_move_buffer(kmem_move_t *callback) * another buffer on the same slab. */ if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { - KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_no_longer_reclaimable); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } /* - * Hunting magazines is expensive, so we'll wait to do that until the - * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer - * is cheap, so we might as well do that here in case we can avoid - * bothering the client. + * Checking the slab layer is easy, so we might as well do that here + * in case we can avoid bothering the client. */ mutex_enter(&cp->cache_lock); free_on_slab = (kmem_slab_allocated(cp, sp, @@ -4859,7 +4752,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); if (free_on_slab) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; @@ -4871,7 +4763,6 @@ kmem_move_buffer(kmem_move_t *callback) */ if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, KM_NOSLEEP, 1, caller()) != 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail); kmem_move_end(cp, callback); return; } @@ -4879,15 +4770,11 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, KM_NOSLEEP) != 0) { atomic_inc_64(&cp->cache_alloc_fail); - KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail); kmem_slab_free(cp, callback->kmm_to_buf); kmem_move_end(cp, callback); return; } - KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); - KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), - kmem_move_stats.kms_notify_callbacks); cp->cache_defrag->kmd_callbacks++; cp->cache_defrag->kmd_thread = curthread; cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; @@ -4905,7 +4792,6 @@ kmem_move_buffer(kmem_move_t *callback) cp->cache_defrag->kmd_to_buf = NULL; if (response == KMEM_CBRC_YES) { - KMEM_STAT_ADD(kmem_move_stats.kms_yes); cp->cache_defrag->kmd_yes++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); /* slab safe to access until kmem_move_end() */ @@ -4920,14 +4806,12 @@ kmem_move_buffer(kmem_move_t *callback) switch (response) { case KMEM_CBRC_NO: - KMEM_STAT_ADD(kmem_move_stats.kms_no); cp->cache_defrag->kmd_no++; mutex_enter(&cp->cache_lock); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_LATER: - KMEM_STAT_ADD(kmem_move_stats.kms_later); cp->cache_defrag->kmd_later++; mutex_enter(&cp->cache_lock); if (!KMEM_SLAB_IS_PARTIAL(sp)) { @@ -4936,7 +4820,6 @@ kmem_move_buffer(kmem_move_t *callback) } if (++sp->slab_later_count >= KMEM_DISBELIEF) { - KMEM_STAT_ADD(kmem_move_stats.kms_disbelief); kmem_slab_move_no(cp, sp, callback->kmm_from_buf); } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, @@ -4945,7 +4828,6 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_NEED: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_need); cp->cache_defrag->kmd_dont_need++; kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); if (sp->slab_refcnt == 0) @@ -4955,19 +4837,21 @@ kmem_move_buffer(kmem_move_t *callback) mutex_exit(&cp->cache_lock); break; case KMEM_CBRC_DONT_KNOW: - KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); + /* + * If we don't know if we can move this buffer or not, we'll + * just assume that we can't: if the buffer is in fact free, + * then it is sitting in one of the per-CPU magazines or in + * a full magazine in the depot layer. Either way, because + * defrag is induced in the same logic that reaps a cache, + * it's likely that full magazines will be returned to the + * system soon (thereby accomplishing what we're trying to + * accomplish here: return those magazines to their slabs). + * Given this, any work that we might do now to locate a buffer + * in a magazine is wasted (and expensive!) work; we bump + * a counter in this case and otherwise assume that we can't + * move it. + */ cp->cache_defrag->kmd_dont_know++; - if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); - cp->cache_defrag->kmd_hunt_found++; - kmem_slab_free_constructed(cp, callback->kmm_from_buf, - B_TRUE); - if (sp->slab_refcnt == 0) - cp->cache_defrag->kmd_slabs_freed++; - mutex_enter(&cp->cache_lock); - kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); - mutex_exit(&cp->cache_lock); - } break; default: panic("'%s' (%p) unexpected move callback response %d\n", @@ -4992,10 +4876,9 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); - if (callback == NULL) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + + if (callback == NULL) return (B_FALSE); - } callback->kmm_from_slab = sp; callback->kmm_from_buf = buf; @@ -5020,7 +4903,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) pending->kmm_flags |= KMM_DESPERATE; } mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_already_pending); kmem_cache_free(kmem_move_cache, callback); return (B_TRUE); } @@ -5034,7 +4916,6 @@ kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, callback, TQ_NOSLEEP)) { - KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail); mutex_enter(&cp->cache_lock); avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); mutex_exit(&cp->cache_lock); @@ -5080,7 +4961,6 @@ kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); mutex_enter(&cp->cache_lock); } } @@ -5225,8 +5105,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * pending move completes. */ list_insert_head(deadlist, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_dead); return (-1); } @@ -5241,10 +5119,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats. - kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_endscan_slab_destroyed); mutex_enter(&cp->cache_lock); /* * Since we can't pick up the scan where we left @@ -5260,8 +5134,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * for the request and say nothing about the * number of reclaimable slabs. */ - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomem); return (-1); } @@ -5277,16 +5149,10 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, * destination buffer on the same slab. In that * case, we're not interested in counting it. */ - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs), - kmem_move_stats.kms_endscan_refcnt_changed); return (-1); } - if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { - KMEM_STAT_COND_ADD(s < max_slabs, - kmem_move_stats.kms_endscan_nomove_changed); + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) return (-1); - } /* * Generating a move request allocates a destination @@ -5313,11 +5179,6 @@ kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, } end_scan: - KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && - (s < max_slabs) && - (sp == avl_first(&cp->cache_partial_slabs)), - kmem_move_stats.kms_endscan_freelist); - return (s); } @@ -5377,8 +5238,6 @@ kmem_cache_move_notify_task(void *arg) &cp->cache_defrag->kmd_moves_pending)) { list_insert_head(deadlist, sp); mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_dead); return; } @@ -5386,9 +5245,6 @@ kmem_cache_move_notify_task(void *arg) cp->cache_slab_destroy++; mutex_exit(&cp->cache_lock); kmem_slab_destroy(cp, sp); - KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); - KMEM_STAT_ADD(kmem_move_stats. - kms_notify_slab_destroyed); return; } } else { @@ -5402,7 +5258,6 @@ kmem_cache_move_notify(kmem_cache_t *cp, void *buf) { kmem_move_notify_args_t *args; - KMEM_STAT_ADD(kmem_move_stats.kms_notify); args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); if (args != NULL) { args->kmna_cache = cp; @@ -5425,7 +5280,6 @@ kmem_cache_defrag(kmem_cache_t *cp) n = avl_numnodes(&cp->cache_partial_slabs); if (n > 1) { /* kmem_move_buffers() drops and reacquires cache_lock */ - KMEM_STAT_ADD(kmem_move_stats.kms_defrags); cp->cache_defrag->kmd_defrags++; (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); } @@ -5524,7 +5378,6 @@ kmem_cache_scan(kmem_cache_t *cp) * * kmem_move_buffers() drops and reacquires cache_lock. */ - KMEM_STAT_ADD(kmem_move_stats.kms_scans); kmd->kmd_scans++; slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, kmem_reclaim_max_slabs, 0); @@ -5565,12 +5418,9 @@ kmem_cache_scan(kmem_cache_t *cp) if (!kmem_move_noreap && ((debug_rand % kmem_mtb_reap) == 0)) { mutex_exit(&cp->cache_lock); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps); kmem_cache_reap(cp); return; } else if ((debug_rand % kmem_mtb_move) == 0) { - KMEM_STAT_ADD(kmem_move_stats.kms_scans); - KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans); kmd->kmd_scans++; (void) kmem_move_buffers(cp, kmem_reclaim_scan_range, 1, KMM_DEBUG); @@ -5581,8 +5431,6 @@ kmem_cache_scan(kmem_cache_t *cp) mutex_exit(&cp->cache_lock); - if (reap) { - KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); + if (reap) kmem_depot_ws_reap(cp); - } } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..cbc4fa0000 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2016, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index feb8e76c42..a7de7b513f 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + if (state == TS_RUN) { /* * We set the new lwp running immediately. @@ -753,8 +781,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -827,8 +856,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -836,21 +884,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -891,13 +951,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -941,6 +994,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1101,7 +1166,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7afc1cfe00..dda0b3e4a6 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)(uintptr_t)exec_fnamep, (const char **)(uintptr_t)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index b6b5446d71..596c855a45 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1360,10 +1360,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -522,6 +535,20 @@ sprunlock(proc_t *p) THREAD_KPRI_RELEASE(); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + THREAD_KPRI_RELEASE(); +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 07bc2920da..d2bdb4ce37 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +55,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1243,6 +1244,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2078,6 +2095,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2581,3 +2605,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index a3cdaccc2a..cc1c5e03a6 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -372,6 +376,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/project.c b/usr/src/uts/common/os/project.c index 7bd3dd963f..d89f62bea7 100644 --- a/usr/src/uts/common/os/project.c +++ b/usr/src/uts/common/os/project.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/project.h> @@ -659,6 +660,14 @@ static rctl_ops_t project_tasks_ops = { */ /*ARGSUSED*/ +static rctl_qty_t +project_shmmax_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_task->tk_proj->kpj_data.kpd_shmmax); +} + +/*ARGSUSED*/ static int project_shmmax_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t inc, uint_t flags) @@ -675,7 +684,7 @@ project_shmmax_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, static rctl_ops_t project_shmmax_ops = { rcop_no_action, - rcop_no_usage, + project_shmmax_usage, rcop_no_set, project_shmmax_test }; @@ -685,6 +694,14 @@ static rctl_ops_t project_shmmax_ops = { */ /*ARGSUSED*/ +static rctl_qty_t +project_shmmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_shmmni); +} + +/*ARGSUSED*/ static int project_shmmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t inc, uint_t flags) @@ -701,7 +718,7 @@ project_shmmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, static rctl_ops_t project_shmmni_ops = { rcop_no_action, - rcop_no_usage, + project_shmmni_usage, rcop_no_set, project_shmmni_test }; @@ -711,6 +728,14 @@ static rctl_ops_t project_shmmni_ops = { */ /*ARGSUSED*/ +static rctl_qty_t +project_semmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_semmni); +} + +/*ARGSUSED*/ static int project_semmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t inc, uint_t flags) @@ -727,7 +752,7 @@ project_semmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, static rctl_ops_t project_semmni_ops = { rcop_no_action, - rcop_no_usage, + project_semmni_usage, rcop_no_set, project_semmni_test }; @@ -737,6 +762,14 @@ static rctl_ops_t project_semmni_ops = { */ /*ARGSUSED*/ +static rctl_qty_t +project_msgmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_task->tk_proj->kpj_data.kpd_ipc.ipcq_msgmni); +} + +/*ARGSUSED*/ static int project_msgmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t inc, uint_t flags) @@ -753,7 +786,7 @@ project_msgmni_test(struct rctl *rctl, struct proc *p, rctl_entity_p_t *e, static rctl_ops_t project_msgmni_ops = { rcop_no_action, - rcop_no_usage, + project_msgmni_usage, rcop_no_set, project_msgmni_test }; diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..5ef12f3ae4 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,21 @@ issig_forreal(void) } /* + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + /* + * The brand hook will return 0 if it would like + * us to drive on, or -1 if we should restart + * the loop to check other conditions. + */ + if (BROP(p)->b_issig_stop(p, lwp) != 0) { + continue; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +688,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +740,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +754,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +986,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1107,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1213,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1241,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1376,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1820,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1856,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2176,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2202,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 62f94729cf..0a1406e0cd 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,7 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -77,6 +77,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -985,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1029,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1048,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2544,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2635,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2659,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2677,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2693,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3178,6 +3215,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 0d1bb6a8a1..aa44ccf788 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -1093,18 +1093,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1112,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1126,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1145,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1153,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1167,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1202,5 +1209,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index a554f8c3f3..0a6fe0ef96 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1618,7 +1618,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index c997f8fd8d..e86fe138e3 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. */ /* @@ -250,6 +250,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -370,8 +372,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -417,8 +423,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1377,6 +1384,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1486,6 +1601,14 @@ static rctl_ops_t zone_procs_ops = { }; /*ARGSUSED*/ +static rctl_qty_t +zone_shmmax_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_shmmax); +} + +/*ARGSUSED*/ static int zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t incr, uint_t flags) @@ -1501,12 +1624,20 @@ zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, static rctl_ops_t zone_shmmax_ops = { rcop_no_action, - rcop_no_usage, + zone_shmmax_usage, rcop_no_set, zone_shmmax_test }; /*ARGSUSED*/ +static rctl_qty_t +zone_shmmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_ipc.ipcq_shmmni); +} + +/*ARGSUSED*/ static int zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t incr, uint_t flags) @@ -1522,12 +1653,20 @@ zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, static rctl_ops_t zone_shmmni_ops = { rcop_no_action, - rcop_no_usage, + zone_shmmni_usage, rcop_no_set, zone_shmmni_test }; /*ARGSUSED*/ +static rctl_qty_t +zone_semmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_ipc.ipcq_semmni); +} + +/*ARGSUSED*/ static int zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t incr, uint_t flags) @@ -1543,12 +1682,20 @@ zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, static rctl_ops_t zone_semmni_ops = { rcop_no_action, - rcop_no_usage, + zone_semmni_usage, rcop_no_set, zone_semmni_test }; /*ARGSUSED*/ +static rctl_qty_t +zone_msgmni_usage(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_ipc.ipcq_msgmni); +} + +/*ARGSUSED*/ static int zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, rctl_qty_t incr, uint_t flags) @@ -1564,7 +1711,7 @@ zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, static rctl_ops_t zone_msgmni_ops = { rcop_no_action, - rcop_no_usage, + zone_msgmni_usage, rcop_no_set, zone_msgmni_test }; @@ -1671,6 +1818,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1764,6 +1944,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1792,7 +1986,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1817,6 +2011,160 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) @@ -1827,11 +2175,19 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw) if (rw == KSTAT_WRITE) return (EACCES); + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; return (0); } @@ -1859,12 +2215,22 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); ksp->ks_update = zone_mcap_kstat_update; ksp->ks_private = zone; @@ -1960,13 +2326,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -1998,8 +2376,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2037,6 +2422,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -2060,8 +2447,9 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - + zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; @@ -2172,6 +2560,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2213,6 +2616,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2223,6 +2640,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2244,6 +2666,8 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2323,6 +2747,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2351,6 +2777,19 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) @@ -2395,12 +2834,18 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2408,7 +2853,7 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG @@ -2486,9 +2931,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2534,14 +2984,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ +static int +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) +{ + zone->zone_mcap_nover++; + + return (0); +} + static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) { - uint64_t mcap; - int err = 0; + uint64_t pageout; + int err; - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} + +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -2953,6 +3454,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3696,6 +4203,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3724,9 +4242,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -3768,6 +4331,7 @@ zsched(void *arg) PTOU(pp)->u_argc = 0; PTOU(pp)->u_argv = NULL; PTOU(pp)->u_envp = NULL; + PTOU(pp)->u_commpagep = NULL; closeall(P_FINFO(pp)); /* @@ -4210,8 +4774,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4329,7 +4894,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4352,6 +4917,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zoneid = zone->zone_id = id_alloc(zoneid_space); + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4359,6 +4925,8 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4420,10 +4988,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4578,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4718,6 +5290,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4728,7 +5301,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5479,14 +6061,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5541,6 +6115,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5572,10 +6163,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5592,7 +6184,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5614,8 +6208,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); @@ -5644,6 +6247,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6336,6 +6955,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6346,7 +6966,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6447,6 +7067,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6624,7 +7245,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6687,16 +7308,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6764,7 +7384,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6801,6 +7422,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/refhash/refhash.c index 8f96c2d9f1..e2de00597e 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c +++ b/usr/src/uts/common/refhash/refhash.c @@ -10,16 +10,18 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> +#include <sys/refhash.h> #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/kmem.h> #include <sys/list.h> #include <sys/ddi.h> +#define RHL_F_DEAD 0x01 + #ifdef lint extern refhash_link_t *obj_to_link(refhash_t *, void *); extern void *link_to_obj(refhash_t *, refhash_link_t *); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index d5dd20bff9..052a28a5e2 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -23,6 +23,7 @@ # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2014, Joyent, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> +# Copyright 2015, Joyent, Inc. All rights reserved. # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com> # Copyright 2016 Nexenta Systems, Inc. @@ -250,6 +251,7 @@ CHKHDRS= \ flock.h \ flock_impl.h \ fork.h \ + frameio.h \ fss.h \ fsspriocntl.h \ fsid.h \ @@ -275,6 +277,7 @@ CHKHDRS= \ idmap.h \ ieeefp.h \ id_space.h \ + inotify.h \ instance.h \ int_const.h \ int_fmtio.h \ @@ -343,6 +346,7 @@ CHKHDRS= \ lgrp.h \ lgrp_user.h \ libc_kernel.h \ + limits.h \ link.h \ list.h \ list_impl.h \ @@ -424,6 +428,9 @@ CHKHDRS= \ ontrap.h \ open.h \ openpromio.h \ + overlay.h \ + overlay_common.h \ + overlay_target.h \ panic.h \ param.h \ pathconf.h \ @@ -646,6 +653,8 @@ CHKHDRS= \ vmem.h \ vmem_impl.h \ vmsystm.h \ + vnd.h \ + vnd_errno.h \ vnic.h \ vnic_impl.h \ vnode.h \ @@ -657,12 +666,14 @@ CHKHDRS= \ vuid_queue.h \ vuid_state.h \ vuid_store.h \ + vxlan.h \ wait.h \ waitq.h \ wanboot_impl.h \ watchpoint.h \ winlockio.h \ zcons.h \ + zfd.h \ zone.h \ xti_inet.h \ xti_osi.h \ @@ -856,13 +867,14 @@ FSHDRS= \ autofs.h \ decomp.h \ dv_node.h \ - sdev_impl.h \ fifonode.h \ hsfs_isospec.h \ hsfs_node.h \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ @@ -872,6 +884,8 @@ FSHDRS= \ pc_label.h \ pc_node.h \ pxfs_ki.h \ + sdev_impl.h \ + sdev_plugin.h \ snode.h \ swapnode.h \ tmp.h \ @@ -996,6 +1010,7 @@ SATAGENHDRS= \ SYSEVENTHDRS= \ ap_driver.h \ + datalink.h \ dev.h \ domain.h \ dr.h \ diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 547c9cc241..a4c0409304 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ #ifndef _SYS_AGGR_IMPL_H @@ -308,6 +309,8 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *); extern void aggr_port_init_callbacks(aggr_port_t *); extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *, + boolean_t); extern void aggr_tx_ring_update(void *, uintptr_t); extern void aggr_tx_notify_thread(void *); diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 3a2e705850..48b94e2951 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -29,7 +29,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_AUXV_H @@ -78,6 +78,9 @@ typedef struct { #define AT_FLAGS 8 /* processor flags */ #define AT_ENTRY 9 /* a.out entry point */ +/* First introduced on Linux */ +#define AT_RANDOM 25 /* address of 16 random bytes */ + /* * These relate to the original PPC ABI document; Linux reused * the values for other things (see below), so disambiguation of @@ -90,19 +93,18 @@ typedef struct { * These are the values from LSB 1.3, the first five are also described * in the draft amd64 ABI. * - * At the time of writing, Solaris doesn't place any of these values into - * the aux vector, except AT_CLKTCK which is placed on the aux vector for - * lx branded processes; also, we do similar things via AT_SUN_ values. + * At the time of writing, illumos doesn't place any of these values into the + * aux vector, except where noted. We do similar things via AT_SUN_ values. * * AT_NOTELF 10 program is not ELF? - * AT_UID 11 real user id - * AT_EUID 12 effective user id - * AT_GID 13 real group id - * AT_EGID 14 effective group id + * AT_UID 11 real user id (provided in LX) + * AT_EUID 12 effective user id (provided in LX) + * AT_GID 13 real group id (provided in LX) + * AT_EGID 14 effective group id (provided in LX) * * AT_PLATFORM 15 * AT_HWCAP 16 - * AT_CLKTCK 17 c.f. _SC_CLK_TCK + * AT_CLKTCK 17 c.f. _SC_CLK_TCK (provided in LX) * AT_FPUCW 18 * * AT_DCACHEBSIZE 19 (moved from 10) @@ -110,6 +112,16 @@ typedef struct { * AT_UCACHEBSIZE 21 (moved from 12) * * AT_IGNOREPPC 22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE 23 secure mode boolean (provided in LX) + * AT_BASE_PLATFORM 24 string identifying real platform, may + * differ from AT_PLATFORM. + * AT_HWCAP2 26 extension of AT_HWCAP + * AT_EXECFN 31 filename of program + * AT_SYSINFO 32 + * AT_SYSINFO_EHDR 33 The vDSO location */ /* @@ -186,6 +198,13 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_AUX1 2020 #define AT_SUN_BRAND_AUX2 2021 #define AT_SUN_BRAND_AUX3 2022 +#define AT_SUN_BRAND_AUX4 2025 +#define AT_SUN_BRAND_NROOT 2024 + +/* + * Aux vector for comm page + */ +#define AT_SUN_COMMPAGE 2026 /* * Note that 2023 is reserved for the AT_SUN_HWCAP2 word defined above. diff --git a/usr/src/uts/common/sys/auxv_386.h b/usr/src/uts/common/sys/auxv_386.h index ec4c8b0f19..a3256a464f 100644 --- a/usr/src/uts/common/sys/auxv_386.h +++ b/usr/src/uts/common/sys/auxv_386.h @@ -89,10 +89,12 @@ extern "C" { #define AV_386_2_BMI2 0x00008 /* BMI2 insns */ #define AV_386_2_FMA 0x00010 /* FMA insns */ #define AV_386_2_AVX2 0x00020 /* AVX2 insns */ +#define AV_386_2_ADX 0x00040 /* ADX insns */ +#define AV_386_2_RDSEED 0x00080 /* RDSEED insn */ #define FMT_AV_386_2 \ "\020" \ - "\06avx2\05fma\04bmi2\03bmi1\02rdrand\01f16c" + "\10rdseed\07adx\06avx2\05fma\04bmi2\03bmi1\02rdrand\01f16c" #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index badc3faff8..d88d5683a7 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #ifndef _SYS_BRAND_H @@ -102,29 +103,101 @@ struct brand_mach_ops; struct intpdata; struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock) + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + * during lwp creation before b_initlwp could be called. + * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock) + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + */ struct brand_ops { - void (*b_init_brand_data)(zone_t *); + void (*b_init_brand_data)(zone_t *, kmutex_t *); void (*b_free_brand_data)(zone_t *); int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void (*b_setbrand)(struct proc *); int (*b_getattr)(zone_t *, int, void *, size_t *); int (*b_setattr)(zone_t *, int, void *, size_t); void (*b_copy_procdata)(struct proc *, struct proc *); - void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_proc_exit)(struct proc *); void (*b_exec)(); void (*b_lwp_setrval)(klwp_t *, int, int); - int (*b_initlwp)(klwp_t *); + void *(*b_lwpdata_alloc)(struct proc *); + void (*b_lwpdata_free)(void *); + void (*b_initlwp)(klwp_t *, void *); + void (*b_initlwp_post)(klwp_t *); void (*b_forklwp)(klwp_t *, klwp_t *); void (*b_freelwp)(klwp_t *); void (*b_lwpexit)(klwp_t *); int (*b_elfexec)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); void (*b_sigset_native_to_brand)(sigset_t *); void (*b_sigset_brand_to_native)(sigset_t *); + void (*b_sigfd_translate)(k_siginfo_t *); int b_nsig; + void (*b_exit_with_sig)(proc_t *, sigqueue_t *); + boolean_t (*b_wait_filter)(proc_t *, proc_t *); + boolean_t (*b_native_exec)(uint8_t, const char **); + uint32_t (*b_map32limit)(proc_t *); + void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); + int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, + boolean_t *, int *); + int (*b_sigcld_repost)(proc_t *, sigqueue_t *); + int (*b_issig_stop)(proc_t *, klwp_t *); + boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); + int (*b_setid_clear)(vattr_t *vap, cred_t *cr); + int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); + boolean_t b_intp_parse_arg; }; /* @@ -135,6 +208,7 @@ typedef struct brand { char *b_name; struct brand_ops *b_ops; struct brand_mach_ops *b_machops; + size_t b_data_size; } brand_t; extern brand_t native_brand; @@ -165,7 +239,7 @@ extern brand_t *brand_register_zone(struct brand_attr *); extern brand_t *brand_find_name(char *); extern void brand_unregister_zone(brand_t *); extern int brand_zone_count(brand_t *); -extern void brand_setbrand(proc_t *); +extern int brand_setbrand(proc_t *, boolean_t); extern void brand_clearbrand(proc_t *, boolean_t); /* @@ -178,17 +252,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t, extern void brand_solaris_copy_procdata(proc_t *, proc_t *, struct brand *); extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, - intpdata_t *, int, long *, int, caddr_t, cred_t *, int, - struct brand *, char *, char *, char *, char *, char *); + intpdata_t *, int, long *, int, caddr_t, cred_t *, int *, + struct brand *, char *, char *, char *); extern void brand_solaris_exec(struct brand *); extern int brand_solaris_fini(char **, struct modlinkage *, struct brand *); extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *); extern void brand_solaris_freelwp(klwp_t *, struct brand *); -extern int brand_solaris_initlwp(klwp_t *, struct brand *); +extern void brand_solaris_initlwp(klwp_t *, struct brand *); extern void brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void brand_solaris_proc_exit(struct proc *, klwp_t *, - struct brand *); +extern void brand_solaris_proc_exit(struct proc *, struct brand *); extern void brand_solaris_setbrand(proc_t *, struct brand *); #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index a9191aed7c..cb8a6012fc 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,6 +187,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -198,6 +200,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h index 21cf94dcf9..2c70d7c9f1 100644 --- a/usr/src/uts/common/sys/contract/process.h +++ b/usr/src/uts/common/sys/contract/process.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_CONTRACT_PROCESS_H #define _SYS_CONTRACT_PROCESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/contract.h> #include <sys/time.h> @@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t; #define CT_PR_NOORPHAN 0x2 /* kill when contract is abandoned */ #define CT_PR_PGRPONLY 0x4 /* only kill process group on fatal errors */ #define CT_PR_REGENT 0x8 /* automatically detach inherited contracts */ -#define CT_PR_ALLPARAM 0xf +#define CT_PR_KEEP_EXEC 0x10 /* preserve template accross exec */ +#define CT_PR_ALLPARAM 0x1f /* * ctr_ev_* flags diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index 5056f9a511..914f132dc0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/ctf_api.h b/usr/src/uts/common/sys/ctf_api.h index 04d73c3181..bc99f67d3f 100644 --- a/usr/src/uts/common/sys/ctf_api.h +++ b/usr/src/uts/common/sys/ctf_api.h @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -60,6 +60,65 @@ extern "C" { typedef struct ctf_file ctf_file_t; typedef long ctf_id_t; +#define ECTF_BASE 1000 /* base value for libctf errnos */ + +enum { + ECTF_FMT = ECTF_BASE, /* file is not in CTF or ELF format */ + ECTF_ELFVERS, /* ELF version is more recent than libctf */ + ECTF_CTFVERS, /* CTF version is more recent than libctf */ + ECTF_ENDIAN, /* data is different endian-ness than lib */ + ECTF_SYMTAB, /* symbol table uses invalid entry size */ + ECTF_SYMBAD, /* symbol table data buffer invalid */ + ECTF_STRBAD, /* string table data buffer invalid */ + ECTF_CORRUPT, /* file data corruption detected */ + ECTF_NOCTFDATA, /* ELF file does not contain CTF data */ + ECTF_NOCTFBUF, /* buffer does not contain CTF data */ + ECTF_NOSYMTAB, /* symbol table data is not available */ + ECTF_NOPARENT, /* parent CTF container is not available */ + ECTF_DMODEL, /* data model mismatch */ + ECTF_MMAP, /* failed to mmap a data section */ + ECTF_ZMISSING, /* decompression library not installed */ + ECTF_ZINIT, /* failed to initialize decompression library */ + ECTF_ZALLOC, /* failed to allocate decompression buffer */ + ECTF_DECOMPRESS, /* failed to decompress CTF data */ + ECTF_STRTAB, /* string table for this string is missing */ + ECTF_BADNAME, /* string offset is corrupt w.r.t. strtab */ + ECTF_BADID, /* invalid type ID number */ + ECTF_NOTSOU, /* type is not a struct or union */ + ECTF_NOTENUM, /* type is not an enum */ + ECTF_NOTSUE, /* type is not a struct, union, or enum */ + ECTF_NOTINTFP, /* type is not an integer or float */ + ECTF_NOTARRAY, /* type is not an array */ + ECTF_NOTREF, /* type does not reference another type */ + ECTF_NAMELEN, /* buffer is too small to hold type name */ + ECTF_NOTYPE, /* no type found corresponding to name */ + ECTF_SYNTAX, /* syntax error in type name */ + ECTF_NOTFUNC, /* symtab entry does not refer to a function */ + ECTF_NOFUNCDAT, /* no func info available for function */ + ECTF_NOTDATA, /* symtab entry does not refer to a data obj */ + ECTF_NOTYPEDAT, /* no type info available for object */ + ECTF_NOLABEL, /* no label found corresponding to name */ + ECTF_NOLABELDATA, /* file does not contain any labels */ + ECTF_NOTSUP, /* feature not supported */ + ECTF_NOENUMNAM, /* enum element name not found */ + ECTF_NOMEMBNAM, /* member name not found */ + ECTF_RDONLY, /* CTF container is read-only */ + ECTF_DTFULL, /* CTF type is full (no more members allowed) */ + ECTF_FULL, /* CTF container is full */ + ECTF_DUPMEMBER, /* duplicate member name definition */ + ECTF_CONFLICT, /* conflicting type definition present */ + ECTF_REFERENCED, /* type has outstanding references */ + ECTF_NOTDYN, /* type is not a dynamic type */ + ECTF_ELF, /* elf library failure */ + ECTF_MCHILD, /* cannot merge child container */ + ECTF_LABELEXISTS, /* label already exists */ + ECTF_LCONFLICT, /* merged labels conflict */ + ECTF_ZLIB, /* zlib library failure */ + ECTF_CONVBKERR, /* CTF conversion backend error */ + ECTF_CONVNOCSRC, /* No C source to convert from */ + ECTF_NOCONVBKEND /* No applicable conversion backend */ +}; + /* * If the debugger needs to provide the CTF library with a set of raw buffers * for use as the CTF data, symbol table, and string table, it can do so by @@ -143,19 +202,24 @@ typedef struct ctf_lblinfo { typedef int ctf_visit_f(const char *, ctf_id_t, ulong_t, int, void *); typedef int ctf_member_f(const char *, ctf_id_t, ulong_t, void *); typedef int ctf_enum_f(const char *, int, void *); -typedef int ctf_type_f(ctf_id_t, void *); +typedef int ctf_type_f(ctf_id_t, boolean_t, void *); typedef int ctf_label_f(const char *, const ctf_lblinfo_t *, void *); +typedef int ctf_function_f(const char *, ulong_t, ctf_funcinfo_t *, void *); +typedef int ctf_object_f(const char *, ctf_id_t, ulong_t, void *); +typedef int ctf_string_f(const char *, void *); extern ctf_file_t *ctf_bufopen(const ctf_sect_t *, const ctf_sect_t *, const ctf_sect_t *, int *); extern ctf_file_t *ctf_fdopen(int, int *); extern ctf_file_t *ctf_open(const char *, int *); extern ctf_file_t *ctf_create(int *); +extern ctf_file_t *ctf_fdcreate(int, int *); extern ctf_file_t *ctf_dup(ctf_file_t *); extern void ctf_close(ctf_file_t *); extern ctf_file_t *ctf_parent_file(ctf_file_t *); extern const char *ctf_parent_name(ctf_file_t *); +extern const char *ctf_parent_label(ctf_file_t *); extern int ctf_import(ctf_file_t *, ctf_file_t *); extern int ctf_setmodel(ctf_file_t *, int); @@ -165,15 +229,20 @@ extern void ctf_setspecific(ctf_file_t *, void *); extern void *ctf_getspecific(ctf_file_t *); extern int ctf_errno(ctf_file_t *); +extern uint_t ctf_flags(ctf_file_t *); extern const char *ctf_errmsg(int); extern int ctf_version(int); extern int ctf_func_info(ctf_file_t *, ulong_t, ctf_funcinfo_t *); +extern int ctf_func_info_by_id(ctf_file_t *, ctf_id_t, ctf_funcinfo_t *); extern int ctf_func_args(ctf_file_t *, ulong_t, uint_t, ctf_id_t *); +extern int ctf_func_args_by_id(ctf_file_t *, ctf_id_t, uint_t, ctf_id_t *); extern ctf_id_t ctf_lookup_by_name(ctf_file_t *, const char *); extern ctf_id_t ctf_lookup_by_symbol(ctf_file_t *, ulong_t); +extern char *ctf_symbol_name(ctf_file_t *, ulong_t, char *, size_t); + extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t); extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t); extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t); @@ -182,6 +251,7 @@ extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t, extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t); extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t); extern int ctf_type_kind(ctf_file_t *, ctf_id_t); +extern const char *ctf_kind_name(ctf_file_t *, int); extern ctf_id_t ctf_type_reference(ctf_file_t *, ctf_id_t); extern ctf_id_t ctf_type_pointer(ctf_file_t *, ctf_id_t); extern int ctf_type_encoding(ctf_file_t *, ctf_id_t, ctf_encoding_t *); @@ -201,37 +271,50 @@ extern int ctf_label_info(ctf_file_t *, const char *, ctf_lblinfo_t *); extern int ctf_member_iter(ctf_file_t *, ctf_id_t, ctf_member_f *, void *); extern int ctf_enum_iter(ctf_file_t *, ctf_id_t, ctf_enum_f *, void *); -extern int ctf_type_iter(ctf_file_t *, ctf_type_f *, void *); +extern int ctf_type_iter(ctf_file_t *, boolean_t, ctf_type_f *, void *); extern int ctf_label_iter(ctf_file_t *, ctf_label_f *, void *); +extern int ctf_function_iter(ctf_file_t *, ctf_function_f *, void *); +extern int ctf_object_iter(ctf_file_t *, ctf_object_f *, void *); +extern int ctf_string_iter(ctf_file_t *, ctf_string_f *, void *); extern ctf_id_t ctf_add_array(ctf_file_t *, uint_t, const ctf_arinfo_t *); -extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_enum(ctf_file_t *, uint_t, const char *); extern ctf_id_t ctf_add_float(ctf_file_t *, uint_t, const char *, const ctf_encoding_t *); extern ctf_id_t ctf_add_forward(ctf_file_t *, uint_t, const char *, uint_t); -extern ctf_id_t ctf_add_function(ctf_file_t *, uint_t, - const ctf_funcinfo_t *, const ctf_id_t *); +extern ctf_id_t ctf_add_funcptr(ctf_file_t *, uint_t, const ctf_funcinfo_t *, + const ctf_id_t *); extern ctf_id_t ctf_add_integer(ctf_file_t *, uint_t, const char *, const ctf_encoding_t *); -extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_type(ctf_file_t *, ctf_file_t *, ctf_id_t); extern ctf_id_t ctf_add_typedef(ctf_file_t *, uint_t, const char *, ctf_id_t); -extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_struct(ctf_file_t *, uint_t, const char *); extern ctf_id_t ctf_add_union(ctf_file_t *, uint_t, const char *); -extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, const char *, ctf_id_t); extern int ctf_add_enumerator(ctf_file_t *, ctf_id_t, const char *, int); -extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t); +extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t, + ulong_t); + + +extern int ctf_add_function(ctf_file_t *, ulong_t, const ctf_funcinfo_t *, + const ctf_id_t *); +extern int ctf_add_object(ctf_file_t *, ulong_t, ctf_id_t); +extern int ctf_add_label(ctf_file_t *, const char *, ctf_id_t, uint_t); extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *); +extern int ctf_set_root(ctf_file_t *, ctf_id_t, const boolean_t); +extern int ctf_set_size(ctf_file_t *, ctf_id_t, const ulong_t); extern int ctf_delete_type(ctf_file_t *, ctf_id_t); extern int ctf_update(ctf_file_t *); extern int ctf_discard(ctf_file_t *); extern int ctf_write(ctf_file_t *, int); +extern void ctf_dataptr(ctf_file_t *, const void **, size_t *); #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index fb2a0749d3..4cd93be56e 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLD_H @@ -191,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -203,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* @@ -350,6 +353,7 @@ typedef struct dld_hwgrpinfo { */ typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define DI_DIRECT_RAW 0x1 /* * Direct Tx/Rx capability. */ @@ -374,6 +378,9 @@ typedef struct dld_capab_direct_s { /* flow control "can I put on a ring" callback */ uintptr_t di_tx_fctl_df; /* canput-like callback */ void *di_tx_fctl_dh; + + /* flags that control our behavior */ + uint_t di_flags; } dld_capab_direct_t; /* diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index a76a927e59..81708aad38 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum { typedef enum { DLD_UNINITIALIZED, DLD_PASSIVE, - DLD_ACTIVE + DLD_ACTIVE, + DLD_EXCLUSIVE } dld_passivestate_t; /* @@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, extern void dld_str_notify_ind(dld_str_t *); extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, uintptr_t, uint16_t); +extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" { #define IPTUN_IOC 0x454A #define BRIDGE_IOC 0xB81D #define IBPART_IOC 0x6171 +#define OVERLAY_IOC 0x2005 /* GLDv3 modules use these macros to generate unique ioctl commands */ #define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" { #define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid)) #define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid)) #define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid)) #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..34f1c17236 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -107,6 +107,7 @@ typedef struct dl_ipnetinfo { #define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */ #define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */ #define DL_NOTIFY_CONF 0x116 /* Notification from upstream */ +#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */ /* * Primitives used for Connectionless Service @@ -388,6 +389,8 @@ typedef struct dl_ipnetinfo { #define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */ #define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */ #define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */ +#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */ +#define DL_PROMISC_FIXUPS 0x05 /* above will be fixed up */ /* * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -1107,6 +1110,13 @@ typedef struct { } dl_intr_mode_req_t; /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { + t_uscalar_t dl_primitive; +} dl_exclusive_req_t; + +/* * CONNECTION-ORIENTED SERVICE PRIMITIVES */ @@ -1528,6 +1538,7 @@ union DL_primitives { dl_control_ack_t control_ack; dl_passive_req_t passive_req; dl_intr_mode_req_t intr_mode_req; + dl_exclusive_req_t exclusive_req; }; #define DL_INFO_REQ_SIZE sizeof (dl_info_req_t) @@ -1596,6 +1607,7 @@ union DL_primitives { #define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t) #define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t) #define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t) +#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t) #ifdef _KERNEL /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 6bd2bbe35a..81f9e2abac 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_H @@ -85,6 +86,8 @@ typedef struct dls_link_s dls_link_t; #define DLS_PROMISC_SAP 0x00000001 #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 +#define DLS_PROMISC_RX_ONLY 0x00000008 +#define DLS_PROMISC_FIXUPS 0x00000010 extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); extern void dls_close(dld_str_t *); @@ -106,11 +109,13 @@ extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); +extern int dls_devnet_open_in_zone(const char *, + dls_dl_handle_t *, dev_t *, zoneid_t); extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -122,12 +127,13 @@ extern int dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *); extern void dls_devnet_rele(dls_dl_handle_t); extern void dls_devnet_prop_task_wait(dls_dl_handle_t); +extern const char *dls_devnet_link(dls_dl_handle_t); extern const char *dls_devnet_mac(dls_dl_handle_t); extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -141,6 +147,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t, extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, datalink_class_t *, uint32_t *, uint32_t *); extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int dls_mgmt_get_linkid_in_zone(const char *, + datalink_id_t *, zoneid_t); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); extern int dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 60f51c47b5..329f8dd08e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_IMPL_H @@ -46,11 +47,12 @@ typedef struct dls_multicst_addr_s { } dls_multicst_addr_t; struct dls_link_s { /* Protected by */ - char dl_name[MAXNAMELEN]; /* SL */ + char dl_name[MAXNAMELEN]; /* RO */ uint_t dl_ddi_instance; /* SL */ mac_handle_t dl_mh; /* SL */ mac_client_handle_t dl_mch; /* SL */ mac_unicast_handle_t dl_mah; /* SL */ + mac_notify_handle_t dl_mnh; /* SL */ const mac_info_t *dl_mip; /* SL */ uint_t dl_ref; /* SL */ mod_hash_t *dl_str_hash; /* SL, modhash lock */ @@ -61,6 +63,7 @@ struct dls_link_s { /* Protected by */ uint_t dl_zone_ref; link_tagmode_t dl_tagmode; /* atomic */ uint_t dl_nonip_cnt; /* SL */ + uint_t dl_exclusive; /* SL */ }; typedef struct dls_head_s { @@ -96,13 +99,16 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); extern int dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *, dls_link_t **); extern void dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *); +extern int dls_devnet_hold_tmp_by_link(dls_link_t *, + dls_dl_handle_t *); extern void dls_init(void); extern int dls_fini(void); @@ -126,6 +132,7 @@ extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int dls_exclusive_set(dld_str_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b4032c24d6..214e225ac9 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum { DATALINK_CLASS_SIMNET = 0x20, DATALINK_CLASS_BRIDGE = 0x40, DATALINK_CLASS_IPTUN = 0x80, - DATALINK_CLASS_PART = 0x100 + DATALINK_CLASS_PART = 0x100, + DATALINK_CLASS_OVERLAY = 0x200 } datalink_class_t; #define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ - DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) + DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ + DATALINK_CLASS_OVERLAY) /* * A combination of flags and media. @@ -165,6 +168,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { @@ -225,6 +229,7 @@ typedef struct dlmgmt_getattr_retval_s { char lr_attrval[MAXLINKATTRVALLEN]; } dlmgmt_getattr_retval_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index dd1eecc70d..b88d215336 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -20,7 +20,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. @@ -348,6 +348,11 @@ typedef struct { #define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */ #define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */ +/* + * Linux specific program headers not even used by Linux (!!) + */ +#define PT_PAX_FLAGS 0x65041580 /* PaX flags (see below) */ + #define PT_LOSUNW 0x6ffffffa #define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment (unused) */ #define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */ @@ -363,6 +368,45 @@ typedef struct { #define PF_W 0x2 #define PF_X 0x1 +/* + * PaX is a regrettable series of never-integrated Linux patches for a + * facility to provide additional protections on memory pages for purposes of + * increasing security, and for allowing binaries to demand (or refuse) those + * protections via the PT_PAX_FLAGS program header. (Portents of its + * rudderless existence, "PaX" is a term of indefinite origin written by an + * unknown group of people.) This facility is unfortunate in any number of + * ways, and was largely obviated by the broad adoption of non-executable + * stacks at any rate -- but it lives on in binaries that continue to mark + * themselves to explicitly refuse the (never-integrated, now-obviated) + * facility. One might cringe that PaX overloads the meaning of the p_flags + * to specify protections, but that is the least of its transgressions: + * instead of using one p_type constant to explicitly enable a series of + * protections and another to explicitly disable others, it insists on + * conflating both actions into PT_PAX_FLAGS. The resulting doubling of + * constant definitions (two constant definitions for every protection instead + * of merely one) assures that the values can't even fit in the eight + * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for + * such filthy functionality. And were all of this not enough, there is one + * final nomenclature insult to be added to this semantic injury: the + * constants for the p_flags don't even embed "_PAX_" in their name -- despite + * the fact that this is their only purpose! We resist the temptation to + * right this final wrong here; we grit our teeth and provide exactly the + * Linux definitions -- or rather, what would have been the Linux definitions + * had this belching jalopy ever been permitted to crash itself into mainline. + */ +#define PF_PAGEEXEC 0x00000010 /* PaX: enable PAGEEXEC */ +#define PF_NOPAGEEXEC 0x00000020 /* PaX: disable PAGEEXEC */ +#define PF_SEGMEXEC 0x00000040 /* PaX: enable SEGMEXEC */ +#define PF_NOSEGMEXEC 0x00000080 /* PaX: disable SEGMEXEC */ +#define PF_MPROTECT 0x00000100 /* PaX: enable MPROTECT */ +#define PF_NOMPROTECT 0x00000200 /* PaX: disable MPROTECT */ +#define PF_RANDEXEC 0x00000400 /* PaX: enable RANDEXEC */ +#define PF_NORANDEXEC 0x00000800 /* PaX: disable RANDEXEC */ +#define PF_EMUTRAMP 0x00001000 /* PaX: enable EMUTRAMP */ +#define PF_NOEMUTRAMP 0x00002000 /* PaX: disable EMUTRAMP */ +#define PF_RANDMMAP 0x00004000 /* PaX: enable RANDMMAP */ +#define PF_NORANDMMAP 0x00008000 /* PaX: disable RANDMMAP */ + #define PF_MASKOS 0x0ff00000 /* OS specific values */ #define PF_MASKPROC 0xf0000000 /* processor specific values */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index b5e2c58be5..b2db3f2987 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -26,6 +26,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright 2016, Joyent, Inc. + */ + #ifndef _SYS_EXEC_H #define _SYS_EXEC_H @@ -102,10 +106,14 @@ typedef struct uarg { vnode_t *ex_vp; char *emulator; char *brandname; + const char *brand_nroot; char *auxp_auxflags; /* addr of auxflags auxv on the user stack */ char *auxp_brand; /* address of first brand auxv on user stack */ cred_t *pfcred; boolean_t scrubenv; + uintptr_t maxstack; + boolean_t stk_prot_override; + uintptr_t commpage; } uarg_t; /* @@ -175,7 +183,7 @@ struct execsw { int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -213,7 +221,7 @@ extern int exec_common(const char *fname, const char **argp, const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); @@ -238,16 +246,22 @@ extern void exec_set_sp(size_t); * when compiling the 32-bit compatability elf code in the elfexec module. */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, int *, caddr_t *, + ssize_t *); #endif /* !_ELF32_COMPAT */ #if defined(_LP64) extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, int *, caddr_t *, + ssize_t *); #endif /* _LP64 */ /* diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define _SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { + void *fv_buf; /* Buffer with data */ + size_t fv_buflen; /* Size of the buffer */ + size_t fv_actlen; /* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { + uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */ + uint_t fio_nvpf; /* How many vectors make up one frame */ + uint_t fio_nvecs; /* The total number of vectors */ + framevec_t fio_vecs[]; /* C99 VLA */ +} frameio_t; + + +#define FRAMEIO_VERSION_ONE 1 +#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE + +#define FRAMEIO_NVECS_MAX 32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { + caddr32_t fv_buf; + size32_t fv_buflen; + size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { + uint_t fio_version; + uint_t fio_vecspframe; + uint_t fio_nvecs; + framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { + MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, + int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_INFO_H +#define _SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index 561939fc20..e6fa895060 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -36,6 +36,7 @@ extern "C" { #include <sys/vfs_opreg.h> #include <sys/list.h> #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h> #include <sys/sunddi.h> /* @@ -128,6 +129,21 @@ typedef struct sdev_local_data { struct sdev_dprof sdev_lprof; /* profile for multi-inst */ } sdev_local_data_t; +/* sdev_flags */ +typedef enum sdev_flags { + SDEV_BUILD = 0x0001, /* directory cache out-of-date */ + SDEV_GLOBAL = 0x0002, /* global /dev nodes */ + SDEV_PERSIST = 0x0004, /* backing store persisted node */ + SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */ + SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */ + /* (ex: pts) */ + SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */ + SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */ + /* need update */ + SDEV_SUBDIR = 0x0080, /* match all subdirs under here */ + SDEV_ZONED = 0x0100 /* zoned subdir */ +} sdev_flags_t; + /* * /dev filesystem sdev_node defines */ @@ -150,7 +166,7 @@ typedef struct sdev_node { ino64_t sdev_ino; /* inode */ uint_t sdev_nlink; /* link count */ int sdev_state; /* state of this node */ - int sdev_flags; /* flags bit */ + sdev_flags_t sdev_flags; /* flags bit */ kmutex_t sdev_lookup_lock; /* node creation synch lock */ kcondvar_t sdev_lookup_cv; /* node creation sync cv */ @@ -161,7 +177,7 @@ typedef struct sdev_node { struct sdev_global_data sdev_globaldata; struct sdev_local_data sdev_localdata; } sdev_instance_data; - + list_node_t sdev_plist; /* link on plugin list */ void *sdev_private; } sdev_node_t; @@ -192,29 +208,11 @@ typedef enum { SDEV_READY } sdev_node_state_t; -/* sdev_flags */ -#define SDEV_BUILD 0x0001 /* directory cache out-of-date */ -#define SDEV_GLOBAL 0x0002 /* global /dev nodes */ -#define SDEV_PERSIST 0x0004 /* backing store persisted node */ -#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */ -#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */ - /* (ex: pts) */ -#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */ -#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */ - /* need update */ -#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */ -#define SDEV_ZONED 0x0100 /* zoned subdir */ - /* sdev_lookup_flags */ #define SDEV_LOOKUP 0x0001 /* node creation in progress */ #define SDEV_READDIR 0x0002 /* VDIR readdir in progress */ #define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */ -#define SDEV_VTOR_INVALID -1 -#define SDEV_VTOR_SKIP 0 -#define SDEV_VTOR_VALID 1 -#define SDEV_VTOR_STALE 2 - /* convenient macros */ #define SDEV_IS_GLOBAL(dv) \ (dv->sdev_flags & SDEV_GLOBAL) @@ -366,8 +364,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *, extern int devname_profile_update(char *, size_t); extern struct sdev_data *sdev_find_mntinfo(char *); void sdev_mntinfo_rele(struct sdev_data *); +typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *); +void sdev_mnt_walk(sdev_mnt_walk_f, void *); extern struct vnodeops *devpts_getvnodeops(void); extern struct vnodeops *devvt_getvnodeops(void); +extern void sdev_plugin_nodeready(struct sdev_node *); +extern int sdev_plugin_init(void); +extern int sdev_plugin_fini(void); /* * boot states - warning, the ordering here is significant @@ -513,6 +516,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *); extern void sdev_modctl_dump_files(void); /* + * plugin and legacy vtab stuff + */ +/* directory dependent vop table */ +typedef struct sdev_vop_table { + char *vt_name; /* subdirectory name */ + const fs_operation_def_t *vt_service; /* vnodeops table */ + struct vnodeops **vt_global_vops; /* global container for vop */ + int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ + int vt_flags; +} sdev_vop_table_t; + +extern struct sdev_vop_table vtab[]; +extern struct vnodeops *sdev_get_vop(struct sdev_node *); +extern void sdev_set_no_negcache(struct sdev_node *); +extern void *sdev_get_vtor(struct sdev_node *dv); + +/* * globals */ extern kmutex_t sdev_lock; @@ -525,6 +545,7 @@ extern struct vnodeops *devipnet_vnodeops; extern struct vnodeops *devvt_vnodeops; extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */ extern struct vnodeops *devzvol_vnodeops; +extern int sdev_vnodeops_tbl_size; extern const fs_operation_def_t sdev_vnodeops_tbl[]; extern const fs_operation_def_t devpts_vnodeops_tbl[]; diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h new file mode 100644 index 0000000000..8783df58e6 --- /dev/null +++ b/usr/src/uts/common/sys/fs/sdev_plugin.h @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_SDEV_PLUGIN_H +#define _SYS_SDEV_PLUGIN_H + +/* + * Kernel sdev plugin interface + */ + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/vnode.h> + +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef uintptr_t sdev_plugin_hdl_t; +typedef uintptr_t sdev_ctx_t; + +/* + * Valid return values for sdev_plugin_validate_t. + */ +typedef enum sdev_plugin_validate { + SDEV_VTOR_INVALID = -1, + SDEV_VTOR_SKIP = 0, + SDEV_VTOR_VALID = 1, + SDEV_VTOR_STALE = 2 +} sdev_plugin_validate_t; + +/* + * Valid flags + */ +typedef enum sdev_plugin_flags { + SDEV_PLUGIN_NO_NCACHE = 0x1, + SDEV_PLUGIN_SUBDIR = 0x2 +} sdev_plugin_flags_t; + +#define SDEV_PLUGIN_FLAGS_MASK 0x3 + +/* + * Functions a module must implement + */ +typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t); +typedef int (*sp_filldir_f)(sdev_ctx_t); +typedef void (*sp_inactive_f)(sdev_ctx_t); + +#define SDEV_PLUGIN_VERSION 1 + +typedef struct sdev_plugin_ops { + int spo_version; + sdev_plugin_flags_t spo_flags; + sp_valid_f spo_validate; + sp_filldir_f spo_filldir; + sp_inactive_f spo_inactive; +} sdev_plugin_ops_t; + +extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *, + int *); +extern int sdev_plugin_unregister(sdev_plugin_hdl_t); + +typedef enum sdev_ctx_flags { + SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */ +} sdev_ctx_flags_t; + +/* + * Context helper functions + */ +extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t); +extern const char *sdev_ctx_name(sdev_ctx_t); +extern const char *sdev_ctx_path(sdev_ctx_t); +extern enum vtype sdev_ctx_vtype(sdev_ctx_t); +extern const void *sdev_ctx_vtype_data(sdev_ctx_t); + +/* + * Callbacks to manipulate nodes + */ +extern int sdev_plugin_mkdir(sdev_ctx_t, char *); +extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDEV_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index 68dd67c61e..f8740e8873 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -22,12 +22,13 @@ * Copyright 2007 Sun Microsystems, Inc. * All rights reserved. Use is subject to license terms. */ +/* + * Copyright 2015 Joyent, Inc. + */ #ifndef _SYS_FS_TMP_H #define _SYS_FS_TMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,29 +69,28 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ /* * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system. E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system. In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care + * should be exercised in changing this: tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure. */ -#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */ +#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */ extern size_t tmpfs_minfree; /* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */ - -extern size_t tmp_kmemspace; -extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */ - extern void tmpnode_init(struct tmount *, struct tmpnode *, struct vattr *, struct cred *); extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t); @@ -101,13 +101,12 @@ extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *, enum dr_op, struct cred *); extern void tdirinit(struct tmpnode *, struct tmpnode *); extern void tdirtrunc(struct tmpnode *); -extern void *tmp_memalloc(size_t, int); -extern void tmp_memfree(void *, size_t); extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int); extern int tmp_taccess(void *, int, struct cred *); extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, struct cred *); extern int tmp_convnum(char *, pgcnt_t *); +extern int tmp_convmode(char *, mode_t *); extern int tdirenter(struct tmount *, struct tmpnode *, char *, enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *, struct tmpnode **, struct cred *, caller_context_t *); diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 2d4e1aa7fb..4a48af52a1 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -21,13 +21,12 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FX_H #define _SYS_FX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/thread.h> #include <sys/ddi.h> @@ -145,7 +144,14 @@ typedef struct fxkparms { uint_t fx_cflags; } fxkparms_t; +/* + * control flags (kparms->fx_cflags). + */ +#define FX_DOUPRILIM 0x01 /* change user priority limit */ +#define FX_DOUPRI 0x02 /* change user priority */ +#define FX_DOTQ 0x04 /* change FX time quantum */ +#define FXMAXUPRI 60 /* maximum user priority setting */ /* * Interface for partner private code. This is not a public interface. diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..40ef4ce982 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_GSQUEUE_H +#define _SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(uint_t, pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define GSQUEUE_FILL 0x0001 +#define GSQUEUE_NODRAIN 0x0002 +#define GSQUEUE_PROCESS 0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, + int, uint8_t); + +/* + * The default wait is inherited from IP. This determines the amount of time + * that must pass after queuing work, before we wake up the worker thread. This + * value is in milliseconds. + */ +#define GSQUEUE_DEFAULT_WAIT 10 +#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index d56fcceb5a..46d25f207f 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All Rights reserved. */ #ifndef _ID_SPACE_H @@ -34,8 +35,6 @@ extern "C" { #include <sys/mutex.h> #include <sys/vmem.h> -#ifdef _KERNEL - typedef vmem_t id_space_t; id_space_t *id_space_create(const char *, id_t, id_t); @@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *); id_t id_alloc_specific_nosleep(id_space_t *, id_t); void id_free(id_space_t *, id_t); -#endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the inotify facility. Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define _SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ISDIR 0x40000000 +#define IN_ONESHOT 0x80000000 + +/* + * Helpful constants. + */ +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_ALL_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ + IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define IN_CHILD_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +struct inotify_event { + int32_t wd; /* watch descriptor */ + uint32_t mask; /* mask of events */ + uint32_t cookie; /* event association cookie, if any */ + uint32_t len; /* size of name field */ + char name[]; /* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8)) +#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */ +#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */ +#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */ +#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name +#else +#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define INOTIFY_PTR(type, name) type *name +#endif + +typedef struct inotify_addwatch { + int inaw_fd; /* open fd for object */ + uint32_t inaw_mask; /* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { + INOTIFY_PTR(char, inac_name); /* pointer to name */ + int inac_fd; /* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define IN_UNMASKABLE \ + (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define IN_MODIFIERS \ + (IN_EXCL_UNLINK | IN_ONESHOT) + +#define IN_FLAGS \ + (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define IN_REMOVAL (1ULL << 32) +#define INOTIFYMNRN_INOTIFY 0 +#define INOTIFYMNRN_CLONE 1 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h index 0569c3e967..d7dc365c09 100644 --- a/usr/src/uts/common/sys/ipc_impl.h +++ b/usr/src/uts/common/sys/ipc_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #ifndef _IPC_IMPL_H @@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *); kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *); void ipc_cleanup(ipc_service_t *, kipc_perm_t *); +void ipc_rmsvc(ipc_service_t *, kipc_perm_t *); int ipc_rmid(ipc_service_t *, int, cred_t *); int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index b1990121b8..0ae64b45d7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -95,7 +96,7 @@ extern "C" { /* insert new signals here, and move _SIGRTM* appropriately */ #define _SIGRTMIN 42 /* first (highest-priority) realtime signal */ -#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */ +#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */ extern long _sysconf(int); /* System Private interface to sysconf() */ #define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */ #define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..bdbff0be9b 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -191,7 +191,19 @@ typedef struct _klwp { struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */ struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ - void *lwp_brand; /* per-lwp brand data */ + /* + * Branding: + * lwp_brand - per-lwp brand data + * lwp_brand_syscall - brand syscall interposer + * lwp_brand_syscall_fast - brand "fast path" syscall interposer + * + * The lwp_brand_syscall_fast handler should only be used if an + * lwp_brand_syscall handler is also in place. + */ + void *lwp_brand; + int (*lwp_brand_syscall)(void); + int (*lwp_brand_syscall_fast)(void); + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/common/sys/kmem_impl.h b/usr/src/uts/common/sys/kmem_impl.h index 90e0477c45..26ab055dbc 100644 --- a/usr/src/uts/common/sys/kmem_impl.h +++ b/usr/src/uts/common/sys/kmem_impl.h @@ -302,7 +302,6 @@ typedef struct kmem_defrag { uint64_t kmd_later; /* LATER responses */ uint64_t kmd_dont_need; /* DONT_NEED responses */ uint64_t kmd_dont_know; /* DONT_KNOW responses */ - uint64_t kmd_hunt_found; /* DONT_KNOW: # found in mag */ uint64_t kmd_slabs_freed; /* slabs freed by moves */ uint64_t kmd_defrags; /* kmem_cache_defrag() */ uint64_t kmd_scans; /* kmem_cache_scan() */ diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index dfe25eec76..be669cb78d 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -21,6 +21,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_KSOCKET_H_ @@ -121,6 +122,10 @@ extern int ksocket_close(ksocket_t, struct cred *); extern void ksocket_hold(ksocket_t); extern void ksocket_rele(ksocket_t); +typedef boolean_t (*ksocket_krecv_f)(ksocket_t, mblk_t *, size_t, int, void *); +extern int ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *); +extern void ksocket_krecv_unblock(ksocket_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h new file mode 100644 index 0000000000..88625d1829 --- /dev/null +++ b/usr/src/uts/common/sys/limits.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LIMITS_H +#define _SYS_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define IOV_MAX 1024 + +#ifdef _KERNEL +#define IOV_MAX_STACK 16 /* max. IOV on-stack allocation */ +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIMITS_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 247c3bd48d..cdbbe4ce62 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> */ @@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s { } mac_propval_uint32_range_t; /* + * Defines ranges which are a series of C style strings. + */ +typedef struct mac_propval_str_range_s { + uint32_t mpur_nextbyte; + char mpur_data[1]; +} mac_propval_str_range_t; + +/* * Data type of property values. */ typedef enum { @@ -120,6 +128,7 @@ typedef struct mac_propval_range_s { mac_propval_type_t mpr_type; /* type of value */ union { mac_propval_uint32_range_t mpr_uint32[1]; + mac_propval_str_range_t mpr_str; } u; } mac_propval_range_t; @@ -214,6 +223,7 @@ typedef enum { MAC_PROP_MAX_RXHWCLNT_AVAIL, MAC_PROP_MAX_TXHWCLNT_AVAIL, MAC_PROP_IB_LINKMODE, + MAC_PROP_VN_PROMISC_FILTERED, MAC_PROP_SECONDARY_ADDRS, MAC_PROP_ADV_40GFDX_CAP, MAC_PROP_EN_40GFDX_CAP, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 0fc4939503..1f2c732e6d 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* @@ -115,6 +115,7 @@ typedef enum { #define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 #define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004 #define MAC_PROMISC_FLAGS_NO_COPY 0x0008 +#define MAC_PROMISC_FLAGS_DO_FIXUPS 0x0010 /* flags passed to mac_tx() */ #define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 0904b28645..d2fd145375 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_MAC_CLIENT_IMPL_H @@ -82,6 +82,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */ boolean_t mpi_no_phys; /* WO */ boolean_t mpi_strip_vlan_tag; /* WO */ boolean_t mpi_no_copy; /* WO */ + boolean_t mpi_do_fixups; /* WO */ } mac_promisc_impl_t; typedef union mac_tx_percpu_s { @@ -330,13 +331,14 @@ extern int mac_tx_percpu_cnt; /* Mac protection flags */ #define MPT_FLAG_V6_LOCAL_ADDR_SET 0x0001 +#define MPT_FLAG_PROMISC_FILTERED 0x0002 /* in mac_client.c */ extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); extern void mac_client_init(void); extern void mac_client_fini(void); extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, - mac_client_impl_t *); + mac_client_impl_t *, boolean_t); extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 6b409513a6..a5848625c2 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -171,6 +171,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); extern void *mac_get_devinfo(mac_handle_t); extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t); extern uint32_t mac_no_notification(mac_handle_t); extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 2286b587e8..46293b1a74 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_MAC_IMPL_H @@ -331,7 +331,7 @@ struct mac_group_s { if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \ rhandle = (mip)->mi_default_tx_ring; \ if (mip->mi_promisc_list != NULL) \ - mac_promisc_dispatch(mip, mp, src_mcip); \ + mac_promisc_dispatch(mip, mp, src_mcip, B_TRUE); \ /* \ * Grab the proper transmit pointer and handle. Special \ * optimization: we can test mi_bridge_link itself atomically, \ @@ -643,6 +643,7 @@ struct mac_impl_s { #define MIS_LEGACY 0x0040 #define MIS_NO_ACTIVE 0x0080 #define MIS_POLL_DISABLE 0x0100 +#define MIS_IS_OVERLAY 0x0200 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start @@ -894,6 +895,8 @@ extern void mac_protect_fini(mac_client_impl_t *); extern int mac_set_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_effective_resources(mac_handle_t, mac_resource_props_t *); +extern void mac_set_promisc_filtered(mac_client_handle_t, boolean_t); +extern boolean_t mac_get_promisc_filtered(mac_client_handle_t); extern cpupart_t *mac_pset_find(mac_resource_props_t *, boolean_t *); extern void mac_set_pool_effective(boolean_t, cpupart_t *, diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 9f7f2a1a73..5f02451542 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_MAC_PROVIDER_H @@ -106,7 +107,8 @@ typedef enum { MAC_CAPAB_NO_NATIVEVLAN = 0x00080000, /* boolean only, no data */ MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */ MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */ - MAC_CAPAB_VRRP = 0x00400000 /* data is mac_capab_vrrp_t */ + MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */ + MAC_CAPAB_OVERLAY = 0x00800000 /* boolean only, no data */ } mac_capab_t; /* diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 6ec5f4ff41..34e491fd3b 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -338,6 +338,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__) /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index 88c98dc5a4..7196f7b3ac 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 14b1aa55db..883c329aed 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@ * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_NETCONFIG_H @@ -147,6 +148,8 @@ extern int endnetpath(void *); extern struct netconfig *getnetpath(void *); extern void nc_perror(const char *); extern char *nc_sperror(void); +extern void _nsl_brand_set_hooks(int (*)(void), + struct netconfig *(*)(int)); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index 93b5fc3e01..ea85c78f6b 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -44,6 +44,8 @@ extern "C" { #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VND_INET "NHF_VND_INET" +#define NHF_VND_INET6 "NHF_VND_INET6" /* * Event identification diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 2c77e1be96..73f29d1e63 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -81,7 +81,8 @@ typedef id_t netstackid_t; #define NS_IPSECESP 16 #define NS_IPNET 17 #define NS_ILB 18 -#define NS_MAX (NS_ILB+1) +#define NS_VND 19 +#define NS_MAX (NS_VND+1) /* * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define _SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVERLAY_IOC_CREATE OVERLAYIOC(1) +#define OVERLAY_IOC_DELETE OVERLAYIOC(2) +#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3) +#define OVERLAY_IOC_GETPROP OVERLAYIOC(4) +#define OVERLAY_IOC_SETPROP OVERLAYIOC(5) +#define OVERLAY_IOC_NPROPS OVERLAYIOC(6) +#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7) +#define OVERLAY_IOC_STATUS OVERLAYIOC(8) + +typedef struct overlay_ioc_create { + datalink_id_t oic_linkid; + uint32_t oic_filler; + uint64_t oic_vnetid; + char oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { + datalink_id_t oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { + datalink_id_t oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { + datalink_id_t oipn_linkid; + int32_t oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { + datalink_id_t oipi_linkid; + int32_t oipi_id; + char oipi_name[OVERLAY_PROP_NAMELEN]; + uint_t oipi_type; + uint_t oipi_prot; + uint8_t oipi_default[OVERLAY_PROP_SIZEMAX]; + uint32_t oipi_defsize; + uint32_t oipi_posssize; + uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { + datalink_id_t oip_linkid; + int32_t oip_id; + char oip_name[OVERLAY_PROP_NAMELEN]; + uint8_t oip_value[OVERLAY_PROP_SIZEMAX]; + uint32_t oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { + OVERLAY_I_OK = 0x00, + OVERLAY_I_DEGRADED = 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { + datalink_id_t ois_linkid; + uint_t ois_status; + char ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..d638096006 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define _SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { + OVERLAY_TARGET_NONE = 0x0, + OVERLAY_TARGET_POINT, + OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { + OVERLAY_PLUGIN_D_INVALID = 0x0, + OVERLAY_PLUGIN_D_ETHERNET = 0x1, + OVERLAY_PLUGIN_D_IP = 0x2, + OVERLAY_PLUGIN_D_PORT = 0x4, + OVERLAY_PLUGIN_D_MASK = 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { + OVERLAY_PROP_T_INT = 0x1, /* signed int */ + OVERLAY_PROP_T_UINT, /* unsigned int */ + OVERLAY_PROP_T_IP, /* sinaddr6 */ + OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { + OVERLAY_PROP_PERM_REQ = 0x1, + OVERLAY_PROP_PERM_READ = 0x2, + OVERLAY_PROP_PERM_WRITE = 0x4, + OVERLAY_PROP_PERM_RW = 0x6, + OVERLAY_PROP_PERM_RRW = 0x7, + OVERLAY_PROP_PERM_MASK = 0x7 +} overlay_prop_prot_t; + +#define OVERLAY_PROP_NAMELEN 64 +#define OVERLAY_PROP_SIZEMAX 256 +#define OVERLAY_STATUS_BUFLEN 256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..7fb8b8da1d --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define _SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION_ONE 0x1 + +typedef struct overlay_plugin { + kmutex_t ovp_mutex; + list_node_t ovp_link; /* overlay_plugin_lock */ + uint_t ovp_active; /* ovp_mutex */ + const char *ovp_name; /* RO */ + const overlay_plugin_ops_t *ovp_ops; /* RO */ + const char *const *ovp_props; /* RO */ + uint_t ovp_nprops; /* RO */ + uint_t ovp_id_size; /* RO */ + overlay_plugin_flags_t ovp_flags; /* RO */ + overlay_plugin_dest_t ovp_dest; /* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { + list_node_t omux_lnode; + ksocket_t omux_ksock; /* RO */ + overlay_plugin_t *omux_plugin; /* RO: associated encap */ + int omux_domain; /* RO: socket domain */ + int omux_family; /* RO: socket family */ + int omux_protocol; /* RO: socket protocol */ + struct sockaddr *omux_addr; /* RO: socket address */ + socklen_t omux_alen; /* RO: sockaddr len */ + kmutex_t omux_lock; /* Protects everything below */ + uint_t omux_count; /* Active instances */ + avl_tree_t omux_devices; /* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { + OVERLAY_T_TEARDOWN = 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { + kmutex_t ott_lock; + kcondvar_t ott_cond; + overlay_target_mode_t ott_mode; /* RO */ + overlay_plugin_dest_t ott_dest; /* RO */ + uint64_t ott_id; /* RO */ + overlay_target_flag_t ott_flags; /* ott_lock */ + uint_t ott_ocount; /* ott_lock */ + union { /* ott_lock */ + overlay_target_point_t ott_point; + struct overlay_target_dyn { + refhash_t *ott_dhash; + avl_tree_t ott_tree; + } ott_dyn; + } ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { + OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */ + OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */ + OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */ + OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */ + OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */ + OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */ + OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */ + OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */ + OVERLAY_F_DEGRADED = 0x40, /* device is degraded */ + OVERLAY_F_MASK = 0x7f /* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { + kmutex_t odd_lock; + kcondvar_t odd_iowait; + list_node_t odd_link; /* overlay_dev_lock */ + mac_handle_t odd_mh; /* RO */ + overlay_plugin_t *odd_plugin; /* RO */ + datalink_id_t odd_linkid; /* RO */ + void *odd_pvoid; /* RO -- only used by plugin */ + uint_t odd_ref; /* protected by odd_lock */ + uint_t odd_mtu; /* protected by odd_lock */ + overlay_dev_flag_t odd_flags; /* protected by odd_lock */ + uint_t odd_rxcount; /* protected by odd_lock */ + uint_t odd_txcount; /* protected by odd_lock */ + overlay_mux_t *odd_mux; /* protected by odd_lock */ + uint64_t odd_vid; /* RO if active else odd_lock */ + avl_node_t odd_muxnode; /* managed by mux */ + overlay_target_t *odd_target; /* See big theory statement */ + char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { + OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ + OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ + OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ + OVERLAY_ENTRY_F_VALID_MASK = 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { + kmutex_t ote_lock; + refhash_link_t ote_reflink; /* hashtable link */ + avl_node_t ote_avllink; /* iteration link */ + list_node_t ote_qlink; + overlay_target_entry_flags_t ote_flags; /* RW: state flags */ + uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ + overlay_target_t *ote_ott; /* RO */ + overlay_dev_t *ote_odd; /* RO */ + overlay_target_point_t ote_dest; /* RW: destination */ + mblk_t *ote_chead; /* RW: blocked mb chain head */ + mblk_t *ote_ctail; /* RW: blocked mb chain tail */ + size_t ote_mbsize; /* RW: outstanding mblk size */ + hrtime_t ote_vtime; /* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define OVERLAY_CTL "overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, + struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define OVERLAY_TARGET_OK 0 +#define OVERLAY_TARGET_DROP 1 +#define OVERLAY_TARGET_ASYNC 2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, + socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..07efaa05df --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define _SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + * overlay_plugin_init_t + * + * This interface is used to create a new instance of a plugin. An instance + * of a plugin will be created for each overlay device that is created. For + * example, if a device is created with VXLAN ID 23 and ID 42, then there + * will be two different calls to this function. + * + * This function gives the plugin a chance to create a private data + * structure that will be returned on subsequent calls to the system. + * + * overlay_plugin_fini_t + * + * This is the opposite of overlay_plugin_init_t. It will be called when it + * is safe to remove any private data that is associated with this instance + * of the plugin. + * + * overlay_plugin_propinfo_t + * + * This is called with the name of a property that is registered when the + * plugin is created. This function will be called with the name of the + * property that information is being requested about. The plugin is + * responsible for filling out information such as setting the name, the + * type of property it is, the protection of the property (can a user + * update it?), whether the property is required, an optional default value + * for the property, and an optional set of values or ranges that are + * allowed. + * + * overlay_plugin_getprop_t + * + * Return the value of the named property from the current instance of the + * plugin. + * + * overlay_plugin_setprop_t + * + * Set the value of the named property to the specified value for the + * current instance of the plugin. Note, that it is the plugin's + * responsibility to ensure that the value of the property is valid and to + * update state as appropriate. + * + * overlay_plugin_socket_t + * + * Every overlay device has a corresponding socket that it uses to send and + * receive traffic. This routine is used to get the parameters that should + * be used to define such a socket. The actual socket may be multiplexed + * with other uses of it. + * + * overlay_plugin_sockopt_t + * + * Allow a plugin to set any necessary socket options that it needs on the + * kernel socket that is being used by a mux. This will only be called once + * for a given mux, if additional devices are added to a mux, it will not + * be called additional times. + * + * overlay_plugin_encap_t + * + * In this routine you're given a message block and information about the + * packet, such as the identifier and are asked to fill out a message block + * that represents the encapsulation header and optionally manipulate the + * input message if required. + * + * overlay_plugin_decap_t + * + * In this routine, you're given the encapsulated message block. The + * requirement is to decapsulate it and determine what is the correct + * overlay identifier for this network and to fill in the header size so + * the broader system knows how much of this data should be considered + * consumed. + * + * ovpo_callbacks + * + * This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + * ovep_version + * + * Should be set to the value of the macro OVEP_VERSION. + * + * ovep_name + * + * Should be set to a character string that has the name of the module. + * Generally this should match the name of the kernel module; however, this + * is the name that users will use to refer to this module when creating + * devices. + * + * overlay_plugin_ops_t + * + * Should be set to the functions as described above. + * + * ovep_props + * + * This is an array of character strings that holds the names of the + * properties of the encapsulation plugin. + * + * + * ovep_id_size + * + * This is the size in bytes of the valid range for the identifier. The + * valid identifier range is considered a ovep_id_size byte unsigned + * integer, [ 0, 1 << (ovep_id_size * 8) ). + * + * ovep_flags + * + * A series of flags that indicate optional features that are supported. + * Valid flags include: + * + * OVEP_F_VLAN_TAG + * + * The encapsulation format allows for the encapsulated + * packet to maintain a VLAN tag. + * + * ovep_dest + * + * Describes the kind of destination that the overlay plugin supports for + * sending traffic. For example, vxlan uses UDP, therefore it requires both + * an IP address and a port; however, nvgre uses the gre header and + * therefore only requires an IP address. The following flags may be + * combined: + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * Indicates that to send a packet to its destination, we + * require a link-layer ethernet address. + * + * OVERLAY_PLUGIN_D_IP + * + * Indicates that to send a packet to its destination, we + * require an IP address. Note, all IP addresses are + * transmitted as IPv6 addresses and for an IPv4 + * destination, using an IPv4-mapped IPv6 address is the + * expected way to transmit that. + * + * OVERLAY_PLUGIN_D_PORT + * + * Indicates that to send a packet to its destination, a + * port is required, this usually indicates that the + * protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Improtantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION 0x1 + +typedef enum overlay_plugin_flags { + OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { + uint64_t ovdi_id; + size_t ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, + ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, + ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, + struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, + uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, + uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { + uint_t ovpo_callbacks; + overlay_plugin_init_t ovpo_init; + overlay_plugin_fini_t ovpo_fini; + overlay_plugin_encap_t ovpo_encap; + overlay_plugin_decap_t ovpo_decap; + overlay_plugin_socket_t ovpo_socket; + overlay_plugin_sockopt_t ovpo_sockopt; + overlay_plugin_getprop_t ovpo_getprop; + overlay_plugin_setprop_t ovpo_setprop; + overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { + uint_t ovep_version; + const char *ovep_name; + const overlay_plugin_ops_t *ovep_ops; + const char **ovep_props; + uint_t ovep_id_size; + uint_t ovep_flags; + uint_t ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, + uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..cae193c334 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,292 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define _OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { + uint8_t otp_mac[ETHERADDRL]; + struct in6_addr otp_ip; + uint16_t otp_port; +} overlay_target_point_t; + +#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { + OVERLAY_TARG_INFO_F_ACTIVE = 0x01, + OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { + datalink_id_t oti_linkid; + uint32_t oti_needs; + uint64_t oti_flags; + uint64_t oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { + datalink_id_t ota_linkid; + uint32_t ota_mode; + uint64_t ota_id; + uint32_t ota_provides; + overlay_target_point_t ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { + datalink_id_t otd_linkid; + uint32_t otd_pad; + char otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { + datalink_id_t otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + * OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + * The overlay_targ_resp_t has the appropriate information from + * which a reply can be generated. The information is filled into + * an overlay_targ_point_t as appropriate based on the + * overlay_plugin_dest_t type. + * + * + * OVERLAY_TARG_DROP - overlay_targ_resp_t + * + * The overlay_targ_resp_t should identify a request for which to + * drop a packet. + * + * + * OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * The overlay_targ_pkt_t injects a fully formed packet into the + * virtual network. It may either be identified by its data link id + * or by the request id. If both are specified, the + * datalink id will be used. Note, that an injection is not + * considered a reply and if this corresponds to a requeset, then + * that individual packet must still be dropped. + * + * + * OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * This ioctl can be used to copy data from a given request into a + * user buffer. This can be used in combination with + * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * + * + * OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * of receiving it on the local mac handle, it queues it for + * retransmission again. This is useful if you have a packet that + * was originally destined for some broadcast or multicast address + * that you now want to send to a unicast address. + */ +#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10) +#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11) +#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12) +#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13) +#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) +#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { + uint64_t otl_dlid; + uint64_t otl_reqid; + uint64_t otl_varpdid; + uint64_t otl_vnetid; + uint64_t otl_hdrsize; + uint64_t otl_pktsize; + uint8_t otl_srcaddr[ETHERADDRL]; + uint8_t otl_dstaddr[ETHERADDRL]; + uint32_t otl_dsttype; + uint32_t otl_sap; + int32_t otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { + uint64_t otr_reqid; + overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + void *otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +typedef struct overlay_targ_pkt32 { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + caddr32_t otp_buf; +} overlay_targ_pkt32_t; + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { + uint32_t otl_nents; + uint32_t otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * The overlay_targ_cache_t should be have its link identifier and + * the desired mac address filled in. On return, it will fill in + * the otc_dest member, if the entry exists in the table. + * + * + * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * The cache table entry of the mac address referred to by otc_mac + * and otd_linkid will be filled in with the details provided by in + * the otc_dest member. + * + * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * Removes the cache entry identified by otc_mac from the table. + * Note that this does not stop any in-flight lookups or deal with + * any data that is awaiting a lookup. + * + * + * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * entire table identified by otc_linkid. All other parameters are + * ignored. + * + * + * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * Iterates over the contents of a target cache identified by + * otci_linkid. Iteration is guaranteed to be exactly once for + * items which are in the hashtable at the beginning and end of + * iteration. For items which are added or removed after iteration + * has begun, only at most once semantics are guaranteed. Consumers + * should ensure that otci_marker is zeroed before starting + * iteration and should preserve its contents across calls. + * + * Before calling in, otci_count should be set to the number of + * entries that space has been allocated for in otci_ents. The + * value will be updated to indicate the total number written out. + */ + +#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30) +#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31) +#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) +#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) +#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define OVERLAY_TARGET_ITER_MAX 500 + +#define OVERLAY_TARGET_CACHE_DROP 0x01 + +typedef struct overlay_targ_cache_entry { + uint8_t otce_mac[ETHERADDRL]; + uint16_t otce_flags; + overlay_target_point_t otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { + datalink_id_t otc_linkid; + overlay_targ_cache_entry_t otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { + datalink_id_t otci_linkid; + uint32_t otci_pad; + uint64_t otci_marker; + uint16_t otci_count; + overlay_targ_cache_entry_t otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index c3a1b9a97b..ea2fdfd886 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -104,7 +104,7 @@ extern "C" { #define DEFAULT_MAXPID 999999 #define DEFAULT_JUMPPID 100000 #else -#define DEFAULT_MAXPID 30000 +#define DEFAULT_MAXPID 99999 #define DEFAULT_JUMPPID 0 #endif diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index 5f5b66d437..5328d02c59 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *); int secpolicy_kmdb(const cred_t *); int secpolicy_lock_memory(const cred_t *); int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *); int secpolicy_modctl(const cred_t *, int); int secpolicy_net(const cred_t *, int, boolean_t); int secpolicy_net_bindmlp(const cred_t *); @@ -174,6 +175,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 5abf8fd3cd..ff4a1abce4 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -348,7 +349,9 @@ typedef struct proc { struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ struct brand *p_brand; /* process's brand */ - void *p_brand_data; /* per-process brand state */ + + /* per-process brand state */ + void *p_brand_data; /* additional lock to protect p_sessp (but not its contents) */ kmutex_t p_splock; @@ -363,7 +366,6 @@ typedef struct proc { */ struct user p_user; /* (see sys/user.h) */ } proc_t; - #define PROC_T /* headers relying on proc_t are OK */ #ifdef _KERNEL @@ -629,6 +631,7 @@ extern int signal_is_blocked(kthread_t *, int); extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid); extern void pid_setmin(void); extern pid_t pid_allocate(proc_t *, pid_t, int); extern int pid_rele(struct pid *); @@ -644,6 +647,7 @@ extern int sprtrylock_proc(proc_t *); extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); +extern void sprunprlock(proc_t *); extern void pid_init(void); extern proc_t *pid_entry(int); extern int pid_slot(proc_t *); @@ -718,6 +722,10 @@ extern kthread_t *thread_unpin(void); extern void thread_init(void); extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern void thread_splitstack(void (*)(void *), void *, size_t); +extern void thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern void thread_splitstack_cleanup(void); + extern void tsd_create(uint_t *, void (*)(void *)); extern void tsd_destroy(uint_t *); extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); @@ -759,7 +767,7 @@ extern void pokelwps(proc_t *); extern void continuelwps(proc_t *); extern int exitlwps(int); extern void lwp_ctmpl_copy(klwp_t *, klwp_t *); -extern void lwp_ctmpl_clear(klwp_t *); +extern void lwp_ctmpl_clear(klwp_t *, boolean_t); extern klwp_t *forklwp(klwp_t *, proc_t *, id_t); extern void lwp_load(klwp_t *, gregset_t, uintptr_t); extern void lwp_setrval(klwp_t *, int, int); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index f592fd9dcf..501af712ef 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -25,6 +25,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_PROCFS_H @@ -233,6 +234,7 @@ typedef struct pstatus { #define PR_FAULTED 6 #define PR_SUSPENDED 7 #define PR_CHECKPOINT 8 +#define PR_BRAND 9 /* * lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 6c79ee266d..ba8b2b1210 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ typedef struct pt_own { @@ -157,6 +163,19 @@ typedef struct pt_own { #define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ #define OWNERPT (('P'<<8)|5) /* set owner/group for slave device */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h index 2069e6d3f1..b7427a454d 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -10,11 +10,11 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#ifndef _SYS_SCSI_ADAPTERS_MPTHASH_H -#define _SYS_SCSI_ADAPTERS_MPTHASH_H +#ifndef _SYS_REFHASH_H +#define _SYS_REFHASH_H #include <sys/types.h> #include <sys/list.h> @@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *); extern void *refhash_next(refhash_t *, void *); extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); -#endif /* _SYS_SCSI_ADAPTERS_MPTHASH_H */ +#endif /* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 2d3800b946..4b70a77db8 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -191,6 +192,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ #define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ +#define _RUSAGESYS_INVALMAP 4 /* vm_map_inval */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h index ca52f8d995..82cc08d326 100644 --- a/usr/src/uts/common/sys/rt.h +++ b/usr/src/uts/common/sys/rt.h @@ -22,6 +22,7 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -31,8 +32,6 @@ #ifndef _SYS_RT_H #define _SYS_RT_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.4 */ - #include <sys/types.h> #include <sys/thread.h> @@ -77,6 +76,16 @@ typedef struct rtkparms { int rt_tqsig; /* real-time time quantum signal */ uint_t rt_cflags; /* real-time control flags */ } rtkparms_t; + +#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ + +/* + * control flags (kparms->rt_cflags). + */ +#define RT_DOPRI 0x01 /* change priority */ +#define RT_DOTQ 0x02 /* change RT time quantum */ +#define RT_DOSIG 0x04 /* change RT time quantum signal */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h index 3983188fce..02116b45c4 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright (c) 2014, Tegile Systems Inc. All rights reserved. */ @@ -58,10 +58,10 @@ #include <sys/byteorder.h> #include <sys/queue.h> +#include <sys/refhash.h> #include <sys/isa_defs.h> #include <sys/sunmdi.h> #include <sys/mdi_impldefs.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h> diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h index e3bd2a77d3..030379488f 100644 --- a/usr/src/uts/common/sys/shm.h +++ b/usr/src/uts/common/sys/shm.h @@ -21,6 +21,7 @@ */ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2016 Joyent, Inc. * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -120,6 +121,10 @@ struct shmid_ds { #define SHM_LOCK 3 /* Lock segment in core */ #define SHM_UNLOCK 4 /* Unlock segment */ +#if defined(_KERNEL) +#define SHM_RMID 5 /* Private RMID for lx support */ +#endif + #if !defined(_KERNEL) int shmget(key_t, size_t, int); int shmids(int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h index 4d8cdcede5..1eae2ca0a4 100644 --- a/usr/src/uts/common/sys/shm_impl.h +++ b/usr/src/uts/common/sys/shm_impl.h @@ -21,13 +21,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SHM_IMPL_H #define _SYS_SHM_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ipc_impl.h> #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/shm.h> @@ -70,7 +69,11 @@ typedef struct kshmid { time_t shm_ctime; /* last change time */ struct sptinfo *shm_sptinfo; /* info about ISM segment */ struct seg *shm_sptseg; /* pointer to ISM segment */ - long shm_sptprot; /* was reserved (still a "long") */ + ulong_t shm_opts; + /* + * Composed of: sptprot (uchar_t) and + * RM_PENDING flag (1 bit). + */ } kshmid_t; /* @@ -78,6 +81,14 @@ typedef struct kshmid { */ #define SHMSA_ISM 1 /* uses shared page table */ +/* + * shm_opts definitions + * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are + * used for additional options. + */ +#define SHM_PROT_MASK 0xff +#define SHM_RM_PENDING 0x100 + typedef struct sptinfo { struct as *sptas; /* dummy as ptr. for spt segment */ } sptinfo_t; diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index 8f0e1794f4..139784d578 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ struct sigaction32 { * use of these symbols by applications is injurious * to binary compatibility */ -#define NSIG 74 /* valid signals range from 1 to NSIG-1 */ -#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */ +#define NSIG 75 /* valid signals range from 1 to NSIG-1 */ +#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */ #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */ #define MINSIGSTKSZ 2048 diff --git a/usr/src/uts/common/sys/signalfd.h b/usr/src/uts/common/sys/signalfd.h index 2661d5a05f..89d0647020 100644 --- a/usr/src/uts/common/sys/signalfd.h +++ b/usr/src/uts/common/sys/signalfd.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* @@ -75,13 +75,9 @@ extern int signalfd(int, const sigset_t *, int); #define SIGNALFDMNRN_SIGNALFD 0 #define SIGNALFDMNRN_CLONE 1 -typedef struct sigfd_wake_list { - list_node_t sigfd_wl_lst; - void *sigfd_wl_state; -} sigfd_wake_list_t; - /* * This holds the proc_t state for a process which is using signalfd. + * Its presence and contents are protected by p_lock. */ typedef struct sigfd_proc_state { void (*sigfd_pollwake_cb)(void *, int); diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index da8e3ab351..e55cd165aa 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -22,6 +22,7 @@ * Copyright 2014 Garrett D'Amore <garrett@damore.org> * * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -39,6 +40,9 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #ifndef _SYS_SOCKET_H #define _SYS_SOCKET_H @@ -194,6 +198,7 @@ struct so_snd_bufinfo { #define SO_SRCADDR 0x2001 /* Internal: AF_UNIX source address */ #define SO_FILEP 0x2002 /* Internal: AF_UNIX file pointer */ #define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */ +#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */ #endif /* _KERNEL */ /* @@ -293,8 +298,9 @@ struct linger { #define AF_INET_OFFLOAD 30 /* Sun private; do not use */ #define AF_TRILL 31 /* TRILL interface */ #define AF_PACKET 32 /* PF_PACKET Linux socket interface */ +#define AF_LX_NETLINK 33 /* Linux-compatible netlink */ -#define AF_MAX 32 +#define AF_MAX 33 /* * Protocol families, same as address families for now. @@ -334,6 +340,7 @@ struct linger { #define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */ #define PF_TRILL AF_TRILL #define PF_PACKET AF_PACKET +#define PF_LX_NETLINK AF_LX_NETLINK #define PF_MAX AF_MAX @@ -420,6 +427,7 @@ struct msghdr32 { #define MSG_NOTIFICATION 0x100 /* Notification, not data */ #define MSG_XPG4_2 0x8000 /* Private: XPG4.2 flag */ +/* Obsolete but kept for compilation compatability. Use IOV_MAX. */ #define MSG_MAXIOVLEN 16 #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 52fa3a5822..da61975904 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -102,6 +103,7 @@ struct sockaddr_ux { typedef struct sonodeops sonodeops_t; typedef struct sonode sonode_t; +typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *); struct sodirect_s; @@ -244,6 +246,10 @@ struct sonode { struct sof_instance *so_filter_top; /* top of stack */ struct sof_instance *so_filter_bottom; /* bottom of stack */ clock_t so_filter_defertime; /* time when deferred */ + + /* Kernel direct receive callbacks */ + so_krecv_f so_krecv_cb; /* recv callback */ + void *so_krecv_arg; /* recv cb arg */ }; #define SO_HAVE_DATA(so) \ @@ -297,15 +303,16 @@ struct sonode { #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ -#define SS_CLOSING 0x00010000 /* in process of closing */ +#define SS_CLOSING 0x00010000 /* in process of closing */ #define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ #define SS_FILOP_OK 0x00040000 /* socket can attach filters */ #define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ + #define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ #define SS_FIL_STOP 0x00200000 /* no more filter actions */ - #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ +#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */ #define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ #define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ @@ -321,7 +328,8 @@ struct sonode { /* * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment. */ #define SO_BLOCK_FALLBACK(so, fn) \ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ @@ -337,6 +345,24 @@ struct sonode { } \ } +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define SO_BLOCK_FALLBACK_SAFE(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state |= SS_FILOP_UNSF; \ + mutex_exit(&(so)->so_lock); \ + } + #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } @@ -368,6 +394,7 @@ struct sonode { /* The modes below are only for non-streams sockets */ #define SM_ACCEPTSUPP 0x400 /* can handle accept() */ #define SM_SENDFILESUPP 0x800 /* Private: proto supp sendfile */ +#define SM_DEFERERR 0x1000 /* Private: defer so_error delivery */ /* * Socket versions. Used by the socket library when calling _so_socket(). @@ -946,6 +973,15 @@ extern struct sonode *socreate(struct sockparams *, int, int, int, int, extern int so_copyin(const void *, void *, size_t, int); extern int so_copyout(const void *, void *, size_t, int); +/* + * Functions to manipulate the use of direct receive callbacks. This should not + * be used outside of sockfs and ksocket. These are generally considered a use + * once interface for a socket and will cause all outstanding data on the socket + * to be flushed. + */ +extern int so_krecv_set(sonode_t *, so_krecv_f, void *); +extern void so_krecv_unblock(sonode_t *); + #endif /* diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops { #define SOF_VERSION 1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define SOF_ATT_SAFE 0x1 + extern int sof_register(int, const char *, const sof_ops_t *, int); extern int sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index f1bd429815..35e1cf64c7 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -29,6 +29,17 @@ extern "C" { #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ + #include <sys/types.h> #include <sys/processor.h> #include <sys/stream.h> @@ -76,12 +87,13 @@ typedef enum { struct ip_recv_attr_s; extern void squeue_init(void); -extern squeue_t *squeue_create(clock_t, pri_t); +extern squeue_t *squeue_create(clock_t, pri_t, boolean_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *); struct conn_s; extern int squeue_synch_enter(struct conn_s *, mblk_t *); diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 22550886eb..d2418bbc15 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -117,6 +117,7 @@ struct squeue_s { squeue_set_t *sq_set; /* managed by squeue creator */ pri_t sq_priority; /* squeue thread priority */ + boolean_t sq_isip; /* use IP-centric features */ /* Keep the debug-only fields at the end of the structure */ #ifdef DEBUG @@ -165,6 +166,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 #define SQS_PAUSE 0x04000000 /* The squeue has been paused */ +#define SQS_EXIT 0x08000000 /* squeue is being torn down */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index a04019a9ce..28289649dd 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -628,16 +629,11 @@ struct stroptions { /* * Structure for rw (read/write) procedure calls. A pointer * to a struiod_t is passed as a parameter to the rwnext() call. - * - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? */ -#define DEF_IOV_MAX 16 - typedef struct struiod { mblk_t *d_mp; /* pointer to mblk (chain) */ uio_t d_uio; /* uio info */ - iovec_t d_iov[DEF_IOV_MAX]; /* iov referenced by uio */ + iovec_t *d_iov; /* iov referenced by uio */ } struiod_t; /* diff --git a/usr/src/uts/common/sys/sysevent.h b/usr/src/uts/common/sys/sysevent.h index 46a800e62b..255e98b871 100644 --- a/usr/src/uts/common/sys/sysevent.h +++ b/usr/src/uts/common/sys/sysevent.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SYSEVENT_H @@ -67,10 +68,12 @@ extern "C" { #define SE_KERN_PID 0 #define SUNW_VENDOR "SUNW" +#define ILLUMOS_VENDOR "ILLUMOS" #define SE_USR_PUB "usr:" #define SE_KERN_PUB "kern:" #define SUNW_KERN_PUB SUNW_VENDOR":"SE_KERN_PUB #define SUNW_USR_PUB SUNW_VENDOR":"SE_USR_PUB +#define ILLUMOS_KERN_PUB ILLUMOS_VENDOR":"SE_KERN_PUB /* * Event header and attribute value limits diff --git a/usr/src/uts/common/sys/sysevent/datalink.h b/usr/src/uts/common/sys/sysevent/datalink.h new file mode 100644 index 0000000000..592ef5bdde --- /dev/null +++ b/usr/src/uts/common/sys/sysevent/datalink.h @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_SYSEVENT_DATALINK_H +#define _SYS_SYSEVENT_DATALINK_H + +/* + * Datalink System Event payloads + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DATALINK_LINK_STATE + * + * Event Class - EC_DATALINK + * Event Sub-Class - EC_DATALINK_LINK_STATE + * + * Attribute Name - DATALINK_EV_LINK_NAME + * Attribute Type - SE_DATA_TYPE_STRING + * Attribute Value - [Name of the datalink] + * + * Attribute Name - DATALINK_EV_LINK_ID + * Attribute Type - SE_DATA_TYPE_INT32 + * Attribute Value - [datalink_id_t for the device] + * + * Attribute Name - DATALINK_EV_ZONE_ID + * Attribute Type - SE_DATA_TYPE_INT32 + * Attribute Value - [zoneid_t of the zone the datalink is in] + */ + +#define DATALINK_EV_LINK_NAME "link" +#define DATALINK_EV_LINK_ID "linkid" +#define DATALINK_EV_ZONE_ID "zone" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DATALINK_H */ diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index 25401cec53..0a78d0310b 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -267,9 +267,11 @@ extern "C" { #define ESC_ZFS_POOL_REGUID "ESC_ZFS_pool_reguid" /* - * datalink subclass definitions. + * datalink subclass definitions. Supporting attributes for datalink state found + * in sys/sysevent/datalink.h. */ #define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */ +#define ESC_DATALINK_LINK_STATE "ESC_datalink_link_state" /* link state */ /* * VRRP subclass definitions. Supporting attributes (name/value paris) are diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dtrace.h> #ifdef __cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent; extern systrace_sysent_t *systrace_sysent32; extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #ifdef _SYSCALL32_IMPL extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #endif #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 09be20858d..889a7096cd 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -361,6 +361,24 @@ extern pid_t tcgetsid(int); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index d917944edf..6a1c36f2e7 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_THREAD_H #define _SYS_THREAD_H @@ -68,6 +72,8 @@ typedef struct ctxop { void (*free_op)(void *, int); /* function which frees the context */ void *arg; /* argument to above functions, ctx pointer */ struct ctxop *next; /* next context ops */ + hrtime_t save_ts; /* timestamp of last save */ + hrtime_t restore_ts; /* timestamp of last restore */ } ctxop_t; /* @@ -366,7 +372,7 @@ typedef struct _kthread { #define T_WOULDBLOCK 0x0020 /* for lockfs */ #define T_DONTBLOCK 0x0040 /* for lockfs */ #define T_DONTPEND 0x0080 /* for lockfs */ -#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */ +#define T_SPLITSTK 0x0100 /* kernel stack is currently split */ #define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ @@ -414,8 +420,9 @@ typedef struct _kthread { #define TS_RESUME 0x1000 /* setrun() by CPR resume process */ #define TS_CREATE 0x2000 /* setrun() by syslwp_create() */ #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ +#define TS_BSTART 0x8000 /* setrun() by brand */ #define TS_ALLSTART \ - (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) + (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART) #define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* @@ -443,6 +450,10 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \ + !((t)->t_schedflag & TS_BSTART)) + /* True if thread is asleep and wakeable */ #define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ ((t)->t_flag & T_WAKEABLE))) diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index d5168c9b2c..c14a3bf11e 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -159,7 +160,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index e803efeb45..5663929bfb 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -145,7 +145,8 @@ typedef struct uioa_s { */ typedef enum xuio_type { UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY + UIOTYPE_ZEROCOPY, + UIOTYPE_PEEKSIZE } xuio_type_t; typedef struct xuio { @@ -175,6 +176,15 @@ typedef struct xuio { int xu_zc_rw; /* read or write buffer */ void *xu_zc_priv; /* fs specific */ } xu_zc; + + /* + * Peek Size Support -- facilitate peeking at the size of a + * waiting message on a socket. + */ + struct { + ssize_t xu_ps_size; /* size of waiting msg */ + boolean_t xu_ps_set; /* was size calculated? */ + } xu_ps; } xu_ext; } xuio_t; diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index a7bff8dd52..66250a3f2b 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2012 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ @@ -185,9 +185,9 @@ typedef struct { /* kernel syscall set type */ * This value should not be changed in a patch. */ #if defined(__sparc) -#define __KERN_NAUXV_IMPL 20 +#define __KERN_NAUXV_IMPL 24 #elif defined(__i386) || defined(__amd64) -#define __KERN_NAUXV_IMPL 22 +#define __KERN_NAUXV_IMPL 26 #endif struct execsw; @@ -211,6 +211,7 @@ typedef struct user { int u_argc; /* value of argc passed to main() */ uintptr_t u_argv; /* value of argv passed to main() */ uintptr_t u_envp; /* value of envp passed to main() */ + uintptr_t u_commpagep; /* address of mapped comm page */ /* * These fields are protected by p_lock: diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..c2954cbc29 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,9 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ @@ -108,6 +110,7 @@ extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); int vm_getusage(uint_t, time_t, vmusage_t *, size_t *, int); void vm_usage_init(); +int vm_map_inval(pid_t, caddr_t, size_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index 6122b6cd2f..c7b41730b6 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ +/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -159,6 +162,8 @@ extern void *boot_virt_alloc(void *addr, size_t size); extern size_t exec_get_spslew(void); +extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define _SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define VND_IOC_ATTACH (VND_IOC | 0x1) + +#define VND_NAMELEN 32 + +typedef struct vnd_ioc_attach { + char via_name[VND_NAMELEN]; + zoneid_t via_zoneid; + uint32_t via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define VND_IOC_LINK (VND_IOC | 0x2) + +typedef struct vnd_ioc_link { + char vil_name[VND_NAMELEN]; + uint32_t vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define VND_IOC_UNLINK (VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { + uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { + uint64_t vib_size; + uint32_t vib_filler; + uint32_t vib_errno; +} vnd_ioc_buf_t; + +#define VND_IOC_GETRXBUF (VND_IOC | 0x04) +#define VND_IOC_SETRXBUF (VND_IOC | 0x05) +#define VND_IOC_GETMAXBUF (VND_IOC | 0x06) +#define VND_IOC_GETTXBUF (VND_IOC | 0x07) +#define VND_IOC_SETTXBUF (VND_IOC | 0x08) +#define VND_IOC_GETMINTU (VND_IOC | 0x09) +#define VND_IOC_GETMAXTU (VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { + uint32_t vii_version; + zoneid_t vii_zone; + char vii_name[VND_NAMELEN]; + char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { + uint_t vl_nents; + uint_t vl_actents; + vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { + uint_t vl_nents; + uint_t vl_actents; + caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif /* _KERNEL */ + +#define VND_IOC_LIST (VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30) +#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define _SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { + VND_E_SUCCESS = 0, /* no error */ + VND_E_NOMEM, /* no memory */ + VND_E_NODATALINK, /* no such datalink */ + VND_E_NOTETHER, /* not DL_ETHER */ + VND_E_DLPIINVAL, /* Unknown DLPI failures */ + VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */ + VND_E_BINDFAIL, /* DL_BIND_REQ failed */ + VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */ + VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */ + VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */ + VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */ + VND_E_DLDBADVERS, /* bad dld version */ + VND_E_KSTATCREATE, /* failed to create kstats */ + VND_E_NODEV, /* no such vnd link */ + VND_E_NONETSTACK, /* netstack doesn't exist */ + VND_E_ASSOCIATED, /* device already associated */ + VND_E_ATTACHED, /* device already attached */ + VND_E_LINKED, /* device already linked */ + VND_E_BADNAME, /* invalid name */ + VND_E_PERM, /* can't touch this */ + VND_E_NOZONE, /* no such zone */ + VND_E_STRINIT, /* failed to initialize vnd stream module */ + VND_E_NOTATTACHED, /* device not attached */ + VND_E_NOTLINKED, /* device not linked */ + VND_E_LINKEXISTS, /* another device has the same link name */ + VND_E_MINORNODE, /* failed to create minor node */ + VND_E_BUFTOOBIG, /* requested buffer size is too large */ + VND_E_BUFTOOSMALL, /* requested buffer size is too small */ + VND_E_DLEXCL, /* unable to get dlpi excl access */ + VND_E_DIRECTNOTSUP, + /* DLD direct capability not suported over data link */ + VND_E_BADPROPSIZE, /* invalid property size */ + VND_E_BADPROP, /* invalid property */ + VND_E_PROPRDONLY, /* property is read only */ + VND_E_SYS, /* unexpected system error */ + VND_E_CAPABPASS, + /* capabilities invalid, pass-through module detected */ + VND_E_UNKNOWN /* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 7e50091347..1a91158da6 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_VNIC_IMPL_H @@ -65,6 +65,7 @@ typedef struct vnic_s { uint32_t vn_hcksum_txflags; uint32_t vn_mtu; + link_state_t vn_ls; } vnic_t; #define vn_mch vn_mc_handles[0] diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index e4d43cea7f..d12f6c4046 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -738,12 +738,14 @@ typedef enum vnevent { VE_RMDIR = 4, /* Remove of directory vnode's name */ VE_CREATE = 5, /* Create with vnode's name which exists */ VE_LINK = 6, /* Link with vnode's name as source */ - VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ + VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ VE_TRUNCATE = 9, /* Truncate */ VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */ VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ - VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */ + VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */ + VE_RENAME_SRC_DIR = 13, /* Rename with vnode as source dir */ + VE_RESIZE = 14 /* Resize/truncate to non-zero offset */ } vnevent_t; /* @@ -1298,7 +1300,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_create(vnode_t *, caller_context_t *); void vnevent_link(vnode_t *, caller_context_t *); -void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, + caller_context_t *ct); void vnevent_mountedover(vnode_t *, caller_context_t *); void vnevent_truncate(vnode_t *, caller_context_t *); int vnevent_support(vnode_t *, caller_context_t *); @@ -1308,6 +1311,7 @@ void vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *, caller_context_t *); +void vnevent_resize(vnode_t *, caller_context_t *); /* Vnode specific data */ void vsd_create(uint_t *, void (*)(void *)); @@ -1337,6 +1341,9 @@ u_longlong_t fs_new_caller_id(); int vn_vmpss_usepageio(vnode_t *); +/* Empty v_path placeholder */ +extern char *vn_vpath_empty; + /* * Needed for use of IS_VMODSORT() in kernel. */ diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h new file mode 100644 index 0000000000..d87786b507 --- /dev/null +++ b/usr/src/uts/common/sys/vxlan.h @@ -0,0 +1,47 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_VXLAN_H +#define _SYS_VXLAN_H + +/* + * Common VXLAN information + */ + +#include <sys/inttypes.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Sizes in bytes */ +#define VXLAN_HDR_LEN 8 +#define VXLAN_ID_LEN 3 + +#define VXLAN_F_VDI 0x08000000 +#define VXLAN_ID_SHIFT 8 + +#pragma pack(1) +typedef struct vxlan_hdr { + uint32_t vxlan_flags; + uint32_t vxlan_id; +} vxlan_hdr_t; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VXLAN_H */ diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..e08d75ecba --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,78 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define _SYS_ZFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define ZFD_MASTER_NAME "master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define ZFD_SLAVE_NAME "slave" + +#define ZFD_NAME_LEN 16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define ZFD_MAKETTY (ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define ZFD_EOF (ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define ZFD_HAS_SLAVE (ZFD_IOC | 2) + +/* + * This ioctl links two streams into a multiplexer configuration for in-zone + * logging. + */ +#define ZFD_MUX (ZFD_IOC | 3) + +/* + * This ioctl controls the flow control setting for the log multiplexer stream + * (1 = true, 0 = false). The default is false which implies teeing into the + * log stream is "best-effort" but data will be discarded if the stream + * becomes full. If set and the log stream begins to fill up, the primary + * stream will stop flowing. + */ +#define ZFD_MUX_FLOWCON (ZFD_IOC | 4) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 2e69b0d1c7..754f8e3978 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -20,9 +20,9 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2016, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -97,13 +97,19 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_PMCAP_NOVER 12 #define ZONE_ATTR_SCHED_CLASS 13 #define ZONE_ATTR_FLAGS 14 #define ZONE_ATTR_HOSTID 15 #define ZONE_ATTR_FS_ALLOWED 16 #define ZONE_ATTR_NETWORK 17 +#define ZONE_ATTR_DID 18 +#define ZONE_ATTR_PMCAP_PAGEOUT 19 #define ZONE_ATTR_INITNORESTART 20 +#define ZONE_ATTR_PG_FLT_DELAY 21 +#define ZONE_ATTR_RSS 22 +#define ZONE_ATTR_APP_SVC_CT 23 +#define ZONE_ATTR_SCHED_FIXEDHI 24 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -184,6 +190,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -200,6 +207,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -244,9 +252,12 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + int status; /* init status on shutdown */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ + /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ + int pad; } zone_cmd_arg_t; /* @@ -372,7 +383,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -383,12 +394,57 @@ typedef struct zone_kstat { struct cpucap; typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_rcnt; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_wcnt; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; kstat_named_t zm_pgpgin; kstat_named_t zm_anonpgin; kstat_named_t zm_execpgin; kstat_named_t zm_fspgin; kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; } zone_mcap_kstat_t; typedef struct { @@ -447,6 +503,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -499,10 +556,11 @@ typedef struct zone { kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ pid_t zone_proc_initpid; /* pid of "init" for this zone */ - char *zone_initname; /* fs path to 'init' */ + char *zone_initname; /* fs path to 'init' */ + int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ + rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -540,9 +598,12 @@ typedef struct zone { tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */ boolean_t zone_restart_init; /* Restart init if it dies? */ + boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */ + boolean_t zone_setup_app_contract; /* setup contract? */ struct brand *zone_brand; /* zone's brand */ void *zone_brand_data; /* store brand specific data */ id_t zone_defaultcid; /* dflt scheduling class id */ + boolean_t zone_fixed_hipri; /* fixed sched. hi prio */ kstat_t *zone_swapresv_kstat; kstat_t *zone_lockedmem_kstat; /* @@ -551,6 +612,37 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + + /* + * Data and counters used for ZFS fair-share disk IO. + */ + rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */ + uint_t zone_zfs_queued[2]; /* sync I/O enqueued count */ + uint64_t zone_zfs_weight; /* used to prevent starvation */ + uint64_t zone_io_util; /* IO utilization metric */ + boolean_t zone_io_util_above_avg; /* IO util percent > avg. */ + uint16_t zone_io_delay; /* IO delay on logical r/w */ + kmutex_t zone_stg_io_lock; /* protects IO window data */ + sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zone_wr_ops; /* writes and */ + sys_zio_cntr_t zone_lwr_ops; /* logical writes. */ + + /* + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + kstat_io_t zone_zfs_rwstats; + zone_zfs_kstat_t *zone_zfs_stats; + /* * Solaris Auditing per-zone audit context */ @@ -569,6 +661,13 @@ typedef struct zone { /* zone_rctls->rcs_lock */ kstat_t *zone_nprocs_kstat; + /* + * kstats and counters for physical memory capping. + */ + rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */ + kstat_t *zone_physmem_kstat; + uint64_t zone_mcap_nover; /* # of times over phys. cap */ + uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */ kmutex_t zone_mcap_lock; /* protects mcap statistics */ kstat_t *zone_mcap_ksp; zone_mcap_kstat_t *zone_mcap_stats; @@ -577,6 +676,11 @@ typedef struct zone { uint64_t zone_execpgin; /* exec pages paged in */ uint64_t zone_fspgin; /* fs pages paged in */ uint64_t zone_anon_alloc_fail; /* cnt of anon alloc fails */ + uint64_t zone_pf_throttle; /* cnt of page flt throttles */ + uint64_t zone_pf_throttle_usec; /* time of page flt throttles */ + + /* Num usecs to throttle page fault when zone is over phys. mem cap */ + uint32_t zone_pg_flt_delay; /* * Misc. kstats and counters for zone cpu-usage aggregation. @@ -658,6 +762,7 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -838,6 +943,7 @@ extern int zone_ncpus_online_get(zone_t *); * Returns true if the named pool/dataset is visible in the current zone. */ extern int zone_dataset_visible(const char *, int *); +extern int zone_dataset_visible_inzone(zone_t *, const char *, int *); /* * zone version of kadmin() @@ -852,6 +958,7 @@ extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c index 9b4bd38baa..8ee5511fd0 100644 --- a/usr/src/uts/common/syscall/brandsys.c +++ b/usr/src/uts/common/syscall/brandsys.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #include <sys/brand.h> #include <sys/systm.h> @@ -35,7 +37,7 @@ */ int64_t brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg4, uintptr_t arg5) { struct proc *p = curthread->t_procp; int64_t rval = 0; @@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (set_errno(ENOSYS)); if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, - arg4, arg5, arg6)) != 0) + arg4, arg5)) != 0) return (set_errno(err)); return (rval); diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c index 371bc83c29..d631fe62f6 100644 --- a/usr/src/uts/common/syscall/fcntl.c +++ b/usr/src/uts/common/syscall/fcntl.c @@ -54,7 +54,8 @@ #include <sys/cmn_err.h> -static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +/* This is global so that it can be used by brand emulation. */ +int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); static void fd_too_big(proc_t *); diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index 1ee4b6a395..721f884a7e 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c index edb04c824b..874e31869c 100644 --- a/usr/src/uts/common/syscall/open.c +++ b/usr/src/uts/common/syscall/open.c @@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode) if (filemode & (FSEARCH|FEXEC)) { /* - * Must be one or the other and neither FREAD nor FWRITE + * Must be one or the other. * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN - * XXX: Should these just be silently ignored? + * XXX: Should these just be silently ignored like we + * silently ignore FREAD|FWRITE? */ - if ((filemode & (FREAD|FWRITE)) || - (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || + if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index cc125f127a..3d0a5cc04b 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2016, Joyent, Inc. */ /* @@ -317,20 +317,58 @@ polllock(pollhead_t *php, kmutex_t *lp) return (0); } -static int -poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +int +poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds) +{ + pollfd_t *pollfdp; + nfds_t old_nfds; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + return (EFAULT); + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + return (EFAULT); + } + return (0); +} + +int +poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp, + int *fdcnt) { kthread_t *t = curthread; - klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); - int fdcnt = 0; - int i; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; - pollstate_t *ps; pollcache_t *pcp; int error = 0; - nfds_t old_nfds; int cacheindex = 0; /* which cache set is used */ /* @@ -348,32 +386,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) } /* - * Reset our signal mask, if requested. - */ - if (ksetp != NULL) { - mutex_enter(&p->p_lock); - schedctl_finish_sigblock(t); - lwp->lwp_sigoldmask = t->t_hold; - t->t_hold = *ksetp; - t->t_flag |= T_TOMASK; - /* - * Call cv_reltimedwait_sig() just to check for signals. - * We will return immediately with either 0 or -1. - */ - if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, - TR_CLOCK_TICK)) { - mutex_exit(&p->p_lock); - error = EINTR; - goto pollout; - } - mutex_exit(&p->p_lock); - } - - /* - * Check to see if this guy just wants to use poll() as a timeout. + * Check to see if the caller just wants to use poll() as a timeout. * If yes then bypass all the other stuff and make him sleep. */ if (nfds == 0) { + *fdcnt = 0; /* * Sleep until we have passed the requested future * time or until interrupted by a signal. @@ -385,66 +402,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); - error = (error == 0) ? EINTR : 0; + return ((error == 0) ? EINTR : 0); } - goto pollout; - } - - if (nfds > p->p_fno_ctl) { - mutex_enter(&p->p_lock); - (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], - p->p_rctls, p, RCA_SAFE); - mutex_exit(&p->p_lock); - error = EINVAL; - goto pollout; - } - - /* - * Need to allocate memory for pollstate before anything because - * the mutex and cv are created in this space - */ - ps = pollstate_create(); - - if (ps->ps_pcache == NULL) - ps->ps_pcache = pcache_alloc(); - pcp = ps->ps_pcache; - - /* - * NOTE: for performance, buffers are saved across poll() calls. - * The theory is that if a process polls heavily, it tends to poll - * on the same set of descriptors. Therefore, we only reallocate - * buffers when nfds changes. There is no hysteresis control, - * because there is no data to suggest that this is necessary; - * the penalty of reallocating is not *that* great in any event. - */ - old_nfds = ps->ps_nfds; - if (nfds != old_nfds) { - - kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); - pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); - ps->ps_pollfd = pollfdp; - ps->ps_nfds = nfds; + return (0); } + VERIFY(ps != NULL); pollfdp = ps->ps_pollfd; - if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { - error = EFAULT; - goto pollout; - } - - if (fds == NULL) { - /* - * If the process has page 0 mapped, then the copyin() above - * will succeed even if fds is NULL. However, our cached - * poll lists are keyed by the address of the passed-in fds - * structure, and we use the value NULL to indicate an unused - * poll cache list entry. As such, we elect not to support - * NULL as a valid (user) memory address and fail the poll() - * call. - */ - error = EINVAL; - goto pollout; - } + VERIFY(pollfdp != NULL); /* * If this thread polls for the first time, allocate ALL poll @@ -460,10 +425,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) /* * poll and cache this poll fd list in ps_pcacheset[0]. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } else { pollcacheset_t *pcset = ps->ps_pcacheset; @@ -488,11 +453,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * the callee will guarantee the consistency * of cached poll list and cache content. */ - error = pcacheset_resolve(ps, nfds, &fdcnt, + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -509,11 +474,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * found an unused entry. Use it to cache * this poll list. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); - if (fdcnt || error) { + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -527,10 +492,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; - error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } } @@ -548,8 +513,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; - error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; @@ -595,13 +560,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); } + return (error); +} + +/* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ +int +pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t ts; + timespec_t *tsp; + k_sigset_t kset; + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + int error = 0, fdcnt = 0; + + /* + * Copy in timeout + */ + if (timeoutp == NULL) { + tsp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + /* + * Copy in and reset signal mask, if requested. + */ + if (setp != NULL) { + sigset_t set; + + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = kset; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + * If nfds == 0, we will skip all of the copying and check steps and + * proceed directly into poll_common to process the supplied timeout. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + if ((error = poll_copyin(ps, fds, nfds)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + pollout: /* - * If we changed the signal mask but we received - * no signal then restore the signal mask. - * Otherwise psig() will deal with the signal mask. + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. */ - if (ksetp != NULL) { + if (setp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; @@ -612,12 +680,10 @@ pollout: if (error) return (set_errno(error)); - /* * Copy out the events and return the fdcnt to the user. */ - if (nfds != 0 && - copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) + if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG @@ -625,7 +691,7 @@ pollout: * Another sanity check: */ if (fdcnt) { - int reventcnt = 0; + int i, reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { @@ -638,6 +704,8 @@ pollout: } ASSERT(fdcnt == reventcnt); } else { + int i; + for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } @@ -648,52 +716,6 @@ pollout: } /* - * This is the system call trap that poll(), - * select() and pselect() are built upon. - * It is a private interface between libc and the kernel. - */ -int -pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) -{ - timespec_t ts; - timespec_t *tsp; - sigset_t set; - k_sigset_t kset; - k_sigset_t *ksetp; - model_t datamodel = get_udatamodel(); - - if (timeoutp == NULL) - tsp = NULL; - else { - if (datamodel == DATAMODEL_NATIVE) { - if (copyin(timeoutp, &ts, sizeof (ts))) - return (set_errno(EFAULT)); - } else { - timespec32_t ts32; - - if (copyin(timeoutp, &ts32, sizeof (ts32))) - return (set_errno(EFAULT)); - TIMESPEC32_TO_TIMESPEC(&ts, &ts32) - } - - if (itimerspecfix(&ts)) - return (set_errno(EINVAL)); - tsp = &ts; - } - - if (setp == NULL) - ksetp = NULL; - else { - if (copyin(setp, &set, sizeof (set))) - return (set_errno(EFAULT)); - sigutok(&set, &kset); - ksetp = &kset; - } - - return (poll_common(fds, nfds, tsp, ksetp)); -} - -/* * Clean up any state left around by poll(2). Called when a thread exits. */ void diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e0e63f4c0..417c629168 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* @@ -257,6 +258,19 @@ rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) case _RUSAGESYS_GETVMUSAGE: return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, (vmusage_t *)arg3, (size_t *)arg4, 0)); + case _RUSAGESYS_INVALMAP: + /* + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD + * handling so callers on SPARC should get simple sync + * handling with invalidation to all processes. + */ +#if defined(__sparc) + return (memcntl((caddr_t)arg2, (size_t)arg3, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0)); +#else + return (vm_map_inval((pid_t)(uintptr_t)arg1, (caddr_t)arg2, + (size_t)arg3)); +#endif default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c index a28894b2c9..943b7d244e 100644 --- a/usr/src/uts/common/syscall/rw.c +++ b/usr/src/uts/common/syscall/rw.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -50,6 +50,7 @@ #include <sys/debug.h> #include <sys/rctl.h> #include <sys/nbmlock.h> +#include <sys/limits.h> #define COPYOUT_MAX_CACHE (1<<17) /* 128K */ @@ -607,19 +608,12 @@ out: return (bcount); } -/* - * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr.... - * XXX -- However, SVVS expects readv() and writev() to fail if - * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source), - * XXX -- so I guess that's the "interface". - */ -#define DEF_IOV_MAX 16 - ssize_t readv(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -768,6 +794,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -777,7 +805,8 @@ ssize_t writev(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen = aiov32[i].iov_len; count32 += iovlen; - if (iovlen < 0 || count32 < 0) + if (iovlen < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -917,6 +977,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -952,9 +1015,14 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -962,39 +1030,68 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = (ssize_t)count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -1099,6 +1196,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -1109,7 +1208,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -1134,9 +1234,14 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -1144,39 +1249,68 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = (ssize_t)count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -1308,6 +1442,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index cb8246f584..ccceca7c6d 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *); * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ -int +static int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { @@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, return (0); } -ssize32_t +static ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { @@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, } #endif -int +static int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { @@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, } -int +static int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { @@ -1160,6 +1160,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, } else { maxblk = (int)vp->v_stream->sd_maxblk; } + + /* + * We need to make sure that the socket that we're sending on + * supports sendfile behavior. sockfs doesn't know that the APIs + * we want to use are coming from sendfile, so we can't rely on + * it to check for us. + */ + if ((so->so_mode & SM_SENDFILESUPP) == 0) { + error = EOPNOTSUPP; + goto err; + } break; case VREG: break; diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c index 4085104cc7..93f26121bc 100644 --- a/usr/src/uts/common/syscall/stat.c +++ b/usr/src/uts/common/syscall/stat.c @@ -61,7 +61,7 @@ * to VOP_GETATTR */ -static int +int cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred) { vnode_t *startvp; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 03f2fabe13..26ea859224 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -170,8 +171,8 @@ sysconfig(int which) * even though rcapd can be used on the global zone too. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) + return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl), physinstalled)); return (physinstalled); @@ -179,26 +180,23 @@ sysconfig(int which) case _CONFIG_AVPHYS_PAGES: /* * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always + * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * though memory capping can be used on the global zone too. + * We use the cached value for the RSS since vm_getusage() + * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - cap = btop(curproc->p_zone->zone_phys_mcap); + cap = btop(curproc->p_zone->zone_phys_mem_ctl); if (cap > physinstalled) return (freemem); - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); + rss = btop(curproc->p_zone->zone_phys_mem); /* - * Because rcapd implements a soft cap, it is possible + * Because this is a soft cap, it is possible * for rss to be temporarily over the cap. */ if (cap > rss) diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 2dda4001bf..68aa1a95f5 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -108,7 +108,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -245,7 +245,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index 1d91475e38..c908a9e16c 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c index 90e1b73b70..439c859d96 100644 --- a/usr/src/uts/common/vm/seg_kmem.c +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -773,7 +774,7 @@ segkmem_capable(struct seg *seg, segcapability_t capability) return (0); } -static struct seg_ops segkmem_ops = { +struct seg_ops segkmem_ops = { SEGKMEM_BADOP(int), /* dup */ SEGKMEM_BADOP(int), /* unmap */ SEGKMEM_BADOP(void), /* free */ diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h index 2a4ed3b2aa..3ad4202e91 100644 --- a/usr/src/uts/common/vm/seg_kmem.h +++ b/usr/src/uts/common/vm/seg_kmem.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _VM_SEG_KMEM_H @@ -136,6 +137,8 @@ extern size_t segkmem_kmemlp_max; #define IS_KMEM_VA_LARGEPAGE(vaddr) \ (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) +extern struct seg_ops segkmem_ops; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_umap.c b/usr/src/uts/common/vm/seg_umap.c new file mode 100644 index 0000000000..ccad71c5d6 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.c @@ -0,0 +1,466 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * VM - Kernel-to-user mapping segment + * + * The umap segment driver was primarily designed to facilitate the comm page: + * a portion of kernel memory shared with userspace so that certain (namely + * clock-related) actions could operate without making an expensive trip into + * the kernel. + * + * Since the initial requirements for the comm page are slim, advanced features + * of the segment driver such as per-page protection have been left + * unimplemented at this time. + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_umap.h> + + +static boolean_t segumap_verify_safe(caddr_t, size_t); +static int segumap_dup(struct seg *, struct seg *); +static int segumap_unmap(struct seg *, caddr_t, size_t); +static void segumap_free(struct seg *); +static faultcode_t segumap_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segumap_faulta(struct seg *, caddr_t); +static int segumap_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segumap_incore(struct seg *, caddr_t, size_t, char *); +static int segumap_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int segumap_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segumap_getoffset(struct seg *, caddr_t); +static int segumap_gettype(struct seg *, caddr_t); +static int segumap_getvp(struct seg *, caddr_t, struct vnode **); +static int segumap_advise(struct seg *, caddr_t, size_t, uint_t); +static void segumap_dump(struct seg *); +static int segumap_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int segumap_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segumap_getmemid(struct seg *, caddr_t, memid_t *); +static int segumap_capable(struct seg *, segcapability_t); + +static struct seg_ops segumap_ops = { + segumap_dup, + segumap_unmap, + segumap_free, + segumap_fault, + segumap_faulta, + segumap_setprot, + segumap_checkprot, + NULL, /* kluster: disabled */ + NULL, /* swapout: disabled */ + segumap_sync, + segumap_incore, + segumap_lockop, + segumap_getprot, + segumap_getoffset, + segumap_gettype, + segumap_getvp, + segumap_advise, + segumap_dump, + segumap_pagelock, + segumap_setpagesize, + segumap_getmemid, + NULL, /* getpolicy: disabled */ + segumap_capable, + seg_inherit_notsup +}; + + +/* + * Create a kernel/user-mapped segment. + */ +int +segumap_create(struct seg *seg, void *argsp) +{ + segumap_crargs_t *a = (struct segumap_crargs *)argsp; + segumap_data_t *data; + + ASSERT((uintptr_t)a->kaddr > _userlimit); + + /* + * Check several aspects of the mapping request to ensure validity: + * - kernel pages must reside entirely in kernel space + * - target protection must be user-accessible + * - kernel address must be page-aligned + * - kernel address must reside inside a "safe" segment + */ + if ((uintptr_t)a->kaddr <= _userlimit || + ((uintptr_t)a->kaddr + seg->s_size) < (uintptr_t)a->kaddr || + (a->prot & PROT_USER) == 0 || + ((uintptr_t)a->kaddr & PAGEOFFSET) != 0 || + !segumap_verify_safe(a->kaddr, seg->s_size)) { + return (EINVAL); + } + + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + rw_init(&data->sud_lock, NULL, RW_DEFAULT, NULL); + data->sud_kaddr = a->kaddr; + data->sud_prot = a->prot; + data->sud_loaded = B_FALSE; + + seg->s_ops = &segumap_ops; + seg->s_data = data; + return (0); +} + +static boolean_t +segumap_verify_safe(caddr_t kaddr, size_t len) +{ + struct seg *seg; + + /* + * Presently, only pages which are backed by segkmem are allowed to be + * shared with userspace. This prevents nasty paging behavior with + * other drivers such as seg_kp. Furthermore, the backing kernel + * segment must completely contain the region to be mapped. + * + * Failing these checks is fatal for now since such mappings are done + * in a very limited context from the kernel. + */ + AS_LOCK_ENTER(&kas, RW_READER); + seg = as_segat(&kas, kaddr); + VERIFY(seg != NULL); + VERIFY(seg->s_base + seg->s_size >= kaddr + len); + VERIFY(seg->s_ops == &segkmem_ops); + AS_LOCK_EXIT(&kas); + + return (B_TRUE); +} + +static int +segumap_dup(struct seg *seg, struct seg *newseg) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + segumap_data_t *newsud; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newsud = kmem_zalloc(sizeof (segumap_data_t), KM_SLEEP); + rw_init(&newsud->sud_lock, NULL, RW_DEFAULT, NULL); + newsud->sud_kaddr = sud->sud_kaddr; + newsud->sud_prot = sud->sud_prot; + newsud->sud_loaded = B_FALSE; + + newseg->s_ops = seg->s_ops; + newseg->s_data = newsud; + return (0); +} + +static int +segumap_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Only allow unmap of entire segment */ + if (addr != seg->s_base || len != seg->s_size) { + return (EINVAL); + } + if (sud->sud_softlockcnt != 0) { + return (EAGAIN); + } + + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + /* + * While setting this field before immediately freeing the segment is + * not necessary, it is done for the sake of completeness. Doing so + * outside sud_lock is safe with the AS write-locked. + */ + sud->sud_loaded = B_FALSE; + + seg_free(seg); + return (0); +} + +static void +segumap_free(struct seg *seg) +{ + segumap_data_t *data = (segumap_data_t *)seg->s_data; + + ASSERT(data != NULL); + + rw_destroy(&data->sud_lock); + VERIFY(data->sud_loaded == B_FALSE); + VERIFY(data->sud_softlockcnt == 0); + kmem_free(data, sizeof (*data)); + seg->s_data = NULL; +} + +/* ARGSUSED */ +static faultcode_t +segumap_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw tw) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + if (type == F_PROT) { + /* + * Since protection on the segment is fixed, there is nothing + * to do but report an error for protection faults. + */ + return (FC_PROT); + } else if (type == F_SOFTUNLOCK) { + size_t plen = btop(len); + + rw_enter(&sud->sud_lock, RW_WRITER); + VERIFY(sud->sud_softlockcnt >= plen); + sud->sud_softlockcnt -= plen; + rw_exit(&sud->sud_lock); + return (0); + } + + ASSERT(type == F_INVAL || type == F_SOFTLOCK); + rw_enter(&sud->sud_lock, RW_WRITER); + + if (type == F_INVAL && sud->sud_loaded) { + rw_exit(&sud->sud_lock); + return (FC_NOMAP); + } + + /* + * Load the (entire) segment into the HAT if it has not been done so. + */ + if (!sud->sud_loaded) { + for (uintptr_t i = 0; i < seg->s_size; i += PAGESIZE) { + pfn_t pfn; + + pfn = hat_getpfnum(kas.a_hat, sud->sud_kaddr + i); + VERIFY(pfn != PFN_INVALID); + hat_devload(seg->s_as->a_hat, seg->s_base + i, + PAGESIZE, pfn, sud->sud_prot, HAT_LOAD); + } + sud->sud_loaded = B_TRUE; + } else { + /* + * If there the segment has already been loaded, there is no + * reason to take an F_INVALID fault. + */ + VERIFY(type != F_INVAL); + } + + if (type == F_SOFTLOCK) { + size_t nval = sud->sud_softlockcnt + btop(len); + + if (sud->sud_softlockcnt >= nval) { + rw_exit(&sud->sud_lock); + return (FC_MAKE_ERR(EOVERFLOW)); + } + sud->sud_softlockcnt = nval; + } + rw_exit(&sud->sud_lock); + return (0); +} + +/* ARGSUSED */ +static faultcode_t +segumap_faulta(struct seg *seg, caddr_t addr) +{ + /* Do nothing since asynch pagefault should not load translation. */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* + * The seg_umap driver does not yet allow protection to be changed. + */ + return (EACCES); +} + +/* ARGSUSED */ +static int +segumap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + int error = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + if ((sud->sud_prot & prot) != prot) { + error = EACCES; + } + rw_exit(&sud->sud_lock); + return (error); +} + +/* ARGSUSED */ +static int +segumap_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +segumap_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t sz = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + len = (len + PAGEOFFSET) & PAGEMASK; + while (len > 0) { + *vec = 1; + sz += PAGESIZE; + vec++; + len -= PAGESIZE; + } + return (sz); +} + +/* ARGSUSED */ +static int +segumap_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* Report success since kernel pages are always in memory. */ + return (0); +} + +static int +segumap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + size_t pgno; + uint_t prot; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + prot = sud->sud_prot; + rw_exit(&sud->sud_lock); + + /* + * Reporting protection is simple since it is not tracked per-page. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = prot; + } + return (0); +} + +/* ARGSUSED */ +static u_offset_t +segumap_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_gettype(struct seg *seg, caddr_t addr) +{ + /* + * Since already-existing kernel pages are being mapped into userspace, + * always report the segment type as shared. + */ + return (MAP_SHARED); +} + +/* ARGSUSED */ +static int +segumap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + *vpp = NULL; + return (0); +} + +/* ARGSUSED */ +static int +segumap_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + if (behav == MADV_PURGE) { + /* Purge does not make sense for this mapping */ + return (EINVAL); + } + /* Indicate success for everything else. */ + return (0); +} + +/* ARGSUSED */ +static void +segumap_dump(struct seg *seg) +{ + /* + * Since this is a mapping to share kernel data with userspace, nothing + * additional should be dumped. + */ +} + +/* ARGSUSED */ +static int +segumap_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +segumap_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +static int +segumap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + memidp->val[0] = (uintptr_t)sud->sud_kaddr; + memidp->val[1] = (uintptr_t)(addr - seg->s_base); + return (0); +} + +/* ARGSUSED */ +static int +segumap_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/common/vm/seg_umap.h b/usr/src/uts/common/vm/seg_umap.h new file mode 100644 index 0000000000..bcf7447509 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.h @@ -0,0 +1,43 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _VM_SEG_UMAP_H +#define _VM_SEG_UMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct segumap_crargs { + caddr_t kaddr; + uchar_t prot; /* protection */ + uchar_t maxprot; /* maximum protection */ +} segumap_crargs_t; + +typedef struct segumap_data { + krwlock_t sud_lock; + caddr_t sud_kaddr; + uchar_t sud_prot; + size_t sud_softlockcnt; + boolean_t sud_loaded; +} segumap_data_t; + +extern int segumap_create(struct seg *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_UMAP_H */ diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index 875dec7fe9..f143c1e464 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7308,7 +7308,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7333,11 +7334,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7389,7 +7390,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index 4fd32a3f4a..01db9b23d7 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -788,14 +788,21 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) pgcnt_t pswap_pages = 0; proc_t *p = curproc; - if (zone != NULL && takemem) { + if (zone != NULL) { /* test zone.max-swap resource control */ mutex_enter(&p->p_lock); if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { mutex_exit(&p->p_lock); - atomic_add_64(&zone->zone_anon_alloc_fail, 1); + + if (takemem) + atomic_add_64(&zone->zone_anon_alloc_fail, 1); + return (0); } + + if (!takemem) + rctl_decr_swap(zone, ptob(npages)); + mutex_exit(&p->p_lock); } mutex_enter(&anoninfo_lock); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index bb5a96eb0f..b0a5e7fb33 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -57,6 +57,7 @@ #include <sys/debug.h> #include <sys/tnf_probe.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/as.h> @@ -848,8 +849,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, struct seg *segsav; int as_lock_held; klwp_t *lwp = ttolwp(curthread); - - + zone_t *zonep = curzone; retry: /* @@ -885,6 +885,22 @@ retry: if (as == &kas) CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); CPU_STATS_EXIT_K(); + if (zonep->zone_pg_flt_delay != 0) { + /* + * The zone in which this process is running is + * currently over it's physical memory cap. Throttle + * page faults to help the user-land memory capper + * catch up. Note that drv_usectohz() rounds up. + */ + atomic_add_64(&zonep->zone_pf_throttle, 1); + atomic_add_64(&zonep->zone_pf_throttle_usec, + zonep->zone_pg_flt_delay); + if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) { + drv_usecwait(zonep->zone_pg_flt_delay); + } else { + delay(drv_usectohz(zonep->zone_pg_flt_delay)); + } + } break; } diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index 1b8d12eb8d..a206320a30 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_swap.c b/usr/src/uts/common/vm/vm_swap.c index 1a28c04357..2a008e114b 100644 --- a/usr/src/uts/common/vm/vm_swap.c +++ b/usr/src/uts/common/vm/vm_swap.c @@ -18,6 +18,11 @@ * * CDDL HEADER END */ + +/* + * Copyright 2015 Joyent, Inc. + */ + /* * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -625,7 +630,18 @@ swapctl(int sc_cmd, void *sc_arg, int *rv) return (0); } beginning: + mutex_enter(&swapinfo_lock); tmp_nswapfiles = nswapfiles; + mutex_exit(&swapinfo_lock); + + /* + * Return early if there are no swap entries to report: + */ + if (tmp_nswapfiles < 1) { + *rv = 0; + return (0); + } + /* Return an error if not enough space for the whole table. */ if (length < tmp_nswapfiles) return (ENOMEM); @@ -920,7 +936,18 @@ swapctl32(int sc_cmd, void *sc_arg, int *rv) return (0); } beginning: + mutex_enter(&swapinfo_lock); tmp_nswapfiles = nswapfiles; + mutex_exit(&swapinfo_lock); + + /* + * Return early if there are no swap entries to report: + */ + if (tmp_nswapfiles < 1) { + *rv = 0; + return (0); + } + /* Return an error if not enough space for the whole table. */ if (length < tmp_nswapfiles) return (ENOMEM); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 57166b4e63..8b9fd0d7a3 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2016, Joyent, Inc. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1594,8 +1647,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1739,12 +1791,34 @@ vmu_cache_rele(vmu_cache_t *cache) } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ +static void +vmu_update_zone_rctls(vmu_cache_t *cache) +{ + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } +} + +/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1763,7 +1837,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1826,26 +1900,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1901,10 +1982,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1924,6 +2007,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1934,6 +2021,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1953,7 +2055,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2009,8 +2111,11 @@ start: mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); @@ -2030,3 +2135,185 @@ start: vmu_data.vmu_pending_waiters--; goto start; } + +#if defined(__x86) +/* + * Attempt to invalidate all of the pages in the mapping for the given process. + */ +static void +map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size) +{ + page_t *pp; + size_t psize; + u_offset_t off; + caddr_t eaddr; + struct vnode *vp; + struct segvn_data *svd; + struct hat *victim_hat; + + ASSERT((addr + size) <= (seg->s_base + seg->s_size)); + + victim_hat = p->p_as->a_hat; + svd = (struct segvn_data *)seg->s_data; + vp = svd->vp; + psize = page_get_pagesize(seg->s_szc); + + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + + if (pp != NULL) { + /* following logic based on pvn_getdirty() */ + + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + continue; + } + + page_io_lock(pp); + hat_page_inval(pp, 0, victim_hat); + page_io_unlock(pp); + + /* + * For B_INVALCURONLY-style handling we let + * page_release call VN_DISPOSE if no one else is using + * the page. + * + * A hat_ismod() check would be useless because: + * (1) we are not be holding SE_EXCL lock + * (2) we've not unloaded _all_ translations + * + * Let page_release() do the heavy-lifting. + */ + (void) page_release(pp, 1); + } + } +} + +/* + * vm_map_inval() + * + * Invalidate as many pages as possible within the given mapping for the given + * process. addr is expected to be the base address of the mapping and size is + * the length of the mapping. In some cases a mapping will encompass an + * entire segment, but at least for anon or stack mappings, these will be + * regions within a single large segment. Thus, the invalidation is oriented + * around a single mapping and not an entire segment. + * + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so + * this code is only applicable to x86. + */ +int +vm_map_inval(pid_t pid, caddr_t addr, size_t size) +{ + int ret; + int error = 0; + proc_t *p; /* target proc */ + struct as *as; /* target proc's address space */ + struct seg *seg; /* working segment */ + + if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0) + return (set_errno(EPERM)); + + /* If not a valid mapping address, return an error */ + if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr) + return (set_errno(EINVAL)); + +again: + mutex_enter(&pidlock); + p = prfind(pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr != NULL) { + mutex_exit(&p->p_lock); + return (0); + } + + as = p->p_as; + + /* + * Try to set P_PR_LOCK - prevents process "changing shape" + * - blocks fork + * - blocks sigkill + * - cannot be a system proc + * - must be fully created proc + */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + return (set_errno(ESRCH)); + } + + if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. This also + * drops p_lock so p may no longer be valid since the proc may + * have exited. + */ + sprwaitlock_proc(p); + goto again; + } + + /* P_PR_LOCK is now set */ + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + if ((seg = as_segat(as, addr)) == NULL) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(ENOMEM)); + } + + /* + * The invalidation behavior only makes sense for vnode-backed segments. + */ + if (seg->s_ops != &segvn_ops) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (0); + } + + /* + * If the mapping is out of bounds of the segement return an error. + */ + if ((addr + size) > (seg->s_base + seg->s_size)) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(EINVAL)); + } + + /* + * Don't use MS_INVALCURPROC flag here since that would eventually + * initiate hat invalidation based on curthread. Since we're doing this + * on behalf of a different process, that would erroneously invalidate + * our own process mappings. + */ + error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC); + if (error == 0) { + /* + * Since we didn't invalidate during the sync above, we now + * try to invalidate all of the pages in the mapping. + */ + map_inval(p, seg, addr, size); + } + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + sprunlock(p); + + if (error) + (void) set_errno(error); + return (error); +} +#endif |