diff options
Diffstat (limited to 'usr/src/uts/common')
635 files changed, 135572 insertions, 7180 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index e344b15a00..0cef482d82 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -288,6 +288,7 @@ GENUNIX_OBJS += \ rctl.o \ rctlsys.o \ readlink.o \ + refhash.o \ refstr.o \ rename.o \ resolvepath.o \ @@ -437,6 +438,8 @@ PROFILE_OBJS += profile.o SYSTRACE_OBJS += systrace.o +LX_SYSTRACE_OBJS += lx_systrace.o + LOCKSTAT_OBJS += lockstat.o FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o @@ -499,6 +502,10 @@ PTSL_OBJS += tty_pts.o PTM_OBJS += ptm.o +LX_PTM_OBJS += lx_ptm.o + +LX_NETLINK_OBJS += lx_netlink.o + MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ mii_marvell.o mii_realtek.o mii_other.o @@ -556,6 +563,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \ sctp_misc.o IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o +IP_COMM_OBJS = inet_hash.o IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ @@ -566,12 +574,14 @@ IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ ip_helper_stream.o ip_tunables.o \ ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \ conn_opt.o ip_attr.o ip_dce.o \ + bpf_filter.o \ $(IP_ICMP_OBJS) \ $(IP_RTS_OBJS) \ $(IP_TCP_OBJS) \ $(IP_UDP_OBJS) \ $(IP_SCTP_OBJS) \ - $(IP_ILB_OBJS) + $(IP_ILB_OBJS) \ + $(IP_COMM_OBJS) IP6_OBJS += ip6ddi.o @@ -589,6 +599,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o +DATAFILT_OBJS += datafilt.o + SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o SPPPTUN_OBJS += sppptun.o sppptun_mod.o @@ -642,7 +654,7 @@ TL_OBJS += tl.o DUMP_OBJS += dump.o -BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o +BPF_OBJS += bpf.o bpf_wrap.o bpf_mod.o bpf_dlt.o bpf_mac.o CLONE_OBJS += clone.o @@ -686,6 +698,15 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ VNIC_OBJS += vnic_ctl.o vnic_dev.o +OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \ + overlay_prop.o overlay_target.o + +OVERLAY_VXLAN_OBJS += overlay_vxlan.o + +VND_OBJS += vnd.o frameio.o + +GSQUEUE_OBJS += gsqueue.o + SIMNET_OBJS += simnet.o IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o @@ -832,7 +853,7 @@ SATA_OBJS += sata.o USBA_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \ usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \ - usba_devdb.o usba10_calls.o usba_ugen.o + usba_devdb.o usba10_calls.o usba_ugen.o usba_bos.o USBA10_OBJS += usba10.o @@ -938,6 +959,8 @@ SIGNALFD_OBJS += signalfd.o I8042_OBJS += i8042.o +INOTIFY_OBJS += inotify.o + KB8042_OBJS += \ at_keyprocess.o \ kb8042.o \ @@ -1012,6 +1035,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o ZCONS_OBJS += zcons.o +ZFD_OBJS += zfd.o + NV_SATA_OBJS += nv_sata.o SI3124_OBJS += si3124.o @@ -1065,8 +1090,7 @@ DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \ sdev_ptsops.o sdev_zvolops.o sdev_comm.o \ sdev_profile.o sdev_ncache.o sdev_netops.o \ - sdev_ipnetops.o \ - sdev_vtops.o + sdev_ipnetops.o sdev_vtops.o sdev_plugin.o CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \ ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o @@ -1083,8 +1107,13 @@ PIPE_OBJS += pipe.o HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o +HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \ + hyprlofs_vnops.o hyprlofs_vfsops.o + LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o +LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o + NAMEFS_OBJS += namevfs.o namevno.o NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ @@ -1236,8 +1265,8 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ pc_vfsops.o pc_vnops.o -PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ - prvfsops.o prvnops.o +PROC_OBJS += prargv.o prcontrol.o prioctl.o prsubr.o \ + prusrio.o prvfsops.o prvnops.o MNTFS_OBJS += mntvfsops.o mntvnops.o @@ -1402,6 +1431,7 @@ ZFS_COMMON_OBJS += \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ + zfs_zone.o \ zil.o \ zio.o \ zio_checksum.o \ @@ -1867,7 +1897,7 @@ ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o MXFE_OBJS += mxfe.o -MPTSAS_OBJS += mptsas.o mptsas_hash.o mptsas_impl.o mptsas_init.o \ +MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o \ mptsas_raid.o mptsas_smhba.o SFE_OBJS += sfe.o sfe_util.o @@ -1902,9 +1932,9 @@ LINT_DEFS += -Dunix # It is a bug in the current compilation system that the assember # can't process the -Y I, flag. # -NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common -AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common -INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +NATIVE_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +AS_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) -I$(UTSBASE)/common +INCLUDE_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common PCIEB_OBJS += pcieb.o @@ -2102,6 +2132,11 @@ MEGA_SAS_OBJS = megaraid_sas.o MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o # +# DR_SAS module +# +DR_SAS_OBJS = dr_sas.o + +# # CPQARY3 module # CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \ @@ -2110,6 +2145,20 @@ CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \ cpqary3_bd.o # +# HP Smart Array driver module (smrt) +# +SMRT_OBJS = smrt.o \ + smrt_device.o \ + smrt_interrupts.o \ + smrt_commands.o \ + smrt_logvol.o \ + smrt_hba.o \ + smrt_ciss_simple.o \ + smrt_ciss.o \ + smrt_physical.o \ + smrt_sata.o + +# # ISCSI_INITIATOR module # ISCSI_INITIATOR_OBJS = chap.o iscsi_io.o iscsi_thread.o \ @@ -2149,6 +2198,11 @@ URF_OBJS = urf_usbgem.o UPF_OBJS = upf_usbgem.o # +# NFP objects +# +NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o + +# # BNXE objects # BNXE_OBJS += bnxe_cfg.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index c369cd3b63..e739dae95f 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -102,6 +102,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/avl/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/inet/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/ucode/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -265,10 +269,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -759,6 +771,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/drm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/dr_sas/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/efe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -959,6 +975,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/net80211/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nfp/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nge/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -978,6 +998,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1082,6 +1110,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/fops/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/scsi/adapters/smrt/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/fibre-channel/ulp/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1122,6 +1154,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1134,6 +1170,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1478,9 +1518,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/idspace/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioif/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) + # # krtld must refer to its own bzero/bcopy until the kernel is fully linked # @@ -1545,6 +1590,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1672,6 +1721,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/acl/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/avl/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(COMMONBASE)/inet/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(COMMONBASE)/ucode/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1786,9 +1838,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2137,6 +2195,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dmfe/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/drm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dr_sas/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/efe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2284,6 +2345,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/mwl/mwl_fw/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/net80211/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nfp/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nge/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2299,6 +2363,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/%.s $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/npi/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/plugins/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/pci-ide/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2368,6 +2438,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/scsi_vhci/fops/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/scsi/adapters/smrt/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/fibre-channel/ulp/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2407,6 +2480,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sdcard/impl/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/gsqueue/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/sfe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2416,6 +2492,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/simnet/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/softmac/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/vnd/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/uath/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2677,6 +2756,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/nexus/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/pcs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/refhash/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2770,3 +2852,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/skd/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/fsreparse/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(COMMONBASE)/idspace/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c new file mode 100644 index 0000000000..730deae80e --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c @@ -0,0 +1,3174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * See the big theory statement in ../sys/lx_autofs.h + */ + +#include <fs/fs_subr.h> +#include <sys/stat.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/dirent.h> +#include <sys/fs/fifonode.h> +#include <sys/modctl.h> +#include <sys/mount.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/conf.h> +#include <sys/sdt.h> + +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> + +#include <sys/dnlc.h> +#include <nfs/rnode.h> +#include <nfs/rnode4.h> +#include <sys/lx_autofs_impl.h> +#include <sys/lx_types.h> + +/* + * External functions + */ +extern uintptr_t space_fetch(char *key); +extern int space_store(char *key, uintptr_t ptr); +extern int umount2_engine(vfs_t *, int, cred_t *, int); + +/* + * Globals + */ +static vfsops_t *lx_autofs_vfsops; +static vnodeops_t *lx_autofs_vn_ops = NULL; +static int lx_autofs_fstype; +static major_t lx_autofs_major; +static minor_t lx_autofs_minor = 0; +static dev_info_t *lx_autofs_dip = NULL; + +#define LX_AUTOFS_DEV_VERSION_MAJOR 1 +#define LX_AUTOFS_DEV_VERSION_MINOR 0 + +/* The Linux autofs superblock magic number */ +#define LX_AUTOFS_SB_MAGIC 0x0187 + +/* Linux autofs mount types */ +#define LX_AUTOFS_TYPE_INDIRECT 1 +#define LX_AUTOFS_TYPE_DIRECT 2 +#define LX_AUTOFS_TYPE_OFFSET 4 + +/* Structure passed for autofs dev ioctls */ +typedef struct lx_autofs_dv_ioctl { + uint32_t lad_ver_major; + uint32_t lad_ver_minor; + uint32_t lad_size; + uint32_t lad_ioctlfd; + uint32_t lad_arg1; + uint32_t lad_arg2; + char lad_path[0]; +} lx_autofs_dv_ioctl_t; + +/* + * Support functions + */ +static void +lx_autofs_strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +lx_autofs_strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static int +lx_autofs_str_to_int(char *str, int *val) +{ + long res; + + if (str == NULL) + return (-1); + + if ((ddi_strtol(str, NULL, 10, &res) != 0) || + (res < INT_MIN) || (res > INT_MAX)) + return (-1); + + *val = res; + return (0); +} + +static void +ls_autofs_stack_init(list_t *lp) +{ + list_create(lp, + sizeof (stack_elem_t), offsetof(stack_elem_t, se_list)); +} + +static void +lx_autofs_stack_fini(list_t *lp) +{ + ASSERT(list_head(lp) == NULL); + list_destroy(lp); +} + +static void +lx_autofs_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3) +{ + stack_elem_t *se; + + se = kmem_alloc(sizeof (*se), KM_SLEEP); + se->se_ptr1 = ptr1; + se->se_ptr2 = ptr2; + se->se_ptr3 = ptr3; + list_insert_head(lp, se); +} + +static int +lx_autofs_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3) +{ + stack_elem_t *se; + + if ((se = list_head(lp)) == NULL) + return (-1); + list_remove(lp, se); + if (ptr1 != NULL) + *ptr1 = se->se_ptr1; + if (ptr2 != NULL) + *ptr2 = se->se_ptr2; + if (ptr3 != NULL) + *ptr3 = se->se_ptr3; + kmem_free(se, sizeof (*se)); + return (0); +} + +static vnode_t * +lx_autofs_fifo_peer_vp(vnode_t *vp) +{ + fifonode_t *fnp = VTOF(vp); + fifonode_t *fn_dest = fnp->fn_dest; + return (FTOV(fn_dest)); +} + +static vnode_t * +lx_autofs_vn_alloc(vfs_t *vfsp, vnode_t *uvp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *vp, *vp_old; + + /* Allocate a new vnode structure in case we need it. */ + vp = vn_alloc(KM_SLEEP); + vn_setops(vp, lx_autofs_vn_ops); + VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev); + vp->v_data = uvp; + ASSERT(vp->v_count == 1); + + /* + * Take a hold on the vfs structure. This is how unmount will + * determine if there are any active vnodes in the file system. + */ + VFS_HOLD(vfsp); + + /* + * Check if we already have a vnode allocated for this underlying + * vnode_t. + */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) { + + /* + * Didn't find an existing node. + * Add this node to the hash and return. + */ + VERIFY(mod_hash_insert(data->lav_vn_hash, + (mod_hash_key_t)uvp, + (mod_hash_val_t)vp) == 0); + mutex_exit(&data->lav_lock); + return (vp); + } + + /* Get a hold on the existing vnode and free up the one we allocated. */ + VN_HOLD(vp_old); + mutex_exit(&data->lav_lock); + + /* Free up the new vnode we allocated. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); + + return (vp_old); +} + +static void +lx_autofs_vn_free(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *uvp = vp->v_data; + vnode_t *vp_tmp; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + ASSERT(MUTEX_HELD((&vp->v_lock))); + + ASSERT(vp->v_count == 0); + + /* We're about to free this vnode so take it out of the hash. */ + (void) mod_hash_remove(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp); + + /* + * No one else can lookup this vnode any more so there's no need + * to hold locks. + */ + mutex_exit(&data->lav_lock); + mutex_exit(&vp->v_lock); + + /* Release the underlying vnode. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_alloc(lx_autofs_vfs_t *data, boolean_t *is_dup, boolean_t expire, + char *nm) +{ + lx_autofs_automnt_req_t *laar, *laar_dup; + + /* Pre-allocate a new automounter request before grabbing locks. */ + laar = kmem_zalloc(sizeof (*laar), KM_SLEEP); + mutex_init(&laar->laar_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&laar->laar_cv, NULL, CV_DEFAULT, NULL); + laar->laar_ref = 1; + + if (data->lav_min_proto == 5) { + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS5; + + if (data->lav_mnttype == LXAMT_INDIR) { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_INDIR; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_INDIR; + } + } else { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_DIRECT; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_DIRECT; + } + } + laar->laar_pkt_size = sizeof (lx_autofs_v5_pkt_t); + + laar->laar_pkt.lap_v5.lap_dev = data->lav_dev; + laar->laar_pkt.lap_v5.lap_ino = data->lav_ino; + /* + * Note that we're currently not filling in the other v5 pkt + * fields (pid, uid, etc.) since they don't appear to be used + * by the automounter. We can fill those in later if it proves + * necessary. + */ + + /* + * For indirect mounts the token expected by the automounter is + * the name of the directory entry to look up (not the entire + * path that is being accessed.) For direct mounts the Linux + * kernel passes a dummy name, so this is just as good. + */ + laar->laar_pkt.lap_v5.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v5.lap_name_len > + (sizeof (laar->laar_pkt.lap_v5.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs automnt req: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v5.lap_name, nm, + sizeof (laar->laar_pkt.lap_v5.lap_name)); + + } else if (expire) { + zcmn_err(getzoneid(), CE_WARN, + "unsupported expire protocol request: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + + } else { + ASSERT(expire == B_FALSE); + + /* Older protocol pkt (really v2) */ + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS2; + laar->laar_pkt.lap_type = LX_AUTOFS_PTYPE_MISSING; + laar->laar_pkt_size = sizeof (lx_autofs_v2_pkt_t); + + /* + * The token expected by the linux automount is the name of + * the directory entry to look up. (And not the entire + * path that is being accessed.) + */ + laar->laar_pkt.lap_v2.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v2.lap_name_len > + (sizeof (laar->laar_pkt.lap_v2.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs lookup: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v2.lap_name, nm, + sizeof (laar->laar_pkt.lap_v2.lap_name)); + } + + /* Assign a unique id for this request. */ + laar->laar_pkt.lap_id = id_alloc(data->lav_ids); + + /* Check for an outstanding request for this path. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_path_hash, + (mod_hash_key_t)nm, (mod_hash_val_t *)&laar_dup) == 0) { + /* + * There's already an outstanding request for this + * path so we don't need a new one. + */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); + laar = laar_dup; + + /* Bump the ref count on the old request. */ + atomic_add_int(&laar->laar_ref, 1); + + *is_dup = 1; + } else { + /* Add it to the hashes. */ + VERIFY(mod_hash_insert(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)laar) == 0); + VERIFY(mod_hash_insert(data->lav_path_hash, + (mod_hash_key_t)lx_autofs_strdup(nm), + (mod_hash_val_t)laar) == 0); + + *is_dup = 0; + } + mutex_exit(&data->lav_lock); + + return (laar); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_find(lx_autofs_vfs_t *data, int id) +{ + lx_autofs_automnt_req_t *laar; + + /* Check for an outstanding request for this id. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&laar) != 0) { + mutex_exit(&data->lav_lock); + return (NULL); + } + atomic_add_int(&laar->laar_ref, 1); + mutex_exit(&data->lav_lock); + return (laar); +} + +static void +lx_autofs_la_complete(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* Remove this request from the hashes so no one can look it up. */ + mutex_enter(&data->lav_lock); + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* Mark this requst as complete and wakeup anyone waiting on it. */ + mutex_enter(&laar->laar_lock); + laar->laar_complete = 1; + cv_broadcast(&laar->laar_cv); + mutex_exit(&laar->laar_lock); +} + +static void +lx_autofs_la_release(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + ASSERT(!MUTEX_HELD(&laar->laar_lock)); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) + return; + ASSERT(laar->laar_ref == 0); + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static void +lx_autofs_la_abort(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* + * This is a little tricky. We're aborting the wait for this + * request. So if anyone else is waiting for this request we + * can't free it, but if no one else is waiting for the request + * we should free it. + */ + mutex_enter(&data->lav_lock); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) { + mutex_exit(&data->lav_lock); + return; + } + ASSERT(laar->laar_ref == 0); + + /* Remove this request from the hashes so no one can look it up. */ + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* It's ok to free this now because the ref count was zero. */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static int +lx_autofs_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_wr, *ufp_rd = NULL; + file_t *fp_wr, *fp_rd = NULL; + vnode_t *vp_wr, *vp_rd; + int i; + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* Sanity check fifo write fd. */ + if (fd >= fip->fi_nfiles) { + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* Get a pointer to the write fifo. */ + UF_ENTER(ufp_wr, fip, fd); + if (((fp_wr = ufp_wr->uf_file) == NULL) || + ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) { + /* Invalid fifo fd. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + vp_rd = lx_autofs_fifo_peer_vp(fp_wr->f_vnode); + for (i = 0; i < fip->fi_nfiles; i++) { + if (i == fd) + continue; + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * We need to drop fi_lock before we can try to acquire f_tlock + * the good news is that the file pointers are protected because + * we're still holding uf_lock. + */ + mutex_exit(&fip->fi_lock); + + /* + * Here we bump the open counts on the fifos. The reason + * that we do this is because when we go to write to the + * fifo we want to ensure that they are actually open (and + * not in the process of being closed) without having to + * stop the automounter. (If the write end of the fifo + * were closed and we tried to write to it we would panic. + * If the read end of the fifo was closed and we tried to + * write to the other end, the process that invoked the + * lookup operation would get an unexpected SIGPIPE.) + */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + ASSERT(fp_wr->f_count >= 2); + mutex_exit(&fp_wr->f_tlock); + + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + ASSERT(fp_rd->f_count >= 2); + mutex_exit(&fp_rd->f_tlock); + + /* Release all our locks. */ + UF_EXIT(ufp_wr); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + /* Return the file pointers. */ + *fpp_rd = fp_rd; + *fpp_wr = fp_wr; + return (0); +} + +static uint_t +/*ARGSUSED*/ +lx_autofs_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + int *id = (int *)arg; + /* Return the key and terminate the walk. */ + *id = (uintptr_t)key; + return (MH_WALK_TERMINATE); +} + +static void +lx_autofs_fifo_close(lx_autofs_vfs_t *data) +{ + /* + * Close the fifo to prevent any future requests from + * getting sent to the automounter. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr != NULL) { + (void) closef(data->lav_fifo_wr); + data->lav_fifo_wr = NULL; + } + if (data->lav_fifo_rd != NULL) { + (void) closef(data->lav_fifo_rd); + data->lav_fifo_rd = NULL; + } + mutex_exit(&data->lav_lock); + + /* + * Wakeup any threads currently waiting for the automounter + * note that it's possible for multiple threads to have entered + * this function and to be doing the work below simultaneously. + */ + for (;;) { + lx_autofs_automnt_req_t *laar; + int id; + + /* Lookup the first entry in the hash. */ + id = -1; + mod_hash_walk(data->lav_id_hash, + lx_autofs_fifo_close_cb, &id); + if (id == -1) { + /* No more id's in the hash. */ + break; + } + if ((laar = lx_autofs_la_find(data, id)) == NULL) { + /* Someone else beat us to it. */ + continue; + } + + /* Mark the request as complete and release it. */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + } +} + +static int +lx_autofs_fifo_verify_rd(lx_autofs_vfs_t *data) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_rd = NULL; + file_t *fp_rd = NULL; + vnode_t *vp_rd; + int i; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + + /* Check if we've already been shut down. */ + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + return (-1); + } + vp_rd = lx_autofs_fifo_peer_vp(data->lav_fifo_wr->f_vnode); + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(data->lav_pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Seems the automounter still has the read end of the fifo + * open, we're done here. Release all our locks and exit. + */ + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + return (0); +} + +static int +lx_autofs_fifo_write(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laarp) +{ + struct uio uio; + struct iovec iov; + file_t *fp_wr, *fp_rd; + int error; + + /* + * The catch here is we need to make sure _we_ don't close + * the the fifo while writing to it. (Another thread could come + * along and realize the automounter process is gone and close + * the fifo. To do this we bump the open count before we + * write to the fifo. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + fp_wr = data->lav_fifo_wr; + fp_rd = data->lav_fifo_rd; + + /* Bump the open count on the write fifo. */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + mutex_exit(&fp_wr->f_tlock); + + /* Bump the open count on the read fifo. */ + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + mutex_exit(&fp_rd->f_tlock); + + mutex_exit(&data->lav_lock); + + iov.iov_base = (caddr_t)&laarp->laar_pkt; + iov.iov_len = laarp->laar_pkt_size; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = 0; + uio.uio_segflg = (short)UIO_SYSSPACE; + uio.uio_resid = laarp->laar_pkt_size; + uio.uio_llimit = 0; + uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK; + + error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL); + (void) closef(fp_wr); + (void) closef(fp_rd); + + /* + * After every write we verify that the automounter still has + * these files open. + */ + mutex_enter(&data->lav_lock); + if (lx_autofs_fifo_verify_rd(data) != 0) { + /* + * Something happened to the automounter. + * Close down the communication pipe we setup. + */ + mutex_exit(&data->lav_lock); + lx_autofs_fifo_close(data); + if (error != 0) + return (error); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + return (error); +} + +static int +lx_autofs_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack) +{ + struct iovec iov; + struct uio uio; + dirent64_t *dp, *dbuf; + vnode_t *vp; + size_t dlen, dbuflen; + int eof, error, ndirents = 64; + char *nm; + + dlen = ndirents * (sizeof (*dbuf)); + dbuf = kmem_alloc(dlen, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_loffset = 0; + uio.uio_llimit = MAXOFFSET_T; + + eof = 0; + error = 0; + while (!error && !eof) { + uio.uio_resid = dlen; + iov.iov_base = (char *)dbuf; + iov.iov_len = dlen; + + (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); + if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) { + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + kmem_free(dbuf, dlen); + return (-1); + } + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + + if ((dbuflen = dlen - uio.uio_resid) == 0) { + /* We're done. */ + break; + } + + for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) { + + nm = dp->d_name; + + if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0) + continue; + + if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + kmem_free(dbuf, dlen); + return (-1); + } + if (vp->v_type == VDIR) { + if (dir_stack != NULL) { + lx_autofs_stack_push(dir_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } else { + if (file_stack != NULL) { + lx_autofs_stack_push(file_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } + } + } + kmem_free(dbuf, dlen); + return (0); +} + +static void +lx_autofs_bs_destroy(vnode_t *dvp, char *path) +{ + list_t search_stack; + list_t dir_stack; + list_t file_stack; + vnode_t *pdvp, *vp; + char *dpath, *fpath; + int ret; + + if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + /* A directory entry with this name doesn't actually exist. */ + return; + } + + if ((vp->v_type & VDIR) == 0) { + /* Easy, the directory entry is a file so delete it. */ + VN_RELE(vp); + (void) VOP_REMOVE(dvp, path, kcred, NULL, 0); + return; + } + + /* + * The directory entry is a subdirectory, now we have a bit more + * work to do. (We'll have to recurse into the sub directory.) + * It would have been much easier to do this recursively but kernel + * stacks are notoriously small. + */ + ls_autofs_stack_init(&search_stack); + ls_autofs_stack_init(&dir_stack); + ls_autofs_stack_init(&file_stack); + + /* Save our newfound subdirectory into a list. */ + lx_autofs_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp, + lx_autofs_strdup(path)); + + /* Do a recursive depth first search into the subdirectories. */ + while (lx_autofs_stack_pop(&search_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the subdirectories in this directory. */ + if (lx_autofs_bs_readdir(dvp, &search_stack, NULL) != 0) + goto exit; + + /* Save the current directory a separate stack. */ + lx_autofs_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp, + dpath); + } + + /* + * Now dir_stack contains a list of directories, the deepest paths + * are at the top of the list. So let's go through and process them. + */ + while (lx_autofs_stack_pop(&dir_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the files in this directory. */ + if (lx_autofs_bs_readdir(dvp, NULL, &file_stack) != 0) { + VN_RELE(dvp); + lx_autofs_strfree(dpath); + goto exit; + } + + /* Delete all the files in this directory. */ + while (lx_autofs_stack_pop(&file_stack, + NULL, (caddr_t *)&vp, &fpath) == 0) { + VN_RELE(vp) + ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0); + lx_autofs_strfree(fpath); + if (ret != 0) { + lx_autofs_strfree(dpath); + goto exit; + } + } + + /* Delete this directory. */ + VN_RELE(dvp); + ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0); + lx_autofs_strfree(dpath); + if (ret != 0) + goto exit; + } + +exit: + while ( + (lx_autofs_stack_pop(&search_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&dir_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&file_stack, NULL, (caddr_t *)&vp, + &path) == 0)) { + VN_RELE(vp); + lx_autofs_strfree(path); + } + lx_autofs_stack_fini(&search_stack); + lx_autofs_stack_fini(&dir_stack); + lx_autofs_stack_fini(&file_stack); +} + +static vnode_t * +lx_autofs_bs_create(vnode_t *dvp, char *bs_name) +{ + vnode_t *vp; + vattr_t vattr; + + /* + * After looking at the mkdir syscall path it seems we don't need + * to initialize all of the vattr_t structure. + */ + bzero(&vattr, sizeof (vattr)); + vattr.va_type = VDIR; + vattr.va_mode = 0755; /* u+rwx,og=rx */ + vattr.va_mask = AT_TYPE|AT_MODE; + + if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0) + return (NULL); + return (vp); +} + +static int +lx_autofs_automounter_call(vnode_t *dvp, char *nm) +{ + lx_autofs_automnt_req_t *laar; + lx_autofs_vfs_t *data; + int error; + boolean_t is_dup; + + /* Get a pointer to the vfs mount data. */ + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* The automounter only supports queries in the root directory. */ + if (dvp != data->lav_root) + return (ENOENT); + + /* + * Check if the current process is in the automounters process + * group. (If it is, the current process is either the autmounter + * itself or one of it's forked child processes.) If so, don't + * redirect this call back into the automounter because we'll + * hang. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter request structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_FALSE, + nm)) == NULL) + return (ENOENT); + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if ((!is_dup) && + ((error = lx_autofs_fifo_write(data, laar)) != 0)) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (error); + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this call. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + return (EINTR); + } + } + mutex_exit(&laar->laar_lock); + + if (laar->laar_result == LXACR_READY) { + /* + * Mount succeeded, keep track for future expire calls. + * + * See vfs lav_vn_hash. Is this something we could use for + * iterating mounts under this autofs? Used by + * lx_autofs_vn_alloc + */ + lx_autofs_mntent_t *mp; + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, mp->lxafme_len); + + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } + + lx_autofs_la_release(data, laar); + + return (0); +} + +/* + * Same preliminary checks as in lx_autofs_unmount. + */ +static boolean_t +lx_autofs_may_unmount(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (B_FALSE); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (B_FALSE); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + + /* Check for any remaining holds on the root vnode. */ + if (data->lav_root->v_count > 1) + return (B_FALSE); + + return (B_TRUE); +} + +static vfs_t * +lx_autofs_get_mountvfs(char *fs_mntpt, int *cnt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + int fsmplen; + int acnt = 0; + + fsmplen = strlen(fs_mntpt); + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + *cnt = 0; + return (NULL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt; + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) == 0 && + (mntpt[fsmplen] == '\0' || mntpt[fsmplen] == '/')) { + /* + * We'll return the first one we find but don't + * return a mount that is actually autofs (i.e. + * autofs direct or offset mount). + */ + if (vfsp->vfs_op == lx_autofs_vfsops) { + acnt++; + } else if (fnd_vfs == NULL) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + } + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + *cnt = acnt; + return (fnd_vfs); +} + +/* + * Unmount all autofs offset mounts below the given path. + */ +static boolean_t +lx_autofs_umount_offset(char *fs_mntpt, struct cred *cr) +{ + struct vfs *vfsp; + struct vfs *vfslist; + boolean_t busy = B_FALSE; + int fsmplen = strlen(fs_mntpt); + +restart: + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (B_FALSE); + } + + do { + char *mntpt; + lx_autofs_vfs_t *data; + + /* Skip mounts we should ignore. */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB)) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) != 0 || + (mntpt[fsmplen] != '\0' && mntpt[fsmplen] != '/')) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Something got mounted over the autofs mountpoint + * after we checked that this inidrect hierarchy was + * not busy. + */ + busy = B_TRUE; + break; + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_mnttype != LXAMT_OFFSET) { + /* + * Something mounted a non-offset autofs fs under this + * indirect mnt! + */ + busy = B_TRUE; + break; + } + + /* + * Attempt to umount - set busy if fails. + * + * umount2_engine will call VFS_RELE, so we need to take an + * extra hold to match the behavior during the normal umount + * path. + * + * We also need to drop the list lock to prevent deadlock + * during umount. + */ + VFS_HOLD(vfsp); + vfs_list_unlock(); + if (umount2_engine(vfsp, 0, cr, 0) != 0) { + busy = B_TRUE; + goto errexit; + } + + /* Retake list lock and look for more. */ + goto restart; + } while (vfsp != vfslist); + + vfs_list_unlock(); + +errexit: + return (busy); +} + + +/* + * Note that lx_autofs_automounter_call() only supports queries in the root + * directory, so all mntent names are relative to that. + */ +static int +lx_autofs_expire(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + lx_autofs_mntent_t *mp; + lx_autofs_automnt_req_t *laar; + boolean_t is_dup; + vfs_t *fnd_vfs; + int autofs_cnt; + boolean_t busy = B_FALSE; + char exp_path[MAXPATHLEN]; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* + * We process only the first element (i.e. do not do multi). This + * works fine for the automounter. + */ + mutex_enter(&data->lav_lock); + mp = (lx_autofs_mntent_t *)list_remove_head(&data->lav_mnt_list); + mutex_exit(&data->lav_lock); + if (mp == NULL) { + if (data->lav_mnttype == LXAMT_OFFSET) { + /* + * During restart the automounter will openmount each + * offset mount for management. It won't closemount the + * offset mount until we expire it, even though nothing + * is mounted over that offset. We handle this as a + * special expiration case. + */ + int cnt; + + mutex_enter(&data->lav_lock); + cnt = data->lav_openmnt_cnt; + mutex_exit(&data->lav_lock); + + if (cnt == 1 && vn_ismntpt(data->lav_root) == 0) { + char *mntpt = (char *) + refstr_value(vfsp->vfs_mntpt); + char *nm = ZONE_PATH_TRANSLATE(mntpt, curzone); + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), + KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, + KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, + mp->lxafme_len); + + goto exp_offset; + } + } + + return (EAGAIN); + } + + /* + * We only return an expired mount if it is inactive for the full + * timeout. This reduces overly aggressive umount/mount activity. + */ + if (data->lav_timeout > 0) { + uint64_t now = TICK_TO_SEC(ddi_get_lbolt64()); + + if ((now - mp->lxafme_ts) < data->lav_timeout) { + /* put it back at the end of the line */ + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + } + + if (data->lav_mnttype == LXAMT_INDIR) { + (void) snprintf(exp_path, sizeof (exp_path), "%s/%s", + (char *)refstr_value(vfsp->vfs_mntpt), mp->lxafme_path); + } else { + (void) strlcpy(exp_path, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (exp_path)); + } + + fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt); + if (fnd_vfs != NULL) { + boolean_t skip = B_FALSE; + vfssw_t *vfssw; + + /* + * If it's an NFS file system (typical) then we check in + * advance to see if it can be unmounted, otherwise, proceed. + * The fs-specific umount attempted by the automounter will + * either succeed or fail. Both are valid outcomes but checking + * now for nfs will save a bunch of work by the automounter + * if the fs is busy. + * + * Unfortunately, for NFS the vfs_fstype is the same for all + * versions of NFS, so we need to check the vfs_op member to + * determine which version of NFS we're dealing with. + */ + if (!skip && (vfssw = vfs_getvfssw("nfs4")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable4(fnd_vfs)) + busy = B_TRUE; + skip = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + if (!skip && (vfssw = vfs_getvfssw("nfs3")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable(fnd_vfs)) + busy = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + VFS_RELE(fnd_vfs); + + } else if (autofs_cnt > 0) { + /* + * The automounter is asking us to expire and we pulled this + * name from our vfs mountpoint list, but if + * lx_autofs_get_mountvfs returns null then that means we + * didn't find a non-autofs mount under this name. Thus, the + * name could be a subdirectory under an autofs toplevel + * indirect mount with one or more offset mounts below. + * autofs_cnt will indicate how many autofs mounts exist below + * this subdirectory name. + * + * The automounter will take care of unmounting any fs mounted + * over one of these offset mounts (i.e. offset is like a + * direct mount which the automounter will manage) but the + * automounter will not unmount the actual autofs offset mount + * itself, so we have to do that before we can expire the + * top-level subrectory name. + */ + busy = lx_autofs_umount_offset(exp_path, cr); + } + + if (busy) { + /* + * Can't unmount this one right now, put it at the end of the + * list and return. The caller will return EAGAIN for the + * expire ioctl and the automounter will check again later. + */ + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + + /* + * See lx_autofs_automounter_call. We want to send a msg up the pipe + * to the automounter in a similar way. + */ + +exp_offset: + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + goto err_free; + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter expire structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_TRUE, + mp->lxafme_path)) == NULL) + goto err_free; + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if (!is_dup && lx_autofs_fifo_write(data, laar) != 0) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + goto err_free; + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this request. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + goto err_free; + } + } + mutex_exit(&laar->laar_lock); + + /* + * If it failed or if the file system is still mounted after we get the + * response from our expire msg, then that means the automounter tried + * to unmount it but failed because the file system is busy, so we put + * this entry back on our list to try to expire it again later. + */ + fnd_vfs = NULL; + if (laar->laar_result == LXACR_FAIL || + (fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt)) != NULL || + autofs_cnt > 0) { + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } else { + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + lx_autofs_la_release(data, laar); + return (0); + +err_free: + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + return (EAGAIN); +} + +static int +lx_autofs_ack(int reqid, vfs_t *vfsp, enum lx_autofs_callres result) +{ + lx_autofs_vfs_t *data; + lx_autofs_automnt_req_t *laar; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if ((laar = lx_autofs_la_find(data, reqid)) == NULL) + return (ENXIO); + + /* Mark the request as complete and release it. */ + laar->laar_result = result; + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (0); +} + +static int +lx_autofs_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg, cred_t *cr) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + int id = arg; + int v; + int err; + + /* + * Be strict. + * We only accept ioctls from the automounter process group. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp != curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + switch (cmd) { + case LX_AUTOFS_IOC_READY: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_READY)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_FAIL: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_FAIL)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_CATATONIC: + /* The automounter is shutting down. */ + lx_autofs_fifo_close(data); + return (0); + + case LX_AUTOFS_IOC_PROTOVER: + v = LX_AUTOFS_PROTO_VERS5; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_PROTOSUBVER: + v = LX_AUTOFS_PROTO_SUBVERSION; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_ASKUMOUNT: + /* + * This is asking if autofs can be unmounted, not asking to + * actually unmount it. We return 1 if it is busy or 0 if it + * can be unmounted. + */ + v = 1; + if (lx_autofs_may_unmount(vp->v_vfsp, cr)) + v = 0; + + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_SETTIMEOUT: + if (copyin((caddr_t)arg, &data->lav_timeout, sizeof (ulong_t)) + != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_EXPIRE: + return (ENOTSUP); + + case LX_AUTOFS_IOC_EXPIRE_MULTI: + lx_autofs_expire(vp->v_vfsp, cr); + return (EAGAIN); + + default: + ASSERT(0); + return (ENOTSUP); + } +} + +static int +lx_autofs_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data) +{ + char *fd_str, *pgrp_str, *minproto_str, *maxproto_str; + int fd, pgrp, minproto, maxproto; + file_t *fp_wr, *fp_rd; + + /* Require these options to be present. */ + if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1)) + return (EINVAL); + + /* Get the values for each parameter. */ + if ((lx_autofs_str_to_int(fd_str, &fd) != 0) || + (lx_autofs_str_to_int(pgrp_str, &pgrp) != 0) || + (lx_autofs_str_to_int(minproto_str, &minproto) != 0) || + (lx_autofs_str_to_int(maxproto_str, &maxproto) != 0)) + return (EINVAL); + + /* + * We primarily support v2 & v5 of the linux kernel automounter + * protocol. The userland daemon typically needs v5. We'll reject + * unsupported ioctls later if we get one. + */ + if ((minproto > 5) || (maxproto < 2)) + return (EINVAL); + + /* + * Now we need to lookup the fifos we'll be using + * to talk to the userland automounter process. + */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + /* + * The automounter doesn't always have the same id as the pgrp. + * This happens when it is started via one of the various + * service managers. In this case the fifo lookup will fail + * so we retry with our own pid. + */ + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) + return (EINVAL); + } + + if (vfs_optionisset(vfsp, LX_MNTOPT_INDIRECT, NULL)) { + data->lav_mnttype = LXAMT_INDIR; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_DIRECT, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_DIRECT; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_OFFSET, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_OFFSET; + } + /* The automounter does test mounts with none of the options */ + if (data->lav_mnttype == LXAMT_NONE) + data->lav_mnttype = LXAMT_DIRECT; + + /* Save the mount options and fifo pointers. */ + data->lav_fd = fd; + data->lav_min_proto = minproto; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + return (0); +} + +static uint64_t +s2l_dev(dev_t dev) +{ + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + return (LX_MAKEDEVICE(maj, min)); +} + +/* + * VFS entry points + */ +static int +lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lx_autofs_vfs_t *data; + dev_t dev; + char name[40]; + int error; + vattr_t va; + + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) + return (EBUSY); + + /* We don't support mounts in the global zone. */ + if (getzoneid() == GLOBAL_ZONEID) + return (EPERM); + + /* + * Offset mounts will occur below the top-level mountpoint so we + * need to allow for autofs mounts even though mvp is an autofs. + */ + + /* Allocate a vfs struct. */ + data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP); + + /* Parse mount options. */ + if ((error = lx_autofs_parse_mntopt(vfsp, data)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + + /* Initialize the backing store. */ + lx_autofs_bs_destroy(mvp, LX_AUTOFS_BS_DIR); + data->lav_bs_vp = lx_autofs_bs_create(mvp, LX_AUTOFS_BS_DIR); + if (data->lav_bs_vp == NULL) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (EBUSY); + } + data->lav_bs_name = LX_AUTOFS_BS_DIR; + + /* Get the backing store inode for use in v5 protocol msgs */ + va.va_mask = AT_STAT; + if ((error = VOP_GETATTR(data->lav_bs_vp, &va, 0, cr, NULL)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + data->lav_ino = va.va_nodeid; + + /* We have to hold the underlying vnode we're mounted on. */ + data->lav_mvp = mvp; + VN_HOLD(mvp); + + /* Initialize vfs fields */ + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lx_autofs_fstype; + vfsp->vfs_data = data; + + /* Invent a dev_t (sigh) */ + do { + dev = makedevice(lx_autofs_major, + atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32); + } while (vfs_devismounted(dev)); + vfsp->vfs_dev = dev; + vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype); + + data->lav_dev = s2l_dev(vfsp->vfs_dev); + + /* Create an id space arena for automounter requests. */ + (void) snprintf(name, sizeof (name), "lx_autofs_id_%d", + getminor(vfsp->vfs_dev)); + data->lav_ids = id_space_create(name, 1, INT_MAX); + + /* Create hashes to keep track of automounter requests. */ + mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL); + (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_path_hash = mod_hash_create_strhash(name, + LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor); + (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_id_hash = mod_hash_create_idhash(name, + LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor); + + /* Create a hash to keep track of vnodes. */ + (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_vn_hash = mod_hash_create_ptrhash(name, + LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor, + sizeof (vnode_t)); + + list_create(&data->lav_mnt_list, sizeof (lx_autofs_mntent_t), + offsetof(lx_autofs_mntent_t, lxafme_lst)); + + /* Create root vnode */ + data->lav_root = lx_autofs_vn_alloc(vfsp, data->lav_bs_vp); + + data->lav_root->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP; + + /* + * For a direct mountpoint we need to allow a filesystem to be + * mounted overtop of this autofs mount. Otherwise, disallow that. + */ + if (data->lav_mnttype == LXAMT_INDIR) + data->lav_root->v_flag |= VNOMOUNT; + + return (0); +} + +static int +lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (EPERM); + + /* We do not currently support forced unmounts. */ + if (flag & MS_FORCE) + return (ENOTSUP); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (EBUSY); + + /* Check for any remaining holds on the root vnode. */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + if (data->lav_root->v_count > 1) + return (EBUSY); + + /* Close the fifo to the automount process. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + /* + * We have to release our hold on our root vnode before we can + * delete the backing store. (Since the root vnode is linked + * to the backing store.) + */ + VN_RELE(data->lav_root); + + /* Cleanup the backing store. */ + lx_autofs_bs_destroy(data->lav_mvp, data->lav_bs_name); + VN_RELE(data->lav_mvp); + + /* + * Delete all listed mounts. + */ + for (;;) { + lx_autofs_mntent_t *mp; + + mp = list_remove_head(&data->lav_mnt_list); + if (mp == NULL) + break; + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + /* Cleanup out remaining data structures. */ + mod_hash_destroy_strhash(data->lav_path_hash); + mod_hash_destroy_idhash(data->lav_id_hash); + mod_hash_destroy_ptrhash(data->lav_vn_hash); + id_space_destroy(data->lav_ids); + list_destroy(&data->lav_mnt_list); + kmem_free(data, sizeof (lx_autofs_vfs_t)); + + return (0); +} + +static int +lx_autofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + *vpp = data->lav_root; + VN_HOLD(*vpp); + + return (0); +} + +static int +lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *urvp = data->lav_root->v_data; + dev32_t d32; + int error; + + if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0) + return (error); + + /* Update some of values before returning. */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name, + sizeof (sp->f_basetype)); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + return (0); +} + +static const fs_operation_def_t lx_autofs_vfstops[] = { + { VFSNAME_MOUNT, { .vfs_mount = lx_autofs_mount } }, + { VFSNAME_UNMOUNT, { .vfs_unmount = lx_autofs_unmount } }, + { VFSNAME_ROOT, { .vfs_root = lx_autofs_root } }, + { VFSNAME_STATVFS, { .vfs_statvfs = lx_autofs_statvfs } }, + { NULL, NULL } +}; + +/* + * VOP entry points - simple passthrough + * + * For most VOP entry points we can simply pass the request on to + * the underlying filesystem we're mounted on. + */ +static int +lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp)); +} + +static int +lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ctp, int flags) +{ + vnode_t *uvp = vp->v_data; + return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags)); +} + +static int +lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_ACCESS(uvp, mode, flags, cr, ctp)); +} + +static int +lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_RWLOCK(uvp, write_lock, ctp)); +} + +static void +lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + VOP_RWUNLOCK(uvp, write_lock, ctp); +} + +/* + * Check if attempting to access a 'direct' mount and if so, call the + * automounter to perform the mount. Once the mount occurs, the new filesystem + * will be mounted overtop of this autofs mountpoint and we will no longer + * come through this path. + */ +static vnode_t * +lx_autofs_do_direct(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *nvp; + boolean_t skip_am_call = B_FALSE; + + if (data->lav_mnttype == LXAMT_INDIR) + return (NULL); + + /* + * Check if the current process is in the automounter's process group. + * If it is, the current process is either the automounter itself or + * one of it's children. If so, don't call back into the automounter. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + skip_am_call = B_TRUE; + } + mutex_exit(&pidlock); + + /* + * It is possible there is already a new fs mounted on top of our vnode. + * This can happen if the caller first did a lookup of a file name + * using our vnode as the directory vp. The lookup would trigger the + * autofs mount on top of ourself, but if the caller then uses our + * vnode to do a getattr on the directory, it will use the autofs + * vnode and not the newly mounted vnode. We need to skip re-calling + * the automounter for this case. + */ + if (!skip_am_call && vn_mountedvfs(vp) == NULL) { + char tbuf[MAXPATHLEN]; + char *nm; + + (void) strlcpy(tbuf, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (tbuf)); + nm = tbuf + strlen(tbuf); + while (*nm != '/' && nm != tbuf) + nm--; + if (*nm == '/') + nm++; + (void) lx_autofs_automounter_call(vp, nm); + } + + /* + * We need to take an extra hold on our vp (which is the autofs + * root vp) to account for the rele done in traverse. traverse will + * take a hold on the new vp so the caller is responsible for calling + * VN_RELE on the returned vp. + */ + VN_HOLD(vp); + nvp = vp; + if (traverse(&nvp) != 0) { + VN_RELE(nvp); + return (NULL); + } + + /* Confirm that we have a non-autofs fs mounted now */ + if (nvp->v_op == lx_autofs_vn_ops) { + VN_RELE(nvp); + return (NULL); + } + + return (nvp); +} + +/*ARGSUSED*/ +static int +lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ctp, int flags) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + + /* handle direct mount here */ + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + int error; + + error = VOP_RMDIR(nvp, nm, cdir, cr, ctp, flags); + VN_RELE(nvp); + return (error); + } + + /* + * cdir is the calling processes current directory. + * If cdir is lx_autofs vnode then get its real underlying + * vnode ptr. (It seems like the only thing cdir is + * ever used for is to make sure the user doesn't delete + * their current directory.) + */ + if (vn_matchops(cdir, lx_autofs_vn_ops)) { + vnode_t *ucdir = cdir->v_data; + return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags)); + } + + return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags)); +} + +/* + * VOP entry points - special passthrough + * + * For some VOP entry points we will first pass the request on to + * the underlying filesystem we're mounted on. If there's an error + * then we immediately return the error, but if the request succeeds + * we have to do some extra work before returning. + */ +static int +lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp) +{ + vnode_t *ovp = *vpp; + vnode_t *uvp = ovp->v_data; + int error; + + /* direct mounts were handled by the lookup to get *vpp */ + + if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0) + return (error); + + /* Check for clone opens. */ + if (uvp == ovp->v_data) + return (0); + + /* Deal with clone opens by returning a new vnode. */ + *vpp = lx_autofs_vn_alloc(ovp->v_vfsp, uvp); + VN_RELE(ovp); + return (0); +} + +/* + * Internally, we have already converted our autofs vfs device number into a + * Linux-format device during lx_autofs_mount and stored that device number + * in data->lav_dev. However, our lx emulation for the various stat() syscalls + * also wants to convert the fsid the same way. That obviously will be + * incorrect if we pass along an fsid that is already converted, so we always + * pass along the original vfs fsid here. Both lav_dev and lav_ino are passed + * in messages to the automounter, and these must match the values obtained by + * stat(). + */ +static int +lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + vnode_t *dvp; + int error; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + dev_t autofs_fsid = vp->v_vfsp->vfs_dev; + + if ((dvp = lx_autofs_do_direct(vp)) != NULL) { + uvp = dvp; + } + + error = VOP_GETATTR(uvp, vap, flags, cr, ctp); + + if (dvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(dvp); + if (error == 0) { + /* + * During automounter restart recovery, the automounter + * will fstat the fd provided in the setpipe ioctl. It + * uses the resulting inode & dev to correlate future + * autofs fifo requests to the correct entry. Thus, we + * have to update the attributes with the proper IDs. + */ + vap->va_fsid = autofs_fsid; + vap->va_nodeid = data->lav_ino; + } + } else if (error == 0) { + /* Update the attributes with our filesystem id. */ + vap->va_fsid = autofs_fsid; + } + + return (error); +} + +static int +lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + int error; + + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + udvp = nvp; + } + + error = VOP_MKDIR(udvp, nm, vap, vpp, cr, ctp, flags, vsecp); + + if (nvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(nvp); + } else if (error == 0) { + vnode_t *uvp = NULL; + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = dvp->v_vfsp->vfs_dev; + + /* Allocate our new vnode. */ + uvp = *vpp; + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + } + + return (error); +} + +/* + * VOP entry points - custom + */ +/*ARGSUSED*/ +static void +lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + /* + * We need to hold the vfs lock because if we're going to free + * this vnode we have to prevent anyone from looking it up + * in the vnode hash. + */ + mutex_enter(&data->lav_lock); + mutex_enter(&vp->v_lock); + + if (vp->v_count < 1) { + panic("lx_autofs_inactive: bad v_count"); + /*NOTREACHED*/ + } + + /* Drop the temporary hold by vn_rele now. */ + if (--vp->v_count > 0) { + mutex_exit(&vp->v_lock); + mutex_exit(&data->lav_lock); + return; + } + + /* + * No one should have been blocked on this lock because we're + * about to free this vnode. + */ + lx_autofs_vn_free(vp); +} + +static int +lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + lx_autofs_vfs_t *data; + int error = ENOENT; + + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* + * For an indirect mount first try to lookup if this path component + * already exists. + */ + if (data->lav_mnttype == LXAMT_INDIR) { + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } + + /* Only query the automounter if the path does not exist. */ + if (error != ENOENT) + return (error); + + if (data->lav_catatonic) + return (ENOENT); + + /* Save the uid/gid for the requestor ioctl. */ + data->lav_uid = crgetuid(cr); + data->lav_gid = crgetgid(cr); + + /* Refer the lookup to the automounter. */ + if ((error = lx_autofs_automounter_call(dvp, nm)) != 0) + return (error); + + if (data->lav_mnttype == LXAMT_INDIR) { + /* + * Indirect mount. The automounter call should have mounted + * something on nm. Retry the lookup operation. + */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } else { + /* + * Direct or offset mount. The automounter call should have + * covered our 'dvp' with a new filesystem. Traverse into the + * new mount and retry the lookup. + * + * We need to take an extra hold on our vp (which is the autofs + * root vp) to acount for the rele done in traverse. Our caller + * will also do a rele on the original dvp and that would leave + * us one ref short on our autofs root vnode. + */ + vnode_t *orig_dvp = dvp; + + VN_HOLD(dvp); + if ((error = traverse(&dvp)) != 0) { + VN_RELE(dvp); + return (error); + } + + if (dvp == orig_dvp) { + /* + * For some reason the automountd did not actually + * mount the new filesystem. Return an error. + */ + VN_RELE(dvp); + return (ENOENT); + } + + error = VOP_LOOKUP(dvp, nm, vpp, pnp, flags, rdir, cr, ctp, + direntflags, realpnp); + + /* release the traverse hold */ + VN_RELE(dvp); + } + return (error); +} + +static int +lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rvalp, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + + /* Intercept our ioctls. */ + switch ((uint_t)cmd) { + case LX_AUTOFS_IOC_READY: + case LX_AUTOFS_IOC_FAIL: + case LX_AUTOFS_IOC_CATATONIC: + case LX_AUTOFS_IOC_PROTOVER: + case LX_AUTOFS_IOC_SETTIMEOUT: + case LX_AUTOFS_IOC_EXPIRE: + case LX_AUTOFS_IOC_EXPIRE_MULTI: + case LX_AUTOFS_IOC_PROTOSUBVER: + case LX_AUTOFS_IOC_ASKUMOUNT: + return (lx_autofs_automounter_ioctl(vp, cmd, arg, cr)); + } + + /* Pass any remaining ioctl on. */ + return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp)); +} + +/* + * VOP entry points definitions + */ +static const fs_operation_def_t lx_autofs_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = lx_autofs_open } }, + { VOPNAME_CLOSE, { .vop_close = lx_autofs_close } }, + { VOPNAME_IOCTL, { .vop_ioctl = lx_autofs_ioctl } }, + { VOPNAME_RWLOCK, { .vop_rwlock = lx_autofs_rwlock } }, + { VOPNAME_RWUNLOCK, { .vop_rwunlock = lx_autofs_rwunlock } }, + { VOPNAME_GETATTR, { .vop_getattr = lx_autofs_getattr } }, + { VOPNAME_ACCESS, { .vop_access = lx_autofs_access } }, + { VOPNAME_READDIR, { .vop_readdir = lx_autofs_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = lx_autofs_lookup } }, + { VOPNAME_INACTIVE, { .vop_inactive = lx_autofs_inactive } }, + { VOPNAME_MKDIR, { .vop_mkdir = lx_autofs_mkdir } }, + { VOPNAME_RMDIR, { .vop_rmdir = lx_autofs_rmdir } }, + { NULL } +}; + +/* + * DEV-specific entry points + */ + +/*ARGSUSED*/ +static int +lx_autofs_dev_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +static int +lx_autofs_dev_validate_cmd(intptr_t arg, lx_autofs_dv_ioctl_t *dcmd) +{ + if (copyin((caddr_t)arg, dcmd, sizeof (lx_autofs_dv_ioctl_t)) != 0) + return (EFAULT); + + if (dcmd->lad_ver_major != LX_AUTOFS_DEV_VERSION_MAJOR || + dcmd->lad_ver_minor > LX_AUTOFS_DEV_VERSION_MINOR) + return (EINVAL); + + DTRACE_PROBE1(lx__dev__cmd, void *, dcmd); + + /* Fill in the version for return */ + dcmd->lad_ver_major = LX_AUTOFS_DEV_VERSION_MAJOR; + dcmd->lad_ver_minor = LX_AUTOFS_DEV_VERSION_MINOR; + return (0); +} + +static vfs_t * +lx_autofs_dev_getvfs_bypath(char *fs_mntpt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (NULL); + } + + do { + if (vfsp->vfs_op == lx_autofs_vfsops) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(fs_mntpt, ZONE_PATH_TRANSLATE(mntpt, zone)) + == 0) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + return (fnd_vfs); +} + +static int +lx_autofs_dev_fd_preamble(intptr_t arg, lx_autofs_dv_ioctl_t *dc, vfs_t **vfspp) +{ + int err; + lx_autofs_vfs_t *data; + file_t *fp; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_validate_cmd(arg, dc)) != 0) + return (err); + + if ((fp = getf(dc->lad_ioctlfd)) == NULL) + return (EBADF); + + vfsp = fp->f_vnode->v_vfsp; + if (vfsp->vfs_op != lx_autofs_vfsops) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_root->v_count <= 1) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + VFS_HOLD(vfsp); + *vfspp = vfsp; + + releasef(dc->lad_ioctlfd); + return (0); +} + +static int +lx_autofs_dev_vers(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_VERS5; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protosubver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_SUBVERSION; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_get_path_cmd(intptr_t arg, lx_autofs_dv_ioctl_t **dcp) +{ + int err; + lx_autofs_dv_ioctl_t dcmd, *dc; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (dcmd.lad_size <= sizeof (dcmd) || + dcmd.lad_size > (sizeof (dcmd) + MAXPATHLEN)) + return (EINVAL); + + dc = kmem_alloc(dcmd.lad_size, KM_SLEEP); + + /* re-copyin the full struct with the path */ + if (copyin((caddr_t)arg, dc, dcmd.lad_size) != 0) { + kmem_free(dc, dcmd.lad_size); + return (EFAULT); + } + dc->lad_size = dcmd.lad_size; + + if (dc->lad_path[0] != '/' || + dc->lad_path[dcmd.lad_size - sizeof (dcmd) - 1] != '\0') { + kmem_free(dc, dcmd.lad_size); + return (EINVAL); + } + + *dcp = dc; + return (0); +} + +static int +lx_autofs_dev_openmount(intptr_t arg) +{ + int err; + int fd; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + if ((vfsp = lx_autofs_dev_getvfs_bypath(dc->lad_path)) == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + /* lad_arg1 is the dev number of the mnt but we don't check that */ + + /* + * Do an "open" on the root vnode. To fully simulate "open" we also add + * a hold on the root vnode itself since lx_autofs_open will only open + * (and hold) the underlying vnode. + */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + VN_HOLD(data->lav_root); + if ((err = fassign(&data->lav_root, FWRITE|FREAD, &fd)) != 0) { + VN_RELE(data->lav_root); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (err); + } + + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt++; + mutex_exit(&data->lav_lock); + + dc->lad_ioctlfd = fd; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + (void) closeandsetf(fd, NULL); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + VFS_RELE(vfsp); + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_closemount(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* "close" the vnode */ + if ((err = closeandsetf(dcmd.lad_ioctlfd, NULL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + mutex_enter(&data->lav_lock); + ASSERT(data->lav_openmnt_cnt > 0); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_ready(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_READY)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_fail(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_FAIL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +/* + * Update the fifo pipe information we use to talk to the automounter. The + * ioctl is used when the automounter restarts. This logic is similar to the + * handling done in lx_autofs_parse_mntopt() when the filesytem is first + * mounted. + */ +static int +lx_autofs_dev_setpipefd(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + int fd, pgrp; + file_t *fp_wr, *fp_rd; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + mutex_enter(&pidlock); + pgrp = curproc->p_pgrp; + mutex_exit(&pidlock); + fd = dcmd.lad_arg1; + + /* Lookup the new fifos. See comment in lx_autofs_parse_mntopt. */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) { + VFS_RELE(vfsp); + return (EINVAL); + } + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* Close the old fifos. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + data->lav_fd = fd; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + /* + * Not explicitly in the ioctl spec. but necessary for correct recovery + */ + data->lav_catatonic = B_FALSE; + + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_catatonic(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_catatonic = B_TRUE; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_expire(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + /* If it succeeds in expiring then we don't want to return EAGAIN */ + if ((err = lx_autofs_expire(vfsp, kcred)) == 0) { + VFS_RELE(vfsp); + return (0); + } + + VFS_RELE(vfsp); + return (EAGAIN); +} + +static int +lx_autofs_dev_timeout(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_timeout = dcmd.lad_arg1; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_requestor(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + struct vfs *vfslist; + zone_t *zone = curzone; + lx_autofs_vfs_t *data; + uid_t uid; + gid_t gid; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Found an indirect mount (probably + * NFS) so we need to get the vfs it's + * mounted onto. + */ + vnode_t *vn = vfsp->vfs_vnodecovered; + vfsp = vn->v_vfsp; + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * autofs doesn't manage this + * path. + */ + break; + } + } + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + data = (lx_autofs_vfs_t *)fnd_vfs->vfs_data; + uid = data->lav_uid; + gid = data->lav_gid; + VFS_RELE(fnd_vfs); + + dc->lad_arg1 = uid; + dc->lad_arg2 = gid; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_ismntpt(intptr_t arg) +{ + int err = 0; + lx_autofs_dv_ioctl_t *dc; + struct vfs *vfslist; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + /* + * The automounter will always pass a path. It can also either pass an + * ioctlfd or, if it's -1, arg1 can be an LX_AUTOFS_TYPE_* value. We + * currently don't need those for our algorithm. + */ + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + do { + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + /* + * To handle direct mounts (on top of an autofs + * mount), we must prefer non-autofs vfs for + * this request. + */ + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + + if (fnd_vfs->vfs_op != lx_autofs_vfsops) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + /* + * arg1 is device number, arg2 is superblock magic number + * The superblock value only matters if autofs or not. + */ + dc->lad_arg1 = fnd_vfs->vfs_dev; + if (fnd_vfs->vfs_op == lx_autofs_vfsops) { + dc->lad_arg2 = LX_AUTOFS_SB_MAGIC; + } else { + dc->lad_arg2 = ~LX_AUTOFS_SB_MAGIC; + } + + VFS_RELE(fnd_vfs); + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + + /* + * We have to return 1 if it is a mount point. The lx ioctl autofs + * translator will convert a negative value back to a positive, + * non-error return value. + */ + return (-1); +} + +static int +lx_autofs_dev_askumount(intptr_t arg) +{ + int err; + int v; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if (lx_autofs_may_unmount(vfsp, kcred)) { + v = 0; + } else { + v = 1; + } + VFS_RELE(vfsp); + + dcmd.lad_arg1 = v; + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + switch (cmd) { + case LX_AUTOFS_DEV_IOC_VERSION_CMD: + return (lx_autofs_dev_vers(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOVER_CMD: + return (lx_autofs_dev_protver(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD: + return (lx_autofs_dev_protosubver(arg)); + + case LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD: + return (lx_autofs_dev_openmount(arg)); + + case LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD: + return (lx_autofs_dev_closemount(arg)); + + case LX_AUTOFS_DEV_IOC_READY_CMD: + return (lx_autofs_dev_ready(arg)); + + case LX_AUTOFS_DEV_IOC_FAIL_CMD: + return (lx_autofs_dev_fail(arg)); + + case LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD: + return (lx_autofs_dev_setpipefd(arg)); + + case LX_AUTOFS_DEV_IOC_CATATONIC_CMD: + return (lx_autofs_dev_catatonic(arg)); + + case LX_AUTOFS_DEV_IOC_TIMEOUT_CMD: + return (lx_autofs_dev_timeout(arg)); + + case LX_AUTOFS_DEV_IOC_REQUESTER_CMD: + return (lx_autofs_dev_requestor(arg)); + + case LX_AUTOFS_DEV_IOC_EXPIRE_CMD: + return (lx_autofs_dev_expire(arg)); + + case LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD: + return (lx_autofs_dev_askumount(arg)); + + case LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD: + return (lx_autofs_dev_ismntpt(arg)); + } + + return (EINVAL); +} + +/* + * lx_autofs_init() gets invoked via the mod_install() call in + * this module's _init() routine. Therefore, the code that cleans + * up the structures we allocate below is actually found in + * our _fini() routine. + */ +/* ARGSUSED */ +static int +lx_autofs_init(int fstype, char *name) +{ + int error; + + lx_autofs_major = ddi_name_to_major(LX_AUTOFS_NAME); + + lx_autofs_fstype = fstype; + if ((error = vfs_setfsops(fstype, lx_autofs_vfstops, + &lx_autofs_vfsops)) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template"); + return (error); + } + + if ((error = vn_make_ops(name, lx_autofs_tops_root, + &lx_autofs_vn_ops)) != 0) { + VERIFY(vfs_freevfsops_by_type(fstype) == 0); + lx_autofs_vn_ops = NULL; + return (error); + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance = ddi_get_instance(dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + ASSERT(instance == 0); + if (instance != 0) + return (DDI_FAILURE); + + /* create our minor node */ + if (ddi_create_minor_node(dip, LX_AUTOFS_MINORNAME, S_IFCHR, 0, + DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + lx_autofs_dip = dip; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + lx_autofs_dip = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, + void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = lx_autofs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Driver flags + */ +static struct cb_ops lx_autofs_cb_ops = { + lx_autofs_dev_open, /* open */ + lx_autofs_dev_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + lx_autofs_dev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* vb_prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +/* + * Module linkage + */ +static mntopt_t lx_autofs_mntopt[] = { + { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_INDIRECT, NULL, 0, 0 }, + { LX_MNTOPT_DIRECT, NULL, 0, 0 }, + { LX_MNTOPT_OFFSET, NULL, 0, 0 } +}; + +static mntopts_t lx_autofs_mntopts = { + sizeof (lx_autofs_mntopt) / sizeof (mntopt_t), + lx_autofs_mntopt +}; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + LX_AUTOFS_NAME, + lx_autofs_init, + VSW_HASPROTO | VSW_VOLATILEDEV | VSW_ZMOUNT, + &lx_autofs_mntopts +}; + +static struct dev_ops lx_autofs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + lx_autofs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_autofs_attach, /* attach */ + lx_autofs_detach, /* detach */ + nodev, /* reset */ + &lx_autofs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx autofs filesystem", &vfw +}; + +static struct modldrv modldrv = { + &mod_driverops, "lx autofs driver", &lx_autofs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlfs, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) { + return (error); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + if (lx_autofs_vn_ops != NULL) { + vn_freevnodeops(lx_autofs_vn_ops); + lx_autofs_vn_ops = NULL; + } + + /* + * In our init routine, if we get an error after calling + * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type(). + * But we don't need to call vfs_freevfsops_by_type() here + * because the fs framework did this for us as part of the + * mod_remove() call above. + */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/autofs/lxautofs.conf b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf new file mode 100644 index 0000000000..36e0119e33 --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2016 Joyent, Inc. +# + +name="lxautofs" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h new file mode 100644 index 0000000000..46e2cdd886 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h @@ -0,0 +1,222 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXCGRPS_H +#define _LXCGRPS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * cgrps.h: declarations, data structures and macros for lx_cgroup + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> + +/* + * cgrpmgr ioctl interface. + */ +#define CGRPFS_IOC ('C' << 16 | 'G' << 8) +#define CGRPFS_GETEVNT (CGRPFS_IOC | 1) + +typedef struct cgrpmgr_info { + pid_t cgmi_pid; + char *cgmi_rel_agent_path; + char *cgmi_cgroup_path; +} cgrpmgr_info_t; + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +typedef struct cgrpmgr_info32 { + pid_t cgmi_pid; + caddr32_t cgmi_rel_agent_path; + caddr32_t cgmi_cgroup_path; +} cgrpmgr_info32_t; + +#define CG_PSNSIZE 256 /* max size of pseudo file name entries */ +#define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */ + +/* + * The order of these entries must be in sync with the cg_ssde_dir array. + */ +typedef enum cgrp_ssid { + CG_SSID_GENERIC = 1, + CG_SSID_NUM /* last ssid for range checking */ +} cgrp_ssid_t; + +typedef enum cgrp_nodetype { + CG_CGROUP_DIR = 1, /* cgroup directory entry */ + CG_NOTIFY, /* notify_on_release file */ + CG_PROCS, /* cgroup.procs file */ + CG_REL_AGENT, /* release_agent file */ + CG_TASKS, /* tasks file */ +} cgrp_nodetype_t; + +typedef struct cgrp_subsys_dirent { + cgrp_nodetype_t cgrp_ssd_type; + char *cgrp_ssd_name; +} cgrp_subsys_dirent_t; + +#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2) + +/* + * A modern systemd-based Linux system typically has 50-60 cgroups so + * we size the hash for 2x that number. + */ +#define CGRP_HASH_SZ 128 +#define CGRP_AGENT_LEN (MAXPATHLEN + 1) + +/* + * cgroups per-mount data structure. + * + * All but the event related fields are protected by cg_contents. + * The evnt_list and counter is protected by cg_events. + */ +typedef struct cgrp_mnt { + struct vfs *cg_vfsp; /* filesystem's vfs struct */ + struct cgrp_node *cg_rootnode; /* root cgrp_node */ + char *cg_mntpath; /* name of cgroup mount point */ + cgrp_ssid_t cg_ssid; /* subsystem type */ + dev_t cg_dev; /* unique dev # of mounted `device' */ + uint_t cg_gen; /* node ID source for files */ + uint_t cg_grp_gen; /* ID source for cgroups */ + kmutex_t cg_contents; /* global lock for most fs activity */ + char cg_agent[CGRP_AGENT_LEN]; /* release_agent path */ + /* ptr to zone data for containing zone */ + lx_zone_data_t *cg_lxzdata; + struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */ +} cgrp_mnt_t; + +/* + * cgrp_node is the file system dependent node for cgroups. + * + * The node is used to represent both directories (a cgroup) and pseudo files + * within the directory. + * + * Members are tagged in the comment to note which type of node they apply to: + * A - all + * D - dir (i.e. a cgroup) + * F - pseudo file + */ + +typedef struct cgrp_node { + struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */ + struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */ + struct cgrp_dirent *cgn_dir; /* D dirent list */ + struct cgrp_node *cgn_parent; /* A dir containing this node */ + struct cgrp_node *cgn_next; /* D link in per-mount cgroup */ + /* hash table */ + uint_t cgn_dirents; /* D number of dirents */ + cgrp_nodetype_t cgn_type; /* A type for this node */ + uint_t cgn_notify; /* D notify_on_release value */ + uint_t cgn_task_cnt; /* D number of threads in grp */ + struct vnode *cgn_vnode; /* A vnode for this cgrp_node */ + uint_t cgn_id; /* D ID number for the cgroup */ + struct vattr cgn_attr; /* A attributes */ +} cgrp_node_t; + +/* + * File system independent to cgroups conversion macros + */ +#define VFSTOCGM(vfsp) ((cgrp_mnt_t *)(vfsp)->vfs_data) +#define VTOCGM(vp) ((cgrp_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOCGN(vp) ((struct cgrp_node *)(vp)->v_data) +#define CGNTOV(cn) ((cn)->cgn_vnode) +#define cgnode_hold(cn) VN_HOLD(CGNTOV(cn)) +#define cgnode_rele(cn) VN_RELE(CGNTOV(cn)) + +/* + * Attributes + */ +#define cgn_mask cgn_attr.va_mask +#define cgn_mode cgn_attr.va_mode +#define cgn_uid cgn_attr.va_uid +#define cgn_gid cgn_attr.va_gid +#define cgn_fsid cgn_attr.va_fsid +#define cgn_nodeid cgn_attr.va_nodeid +#define cgn_nlink cgn_attr.va_nlink +#define cgn_size cgn_attr.va_size +#define cgn_atime cgn_attr.va_atime +#define cgn_mtime cgn_attr.va_mtime +#define cgn_ctime cgn_attr.va_ctime +#define cgn_rdev cgn_attr.va_rdev +#define cgn_blksize cgn_attr.va_blksize +#define cgn_nblocks cgn_attr.va_nblocks +#define cgn_seq cgn_attr.va_seq + +/* + * cgroup directories are made up of a linked list of cg_dirent structures + * hanging off directory cgrp_nodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct cgrp_dirent { + struct cgrp_node *cgd_cgrp_node; /* cg node for this file */ + struct cgrp_dirent *cgd_next; /* next directory entry */ + struct cgrp_dirent *cgd_prev; /* prev directory entry */ + uint_t cgd_offset; /* "offset" of dir entry */ + uint_t cgd_hash; /* a hash of cgd_name */ + struct cgrp_dirent *cgd_link; /* linked via hash table */ + struct cgrp_node *cgd_parent; /* parent, dir we are in */ + char *cgd_name; /* null terminated */ +} cgrp_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +extern struct vnodeops *cgrp_vnodeops; + +int cgrp_dirdelete(cgrp_node_t *, cgrp_node_t *, char *, enum dr_op, cred_t *); +int cgrp_direnter(cgrp_mnt_t *, cgrp_node_t *, char *, enum de_op, + cgrp_node_t *, struct vattr *, cgrp_node_t **, cred_t *); +void cgrp_dirinit(cgrp_node_t *, cgrp_node_t *, cred_t *); +int cgrp_dirlookup(cgrp_node_t *, char *, cgrp_node_t **, cred_t *); +void cgrp_dirtrunc(cgrp_node_t *); +void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *); +int cgrp_taccess(void *, int, cred_t *); +ino_t cgrp_inode(cgrp_nodetype_t, unsigned int); +int cgrp_num_pseudo_ents(cgrp_ssid_t); +cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t); +void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *, boolean_t); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXCGRPS_H */ diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c new file mode 100644 index 0000000000..66b6f60376 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c @@ -0,0 +1,1014 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "cgrps.h" + +static int cgrp_dirmakecgnode(cgrp_node_t *, cgrp_mnt_t *, struct vattr *, + enum de_op, cgrp_node_t **, struct cred *); +static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *); + +static cgrp_subsys_dirent_t cgrp_generic_dir[] = { + { CG_PROCS, "cgroup.procs" }, + { CG_NOTIFY, "notify_on_release" }, + { CG_TASKS, "tasks" } +}; + +typedef struct cgrp_ssde { + cgrp_subsys_dirent_t *cg_ssde_files; + int cg_ssde_nfiles; +} cgrp_ssde_t; + +#define CGDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +/* + * Note, these entries must be in the same order as the cgrp_ssid_t entries. + */ +static cgrp_ssde_t cg_ssde_dir[] = { + /* subsystems start at 1 */ + {NULL, 0}, + + /* CG_SSID_GENERIC */ + {cgrp_generic_dir, CGDIRLISTSZ(cgrp_generic_dir)}, +}; + + +#define CG_HASH_SIZE 8192 /* must be power of 2 */ +#define CG_MUTEX_SIZE 64 + +static cgrp_dirent_t *cg_hashtable[CG_HASH_SIZE]; +static kmutex_t cg_hashmutex[CG_MUTEX_SIZE]; + +#define CG_HASH_INDEX(a) ((a) & (CG_HASH_SIZE-1)) +#define CG_MUTEX_INDEX(a) ((a) & (CG_MUTEX_SIZE-1)) + +#define CG_HASH(cp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(cp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +#define MODESHIFT 3 + +typedef enum cgrp_nodehold { + NOHOLD, + HOLD +} cgrp_nodehold_t; + +void +cgrp_hash_init(void) +{ + int i; + + for (i = 0; i < CG_MUTEX_SIZE; i++) + mutex_init(&cg_hashmutex[i], NULL, MUTEX_DEFAULT, NULL); +} + +static void +cgrp_hash_in(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + CG_HASH(c->cgd_parent, c->cgd_name, hash); + c->cgd_hash = hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + c->cgd_link = *prevpp; + *prevpp = c; + mutex_exit(cg_hmtx); +} + +static void +cgrp_hash_out(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + hash = c->cgd_hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + while (*prevpp != c) + prevpp = &(*prevpp)->cgd_link; + *prevpp = c->cgd_link; + mutex_exit(cg_hmtx); +} + +static cgrp_dirent_t * +cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold, + cgrp_node_t **found) +{ + cgrp_dirent_t *l; + uint_t hash; + kmutex_t *cg_hmtx; + cgrp_node_t *cnp; + + CG_HASH(parent, name, hash); + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + l = cg_hashtable[CG_HASH_INDEX(hash)]; + while (l) { + if ((l->cgd_hash == hash) && + (l->cgd_parent == parent) && + (strcmp(l->cgd_name, name) == 0)) { + /* + * We need to make sure that the cgrp_node that + * we put a hold on is the same one that we pass back. + * Hence, temporary variable cnp is necessary. + */ + cnp = l->cgd_cgrp_node; + if (hold == HOLD) { + ASSERT(cnp); + cgnode_hold(cnp); + } + if (found) + *found = cnp; + mutex_exit(cg_hmtx); + return (l); + } else { + l = l->cgd_link; + } + } + mutex_exit(cg_hmtx); + return (NULL); +} + +/* + * The following functions maintain the per-mount cgroup hash table. + */ +static void +cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + cn->cgn_next = cgm->cg_grp_hash[hsh]; + cgm->cg_grp_hash[hsh] = cn; +} + +static void +cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + cgrp_node_t *np = NULL, *curp, *prevp = NULL; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + if (prevp == NULL) { + cgm->cg_grp_hash[hsh] = curp->cgn_next; + } else { + prevp->cgn_next = curp->cgn_next; + } + np = curp; + np->cgn_next = NULL; + break; + } + + prevp = curp; + } + + ASSERT(np != NULL); + ASSERT(np->cgn_task_cnt == 0); +} + +/* + * Count up the number of threads already running in the zone and initialize the + * first cgroup's task counter. + * + * We have to look at all of the processes to find applicable ones. + */ +static void +cgrp_cg_hash_init(cgrp_node_t *cn) +{ + int i; + int cnt = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + /* Scan all of the process entries */ + mutex_enter(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != zoneid) { + continue; + } + + mutex_enter(&p->p_lock); + if (p->p_brand != &lx_brand) { + mutex_exit(&p->p_lock); + continue; + } + cnt += p->p_lwpcnt; + mutex_exit(&p->p_lock); + } + + /* + * There should be at least the init process with 1 thread in the zone + */ + ASSERT(cnt > 0); + cn->cgn_task_cnt = cnt; + + DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt); + + mutex_exit(&pidlock); +} + +cgrp_node_t * +cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid) +{ + int hsh = cgid % CGRP_HASH_SZ; + cgrp_node_t *curp; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + return (curp); + } + } + + return (NULL); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them to give the inode number for + * a cgrp pseudo file node. + */ +ino_t +cgrp_inode(cgrp_nodetype_t type, unsigned int cgrpid) +{ + /* + * cgroup inode format: + * 00000000AABBBBBB + * + * AA - node type (from subsystem list) + * BBBBBB - id of the cgroup + */ + + return ((ino_t)(type << 24) | (cgrpid & 0xffffff)); +} + +/* + * Return the number of pseudo file entries in a cgroup directory for the + * given subsystem. + */ +int +cgrp_num_pseudo_ents(cgrp_ssid_t ssid) +{ + cgrp_ssde_t *ssdp = &cg_ssde_dir[ssid]; + + return (ssdp->cg_ssde_nfiles); +} + +int +cgrp_taccess(void *vcp, int mode, cred_t *cred) +{ + cgrp_node_t *cn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in cgrp_node. + */ + if (crgetuid(cred) != cn->cgn_uid) { + shift += MODESHIFT; + if (groupmember(cn->cgn_gid, cred) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cred, CGNTOV(cn), cn->cgn_uid, + cn->cgn_mode << shift, mode)); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found cgrp_node with its vnode held. + */ +int +cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp, + cred_t *cred) +{ + int error; + + ASSERT(MUTEX_HELD(&VTOCGM(parent->cgn_vnode)->cg_contents)); + *foundcp = NULL; + if (parent->cgn_type != CG_CGROUP_DIR) + return (ENOTDIR); + + if ((error = cgrp_taccess(parent, VEXEC, cred))) + return (error); + + if (*name == '\0') { + cgnode_hold(parent); + *foundcp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the cgn_dir list + * so that it doesn't change out from underneath us. + * cgrp_hash_lookup() will pass back the cgrp_node + * with a hold on it. + */ + + if (cgrp_hash_lookup(name, parent, HOLD, foundcp) != NULL) { + ASSERT(*foundcp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry for 'name' and 'cp' into directory 'dir' + * + * Returns 0 on success. + */ +int +cgrp_direnter( + cgrp_mnt_t *cgm, + cgrp_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + cgrp_node_t *cn, /* existing cgrp_node, if rename */ + struct vattr *va, + cgrp_node_t **cnp, /* return cgrp_node, if create/mkdir */ + cred_t *cred) +{ + cgrp_dirent_t *cdp; + cgrp_node_t *found = NULL; + int error = 0; + char *s; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("cgrp_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + * Remember that we can only rename within the same directory. + */ + if (op == DE_RENAME) { + if (cn->cgn_nlink == 0) { + return (ENOENT); + } + + if (cn->cgn_nlink == MAXLINK) { + return (EMLINK); + } + cn->cgn_nlink++; + gethrestime(&cn->cgn_ctime); + } + + /* + * This might be a "dangling detached directory". + * it could have been removed, but a reference + * to it kept in u_cwd. don't bother searching + * it, and with any luck the user will get tired + * of dealing with us and cd to some absolute + * pathway. *sigh*, thus in ufs, too. + */ + if (dir->cgn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * Search for the entry. In all cases it is an error if it exists. + */ + cdp = cgrp_hash_lookup(name, dir, HOLD, &found); + + if (cdp) { + ASSERT(found != NULL); + error = EEXIST; + mutex_exit(&cgm->cg_contents); + cgnode_rele(found); + mutex_enter(&cgm->cg_contents); + } else { + + /* + * The entry does not exist. Check write permission in + * directory to see if entry can be created. + */ + if ((error = cgrp_taccess(dir, VWRITE, cred)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new cgrp_node and directory entry as required. + */ + error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred); + if (error) + goto out; + + if (op == DE_MKDIR) { + /* + * inherit notify_on_release value from parent + */ + cn->cgn_notify = dir->cgn_notify; + } + } + + error = cgrp_diraddentry(dir, cn, name); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + if ((cn->cgn_type) == CG_CGROUP_DIR) { + ASSERT(cdp == NULL); + /* + * cleanup allocs made by cgrp_dirinit + */ + cgrp_dirtrunc(cn); + } + cn->cgn_nlink = 0; + gethrestime(&cn->cgn_ctime); + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + cn = NULL; + } + } else if (cnp) { + *cnp = cn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + cn->cgn_nlink--; + gethrestime(&cn->cgn_ctime); + } + return (error); +} + +/* + * Delete entry cn of name "nm" from parent dir. This is used to both remove + * a cgroup directory and to remove the pseudo file nodes within the cgroup + * directory (by recursively calling itself). It frees the dir entry space + * and decrements link count on cgrp_node(s). + * + * Return 0 on success. + */ +int +cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, + cred_t *cred) +{ + cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode); + cgrp_dirent_t *cndp; + int error; + size_t namelen; + cgrp_node_t *cnnp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + if (nm[0] == '\0') + panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = cgrp_taccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->cgn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a cgroup dir. Start by + * removing the fixed pseudo file entries from the dir. We do + * this by recursively calling back into this function with + * a different op code. The caller of this function has + * already verified that it is safe to remove this directory. + */ + cgrp_dirent_t *cdp; + + ASSERT(cn->cgn_type == CG_CGROUP_DIR); + + cdp = cn->cgn_dir; + while (cdp) { + cgrp_node_t *pseudo_node; + cgrp_dirent_t *nextp; + + if (strcmp(cdp->cgd_name, ".") == 0 || + strcmp(cdp->cgd_name, "..") == 0) { + cdp = cdp->cgd_next; + continue; + } + + pseudo_node = cdp->cgd_cgrp_node; + nextp = cdp->cgd_next; + + cgnode_hold(pseudo_node); + error = cgrp_dirdelete(cn, pseudo_node, + cdp->cgd_name, DR_REMOVE, cred); + mutex_exit(&cgm->cg_contents); + cgnode_rele(pseudo_node); + mutex_enter(&cgm->cg_contents); + + cdp = nextp; + } + + cgrp_cg_hash_remove(cgm, cn); + } + + cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp); + VERIFY(cndp != NULL); + VERIFY(cn == cnnp); + + cgrp_hash_out(cndp); + + /* Take cndp out of the directory list. */ + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + if (cndp->cgd_prev) { + cndp->cgd_prev->cgd_next = cndp->cgd_next; + } + if (cndp->cgd_next) { + cndp->cgd_next->cgd_prev = cndp->cgd_prev; + } + + /* + * If the roving slot pointer happens to match cndp, + * point it at the previous dirent. + */ + if (dir->cgn_dir->cgd_prev == cndp) { + dir->cgn_dir->cgd_prev = cndp->cgd_prev; + } + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + + /* cndp points to the correct directory entry */ + namelen = strlen(cndp->cgd_name) + 1; + + kmem_free(cndp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + cn->cgn_ctime = now; + + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) { + cgrp_dirtrunc(cn); + ASSERT(cn->cgn_nlink == 0); + } + return (0); +} + +/* + * Initialize a cgrp_node and add it to file list under mount point. + */ +void +cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(vap != NULL); + + cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + cn->cgn_mask = 0; + cn->cgn_attr.va_type = vap->va_type; + cn->cgn_nlink = 1; + cn->cgn_size = 0; + + if (cred == NULL) { + cn->cgn_uid = vap->va_uid; + cn->cgn_gid = vap->va_gid; + } else { + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + } + + cn->cgn_fsid = cgm->cg_dev; + cn->cgn_rdev = vap->va_rdev; + cn->cgn_blksize = PAGESIZE; + cn->cgn_nblocks = 0; + gethrestime(&now); + cn->cgn_atime = now; + cn->cgn_mtime = now; + cn->cgn_ctime = now; + cn->cgn_seq = 0; + cn->cgn_dir = NULL; + + cn->cgn_vnode = vn_alloc(KM_SLEEP); + vp = CGNTOV(cn); + vn_setops(vp, cgrp_vnodeops); + vp->v_vfsp = cgm->cg_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)cn; + + cn->cgn_nodeid = cgm->cg_gen++; + + /* + * Add new cgrp_node to end of linked list of cgrp_nodes for this + * cgroup fs. Root directory is handled specially in cgrp_mount. + */ + if (cgm->cg_rootnode != (cgrp_node_t *)NULL) { + cn->cgn_forw = NULL; + cn->cgn_back = cgm->cg_rootnode->cgn_back; + cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn; + } + vn_exists(vp); +} + +void +cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name, + cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr) +{ + cgrp_node_t *ncn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + VERIFY0(cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, + nattr, &ncn, cr)); + + /* + * Fix the inode and assign the pseudo file type to be correct. + */ + ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid); + ncn->cgn_type = type; + + /* + * Since we're creating these entries here and not via the + * normal VOP_CREATE code path, we need to do the rele to drop + * our hold. This will leave the vnode v_count at 0 when we + * come out of cgrp_inactive but we won't reclaim the vnode + * there since the cgn_nlink value will still be 1. + */ + mutex_exit(&cgm->cg_contents); + cgnode_rele(ncn); + mutex_enter(&cgm->cg_contents); +} + +/* + * cgrp_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) +{ + cgrp_dirent_t *dot, *dotdot; + timestruc_t now; + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); + cgrp_ssde_t *ssdp; + cgrp_subsys_dirent_t *pseudo_files; + struct vattr nattr; + int i; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM); + ssdp = &cg_ssde_dir[cgm->cg_ssid]; + + /* + * If this is the top-level cgroup created by the mount then we need to + * count up the number of procs and tasks already running in the zone. + */ + + /* + * Set the cgroup ID for this cgrp_node by using a counter on each + * mount. + */ + dir->cgn_id = cgm->cg_grp_gen++; + cgrp_cg_hash_insert(cgm, dir); + /* Initialise the first cgroup if this is top-level group */ + if (parent == dir) + cgrp_cg_hash_init(dir); + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP); + dot->cgd_cgrp_node = dir; + dot->cgd_offset = 0; + dot->cgd_name = (char *)dot + sizeof (cgrp_dirent_t); + dot->cgd_name[0] = '.'; + dot->cgd_parent = dir; + cgrp_hash_in(dot); + + dotdot = kmem_zalloc(sizeof (cgrp_dirent_t) + 3, KM_SLEEP); + dotdot->cgd_cgrp_node = parent; + dotdot->cgd_offset = 1; + dotdot->cgd_name = (char *)dotdot + sizeof (cgrp_dirent_t); + dotdot->cgd_name[0] = '.'; + dotdot->cgd_name[1] = '.'; + dotdot->cgd_parent = dir; + cgrp_hash_in(dotdot); + + /* + * Initialize directory entry list. + */ + dot->cgd_next = dotdot; + dot->cgd_prev = dotdot; /* dot's cgd_prev holds roving slot pointer */ + dotdot->cgd_next = NULL; + dotdot->cgd_prev = dot; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + parent->cgn_nlink++; + parent->cgn_ctime = now; + + dir->cgn_dir = dot; + dir->cgn_size = 2 * sizeof (cgrp_dirent_t) + 5; /* dot and dotdot */ + dir->cgn_dirents = 2; + dir->cgn_nlink = 2; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; + + /* + * If this is the top-level dir in the file system then it always + * has a release_agent pseudo file. Only the top-level dir has this + * file. + */ + if (parent == dir) { + cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr, + cr); + } + + pseudo_files = ssdp->cg_ssde_files; + for (i = 0; i < ssdp->cg_ssde_nfiles; i++) { + cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name, + pseudo_files[i].cgrp_ssd_type, &nattr, cr); + } +} + +/* + * cgrp_dirtrunc is called to remove all directory entries under this directory. + */ +void +cgrp_dirtrunc(cgrp_node_t *dir) +{ + cgrp_dirent_t *cgdp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&VTOCGM(dir->cgn_vnode)->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) { + size_t namelen; + cgrp_node_t *cn; + + ASSERT(cgdp->cgd_next != cgdp); + ASSERT(cgdp->cgd_prev != cgdp); + ASSERT(cgdp->cgd_cgrp_node); + + dir->cgn_dir = cgdp->cgd_next; + namelen = strlen(cgdp->cgd_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + cn = cgdp->cgd_cgrp_node; + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + + cgrp_hash_out(cgdp); + kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + } + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + ASSERT(dir->cgn_dir == NULL); + ASSERT(dir->cgn_size == 0); + ASSERT(dir->cgn_dirents == 0); +} + +static int +cgrp_diraddentry(cgrp_node_t *dir, cgrp_node_t *cn, char *name) +{ + cgrp_dirent_t *cdp, *cpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->cgn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (cn->cgn_vnode->v_vfsp != dir->cgn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (cgrp_dirent_t); + cdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); + if (cdp == NULL) + return (ENOSPC); + + cn->cgn_parent = dir; + + dir->cgn_size += alloc_size; + dir->cgn_dirents++; + cdp->cgd_cgrp_node = cn; + cdp->cgd_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + cdp->cgd_name = (char *)cdp + sizeof (cgrp_dirent_t); + (void) strcpy(cdp->cgd_name, name); + + cgrp_hash_in(cdp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in cgrp_readdir. + */ + cpdp = dir->cgn_dir->cgd_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (cpdp->cgd_next != NULL && (cpdp->cgd_next->cgd_offset - + cpdp->cgd_offset) <= 1) { + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + ASSERT(cpdp->cgd_next->cgd_offset > cpdp->cgd_offset); + cpdp = cpdp->cgd_next; + } + cdp->cgd_offset = cpdp->cgd_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's cgd_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (cpdp->cgd_next == NULL && cpdp->cgd_offset > 2 * dir->cgn_dirents) + dir->cgn_dir->cgd_prev = dir->cgn_dir->cgd_next; + else + dir->cgn_dir->cgd_prev = cdp; + + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + cdp->cgd_next = cpdp->cgd_next; + if (cdp->cgd_next) { + cdp->cgd_next->cgd_prev = cdp; + } + cdp->cgd_prev = cpdp; + cpdp->cgd_next = cdp; + + ASSERT(cdp->cgd_next != cdp); + ASSERT(cdp->cgd_prev != cdp); + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + return (0); +} + +static int +cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va, + enum de_op op, cgrp_node_t **newnode, struct cred *cred) +{ + cgrp_node_t *cn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + cn = kmem_zalloc(sizeof (cgrp_node_t), KM_SLEEP); + cgrp_node_init(cgm, cn, va, cred); + + cn->cgn_vnode->v_rdev = cn->cgn_rdev = NODEV; + cn->cgn_vnode->v_type = va->va_type; + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + + if (va->va_mask & AT_ATIME) + cn->cgn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + cn->cgn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + cn->cgn_type = CG_CGROUP_DIR; + cgrp_dirinit(dir, cn, cred); + } + + *newnode = cn; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c new file mode 100644 index 0000000000..7805c3f2bd --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c @@ -0,0 +1,1071 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The cgroup file system implements a subset of the Linux cgroup functionality + * for use by lx-branded zones. On Linux, cgroups are a generic process grouping + * mechanism which is used to apply various behaviors to the processes within + * the group, although it's primary purpose is for resource management. + * + * In Linux, the cgroup file system provides two pieces of functionality: + * 1) A per-mount set of cgroups arranged in a tree, such that every task in + * the system is in one, and only one, of the cgroups in the tree. + * 2) A set of subsystems; each subsystem has subsystem-specific state and + * behavior and is associated with a cgroup mount. This provides a way to + * apply arbitrary functionality (but generally resource management related) + * to the processes associated with the nodes in the tree at that mount + * point. + * + * For example, it is common to see cgroup trees (each is its own mount with a + * different subsystem controller) for blkio, cpuset, memory, systemd (has no + * controller), etc. Within each tree there is a top-level directory with at + * least a cgroup.procs, notify_on_release, release_agent, and tasks file. + * The cgroup.procs file lists the processes within that group and the tasks + * file lists the threads in the group. There could be subdirectories, which + * define new cgroups, that then contain a subset of the processes. Each + * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and + * tasks file. + * + * Since we're using lx to run user-level code within zones, the majority (all?) + * of the cgroup resource management functionality simply doesn't apply to us. + * The primary need for cgroups is to support the init program 'systemd' as the + * consumer. systemd only requires the process grouping hierarchy of cgroups, + * although it can also use the resource management features if they are + * available. Given this, our cgroup file system only implements the process + * hierarchy and does not report that any resource management controllers are + * available for separate mounts. + * + * In addition to the hierarchy, the other important component of cgroups that + * is used by systemd is the 'release_agent'. This provides a mechanism to + * run a command when a cgroup becomes empty (the last task in the group + * leaves, either by exit or move, and there are no more sub-cgroups). The + * 'release_agent' file only exists in the top-level cgroup of the mounted + * file system and holds the path to a command to run. The 'notify_on_release' + * file exists in each cgroup dir. If that file contains a '1' then the agent + * is run when that group becomes empty. The agent is passed a path string of + * the cgroup, relative to the file system mount point (e.g. a mount on + * /sys/fs/cgroups/systemd with a sub-cgroup of /sys/fs/cgroups/systemd/foo/bar + * gets the arg /foo/bar). + * + * Cgroup membership is implemented via hooks into the lx brand code. When + * the cgroup file system loads it installs callbacks for: + * lx_cgrp_initlwp + * lx_cgrp_freelwp + * and when it unloads it clears those hooks. The lx brand code calls those + * hooks when a lwp starts and when it exits. Internally we use a + * simple reference counter (cgn_task_cnt) on the cgroup node to track how many + * threads are in the group, so we can tell when a group becomes empty. + * To make this quick, a hash table (cg_grp_hash) is maintained on the + * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is + * sized so that there should typically only be 0 or 1 cgroups per bucket. + * We also keep a reference to the file system in the zone-specific brand data + * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t + * when it runs the hook. + * + * Once a cgroup is about to become empty, the final process exiting the cgroup + * will launch a new user-level process which execs the release agent. The new + * process is created as a child of zsched (indicated by the -1 pid argument + * to newproc) and is not associated with the exiting process in any way. + * + * This file system is similar to tmpfs in that directories only exist in + * memory. Each subdirectory represents a different cgroup. Within the cgroup + * there are pseudo files (see cg_ssde_dir) with well-defined names which + * control the configuration and behavior of the cgroup (see cgrp_nodetype_t). + * The primary files within every cgroup are named 'cgroup.procs', + * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the + * top-level cgroup). The cgroup.procs and tasks files are used to control and + * list which processes/threads belong to the cgroup. In the general case there + * could be additional files in the cgroup, which defined additional behavior + * (i.e. subsystem specific pseudo files), although none exist at this time. + * + * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is + * used to correlate with the threads to determine cgroup membership. When + * assigning a PID to a cgroup (via write) the code updates the br_cgroupid + * member in the brand-specific lx_lwp_data structure to control which cgroup + * the thread belongs to. Note that because the br_cgroupid lives in + * lx_lwp_data, native processes will not appear in the cgroup hierarchy. + * + * An overview of the behavior for the various vnode operations is: + * - no hardlinks or symlinks + * - no file create (the subsystem-specific files are a fixed list of + * pseudo-files accessible within the directory) + * - no file remove + * - no file rename, but a directory (i.e. a cgroup) can be renamed within the + * containing directory, but not into a different directory + * - can mkdir and rmdir to create/destroy cgroups + * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup) + * - open, read/write, close on the subsytem-specific pseudo files is + * allowed, as this is the interface to configure and report on the cgroup. + * The pseudo file's mode controls write access and cannot be changed. + * + * The locking in this file system is simple since the file system is not + * subjected to heavy I/O activity and all data is in-memory. There is a single + * global mutex for each mount (cg_contents). This mutex is held for the life + * of most vnode operations. The most active path is probably the LWP start and + * exit hooks which increment/decrement the reference counter on the cgroup + * node. The lock is important for this case since we don't want concurrent + * activity (such as moving the process into another cgroup) while we're trying + * to lookup the cgroup from the mount's hash table. We must be careful to + * avoid a deadlock while reading or writing since that code can take pidlock + * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of + * those is held. To prevent deadlock we always take cg_contents after pidlock + * and p_lock. + * + * EXTENDING THE FILE SYSTEM + * + * When adding support for a new subsystem, be sure to also update the + * lxpr_read_cgroups function in lx_procfs so that the subsystem is reported + * by proc. + * + * Although we don't currently support any subsystem controllers, the design + * allows for the file system to be extended to add controller emulation + * if needed. New controller IDs (i.e. different subsystems) for a mount can + * be defined in the cgrp_ssid_t enum (e.g. CG_SSID_CPUSET or CG_SSID_MEMORY) + * and new node types for additional pseudo files in the tree can be defined in + * the cgrp_nodetype_t enum (e.g. CG_CPUSET_CPUS or CG_MEMORY_USAGE_IN_BYTES). + * The cg_ssde_dir array would need a new entry for the new subsystem to + * control which nodes are visible in a directory for the new subsystem. + * + * New emulation would then need to be written to manage the behavior on the + * new pseudo file(s) associated with new cgrp_nodetype_t types. + * + * Within lx procfs the lxpr_read_pid_cgroup() function would need to be + * updated so that it reported the various subsystems used by the different + * mounts. + * + * In addition, in order to support more than one cgroup mount we would need a + * list of cgroup IDs associated with every thread, instead of just one ID + * (br_cgroupid). The thread data would need to become a struct which held + * both an ID and an indication as to which mounted cgroup file system instance + * the ID was associated with. We would also need a list of cgroup mounts per + * zone, instead the current single zone reference. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/vmparam.h> +#include <sys/corectl.h> +#include <sys/contract_impl.h> +#include <sys/pool.h> +#include <sys/stack.h> +#include <sys/rt.h> +#include <sys/fx.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +/* Module level parameters */ +static int cgrp_fstype; +static dev_t cgrp_dev; + +#define MAX_AGENT_EVENTS 32 /* max num queued events */ + +#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */ +#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */ + +/* + * cgrp_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t cgrp_mountcount; + +/* + * cgrp_minfree is the minimum amount of swap space that cgroups leaves for + * the rest of the zone. In other words, if the amount of free swap space + * in the zone drops below cgrp_minfree, cgroup anon allocations will fail. + * This number is only likely to become factor when DRAM and swap have both + * been capped low to allow for maximum tenancy. + */ +size_t cgrp_minfree = 0; + +/* + * CGMINFREE -- the value from which cgrp_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for cgroups in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow cgroups to consume + * no more than half of this, yielding a CGMINFREE of 64MB. + */ +#define CGMINFREE 64 * 1024 * 1024 /* 64 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +/* + * cgroup vfs operations. + */ +static int cgrp_init(int, char *); +static int cgrp_mount(struct vfs *, struct vnode *, + struct mounta *, struct cred *); +static int cgrp_unmount(struct vfs *, int, struct cred *); +static int cgrp_root(struct vfs *, struct vnode **); +static int cgrp_statvfs(struct vfs *, struct statvfs64 *); +static void cgrp_freevfs(vfs_t *vfsp); + +/* Forward declarations for hooks */ +static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t); +static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_cgroup", + cgrp_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand cgroups", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (cgrp_mountcount) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* Disable hooks used by the lx brand module. */ + lx_cgrp_initlwp = NULL; + lx_cgrp_freelwp = NULL; + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(cgrp_fstype); + vn_freevnodeops(cgrp_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading cgroup module. + */ +static int +cgrp_init(int fstype, char *name) +{ + static const fs_operation_def_t cgrp_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = cgrp_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = cgrp_unmount }, + VFSNAME_ROOT, { .vfs_root = cgrp_root }, + VFSNAME_STATVFS, { .vfs_statvfs = cgrp_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = cgrp_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def cgrp_vnodeops_template[]; + int error; + extern void cgrp_hash_init(); + major_t dev; + + cgrp_hash_init(); + cgrp_fstype = fstype; + ASSERT(cgrp_fstype != 0); + + error = vfs_setfsops(fstype, cgrp_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "cgrp_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, cgrp_vnodeops_template, &cgrp_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "cgrp_init: bad vnode ops template"); + return (error); + } + + /* + * cgrp_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (cgrp_minfree == 0) { + /* Set if not patched */ + cgrp_minfree = btopr(CGMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "cgrp_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + cgrp_dev = makedevice(dev, 0); + + /* Install the hooks used by the lx brand module. */ + lx_cgrp_initlwp = cgrp_lwp_fork_helper; + lx_cgrp_freelwp = cgrp_lwp_exit_helper; + + return (0); +} + +static int +cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + cgrp_mnt_t *cgm = NULL; + struct cgrp_node *cp; + struct pathname dpn; + int error; + struct vattr rattr; + cgrp_ssid_t ssid = CG_SSID_GENERIC; + lx_zone_data_t *lxzdata; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Since we depend on per-thread lx brand data, only allow mounting + * within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Having the resource be anything but "swap" doesn't make sense. + */ + vfs_setresource(vfsp, "swap", 0); + + /* cgroups don't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + /* + * Here is where we could support subsystem-specific controller + * mounting. For example, if mounting a cgroup fs with the 'cpuset' + * option to specify that particular controller. + * + * char *argstr; + * if (vfs_optionisset(vfsp, "cpuset", &argstr)) { + * if (ssid != CG_SSID_GENERIC) { + * error = EINVAL; + * goto out; + * } + * ssid = CG_SSID_CPUSET; + * } + */ + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * We currently only support one mount per zone. + */ + lxzdata = ztolxzd(curproc->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + if (lxzdata->lxzd_cgroup != NULL) { + mutex_exit(&lxzdata->lxzd_lock); + return (EINVAL); + } + + cgm = kmem_zalloc(sizeof (*cgm), KM_SLEEP); + + /* Set but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL); + + cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp; + mutex_exit(&lxzdata->lxzd_lock); + + cgm->cg_lxzdata = lxzdata; + cgm->cg_ssid = ssid; + + vfsp->vfs_data = (caddr_t)cgm; + vfsp->vfs_fstype = cgrp_fstype; + vfsp->vfs_dev = cgrp_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, cgrp_dev, cgrp_fstype); + cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(cgm->cg_mntpath, dpn.pn_path); + + cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ, + KM_SLEEP); + + /* allocate and initialize root cgrp_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP); + + mutex_enter(&cgm->cg_contents); + cgrp_node_init(cgm, cp, &rattr, cr); + + CGNTOV(cp)->v_flag |= VROOT; + + /* + * initialize linked list of cgrp_nodes so that the back pointer of + * the root cgrp_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + cp->cgn_back = cp; + cp->cgn_forw = NULL; + cp->cgn_nlink = 0; + cgm->cg_rootnode = cp; + + cp->cgn_type = CG_CGROUP_DIR; + cp->cgn_nodeid = cgrp_inode(CG_CGROUP_DIR, cgm->cg_gen); + + /* + * This initial cgrp_node will have an ID of 0. All existing processes + * inside the zone will have been started with, or inherited, a + * br_cgroupid of 0. The cgrp_cg_hash_init function will initialize the + * cgn_task_cnt for cgroup 0 to reflect the number of tasks already in + * the group. + * + * Because we must hold cg_contents in cgrp_lwp_fork_helper and + * cgrp_lwp_exit_helper, no process can be creating or exiting another + * thread (although that is unlikely anyway since the cgroup filesystem + * is normally mounted at the start of zone bootup, before anything + * else is started). + */ + cgrp_dirinit(cp, cp, cr); + + mutex_exit(&cgm->cg_contents); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&cgrp_mountcount); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cgnp, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + int retry_cnt = 0; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + +retry: + mutex_enter(&cgm->cg_contents); + + /* + * In the normal unmount case, if there were no open files, only the + * root node would have a reference count. However, the user-level + * agent manager should have the root vnode open and be waiting in + * ioctl. We need to wake the manager and it may take some retries + * before it closes its file descriptor. + * + * With cg_contents held, nothing can be added or removed. + * There may be some dirty pages. To prevent fsflush from + * disrupting the unmount, put a hold on each node while scanning. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + cgnp = cgm->cg_rootnode; + + ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL); + + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return (EINVAL); + } + + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + /* Likely because the user-level manager hasn't exited yet */ + if (retry_cnt++ < UMNT_RETRY_MAX) { + delay(UMNT_DELAY_TIME); + goto retry; + } + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (cgnp = cgnp->cgn_forw; cgnp; cgnp = cgnp->cgn_forw) { + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = cgm->cg_rootnode->cgn_forw; + while (cancel != cgnp) { + vp = CGNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->cgn_forw; + } + mutex_exit(&cgm->cg_contents); + return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + mutex_enter(&cgm->cg_lxzdata->lxzd_lock); + cgm->cg_lxzdata->lxzd_cgroup = NULL; + mutex_exit(&cgm->cg_lxzdata->lxzd_lock); + kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ); + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&cgm->cg_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * cgrp_umount. + */ +void +cgrp_freevfs(vfs_t *vfsp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + struct vnode *vp; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries + */ + for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) { + mutex_enter(&cgm->cg_contents); + if (cn->cgn_type == CG_CGROUP_DIR) + cgrp_dirtrunc(cn); + mutex_exit(&cgm->cg_contents); + } + + ASSERT(cgm->cg_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on + * a cgrp_node via its pages or anon slots from blowing it away + * (in cgrp_inactive) while we're trying to get to it here. Once + * we have a HOLD on it we know it'll stick around. + * + */ + mutex_enter(&cgm->cg_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((cn = cgm->cg_rootnode->cgn_back) != cgm->cg_rootnode) { + mutex_exit(&cgm->cg_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = CGNTOV(cn); + VN_RELE(vp); + mutex_enter(&cgm->cg_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (cn == cgm->cg_rootnode->cgn_back) { + VN_HOLD(vp); + mutex_exit(&cgm->cg_contents); + delay(hz / 4); + mutex_enter(&cgm->cg_contents); + } + } + mutex_exit(&cgm->cg_contents); + + VN_RELE(CGNTOV(cgm->cg_rootnode)); + + ASSERT(cgm->cg_mntpath); + + kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1); + + mutex_destroy(&cgm->cg_contents); + kmem_free(cgm, sizeof (cgrp_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&cgrp_mountcount); +} + +/* + * return root cgnode for given vnode + */ +static int +cgrp_root(struct vfs *vfsp, struct vnode **vpp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cp = cgm->cg_rootnode; + struct vnode *vp; + + ASSERT(cp); + + vp = CGNTOV(cp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = cgm->cg_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > cgrp_minfree) + sbp->f_bfree = blocks - cgrp_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of cgrp_nodes we can allocate from the remaining kernel memory + * available to cgroups. This is fairly inaccurate since it doesn't + * take into account the names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (cgrp_node_t) + sizeof (cgrp_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[cgrp_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, cgm->cg_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen) +{ + cgrp_node_t *parent; + cgrp_dirent_t *dp; + + buf[0] = '\0'; + + parent = cn->cgn_parent; + if (parent == NULL || parent == cn) { + (void) strlcpy(buf, ".", blen); + return (0); + } + + /* + * Search the parent dir list to find this cn's name. + */ + for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) { + if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) { + (void) strlcpy(buf, dp->cgd_name, blen); + return (0); + } + } + + return (-1); +} + +typedef struct cgrp_rra_arg { + char *crraa_agent_path; + char *crraa_event_path; +} cgrp_rra_arg_t; + +static void +cgrp_run_rel_agent(void *a) +{ + cgrp_rra_arg_t *rarg = a; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + struct core_globals *cg; + int res; + + ASSERT(!INGLOBALZONE(curproc)); + + /* The following block is derived from start_init_common */ + ASSERT_STACK_ALIGNED(); + + p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; + p->p_usrstack = (caddr_t)USRSTACK32; + p->p_model = DATAMODEL_ILP32; + p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; + p->p_datprot = PROT_ZFOD & ~PROT_EXEC; + p->p_stk_ctl = INT32_MAX; + + p->p_as = as_alloc(); + p->p_as->a_proc = p; + p->p_as->a_userlimit = (caddr_t)USERLIMIT32; + (void) hat_setup(p->p_as->a_hat, HAT_INIT); + + VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + curproc->p_corefile = cg->core_default_path; + curproc->p_content = cg->core_default_content; + + init_mstate(curthread, LMS_SYSTEM); + res = exec_init(rarg->crraa_agent_path, rarg->crraa_event_path); + + /* End of code derived from start_init_common */ + + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, CGRP_AGENT_LEN); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + + /* The following is derived from zone_start_init - see comments there */ + if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + if (proc_exit(CLD_EXITED, res) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + } else { + id_t cid = curthread->t_cid; + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + (void) parmsset(&pcparms, curthread); + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + + /* cause the process to return to userland. */ + lwp_rtt(); + } +} + +/* + * Launch the user-level release_agent manager. The event data is the + * pathname (relative to the mount point of the file system) of the newly empty + * cgroup. + * + * The cg_contents mutex is held on entry and dropped before returning. + */ +void +cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn, boolean_t is_exit) +{ + cgrp_node_t *parent; + char nm[MAXNAMELEN]; + char *argstr, *oldstr, *tmp; + id_t cid; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + cgrp_rra_arg_t *rarg; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Nothing to do if the agent is not set */ + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return; + } + + parent = cn->cgn_parent; + /* Cannot remove the top-level cgroup (only via unmount) */ + if (parent == cn) { + mutex_exit(&cgm->cg_contents); + return; + } + + argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + *argstr = '\0'; + + /* + * Iterate up the directory tree to construct the agent argument string. + */ + do { + VERIFY0(cgrp_get_dirname(cn, nm, sizeof (nm))); + DTRACE_PROBE1(cgrp__dir__name, char *, nm); + if (*argstr == '\0') { + (void) snprintf(argstr, MAXPATHLEN, "/%s", nm); + } else { + tmp = oldstr; + oldstr = argstr; + argstr = tmp; + (void) snprintf(argstr, MAXPATHLEN, "/%s%s", nm, + oldstr); + } + + if (cn->cgn_parent == NULL) + break; + cn = cn->cgn_parent; + parent = cn->cgn_parent; + + /* + * The arg path is relative to the mountpoint so we stop when + * we get to the top level. + */ + if (parent == NULL || parent == cn) + break; + } while (parent != cn); + + kmem_free(oldstr, MAXPATHLEN); + + rarg = kmem_alloc(sizeof (cgrp_rra_arg_t), KM_SLEEP); + rarg->crraa_agent_path = kmem_alloc(sizeof (cgm->cg_agent), KM_SLEEP); + (void) strlcpy(rarg->crraa_agent_path, cgm->cg_agent, + sizeof (cgm->cg_agent)); + rarg->crraa_event_path = argstr; + + DTRACE_PROBE2(cgrp__agent__event, cgrp_rra_arg_t *, rarg, + int, plwpd->br_cgroupid); + + /* + * When we're exiting, the release agent process cannot belong to our + * cgroup. When the release agent is called for a move or rmdir, then + * we do not change our cgroupid. + */ + if (is_exit) { + plwpd->br_cgroupid = 0; + } + + /* + * The cg_contents mutex cannot be held while taking the pool lock + * or calling newproc. + */ + mutex_exit(&cgm->cg_contents); + + if (z->zone_defaultcid > 0) { + cid = z->zone_defaultcid; + } else { + pool_lock(); + cid = pool_get_class(z->zone_pool); + pool_unlock(); + } + if (cid == -1) + cid = defaultcid; + + if (newproc(cgrp_run_rel_agent, (void *)rarg, cid, minclsyspri - 1, + NULL, -1) != 0) { + /* There's nothing we can do if creating the proc fails. */ + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, sizeof (cgm->cg_agent)); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + } +} + +/*ARGSUSED*/ +static void +cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + cn->cgn_task_cnt++; + mutex_exit(&cgm->cg_contents); + + DTRACE_PROBE1(cgrp__lwp__fork, void *, cn); +} + +/*ARGSUSED*/ +static void +cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + if (cn->cgn_task_cnt == 0) { + /* top-level cgroup cnt can be 0 during reboot */ + mutex_exit(&cgm->cg_contents); + return; + } + cn->cgn_task_cnt--; + DTRACE_PROBE1(cgrp__lwp__exit, void *, cn); + + if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) && + cn->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, cn, B_TRUE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + } else { + mutex_exit(&cgm->cg_contents); + } +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c new file mode 100644 index 0000000000..0078ad7876 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c @@ -0,0 +1,1552 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/vm.h> +#include <sys/prsystm.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +typedef enum cgrp_wr_type { + CG_WR_PROCS = 1, + CG_WR_TASKS +} cgrp_wr_type_t; + +/* ARGSUSED1 */ +static int +cgrp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct) +{ + /* + * swapon to a cgrp file is not supported so access is denied on open + * if VISSWAP is set. + */ + if ((*vpp)->v_flag & VISSWAP) + return (EINVAL); + + return (0); +} + +/* ARGSUSED1 */ +static int +cgrp_close(struct vnode *vp, int flag, int count, offset_t offset, + struct cred *cred, caller_context_t *ct) +{ + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); +} + +/* + * Lookup proc or task based on pid and typ. + */ +static proc_t * +cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ) +{ + int i; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + ASSERT(MUTEX_HELD(&pidlock)); + + /* getting a proc from a pid is easy */ + if (typ == CG_WR_PROCS) + return (prfind(pid)); + + ASSERT(typ == CG_WR_TASKS); + + /* + * We have to scan all of the process entries to find the proc + * containing this task. + */ + mutex_exit(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + kthread_t *t; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check all threads in this proc. + */ + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_pid == pid) { + mutex_exit(&p->p_lock); + return (p); + } + + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + } + + mutex_enter(&pidlock); + return (NULL); +} + +/* + * Move a thread from one cgroup to another. If the old cgroup is empty + * we queue up an agent event. We return true in that case since we've + * dropped the locks and the caller needs to reacquire them. + */ +static boolean_t +cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn, + uint_t cg_id, proc_t *p) +{ + cgrp_node_t *ocn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid); + VERIFY(ocn != NULL); + + ASSERT(ocn->cgn_task_cnt > 0); + atomic_dec_32(&ocn->cgn_task_cnt); + atomic_inc_32(&ncn->cgn_task_cnt); + plwpd->br_cgroupid = cg_id; + + if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) && + ocn->cgn_notify == 1) { + /* + * We want to drop p_lock before queuing the event since + * that might sleep. Dropping p_lock might cause the caller to + * have to restart the move process from the beginning. + */ + mutex_exit(&p->p_lock); + cgrp_rel_agent_event(cgm, ocn, B_FALSE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Assign either all of the threads, or a single thread, for the specified pid + * to the new cgroup. Controlled by the typ argument. + */ +static int +cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) +{ + proc_t *p; + kthread_t *t; + int error; + cgrp_node_t *ncn; + + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + /* + * Move one or all threads to this cgroup. + */ + if (typ == CG_WR_TASKS) { + error = ESRCH; + } else { + error = 0; + } + +restart: + mutex_enter(&pidlock); + + p = cgrp_p_for_wr(pid, typ); + if (p == NULL) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Fail writes for pids for which there is no corresponding process, + * system processes, a pid of 0, the pid for our zsched process, + * anything the security policy doesn't allow us to look at, and + * processes that are not in the zone. + */ + if (p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == curproc->p_zone->zone_zsched->p_pid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != curproc->p_zone->zone_id) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Ignore writes for PID which is not an lx-branded process or with + * no threads. + */ + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL || + p->p_flag & SEXITING) { + mutex_exit(&p->p_lock); + return (0); + } + + mutex_enter(&cgm->cg_contents); + + ncn = cgrp_cg_hash_lookup(cgm, cg_id); + VERIFY(ncn != NULL); + + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_cgroupid != cg_id) { + if (typ == CG_WR_PROCS) { + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + /* + * We dropped all of the locks so we + * need to start over. + */ + goto restart; + } + + } else if (plwpd->br_pid == pid) { + /* type is CG_WR_TASKS and we found the task */ + error = 0; + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + goto done; + } else { + break; + } + } + } + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&cgm->cg_contents); + mutex_exit(&p->p_lock); +done: + + return (error); +} + +/* + * User-level is writing a pid string. We need to get that string and convert + * it to a pid. The user-level code has to completely write an entire pid + * string at once. The user-level code could write multiple strings (delimited + * by newline) although that is frowned upon. However, we must handle this + * case too. Thus we consume the input one byte at a time until we get a whole + * pid string. We can't consume more than a byte at a time since otherwise we + * might be left with a partial pid string. + */ +static int +cgrp_get_pid_str(struct uio *uio, pid_t *pid) +{ + char buf[16]; /* big enough for a pid string */ + int i; + int error; + char *p = &buf[0]; + char *ep; + long pidnum; + + bzero(buf, sizeof (buf)); + for (i = 0; uio->uio_resid > 0 && i < sizeof (buf); i++, p++) { + error = uiomove(p, 1, UIO_WRITE, uio); + if (error != 0) + return (error); + if (buf[i] == '\n') { + buf[i] = '\0'; + break; + } + } + + if (buf[0] == '\0' || i >= sizeof (buf)) /* no input or too long */ + return (EINVAL); + + error = ddi_strtol(buf, &ep, 10, &pidnum); + if (error != 0 || *ep != '\0' || pidnum > maxpid || pidnum < 0) + return (EINVAL); + + *pid = (pid_t)pidnum; + return (0); +} + +static int +cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio) +{ + int error; + uint_t value; + + /* + * This is cheesy but since we only take a 0 or 1 value we can + * let the pid_str function do the uio string conversion. + */ + error = cgrp_get_pid_str(uio, (pid_t *)&value); + if (error != 0) + return (error); + + if (value != 0 && value != 1) + return (EINVAL); + + /* + * The flag is on the containing dir. We don't bother taking the + * cg_contents lock since this is a simple assignment. + */ + cn->cgn_parent->cgn_notify = value; + return (0); +} + +static int +cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int error; + int len; + char *wrp; + + len = uio->uio_offset + uio->uio_resid; + if (len > MAXPATHLEN) + return (EFBIG); + + mutex_enter(&cgm->cg_contents); + + wrp = &cgm->cg_agent[uio->uio_offset]; + error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio); + cgm->cg_agent[len] = '\0'; + if (len > 1 && cgm->cg_agent[len - 1] == '\n') + cgm->cg_agent[len - 1] = '\0'; + + mutex_exit(&cgm->cg_contents); + return (error); +} + +static int +cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, + cgrp_wr_type_t typ) +{ + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + int error; + pid_t pidnum; + + while (uio->uio_resid > 0) { + error = cgrp_get_pid_str(uio, &pidnum); + if (error != 0) + return (error); + + error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ); + if (error != 0) + return (error); + } + + return (0); +} + +static int +cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int error = 0; + rlim64_t limit = uio->uio_llimit; + + ASSERT(CGNTOV(cn)->v_type == VREG); + + if (uio->uio_loffset < 0) + return (EINVAL); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + if (uio->uio_loffset >= MAXOFF_T) + return (EFBIG); + + if (uio->uio_resid == 0) + return (0); + + if (limit > MAXOFF_T) + limit = MAXOFF_T; + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_wr_notify(cn, uio); + break; + case CG_PROCS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS); + break; + case CG_REL_AGENT: + error = cgrp_wr_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* + * Read value from the notify_on_release pseudo file on the parent node + * (which is the actual cgroup node). We don't bother taking the cg_contents + * lock since it's a single instruction so an empty group action/read will + * only see one value or the other. + */ +/* ARGSUSED */ +static int +cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int len; + int error = 0; + char buf[16]; + char *rdp; + /* the flag is on the containing dir */ + uint_t value = cn->cgn_parent->cgn_notify; + + len = snprintf(buf, sizeof (buf), "%u\n", value); + if (uio->uio_offset > len) + return (0); + + len -= uio->uio_offset; + rdp = &buf[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + return (error); +} + +/* + * Read value from the release_agent pseudo file. + */ +static int +cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int len; + int error = 0; + char *rdp; + + mutex_enter(&cgm->cg_contents); + + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len = strlen(cgm->cg_agent); + if (uio->uio_offset > len) { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len -= uio->uio_offset; + rdp = &cgm->cg_agent[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + + mutex_exit(&cgm->cg_contents); + + return (error); +} + +/* + * Read pids from the cgroup.procs pseudo file. We have to look at all of the + * processes to find applicable ones, then report pids for any process which + * has all of its threads in the same cgroup. + */ +static int +cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + ssize_t uresid; + zoneid_t zoneid = curproc->p_zone->zone_id; + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) { + proc_t *p; + ssize_t len; + pid_t pid; + char buf[16]; + char *rdp; + kthread_t *t; + boolean_t in_cg; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check if all threads are in this cgroup. + */ + in_cg = B_TRUE; + mutex_enter(&cgm->cg_contents); + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL || plwpd->br_cgroupid != cg_id) { + in_cg = B_FALSE; + break; + } + + t = t->t_forw; + } while (t != p->p_tlist); + mutex_exit(&cgm->cg_contents); + + mutex_exit(&p->p_lock); + if (!in_cg) { + /* + * This proc, or at least one of its threads, is not + * in this cgroup. + */ + mutex_exit(&pidlock); + continue; + } + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc struct + */ + if (p->p_pid == initpid) { + pid = 1; + } else { + pid = p->p_pid; + } + + mutex_exit(&pidlock); + + /* + * Generate pid line and write all or part of it if we're + * in the right spot within the pseudo file. + */ + len = snprintf(buf, sizeof (buf), "%u\n", pid); + if ((offset + len) > uio->uio_offset) { + int diff = (int)(uio->uio_offset - offset); + + ASSERT(diff < len); + offset += diff; + rdp = &buf[diff]; + len -= diff; + if (len > uresid) + len = uresid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + offset += len; + } + + return (0); +} + +/* + * We are given a locked process we know is valid, report on any of its thresds + * that are in the cgroup. + */ +static int +cgrp_rd_proc_tasks(uint_t cg_id, proc_t *p, pid_t initpid, ssize_t *offset, + struct uio *uio) +{ + int error = 0; + uint_t tid; + char buf[16]; + char *rdp; + kthread_t *t; + + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * Report all threads in this cgroup. + */ + t = p->p_tlist; + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL) { + t = t->t_forw; + continue; + } + + if (plwpd->br_cgroupid == cg_id) { + int len; + + /* + * Convert taskid to the Linux default of 1 if + * we're the zone's init process. + */ + tid = plwpd->br_pid; + if (tid == initpid) + tid = 1; + + len = snprintf(buf, sizeof (buf), "%u\n", tid); + if ((*offset + len) > uio->uio_offset) { + int diff; + + diff = (int)(uio->uio_offset - *offset); + ASSERT(diff < len); + *offset = *offset + diff; + rdp = &buf[diff]; + len -= diff; + if (len > uio->uio_resid) + len = uio->uio_resid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + *offset = *offset + len; + } + + t = t->t_forw; + } while (t != p->p_tlist && uio->uio_resid > 0); + + return (0); +} + +/* + * Read PIDs from the tasks pseudo file. In order to do this, the process + * table is walked, searching for entries which are in the correct state and + * match this zone. The LX emulated PIDs will be reported from branded entries + * which fulfill the criteria. Since records are being emulated for every task + * in the process, PR_LOCK is acquired to prevent changes during output. + * + * Note: If the buffer is filled and the accessing process is forced into a + * subsequent read, the reported threads may changes while locks are dropped in + * the mean time. + */ +static int +cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + cred_t *cred = CRED(); + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && uio->uio_resid > 0; i++) { + proc_t *p; + + mutex_enter(&pidlock); + for (;;) { + if ((p = pid_entry(i)) == NULL) { + /* Quickly move onto the next slot */ + if (++i < v.v_proc) { + continue; + } else { + mutex_exit(&pidlock); + break; + } + } + + /* + * Check if this process would even be of interest to + * cgroupfs before attempting to acquire its PR_LOCK. + */ + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&p->p_lock); + p = NULL; + break; + } + + /* Attempt to grab P_PR_LOCK. */ + error = sprtrylock_proc(p); + if (error == 0) { + /* Success */ + break; + } else if (error < 0) { + /* + * This process is not in a state where + * P_PR_LOCK can be acquired. It either + * belongs to the system or is a zombie. + * Regardless, give up and move on. + */ + mutex_exit(&p->p_lock); + p = NULL; + break; + } else { + /* + * Wait until P_PR_LOCK is no longer contended + * and attempt to acquire it again. Since the + * process may have changed state, the entry + * lookup must be repeated. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + } + } + + if (p == NULL) { + continue; + } else if (secpolicy_basic_procinfo(cred, p, curproc) != 0) { + sprunlock(p); + continue; + } + + /* Shuffle locks and output the entry. */ + mutex_exit(&p->p_lock); + mutex_enter(&cgm->cg_contents); + error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio); + mutex_exit(&cgm->cg_contents); + mutex_enter(&p->p_lock); + + sprunlock(p); + if (error != 0) { + return (error); + } + } + + return (0); +} + +static int +cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int error = 0; + + if (uio->uio_loffset >= MAXOFF_T) + return (0); + if (uio->uio_loffset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_rd_notify(cgm, cn, uio); + break; + case CG_PROCS: + error = cgrp_rd_procs(cgm, cn, uio); + break; + case CG_REL_AGENT: + error = cgrp_rd_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_rd_tasks(cgm, cn, uio); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support reading non-regular files + */ + if (vp->v_type == VDIR) + return (EISDIR); + if (vp->v_type != VREG) + return (EINVAL); + error = cgrp_rd(cgm, cn, uiop); + + return (error); +} + +/* ARGSUSED */ +static int +cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support writing to non-regular files + */ + if (vp->v_type != VREG) + return (EINVAL); + + if (ioflag & FAPPEND) { + /* In append mode start at end of file. */ + uiop->uio_loffset = cn->cgn_size; + } + + error = cgrp_wr(cgm, cn, uiop); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + vap->va_type = vp->v_type; + vap->va_mode = cn->cgn_mode & MODEMASK; + vap->va_uid = cn->cgn_uid; + vap->va_gid = cn->cgn_gid; + vap->va_fsid = cn->cgn_fsid; + vap->va_nodeid = (ino64_t)cn->cgn_nodeid; + vap->va_nlink = cn->cgn_nlink; + vap->va_size = (u_offset_t)cn->cgn_size; + vap->va_atime = cn->cgn_atime; + vap->va_mtime = cn->cgn_mtime; + vap->va_ctime = cn->cgn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = cn->cgn_rdev; + vap->va_seq = cn->cgn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + mutex_exit(&cgm->cg_contents); + return (0); +} + +/*ARGSUSED4*/ +static int +cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error = 0; + struct vattr *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR) || + (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE)) + return (EINVAL); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + get = &cn->cgn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, cgrp_taccess, + cn); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&cn->cgn_ctime); + +out: + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_taccess(cn, mode, cred); + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED3 */ +static int +cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + cgrp_node_t *cn = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *ncn = NULL; + int error; + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* + * Null component name is a synonym for directory being searched. + */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(cn); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(cn, nm, &ncn, cred); + mutex_exit(&cgm->cg_contents); + + if (error == 0) { + ASSERT(ncn); + *vpp = CGNTOV(ncn); + } + + return (error); +} + +/* ARGSUSED */ +static int +cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap, + enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred, + int flag, caller_context_t *ct, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + int error; + + if (*nm == '\0') + return (EPERM); + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + if (error == 0) { /* name found */ + ASSERT(cn); + + mutex_exit(&cgm->cg_contents); + /* + * Creating an existing file, allow it except for the following + * errors. + */ + if (exclusive == EXCL) { + error = EEXIST; + } else if ((CGNTOV(cn)->v_type == VDIR) && (mode & VWRITE)) { + error = EISDIR; + } else { + error = cgrp_taccess(cn, mode, cred); + } + if (error != 0) { + cgnode_rele(cn); + return (error); + } + *vpp = CGNTOV(cn); + return (0); + } + mutex_exit(&cgm->cg_contents); + + /* + * cgroups doesn't allow creation of additional, non-subsystem specific + * files in a dir + */ + return (EPERM); +} + +/* ARGSUSED3 */ +static int +cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + int error; + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + + /* + * Removal of subsystem-specific files is not allowed but we need + * to return the correct error if they try to remove a non-existent + * file. + */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + mutex_exit(&cgm->cg_contents); + if (error) + return (error); + + ASSERT(cn); + cgnode_rele(cn); + return (EPERM); +} + +/* ARGSUSED */ +static int +cgrp_link(struct vnode *dvp, struct vnode *srcvp, char *cnm, struct cred *cred, + caller_context_t *ct, int flags) +{ + /* cgroups doesn't support hard links */ + return (EPERM); +} + +/* + * Rename of subsystem-specific files is not allowed but we can rename + * directories (i.e. sub-groups). We cannot mv subdirs from one group to + * another so the src and dest vnode must be the same. + */ +/* ARGSUSED5 */ +static int +cgrp_rename( + struct vnode *odvp, /* source parent vnode */ + char *onm, /* source name */ + struct vnode *ndvp, /* destination parent vnode */ + char *nnm, /* destination name */ + struct cred *cred, + caller_context_t *ct, + int flags) +{ + cgrp_node_t *fromparent; + cgrp_node_t *toparent; + cgrp_node_t *fromcn = NULL; /* source cgrp_node */ + cgrp_mnt_t *cgm = VTOCGM(odvp); + int error, err; + + fromparent = VTOCGN(odvp); + toparent = VTOCGN(ndvp); + + if (fromparent != toparent) + return (EIO); + + /* discourage additional use of toparent */ + toparent = NULL; + + mutex_enter(&cgm->cg_contents); + + /* + * Look up cgrp_node of file we're supposed to rename. + */ + error = cgrp_dirlookup(fromparent, onm, &fromcn, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + if (fromcn->cgn_type != CG_CGROUP_DIR) { + error = EPERM; + goto done; + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. + */ + if (((error = cgrp_taccess(fromparent, VWRITE, cred)) != 0)) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromcn == fromparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (fromparent == fromcn)) { + error = EINVAL; + goto done; + } + + /* + * Link source to new target + */ + error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME, + fromcn, (struct vattr *)NULL, + (cgrp_node_t **)NULL, cred); + + if (error) + goto done; + + /* + * Unlink from source. + */ + error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred); + + /* + * The following handles the case where our source cgrp_node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + if (err == 0) { + vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct); + } + +done: + mutex_exit(&cgm->cg_contents); + cgnode_rele(fromcn); + + return (error); +} + +/* ARGSUSED5 */ +static int +cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, + struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *self = NULL; + cgrp_mnt_t *cgm = VTOCGM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT + * return from cgrp_dirlookup() is an "ok return". + */ + if (parent->cgn_nlink == 0) + return (ENOENT); + + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error == 0) { + ASSERT(self != NULL); + mutex_exit(&cgm->cg_contents); + cgnode_rele(self); + return (EEXIST); + } + if (error != ENOENT) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL, + va, &self, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + if (self != NULL) + cgnode_rele(self); + return (error); + } + mutex_exit(&cgm->cg_contents); + *vpp = CGNTOV(self); + return (0); +} + +/* ARGSUSED4 */ +static int +cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *self = NULL; + struct vnode *vp; + int error = 0; + + /* + * Return error when removing . and .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + vp = CGNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done; + } + if (self->cgn_type != CG_CGROUP_DIR) { + error = ENOTDIR; + goto done; + } + + cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp); + + /* + * Check for the existence of any sub-cgroup directories or tasks in + * the cgroup. + */ + if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) { + error = EEXIST; + /* + * Update atime because checking cn_dirents is logically + * equivalent to reading the directory + */ + gethrestime(&self->cgn_atime); + goto done; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + } else { + error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred); + } + + vn_vfsunlock(vp); + + if (parent->cgn_task_cnt == 0 && + parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, parent, B_FALSE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + goto dropped; + } + +done: + mutex_exit(&cgm->cg_contents); +dropped: + vnevent_rmdir(CGNTOV(self), dvp, nm, ct); + cgnode_rele(self); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, + caller_context_t *ct, int flags) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + cgrp_dirent_t *cdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + if (cn->cgn_dir == NULL) { + VERIFY(cn->cgn_nlink == 0); + mutex_exit(&cgm->cg_contents); + return (0); + } + + /* + * Get space for multiple directory entries + */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + /* LINTED: alignment */ + dp = (struct dirent64 *)outbuf; + + offset = 0; + cdp = cn->cgn_dir; + while (cdp) { + namelen = strlen(cdp->cgd_name); /* no +1 needed */ + offset = cdp->cgd_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) { + /* Buffer too small for any entries. */ + error = EINVAL; + } + break; + } + ASSERT(cdp->cgd_cgrp_node != NULL); + + /* use strncpy(9f) to zero out uninitialized bytes */ + + (void) strncpy(dp->d_name, cdp->cgd_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)cdp->cgd_cgrp_node->cgn_nodeid; + dp->d_off = (offset_t)cdp->cgd_offset + 1; + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + cdp = cdp->cgd_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!cdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&cn->cgn_atime); + + mutex_exit(&cgm->cg_contents); + + kmem_free(outbuf, bufsize); + return (error); +} + +/* ARGSUSED */ +static int +cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm, + struct cred *cred, caller_context_t *ct, int flags) +{ + /* cgroups doesn't support symlinks */ + return (EPERM); +} + +/* ARGSUSED */ +static void +cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp); + + mutex_enter(&cgm->cg_contents); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || cn->cgn_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return; + } + + if (cn->cgn_forw == NULL) + cgm->cg_rootnode->cgn_back = cn->cgn_back; + else + cn->cgn_forw->cgn_back = cn->cgn_back; + cn->cgn_back->cgn_forw = cn->cgn_forw; + + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + + /* Here's our chance to send invalid event */ + vn_invalid(CGNTOV(cn)); + + vn_free(CGNTOV(cn)); + kmem_free(cn, sizeof (cgrp_node_t)); +} + +/* ARGSUSED */ +static int +cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* ARGSUSED */ +static int +cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + return (write_lock); +} + +/* ARGSUSED */ +static void +cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ +} + +static int +cgrp_pathconf(struct vnode *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + switch (cmd) { + case _PC_XATTR_EXISTS: + if (vp->v_vfsp->vfs_flag & VFS_XATTR) { + *valp = 0; /* assume no attributes */ + error = 0; /* okay to ask */ + } else { + error = EINVAL; + } + break; + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_type == VREG || vp->v_type == VDIR); + error = 0; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *cgrp_vnodeops; + +const fs_operation_def_t cgrp_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = cgrp_open }, + VOPNAME_CLOSE, { .vop_close = cgrp_close }, + VOPNAME_READ, { .vop_read = cgrp_read }, + VOPNAME_WRITE, { .vop_write = cgrp_write }, + VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr }, + VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr }, + VOPNAME_ACCESS, { .vop_access = cgrp_access }, + VOPNAME_LOOKUP, { .vop_lookup = cgrp_lookup }, + VOPNAME_CREATE, { .vop_create = cgrp_create }, + VOPNAME_REMOVE, { .vop_remove = cgrp_remove }, + VOPNAME_LINK, { .vop_link = cgrp_link }, + VOPNAME_RENAME, { .vop_rename = cgrp_rename }, + VOPNAME_MKDIR, { .vop_mkdir = cgrp_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = cgrp_rmdir }, + VOPNAME_READDIR, { .vop_readdir = cgrp_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = cgrp_symlink }, + VOPNAME_INACTIVE, { .vop_inactive = cgrp_inactive }, + VOPNAME_RWLOCK, { .vop_rwlock = cgrp_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = cgrp_rwunlock }, + VOPNAME_SEEK, { .vop_seek = cgrp_seek }, + VOPNAME_PATHCONF, { .vop_pathconf = cgrp_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h new file mode 100644 index 0000000000..437b0b6162 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd.h @@ -0,0 +1,244 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXD_H +#define _LXD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxd.h: declarations, data structures and macros for lxd (lxd devfs). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> +#include <sys/lx_types.h> + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +/* + * It's unlikely that we need to create more than 50-60 subdirs/symlinks + * as front files so we size the file system hash for 2x that number. + * The back devfs typically has ~80 nodes so this is also a comfortable size + * for the back hash table. + */ +#define LXD_HASH_SZ 128 + +#define LXD_BACK_HASH(v) ((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1)) + +#define LXD_NM_HASH(ldn, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(ldn) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + hash &= (LXD_HASH_SZ - 1); \ + } + + +enum lxd_node_type { LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT }; + +typedef struct lxd_dev_attr { + list_node_t lxda_link; + char lxda_name[MAXPATHLEN]; + uid_t lxda_uid; + gid_t lxda_gid; + mode_t lxda_mode; +} lxd_dev_attr_t; + +/* + * lxd per-mount data structure. + * + * All fields are protected by lxd_contents. + * File renames on a specific file system are protected lxdm_renamelck. + */ +typedef struct lxd_mnt { + struct vfs *lxdm_vfsp; /* filesystem's vfs struct */ + struct lxd_node *lxdm_rootnode; /* root lxd_node */ + char *lxdm_mntpath; /* name of lxd mount point */ + dev_t lxdm_dev; /* unique dev # of mounted `device' */ + kmutex_t lxdm_contents; /* per-mount lock */ + kmutex_t lxdm_renamelck; /* rename lock for this mount */ + kmutex_t lxdm_attrlck; /* per-mount attr. file lock */ + list_t lxdm_devattrs; /* list of device attr. settings */ + uint_t lxdm_gen; /* node ID source for files */ + + /* protects buckets in both "dir ent" and "back" hash tables */ + kmutex_t lxdm_hash_mutex[LXD_HASH_SZ]; + + /* per-mount data for "back" vnodes in the fs */ + uint_t lxdm_back_refcnt; /* # outstanding "back" vnodes */ + struct lxd_node *lxdm_back_htable[LXD_HASH_SZ]; + + /* + * Per-mount directory data for "front" nodes in the fs. + * Each front node has a directory entry but directory entries can live + * on either front or back nodes. + */ + uint_t lxdm_dent_refcnt; /* # outstanding dir ents */ + struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ]; +} lxd_mnt_t; + +/* + * lxd_node is the file system dependent node for lxd. + * + * The node is used to represent both front and back files. For front files + * the node can represent either a directory or symlink. + */ +typedef struct lxd_node { + enum lxd_node_type lxdn_type; + + /* Data for "front" nodes */ + struct lxd_node *lxdn_prev; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_next; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_parent; /* dir containing this node */ + krwlock_t lxdn_rwlock; /* serialize mods/dir updates */ + kmutex_t lxdn_tlock; /* time, flag, and nlink lock */ + + /* these could be in a union ala tmpfs but not really necessary */ + uint_t lxdn_dirents; /* number of dirents */ + struct lxd_dirent *lxdn_dir; /* dirent list */ + char *lxdn_symlink; /* pointer to symlink */ + struct vattr lxdn_attr; /* attributes */ + + /* Hash table link */ + struct lxd_node *lxdn_hnxt; /* link in per-mount entry */ + /* hash table */ + vnode_t *lxdn_vnode; /* vnode for this lxd_node */ + + vnode_t *lxdn_real_vp; /* back file - real vnode */ +} lxd_node_t; + +/* + * Attributes + */ +#define lxdn_mask lxdn_attr.va_mask +#define lxdn_mode lxdn_attr.va_mode +#define lxdn_uid lxdn_attr.va_uid +#define lxdn_gid lxdn_attr.va_gid +#define lxdn_fsid lxdn_attr.va_fsid +#define lxdn_nodeid lxdn_attr.va_nodeid +#define lxdn_nlink lxdn_attr.va_nlink +#define lxdn_size lxdn_attr.va_size +#define lxdn_atime lxdn_attr.va_atime +#define lxdn_mtime lxdn_attr.va_mtime +#define lxdn_ctime lxdn_attr.va_ctime +#define lxdn_rdev lxdn_attr.va_rdev +#define lxdn_blksize lxdn_attr.va_blksize +#define lxdn_nblocks lxdn_attr.va_nblocks +#define lxdn_seq lxdn_attr.va_seq + +/* + * lx devfs conversion macros + */ +#define VFSTOLXDM(vfsp) ((lxd_mnt_t *)(vfsp)->vfs_data) +#define VTOLXDM(vp) ((lxd_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOLDN(vp) ((lxd_node_t *)(vp)->v_data) +#define LDNTOV(ln) ((ln)->lxdn_vnode) +#define ldnode_hold(ln) VN_HOLD(LDNTOV(ln)) +#define ldnode_rele(ln) VN_RELE(LDNTOV(ln)) + +#define REALVP(vp) (VTOLDN(vp)->lxdn_real_vp) + +/* + * front directories are made up of a linked list of lxd_dirent structures + * hanging off directory lxdn_nodes. File names are not fixed length, but are + * null terminated. + */ +typedef struct lxd_dirent { + lxd_node_t *lddir_node; /* lxd node for this file */ + struct lxd_dirent *lddir_next; /* next directory entry */ + struct lxd_dirent *lddir_prev; /* prev directory entry */ + uint_t lddir_offset; /* "offset" of dir entry */ + uint_t lddir_hash; /* a hash of lddir_name */ + struct lxd_dirent *lddir_link; /* linked via hash table */ + lxd_node_t *lddir_parent; /* parent, dir we are in */ + char *lddir_name; /* null terminated */ +} lxd_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +typedef struct lxd_minor_translator { + char *lxd_mt_path; /* illumos minor node path */ + minor_t lxd_mt_minor; /* illumos minor node number */ + int lxd_mt_lx_major; /* linux major node number */ + int lxd_mt_lx_minor; /* linux minor node number */ +} lxd_minor_translator_t; + +enum lxd_xl_tp { DTT_INVALID, DTT_LIST, DTT_CUSTOM }; + +#define xl_list lxd_xl_minor.lxd_xl_list +#define xl_custom lxd_xl_minor.lxd_xl_custom + +typedef struct lxd_devt_translator { + char *lxd_xl_driver; /* driver name */ + major_t lxd_xl_major; /* driver number */ + + enum lxd_xl_tp lxd_xl_type; /* dictates how we intrep. xl_minor */ + union { + uintptr_t lxd_xl_foo; /* required to compile */ + lxd_minor_translator_t *lxd_xl_list; + void (*lxd_xl_custom)(dev_t, dev_t *); + } lxd_xl_minor; +} lxd_devt_translator_t; + +extern struct vnodeops *lxd_vnodeops; +extern lxd_devt_translator_t lxd_devt_translators[]; + +vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *); +void lxd_free_back_node(lxd_node_t *); +int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *); +int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *, + lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *); +void lxd_dirinit(lxd_node_t *, lxd_node_t *); +int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *); +void lxd_dirtrunc(lxd_node_t *); +void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *); +int lxd_naccess(void *, int, cred_t *); + +void lxd_save_attrs(lxd_mnt_t *, vnode_t *); +void lxd_apply_db(lxd_mnt_t *); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXD_H */ diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c new file mode 100644 index 0000000000..02d396a36d --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> + +#include "lxd.h" + +#define LX_ATTR_FILE "/etc/.lxd_dev_attr" + +#define RD_BUFSIZE MAXPATHLEN +#define ENTRY_BUFSIZE (MAXPATHLEN + 32) + +static int +lxd_db_open(int fmode, vnode_t **vpp) +{ + return (vn_open(LX_ATTR_FILE, UIO_SYSSPACE, fmode, + (int)(0644 & MODEMASK), vpp, CRCREAT, PTOU(curproc)->u_cmask)); +} + +static int +lxd_wr_entry(vnode_t *wvn, off_t offset, char *entry) +{ + int len, err; + struct uio auio; + struct iovec aiov; + + len = strlen(entry); + aiov.iov_base = entry; + aiov.iov_len = len; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = len; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = FWRITE; + auio.uio_extflg = UIO_COPY_DEFAULT; + + (void) VOP_RWLOCK(wvn, V_WRITELOCK_TRUE, NULL); + err = VOP_WRITE(wvn, &auio, FAPPEND, CRED(), NULL); + VOP_RWUNLOCK(wvn, V_WRITELOCK_TRUE, NULL); + + if (err != 0) + return (0); + return (len); +} + +/* + * Given an entry, apply a uid, gid and mode change to the given device. There + * is no strtok in the kernel but it's easy to tokenize the entry ourselves. + * + * entries have the form (newline removed by caller): + * path uid gid mode\0 + */ +static int +lxd_apply_entry(char *entry, char **dpath, uid_t *uidp, gid_t *gidp, + mode_t *modep) +{ + char *dp, *up, *gp, *mp, *ep; + long uid, gid, mode; + int error, res = 0; + vnode_t *vp; + vattr_t va; + + dp = entry; + + /* find and delimit the first field (device name) */ + for (up = dp; *up != ' ' && *up != '\0'; up++) + ; + if (*up != ' ') + return (-1); + *up++ = '\0'; + + /* find and delimit the second field (uid) */ + for (gp = up; *gp != ' ' && *gp != '\0'; gp++) + ; + if (*gp != ' ') + return (-1); + *gp++ = '\0'; + + /* find and delimit the third field (gid) */ + for (mp = gp; *mp != ' ' && *mp != '\0'; mp++) + ; + if (*mp != ' ') + return (-1); + *mp++ = '\0'; + + /* validate the fourth field (mode) */ + ep = mp + strlen(mp); + if (*ep != '\0') + return (-1); + + if (*dp != '/') + return (-1); + + error = ddi_strtol(up, &ep, 10, &uid); + if (error != 0 || *ep != '\0' || uid > MAXUID || uid < 0) + return (-1); + + error = ddi_strtol(gp, &ep, 10, &gid); + if (error != 0 || *ep != '\0' || gid > MAXUID || gid < 0) + return (-1); + + /* note that the mode is octal */ + error = ddi_strtol(mp, &ep, 8, &mode); + if (error != 0 || *ep != '\0' || mode > 0777 || mode < 0) + return (-1); + + if (lookupname(dp, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp) != 0) { + /* + * It's likely the device is no longer visible to the zone. + * No matter the reason, we indicate failure. + */ + return (-1); + } + + va.va_mask = AT_UID | AT_GID | AT_MODE; + va.va_uid = (uid_t)uid; + va.va_gid = (gid_t)gid; + va.va_mode = (mode_t)mode; + + if (VOP_SETATTR(vp, &va, 0, CRED(), NULL) != 0) + res = -1; + + VN_RELE(vp); + + *dpath = dp; + *uidp = (uid_t)uid; + *gidp = (gid_t)gid; + *modep = (mode_t)mode; + return (res); +} + +/* + * Return true if this is a pre-existing record. + */ +static boolean_t +lxd_save_devattr(lxd_mnt_t *lxdm, char *dpath, uid_t uid, gid_t gid, + mode_t mode) +{ + lxd_dev_attr_t *da; + + da = list_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + if (strcmp(dpath, da->lxda_name) == 0) { + da->lxda_uid = uid; + da->lxda_gid = gid; + da->lxda_mode = mode; + return (B_TRUE); + } + da = list_next(&lxdm->lxdm_devattrs, da); + } + + da = kmem_zalloc(sizeof (lxd_dev_attr_t), KM_SLEEP); + (void) strlcpy(da->lxda_name, dpath, sizeof (da->lxda_name)); + da->lxda_uid = uid; + da->lxda_gid = gid; + da->lxda_mode = mode; + + list_insert_tail(&lxdm->lxdm_devattrs, da); + return (B_FALSE); +} + +static void +lxd_save_db(lxd_mnt_t *lxdm) +{ + lxd_dev_attr_t *da; + char *entry; + vnode_t *wvn; + off_t woff = 0; + + if (list_is_empty(&lxdm->lxdm_devattrs)) { + /* The attribute file is no longer needed. */ + (void) vn_remove(LX_ATTR_FILE, UIO_SYSSPACE, RMFILE); + return; + } + + if (lxd_db_open(FWRITE | FCREAT | FTRUNC, &wvn) != 0) + return; + + entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP); + + woff = lxd_wr_entry(wvn, woff, "# DO NOT EDIT: this file is " + "automatically maintained for lx container devices\n"); + + da = list_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + (void) snprintf(entry, ENTRY_BUFSIZE, "%s %d %d %o\n", + da->lxda_name, da->lxda_uid, da->lxda_gid, + da->lxda_mode & 0777); + woff += lxd_wr_entry(wvn, woff, entry); + da = list_next(&lxdm->lxdm_devattrs, da); + } + + (void) VOP_CLOSE(wvn, FWRITE, 1, woff, CRED(), NULL); + + kmem_free(entry, ENTRY_BUFSIZE); +} + +/* + * This function records the uid, gid and mode information for an lx devfs + * block device node after a chown/chmod setattr operation so that these + * changes can be persistent across reboots. Since the actual setattr has + * already suceeded, the tracking of these changes is done on a "best effort" + * basis. That is, if we fail to record the change for some reason, the setattr + * will still return success. The vp passed in is the "real vp" for the back + * device node. + */ +void +lxd_save_attrs(lxd_mnt_t *lxdm, vnode_t *vp) +{ + vattr_t va; + char devpath[MAXPATHLEN]; + + /* the path returned is relative to the zone's root */ + if (vnodetopath(curproc->p_zone->zone_rootvp, vp, devpath, + sizeof (devpath), CRED()) != 0) + return; + + va.va_mask = AT_MODE | AT_UID | AT_GID; + + /* + * We just set attrs, so the getattr shouldn't fail. If the device + * is not a block device we don't persist the change. + */ + if (VOP_GETATTR(vp, &va, 0, CRED(), NULL) != 0 || + ((va.va_mode & S_IFBLK) != S_IFBLK)) + return; + + /* + * We serialize all updates to the attribute DB file. In practice this + * should not be a problem since there is rarely concurrent device + * file mode changes. + */ + mutex_enter(&lxdm->lxdm_attrlck); + + (void) lxd_save_devattr(lxdm, devpath, va.va_uid, va.va_gid, + va.va_mode & 0777); + lxd_save_db(lxdm); + + mutex_exit(&lxdm->lxdm_attrlck); +} + +/* + * Re-apply the persistent attribute settings to the devices when this lx + * devfs is mounted. As with lxd_save_attrs, this is done on a best effort and + * we won't prevent the mount if there is a problem. No locking is needed + * while reading the DB file since this action is performed during the + * mount of the devfs. + */ +void +lxd_apply_db(lxd_mnt_t *lxdm) +{ + vnode_t *rvn; + char *buf, *entry, *bp, *ep; + struct uio auio; + struct iovec aiov; + size_t cnt, len, ecnt, roff; + char *devpath; + uid_t uid; + gid_t gid; + mode_t mode; + boolean_t needs_update = B_FALSE; + + if (lxd_db_open(FREAD, &rvn) != 0) + return; + + buf = kmem_alloc(RD_BUFSIZE, KM_SLEEP); + entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP); + + roff = 0; + ep = entry; + ecnt = 0; + (void) VOP_RWLOCK(rvn, V_WRITELOCK_FALSE, NULL); +loop: + aiov.iov_base = buf; + aiov.iov_len = RD_BUFSIZE; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = roff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = RD_BUFSIZE; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + (void) VOP_READ(rvn, &auio, 0, CRED(), NULL); + + len = RD_BUFSIZE - auio.uio_resid; + roff += len; + + if (len > 0) { + for (bp = buf, cnt = 0; cnt < len; bp++, cnt++) { + + /* + * We have an improperly formed entry in the file (too + * long). In an attempt to recover we reset the entry + * pointer so we can read the rest of the line and try + * to absorb the bad line. The code in lxd_apply_entry + * will handle any malformed or inapplicable entries. + */ + if (ecnt >= (ENTRY_BUFSIZE - 1)) { + ep = entry; + ecnt = 0; + needs_update = B_TRUE; + } + + if (*bp == '\n') { + *ep = '\0'; + + /* skip comments */ + if (entry[0] != '#') { + if (lxd_apply_entry(entry, &devpath, + &uid, &gid, &mode) != 0 || + lxd_save_devattr(lxdm, devpath, + uid, gid, mode)) { + /* + * An invalid entry, a + * non-existent device node or + * a duplicate entry. + */ + needs_update = B_TRUE; + } + } + ep = entry; + ecnt = 0; + } else { + *ep++ = *bp; + ecnt++; + } + } + goto loop; + } + VOP_RWUNLOCK(rvn, V_WRITELOCK_FALSE, NULL); + + kmem_free(buf, RD_BUFSIZE); + kmem_free(entry, ENTRY_BUFSIZE); + + (void) VOP_CLOSE(rvn, FREAD, 1, 0, CRED(), NULL); + + if (needs_update) + lxd_save_db(lxdm); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c new file mode 100644 index 0000000000..0d056ab167 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c @@ -0,0 +1,1003 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "lxd.h" + +#define LXD_HASH_SIZE 8192 /* must be power of 2 */ +#define LXD_MUTEX_SIZE 64 + + +#define MODESHIFT 3 + +typedef enum lxd_nodehold { + NOHOLD, + HOLD +} lxd_nodehold_t; + +/* + * The following functions maintain the per-mount "front" files. + */ +static void +lxd_save_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash); + de->lddir_hash = hash; + + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + ASSERT(de->lddir_link == NULL); + de->lddir_link = lxdm->lxdm_dent_htable[hash]; + lxdm->lxdm_dent_htable[hash] = de; + mutex_exit(hmtx); + + atomic_inc_32(&lxdm->lxdm_dent_refcnt); +} + +static void +lxd_rm_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + lxd_dirent_t **prevpp; + kmutex_t *hmtx; + + hash = de->lddir_hash; + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + prevpp = &lxdm->lxdm_dent_htable[hash]; + while (*prevpp != de) + prevpp = &(*prevpp)->lddir_link; + *prevpp = de->lddir_link; + de->lddir_link = NULL; + mutex_exit(hmtx); + + ASSERT(lxdm->lxdm_dent_refcnt > 0); + atomic_dec_32(&lxdm->lxdm_dent_refcnt); +} + +static lxd_dirent_t * +lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold, + lxd_node_t **found) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(parent)); + lxd_dirent_t *de; + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(parent, name, hash); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + de = lxdm->lxdm_dent_htable[hash]; + while (de) { + if (de->lddir_hash == hash && de->lddir_parent == parent && + strcmp(de->lddir_name, name) == 0) { + lxd_node_t *ldn = de->lddir_node; + + if (do_hold == HOLD) { + ASSERT(ldn != NULL); + ldnode_hold(ldn); + } + if (found != NULL) + *found = ldn; + mutex_exit(hmtx); + return (de); + } + + de = de->lddir_link; + } + mutex_exit(hmtx); + return (NULL); +} + +int +lxd_naccess(void *vcp, int mode, cred_t *cr) +{ + lxd_node_t *ldn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in lxd_node. + */ + if (crgetuid(cr) != ldn->lxdn_uid) { + shift += MODESHIFT; + if (groupmember(ldn->lxdn_gid, cr) == 0) + shift += MODESHIFT; + } + + if (ldn->lxdn_type == LXDNT_FRONT) + return (secpolicy_vnode_access2(cr, LDNTOV(ldn), + ldn->lxdn_uid, ldn->lxdn_mode << shift, mode)); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL)); +} + +static lxd_node_t * +lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm) +{ + lxd_node_t *l; + + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) { + if (l->lxdn_real_vp == vp) { + ASSERT(l->lxdn_type == LXDNT_BACK); + + VN_HOLD(LDNTOV(l)); + return (l); + } + } + return (NULL); +} + +static void +lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm) +{ + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(l->lxdn_real_vp != NULL); + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + atomic_inc_32(&lxdm->lxdm_back_refcnt); + + l->lxdn_hnxt = lxdm->lxdm_back_htable[hash]; + lxdm->lxdm_back_htable[hash] = l; +} + + +struct vnode * +lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + + hash = LXD_BACK_HASH(vp); /* Note: hashing with realvp */ + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + l = lxd_find_back(vp, hash, lxdm); + if (l == NULL) { + vnode_t *nvp; + + l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + nvp = vn_alloc(KM_SLEEP); + + rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + + l->lxdn_vnode = nvp; + l->lxdn_type = LXDNT_BACK; + l->lxdn_real_vp = vp; + + VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type, + vp->v_rdev); + nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN)); + vn_setops(nvp, lxd_vnodeops); + nvp->v_data = (caddr_t)l; + + lxd_save_back(l, hash, lxdm); + vn_exists(vp); + } else { + VN_RELE(vp); + } + + mutex_exit(hmtx); + return (LDNTOV(l)); +} + +void +lxd_free_back_node(lxd_node_t *lp) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + lxd_node_t *lprev = NULL; + vnode_t *vp = LDNTOV(lp); + vnode_t *realvp = REALVP(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + /* in lxd_make_back_node we call lxd_find_back with the realvp */ + hash = LXD_BACK_HASH(realvp); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + mutex_enter(&vp->v_lock); + if (vp->v_count > 1) { + vp->v_count--; /* release our hold from vn_rele */ + mutex_exit(&vp->v_lock); + mutex_exit(hmtx); + return; + } + mutex_exit(&vp->v_lock); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; + lprev = l, l = l->lxdn_hnxt) { + + if (l != lp) + continue; + + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(lxdm->lxdm_back_refcnt > 0); + + atomic_dec_32(&lxdm->lxdm_back_refcnt); + vn_invalid(vp); + + if (lprev == NULL) { + lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt; + } else { + lprev->lxdn_hnxt = l->lxdn_hnxt; + } + + mutex_exit(hmtx); + rw_destroy(&l->lxdn_rwlock); + mutex_destroy(&l->lxdn_tlock); + kmem_free(l, sizeof (lxd_node_t)); + vn_free(vp); + VN_RELE(realvp); + return; + } + + panic("lxd_free_back_node"); + /*NOTREACHED*/ +} +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found lxd_node with its vnode held. + */ +int +lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr) +{ + int error; + + *foundnp = NULL; + if (parent->lxdn_vnode->v_type != VDIR) + return (ENOTDIR); + + if ((error = lxd_naccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + ldnode_hold(parent); + *foundnp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the lxdn_dir list + * so that it doesn't change out from underneath us. + * lxd_find_dirent() will pass back the lxd_node + * with a hold on it. + */ + + if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) { + ASSERT(*foundnp); + return (0); + } + + return (ENOENT); +} + +/* + * Check if the source directory is in the path of the target directory. + * The target directory is locked by the caller. + */ +static int +lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent) +{ + int error = 0; + lxd_node_t *dir, *dotdot; + + ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock)); + ASSERT(toparent->lxdn_vnode->v_type == VDIR); + + dotdot = toparent->lxdn_parent; + if (dotdot == NULL) + return (ENOENT); + ldnode_hold(dotdot); + + if (dotdot == toparent) { + /* root of fs. search trivially satisfied. */ + ldnode_rele(dotdot); + return (0); + } + + for (;;) { + /* + * Return error for cases like "mv c c/d", + * "mv c c/d/e" and so on. + */ + if (dotdot == fromnode) { + ldnode_rele(dotdot); + error = EINVAL; + break; + } + + dir = dotdot; + dotdot = dir->lxdn_parent; + if (dotdot == NULL) { + ldnode_rele(dir); + error = ENOENT; + break; + } + ldnode_hold(dotdot); + + /* + * We're okay if we traverse the directory tree up to + * the root directory and don't run into the + * parent directory. + */ + if (dir == dotdot) { + ldnode_rele(dir); + ldnode_rele(dotdot); + break; + } + ldnode_rele(dir); + } + + return (error); +} + +static int +lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va, + enum de_op op, lxd_node_t **newnode, struct cred *cred) +{ + lxd_node_t *ldn; + + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + + ldn->lxdn_type = LXDNT_FRONT; + lxd_node_init(lxdm, ldn, NULL, va, cred); + + ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV; + ldn->lxdn_vnode->v_type = va->va_type; + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + if (va->va_mask & AT_ATIME) + ldn->lxdn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + ldn->lxdn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + lxd_dirinit(dir, ldn); + } + + *newnode = ldn; + return (0); +} + +static int +lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name) +{ + lxd_dirent_t *dp, *pdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->lxdn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (lxd_dirent_t); + dp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); + if (dp == NULL) + return (ENOSPC); + + ldn->lxdn_parent = dir; + + dir->lxdn_size += alloc_size; + dir->lxdn_dirents++; + dp->lddir_node = ldn; + dp->lddir_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t); + (void) strcpy(dp->lddir_name, name); + + lxd_save_dirent(dp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in lxd_readdir. + */ + pdp = dir->lxdn_dir->lddir_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (pdp->lddir_next != NULL && + (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) { + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset); + pdp = pdp->lddir_next; + } + dp->lddir_offset = pdp->lddir_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's lddir_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (pdp->lddir_next == NULL && + pdp->lddir_offset > 2 * dir->lxdn_dirents) + dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next; + else + dir->lxdn_dir->lddir_prev = dp; + + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + dp->lddir_next = pdp->lddir_next; + if (dp->lddir_next) { + dp->lddir_next->lddir_prev = dp; + } + dp->lddir_prev = pdp; + pdp->lddir_next = dp; + + ASSERT(dp->lddir_next != dp); + ASSERT(dp->lddir_prev != dp); + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + return (0); +} + +/* + * Enter a directory entry for 'name' into directory 'dir' + * + * Returns 0 on success. + */ +int +lxd_direnter( + lxd_mnt_t *lxdm, + lxd_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + lxd_node_t *fromparent, /* original directory if rename */ + lxd_node_t *ldn, /* existing lxd_node, if rename */ + struct vattr *va, + lxd_node_t **rnp, /* return lxd_node, if create/mkdir */ + cred_t *cr) +{ + lxd_dirent_t *dirp; + lxd_node_t *found = NULL; + int error = 0; + char *s; + + /* lxdn_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("lxd_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + */ + if (op == DE_RENAME) { + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink == 0) { + mutex_exit(&ldn->lxdn_tlock); + return (ENOENT); + } + + if (ldn->lxdn_nlink == MAXLINK) { + mutex_exit(&ldn->lxdn_tlock); + return (EMLINK); + } + ldn->lxdn_nlink++; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + + /* + * This might be a "dangling detached directory" (it could have been + * removed, but a reference to it kept in u_cwd). Don't bother + * searching it, and with any luck the user will get tired of dealing + * with us and cd to some absolute pathway (thus in ufs, too). + */ + if (dir->lxdn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * If this is a rename of a directory and the parent is different + * (".." must be changed), then the source directory must not be in the + * directory hierarchy above the target, as this would orphan + * everything below the source directory. + */ + if (op == DE_RENAME) { + if (ldn == dir) { + error = EINVAL; + goto out; + } + if ((ldn->lxdn_vnode->v_type) == VDIR) { + if ((fromparent != dir) && + (error = lxd_dircheckpath(ldn, dir)) != 0) { + goto out; + } + } + } + + /* Search for an existing entry. */ + dirp = lxd_find_dirent(name, dir, HOLD, &found); + if (dirp != NULL) { + ASSERT(found != NULL); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (rnp != NULL) { + *rnp = found; + error = EEXIST; + } else { + ldnode_rele(found); + } + break; + + case DE_RENAME: + /* + * Note that we only hit this path when we're renaming + * a symlink from one directory to another and there is + * a pre-existing symlink as the target. lxd_rename + * will unlink the src from the original directory but + * here we need to unlink the dest that we collided + * with, then create the new directory entry as we do + * below when there is no pre-existing symlink. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + + ASSERT(found->lxdn_vnode->v_type == VLNK); + /* dir rw lock is already held and asserted above */ + rw_enter(&found->lxdn_rwlock, RW_WRITER); + error = lxd_dirdelete(dir, found, name, DR_RENAME, cr); + rw_exit(&found->lxdn_rwlock); + ldnode_rele(found); + if (error != 0) + goto out; + + error = lxd_diraddentry(dir, ldn, name); + if (error == 0 && rnp != NULL) + *rnp = ldn; + break; + } + } else { + + /* + * The directory entry does not exist, but the node might if + * this is a rename. Check write permission in directory to + * see if entry can be created. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new lxd_node and directory entry as required. + */ + error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr); + if (error) + goto out; + } + + error = lxd_diraddentry(dir, ldn, name); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if ((ldn->lxdn_vnode->v_type) == VDIR) { + ASSERT(dirp == NULL); + /* + * cleanup allocs made by lxd_dirinit + */ + lxd_dirtrunc(ldn); + } + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink = 0; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + ldnode_rele(ldn); + ldn = NULL; + } + } else if (rnp != NULL) { + *rnp = ldn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + ldnode_rele(ldn); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + return (error); +} + +/* + * Delete entry ldn of name "nm" from parent dir. This is used to both remove + * a directory and to remove file nodes within the directory (by recursively + * calling itself). It frees the dir entry space and decrements link count on + * lxd_node(s). + * + * Return 0 on success. + */ +int +lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op, + cred_t *cred) +{ + lxd_dirent_t *dirp; + int error; + size_t namelen; + lxd_node_t *fndnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + if (nm[0] == '\0') + panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->lxdn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a directory. Start by + * removing any file entries from the dir. We do this by + * recursively calling back into this function with a different + * op code. The caller of this function has already verified + * that it is safe to remove this directory. + */ + lxd_dirent_t *dirp; + + ASSERT(ldn->lxdn_vnode->v_type == VDIR); + + dirp = ldn->lxdn_dir; + while (dirp) { + lxd_node_t *dn; + lxd_dirent_t *nextp; + + if (strcmp(dirp->lddir_name, ".") == 0 || + strcmp(dirp->lddir_name, "..") == 0) { + dirp = dirp->lddir_next; + continue; + } + + dn = dirp->lddir_node; + nextp = dirp->lddir_next; + + ldnode_hold(dn); + error = lxd_dirdelete(ldn, dn, dirp->lddir_name, + DR_REMOVE, cred); + ldnode_rele(dn); + + dirp = nextp; + } + } + + dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp); + VERIFY(dirp != NULL); + VERIFY(ldn == fndnp); + + lxd_rm_dirent(dirp); + + /* Take dirp out of the directory list. */ + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + if (dirp->lddir_prev) { + dirp->lddir_prev->lddir_next = dirp->lddir_next; + } + if (dirp->lddir_next) { + dirp->lddir_next->lddir_prev = dirp->lddir_prev; + } + + /* + * If the roving slot pointer happens to match dirp, + * point it at the previous dirent. + */ + if (dir->lxdn_dir->lddir_prev == dirp) { + dir->lxdn_dir->lddir_prev = dirp->lddir_prev; + } + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + + /* dirp points to the correct directory entry */ + namelen = strlen(dirp->lddir_name) + 1; + + kmem_free(dirp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + ldn->lxdn_ctime = now; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) { + lxd_dirtrunc(ldn); + ASSERT(ldn->lxdn_nlink == 0); + } + return (0); +} + +/* + * Initialize a lxd_node and add it to file list under mount point. + */ +void +lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap, + cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + ldn->lxdn_mask = 0; + ldn->lxdn_attr.va_type = vap->va_type; + ldn->lxdn_nlink = 1; + ldn->lxdn_size = 0; + + if (cred == NULL) { + ldn->lxdn_uid = vap->va_uid; + ldn->lxdn_gid = vap->va_gid; + } else { + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + } + + ldn->lxdn_fsid = lxdm->lxdm_dev; + ldn->lxdn_rdev = vap->va_rdev; + ldn->lxdn_blksize = PAGESIZE; + ldn->lxdn_nblocks = 0; + gethrestime(&now); + ldn->lxdn_atime = now; + ldn->lxdn_mtime = now; + ldn->lxdn_ctime = now; + ldn->lxdn_seq = 0; + ldn->lxdn_dir = NULL; + + ldn->lxdn_real_vp = realvp; + + ldn->lxdn_vnode = vn_alloc(KM_SLEEP); + vp = LDNTOV(ldn); + vn_setops(vp, lxd_vnodeops); + vp->v_vfsp = lxdm->lxdm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)ldn; + + mutex_enter(&lxdm->lxdm_contents); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Add new lxd_node to end of linked list of lxd_nodes for this + * lxdevfs. Root directory is handled specially in lxd_mount. + */ + if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) { + ldn->lxdn_next = NULL; + ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev; + ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev = + ldn; + } + mutex_exit(&lxdm->lxdm_contents); + vn_exists(vp); +} + +/* + * lxd_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir) +{ + lxd_dirent_t *dot, *dotdot; + timestruc_t now; + lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode); + struct vattr nattr; + + ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + dir->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP); + dot->lddir_node = dir; + dot->lddir_offset = 0; + dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t); + dot->lddir_name[0] = '.'; + dot->lddir_parent = dir; + lxd_save_dirent(dot); + + dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP); + dotdot->lddir_node = parent; + dotdot->lddir_offset = 1; + dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t); + dotdot->lddir_name[0] = '.'; + dotdot->lddir_name[1] = '.'; + dotdot->lddir_parent = dir; + lxd_save_dirent(dotdot); + + /* + * Initialize directory entry list. + */ + dot->lddir_next = dotdot; + dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */ + dotdot->lddir_next = NULL; + dotdot->lddir_prev = dot; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + parent->lxdn_nlink++; + parent->lxdn_ctime = now; + + dir->lxdn_dir = dot; + dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5; /* dot and dotdot */ + dir->lxdn_dirents = 2; + dir->lxdn_nlink = 2; + dir->lxdn_parent = parent; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; +} + +/* + * lxd_dirtrunc is called to remove all directory entries under this directory. + */ +void +lxd_dirtrunc(lxd_node_t *dir) +{ + lxd_dirent_t *ldp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) { + size_t namelen; + lxd_node_t *ldn; + + ASSERT(ldp->lddir_next != ldp); + ASSERT(ldp->lddir_prev != ldp); + ASSERT(ldp->lddir_node); + + dir->lxdn_dir = ldp->lddir_next; + namelen = strlen(ldp->lddir_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + ldn = ldp->lddir_node; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + + lxd_rm_dirent(ldp); + kmem_free(ldp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + } + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + ASSERT(dir->lxdn_dir == NULL); + ASSERT(dir->lxdn_size == 0); + ASSERT(dir->lxdn_dirents == 0); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c new file mode 100644 index 0000000000..b2e2b9b9e3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c @@ -0,0 +1,860 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The lx devfs (lxd) file system is used within lx branded zones to provide + * the Linux view of /dev. + * + * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev. + * lxd now provides the Linux /dev. + * + * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file + * system which is the special device and corresponds to the special device in + * a lofs mount. As with lofs, all files in the special device are accessible + * through the lxd mount. Because the zone's devfs is not directly modifiable + * within the zone (also mknod(2) is not generally allowed within a zone) it is + * impossible to create files in devfs. For lx, in some cases it's useful to be + * able to make new symlinks or new directories under /dev. lxd implements + * these operations by creating "files" in memory in the same way as tmpfs + * does. Within lxd these are referred to as "front" files. For operations such + * as lookup or readdir, lxd provides a merged view of both the front and back + * files. lxd does not support regular front files or simple I/O (read/write) + * to front files, since there is no need for that. For back files, all + * operations are simply passed through to the real vnode, as is done with + * lofs. Front files are not allowed to mask back files. + * + * The Linux /dev is now a lxd mount with the special file (i.e. the back + * file system) as /native/dev. + * + * In addition, lx has a need for some illumos/Linux translation for the + * various *stat(2) system calls when used on a device. This translation can + * be centralized within lxd's getattr vnode entry point. + * + * Because the front file system only exists in memory and the back file + * system is the zone's devfs, which is not persistent across reboots, we + * track any device uid/gid/mode changes in a per-zone /etc/.lxd_dev_attr + * file and re-apply those changes when the lx devfs file system is mounted. + * Currently only changes to block device nodes are persistent. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_ptm.h> +#include <sys/lx_impl.h> + +#include "lxd.h" + +/* Module level parameters */ +static int lxd_fstype; +static dev_t lxd_dev; + +/* + * lxd_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t lxd_mountcount; + +/* + * lxd_minfree is the minimum amount of swap space that lx devfs leaves for + * the rest of the zone. + */ +size_t lxd_minfree = 0; + +/* + * LXDMINFREE -- the value from which lxd_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for lxd in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow lxd to consume + * no more than ~10% of this, yielding a LXDMINFREE of 12MB. + */ +#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *, + caller_context_t *, int); +extern int stat64(char *, struct stat64 *); + +/* + * lxd vfs operations. + */ +static int lxd_init(int, char *); +static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int lxd_unmount(vfs_t *, int, cred_t *); +static int lxd_root(vfs_t *, vnode_t **); +static int lxd_statvfs(vfs_t *, statvfs64_t *); +static void lxd_freevfs(vfs_t *vfsp); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_devfs", + lxd_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand devfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +/* + * Definitions and translators for devt's. + */ +static void lxd_pts_devt_translator(dev_t, dev_t *); +static void lxd_ptm_devt_translator(dev_t, dev_t *); + +static kmutex_t lxd_xlate_lock; +static boolean_t lxd_xlate_initialized = B_FALSE; + +static lxd_minor_translator_t lxd_mtranslator_mm[] = { + { "/dev/null", 0, 1, 3 }, + { "/dev/zero", 0, 1, 5 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_random[] = { + { "/dev/random", 0, 1, 8 }, + { "/dev/urandom", 0, 1, 9 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_sy[] = { + { "/dev/tty", 0, LX_TTY_MAJOR, 0 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_zcons[] = { + { "/dev/console", 0, LX_TTY_MAJOR, 1 }, + { NULL, 0, 0, 0 } +}; +lxd_devt_translator_t lxd_devt_translators[] = { + { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm }, + { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random }, + { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy }, + { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons }, + { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator }, + { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator }, + { NULL, 0, DTT_INVALID, NULL } +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (lxd_mountcount > 0) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(lxd_fstype); + vn_freevnodeops(lxd_vnodeops); + mutex_destroy(&lxd_xlate_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading lxd module. + */ +static int +lxd_init(int fstype, char *name) +{ + static const fs_operation_def_t lxd_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxd_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount }, + VFSNAME_ROOT, { .vfs_root = lxd_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def lxd_vnodeops_template[]; + int error; + major_t dev; + + lxd_fstype = fstype; + ASSERT(lxd_fstype != 0); + + error = vfs_setfsops(fstype, lxd_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxd_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxd_init: bad vnode ops template"); + return (error); + } + + /* + * lxd_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (lxd_minfree == 0) { + /* Set if not patched */ + lxd_minfree = btopr(LXDMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxd_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxd_dev = makedevice(dev, 0); + + mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* + * Initialize device translator mapping table. + * + * Note that we cannot do this in lxd_init since that can lead to a recursive + * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/ + * devi_attach_node/modload). Thus we do it in the mount path and keep track + * so that we only initialize the table once. + */ +static void +lxd_xlate_init() +{ + int i; + + mutex_enter(&lxd_xlate_lock); + if (lxd_xlate_initialized) { + mutex_exit(&lxd_xlate_lock); + return; + } + + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + lxd_minor_translator_t *mt; + int j; + + lxd_devt_translators[i].lxd_xl_major = + mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver); + + /* if this translator doesn't use a list mapping we're done. */ + if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST) + continue; + + /* for each device listed, lookup the minor node number */ + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + vnode_t *vp; + struct vattr va; + char *tpath; + char tnm[MAXPATHLEN]; + + /* + * The attach might be triggered in either the global + * zone or in a non-global zone, so we may need to + * adjust the path if we're in a NGZ. + */ + if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) { + tpath = mt[j].lxd_mt_path; + } else { + (void) snprintf(tnm, sizeof (tnm), "/native%s", + mt[j].lxd_mt_path); + tpath = tnm; + } + + if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL, + &vp, NULL) != 0) { + mt[j].lxd_mt_minor = UINT_MAX; + continue; + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) { + va.va_rdev = NODEV; + } else { + ASSERT(getmajor(va.va_rdev) == + lxd_devt_translators[i].lxd_xl_major); + ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN); + } + + mt[j].lxd_mt_minor = getminor(va.va_rdev); + + VN_RELE(vp); + } + } + + lxd_xlate_initialized = B_TRUE; + mutex_exit(&lxd_xlate_lock); +} + +static int +lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lxd_mnt_t *lxdm = NULL; + struct lxd_node *ldn; + struct pathname dpn; + int error; + int i; + int nodev; + struct vattr rattr; + vnode_t *realrootvp; + vnode_t *tvp; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + vattr_t vattr; + + nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + lxd_xlate_init(); + + /* + * This is the same behavior as with lofs. + * Loopback devices which get "nodevices" added can be done without + * "nodevices" set because we cannot import devices into a zone + * with loopback. Note that we have all zone privileges when + * this happens; if not, we'd have gotten "nosuid". + */ + if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); + + /* + * Only allow mounting within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* lxd doesn't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * Find real root + */ + if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) { + pn_free(&dpn); + return (error); + } + + if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (error); + } + + /* If realroot is not a devfs, error out */ + if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (EINVAL); + } + + lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP); + + /* init but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_attrlck, NULL, MUTEX_DEFAULT, NULL); + + list_create(&lxdm->lxdm_devattrs, sizeof (lxd_dev_attr_t), + offsetof(lxd_dev_attr_t, lxda_link)); + + /* Initialize the hash table mutexes */ + for (i = 0; i < LXD_HASH_SZ; i++) { + mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT, + NULL); + } + + lxdm->lxdm_vfsp = vfsp; + lxdm->lxdm_gen = 1; /* start inode counter at 1 */ + + vfsp->vfs_data = (caddr_t)lxdm; + vfsp->vfs_fstype = lxd_fstype; + vfsp->vfs_dev = lxd_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype); + lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path); + + /* allocate and initialize root lxd_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + + tvp = lxd_make_back_node(realrootvp, lxdm); + ldn = VTOLDN(tvp); + + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + LDNTOV(ldn)->v_flag |= VROOT; + + /* + * initialize linked list of lxd_nodes so that the back pointer of + * the root lxd_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + ldn->lxdn_prev = ldn; + ldn->lxdn_next = NULL; + ldn->lxdn_nlink = 0; + lxdm->lxdm_rootnode = ldn; + + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + lxd_dirinit(ldn, ldn); + + rw_exit(&ldn->lxdn_rwlock); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&lxd_mountcount); + + lxzdata = ztolxzd(curproc->p_zone); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vattr.va_mask = AT_TYPE | AT_MODE; + vattr.va_type = VLNK; + vattr.va_mode = 0777; + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL) { + char lnknm[MAXPATHLEN]; + + /* Create a symlink for the actual zvol. */ + (void) snprintf(lnknm, sizeof (lnknm), + "./zvol/dsk/%s", vd->lxvd_real_name); + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + lnknm, cr, NULL, 0); + } else if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Create a symlink for the root "disk" using /dev/zfs + * as the target device. + */ + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + "./zfs", cr, NULL, 0); + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Apply any persistent attribute changes. */ + lxd_apply_db(lxdm); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + + mutex_enter(&lxdm->lxdm_contents); + + /* + * In the normal unmount case only the root node would have a reference + * count. + * + * With lxdm_contents held, nothing can be added or removed. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + ldn = lxdm->lxdm_rootnode; + + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EINVAL); + } + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) { + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = lxdm->lxdm_rootnode->lxdn_next; + while (cancel != ldn) { + vp = LDNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->lxdn_next; + } + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } else { + /* + * It may seem incorrect for us to have a vnode with + * a count of 0, but this is modeled on tmpfs and works + * the same way. See lxd_front_inactive. There we allow + * the v_count to go to 0 but rely on the link count to + * keep the vnode alive. Since we now want to cleanup + * these vnodes we manually add a VN_HOLD so that the + * VN_RELEs that occur in the lxd_freevfs() cleanup + * will take us down the lxd_inactive code path. We + * can directly add a VN_HOLD since we have the lock. + */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&lxdm->lxdm_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * lxd_unmount. + */ +void +lxd_freevfs(vfs_t *vfsp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn; + struct vnode *vp; + lxd_dev_attr_t *da; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the lxd_mnt_t that + * says we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries (this doesn't remove top-level dirs). + */ + for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if (ldn->lxdn_vnode->v_type == VDIR) + lxd_dirtrunc(ldn); + rw_exit(&ldn->lxdn_rwlock); + } + + ASSERT(lxdm->lxdm_rootnode != NULL); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a + * lxd_node from blowing it away (in lxd_inactive) while we're trying + * to get to it here. Once we have a HOLD on it we know it'll stick + * around. + */ + mutex_enter(&lxdm->lxdm_contents); + + /* + * Remove all the files (except the rootnode) backwards. + */ + while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) { + mutex_exit(&lxdm->lxdm_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = LDNTOV(ldn); + ASSERT(vp->v_type == VLNK || vp->v_type == VDIR || + vp->v_type == VSOCK); + VN_RELE(vp); + mutex_enter(&lxdm->lxdm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (ldn == lxdm->lxdm_rootnode->lxdn_prev) { + VN_HOLD(vp); + mutex_exit(&lxdm->lxdm_contents); + delay(hz / 4); + mutex_enter(&lxdm->lxdm_contents); + } + } + mutex_exit(&lxdm->lxdm_contents); + + ASSERT(lxdm->lxdm_back_refcnt == 1); + ASSERT(lxdm->lxdm_dent_refcnt == 0); + + VN_RELE(LDNTOV(lxdm->lxdm_rootnode)); + + ASSERT(lxdm->lxdm_mntpath != NULL); + kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1); + + da = list_remove_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + kmem_free(da, sizeof (lxd_dev_attr_t)); + da = list_remove_head(&lxdm->lxdm_devattrs); + } + list_destroy(&lxdm->lxdm_devattrs); + + mutex_destroy(&lxdm->lxdm_contents); + mutex_destroy(&lxdm->lxdm_renamelck); + mutex_destroy(&lxdm->lxdm_attrlck); + kmem_free(lxdm, sizeof (lxd_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&lxd_mountcount); +} + +/* + * return root lxdnode for given vnode + */ +static int +lxd_root(struct vfs *vfsp, struct vnode **vpp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn = lxdm->lxdm_rootnode; + struct vnode *vp; + + ASSERT(ldn != NULL); + + vp = LDNTOV(ldn); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = lxdm->lxdm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > lxd_minfree) + sbp->f_bfree = blocks - lxd_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of lxd_nodes we can allocate from the remaining kernel memory + * available to lxdevfs in this zone. This is fairly inaccurate since + * it doesn't take into account the names stored in the directory + * entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (lxd_node_t) + sizeof (lxd_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static void +lxd_pts_devt_translator(dev_t dev, dev_t *jdev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + /* + * Linux uses a range of major numbers for pts devices to address the + * relatively small minor number space (20 bits). + */ + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + if (lx_maj > LX_PTS_MAJOR_MAX) { + /* + * The major is outside the acceptable range but there's little + * we can presently do about it short of overhauling the + * translation logic. + */ + lx_unsupported("pts major out of translation range"); + } + + *jdev = LX_MAKEDEVICE(lx_maj, lx_min); +} + +/* ARGSUSED */ +static void +lxd_ptm_devt_translator(dev_t dev, dev_t *jdev) +{ + *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c new file mode 100644 index 0000000000..8088ba6174 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c @@ -0,0 +1,1520 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <sys/lx_brand.h> +#include <sys/brand.h> + +#include "lxd.h" + +static int +lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(*vpp); + vnode_t *vp = *vpp; + vnode_t *rvp; + vnode_t *oldvp; + int error; + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + oldvp = vp; + vp = rvp = REALVP(vp); + /* + * Need to hold new reference to vp since VOP_OPEN() may + * decide to release it. + */ + VN_HOLD(vp); + error = VOP_OPEN(&rvp, flag, cr, ct); + + if (!error && rvp != vp) { + /* + * the FS which we called should have released the + * new reference on vp + */ + *vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp)); + + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + VN_RELE(oldvp); + } else { + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + } + + return (error); +} + +static int +lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_CLOSE(vp, flag, count, offset, cr, ct)); +} + +static int +lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READ(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_WRITE(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr, + int *rvalp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct)); +} + +static int +lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETFL(vp, oflags, nflags, cr, ct)); +} + +/* + * Translate SunOS devt to Linux devt. + */ +static void +lxd_s2l_devt(dev_t dev, dev_t *rdev) +{ + lxd_minor_translator_t *mt; + int i, j; + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + /* look for a devt translator for this major number */ + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + if (lxd_devt_translators[i].lxd_xl_major == maj) + break; + } + + if (lxd_devt_translators[i].lxd_xl_driver != NULL) { + /* try to translate the illumos devt to a linux devt */ + switch (lxd_devt_translators[i].lxd_xl_type) { + case DTT_INVALID: + ASSERT(0); + break; + + case DTT_LIST: + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + if (mt[j].lxd_mt_minor == min) { + ASSERT(mt[j].lxd_mt_minor < LX_MAXMIN); + + /* found a translation */ + *rdev = LX_MAKEDEVICE( + mt[j].lxd_mt_lx_major, + mt[j].lxd_mt_lx_minor); + return; + } + } + break; + + case DTT_CUSTOM: + lxd_devt_translators[i].xl_custom(dev, rdev); + return; + } + } + + /* we don't have a translator for this device */ + *rdev = LX_MAKEDEVICE(maj, min); +} + +static int +lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + int error; + vnode_t *rvp; + + if (ldn->lxdn_type == LXDNT_FRONT) { + mutex_enter(&ldn->lxdn_tlock); + + vap->va_type = vp->v_type; + vap->va_mode = ldn->lxdn_mode & MODEMASK; + vap->va_uid = ldn->lxdn_uid; + vap->va_gid = ldn->lxdn_gid; + vap->va_fsid = ldn->lxdn_fsid; + vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid; + vap->va_nlink = ldn->lxdn_nlink; + vap->va_size = (u_offset_t)ldn->lxdn_size; + vap->va_atime = ldn->lxdn_atime; + vap->va_mtime = ldn->lxdn_mtime; + vap->va_ctime = ldn->lxdn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = 0; /* no devs in front */ + vap->va_seq = ldn->lxdn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr( + vap->va_size))); + mutex_exit(&ldn->lxdn_tlock); + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + rvp = REALVP(vp); + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct))) + return (error); + + /* Skip devt translation for native programs */ + if (curproc->p_brand != &lx_brand) { + return (0); + } else { + /* + * We also skip translation when called from the user-land + * emulation code. + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + if (lwpd == NULL || lwpd->br_stack_mode != LX_STACK_MODE_BRAND) + return (0); + } + + if (rvp->v_type == VCHR) { + dev_t ldev; + + lxd_s2l_devt(vap->va_rdev, &ldev); + DTRACE_PROBE3(lxd__devxl, void *, rvp, void *, vap, int, ldev); + vap->va_rdev = ldev; + } + + return (0); +} + +static int +lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + int res; + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error = 0; + struct vattr *set; + long mask = vap->va_mask; + + /* Cannot set these attributes */ + if ((mask & AT_NOSET) || (mask & AT_XVATTR) || + (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) || + (mask & AT_SIZE)) + return (EINVAL); + + mutex_enter(&ldn->lxdn_tlock); + + set = &ldn->lxdn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, set, flags, + lxd_naccess, ldn); + if (error) { + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + if (mask & AT_MODE) { + set->va_mode &= S_IFMT; + set->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + set->va_uid = vap->va_uid; + if (mask & AT_GID) + set->va_gid = vap->va_gid; + if (mask & AT_ATIME) + set->va_atime = vap->va_atime; + if (mask & AT_MTIME) + set->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&ldn->lxdn_ctime); + + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + res = VOP_SETATTR(vp, vap, flags, cr, ct); + if (res == 0 && (vap->va_mask & (AT_MODE | AT_UID | AT_GID))) { + lxd_save_attrs(lxdm, vp); + } + return (res); +} + +static int +lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + mutex_enter(&ldn->lxdn_tlock); + error = lxd_naccess(ldn, mode, cr); + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + vp = REALVP(vp); + return (VOP_ACCESS(vp, mode, flags, cr, ct)); +} + +static int +lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FSYNC(vp, syncflag, cr, ct)); +} + +/* ARGSUSED */ +static void +lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + mutex_enter(&ldn->lxdn_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || ldn->lxdn_nlink != 0) { + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + return; + } + + /* + * We have the last hold *and* the link count is zero, so this node is + * dead from the filesystem's viewpoint. + */ + if (ldn->lxdn_size != 0) { + if (ldn->lxdn_vnode->v_type == VLNK) + kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1); + } + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + + vn_invalid(LDNTOV(ldn)); + + mutex_enter(&lxdm->lxdm_contents); + if (ldn->lxdn_next == NULL) + lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev; + else + ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev; + ldn->lxdn_prev->lxdn_next = ldn->lxdn_next; + + mutex_exit(&lxdm->lxdm_contents); + rw_exit(&ldn->lxdn_rwlock); + rw_destroy(&ldn->lxdn_rwlock); + mutex_destroy(&ldn->lxdn_tlock); + + vn_free(LDNTOV(ldn)); + kmem_free(ldn, sizeof (lxd_node_t)); +} + +/*ARGSUSED*/ +static void +lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + lxd_front_inactive(vp, cr, ct); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + lxd_free_back_node(ldn); +} + +/* ARGSUSED */ +static int +lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FID(vp, fidp, ct)); +} + +/* + * For a front node lookup in the dirent hash table and return a shadow vnode + * (lxd_node_t type) of type LXDNT_FRONT. + * + * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type) + * of the real vnode found. + */ +static int +lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *vp = NULL; + int error; + vnode_t *realdvp; + lxd_mnt_t *lxdm = VTOLXDM(dvp); + int doingdotdot = 0; + lxd_node_t *ldn = VTOLDN(dvp); + lxd_node_t *nldn = NULL; + + /* + * First check for front file which could be instantiated on either a + * front or back node (e.g. the top-level moint point directory node is + * a back node which can have front files created in it). + */ + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for dir being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(ldn, nm, &nldn, cr); + rw_exit(&ldn->lxdn_rwlock); + + if (error == 0) { + /* found */ + ASSERT(nldn != NULL); + *vpp = LDNTOV(nldn); + return (0); + } + + /* At this point, if dir node is a front node, error */ + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOENT); + } + + realdvp = REALVP(dvp); + + if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { + doingdotdot++; + /* + * Handle ".." out of mounted filesystem + */ + while ((realdvp->v_flag & VROOT) && realdvp != rootdir) { + realdvp = realdvp->v_vfsp->vfs_vnodecovered; + ASSERT(realdvp != NULL); + } + } + + *vpp = NULL; /* default(error) case */ + + /* + * Do the normal lookup + */ + if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr, + ct, direntflags, realpnp)) != 0) { + vp = NULL; + goto out; + } + + /* + * We do this check here to avoid returning a stale file handle to the + * caller. + */ + if (nm[0] == '.' && nm[1] == '\0') { + ASSERT(vp == realdvp); + VN_HOLD(dvp); + VN_RELE(vp); + *vpp = dvp; + return (0); + } + + if (doingdotdot) { + *vpp = lxd_make_back_node(vp, lxdm); + return (0); + } + + /* + * If this vnode is mounted on, then we + * traverse to the vnode which is the root of + * the mounted file system. + */ + if ((error = traverse(&vp)) != 0) + goto out; + + /* + * Make a lxd node for the real vnode. + */ + *vpp = lxd_make_back_node(vp, lxdm); + if (vp->v_type != VDIR) { + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + VN_RELE(vp); + error = ENOSYS; + } else { + *vpp = svp; + } + } + return (error); + } + +out: + if (error != 0 && vp != NULL) + VN_RELE(vp); + + return (error); +} + +/*ARGSUSED*/ +static int +lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, + int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + int error; + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *lnp = NULL; + + rw_enter(&parent->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(parent, nm, &lnp, cr); + rw_exit(&parent->lxdn_rwlock); + + /* + * If a back node already exists then there is no need to pass + * the create to native devfs -- just set the vpp to the back + * vnode. If the front node already exists then fail because + * it can't represent a regular file. In both cases, enforce + * open(2)'s EEXIST and EISDIR semantics. + */ + if (error == 0) { + if (exclusive == EXCL) { + error = EEXIST; + } else if (LDNTOV(lnp)->v_type == VDIR && + (mode & S_IWRITE)) { + error = EISDIR; + } else if (lnp->lxdn_type == LXDNT_FRONT) { + error = ENOTSUP; + } + + if (error != 0) { + ldnode_rele(lnp); + return (error); + } + + VERIFY3S(lnp->lxdn_type, ==, LXDNT_BACK); + *vpp = lnp->lxdn_vnode; + + return (error); + } + + /* + * We cannot create files in the back devfs but we want to allow for + * O_CREAT on existing files. Pass this through and let the back file + * system allow or deny it. + */ + if (parent->lxdn_type == LXDNT_BACK) { + vnode_t *vp = NULL; + + if (*nm == '\0') { + ASSERT(vpp && dvp == *vpp); + vp = REALVP(*vpp); + } + if ((error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode, + &vp, cr, flag, ct, vsecp)) == 0) { + *vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp)); + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, + (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + return (ENOSYS); + } + *vpp = svp; + } + return (0); + } + /* + * If we were unable to perform the VOP_CREATE for any reason + * other than sdev being read-only, we should bail. + */ + if (error != ENOTSUP && error != EROFS) { + return (error); + } + } + + /* + * While we don't allow creating data-containing files under + * lx devfs, we must allow VSOCK front nodes to be created so + * that paths such as /dev/log can be used as AF_UNIX sockets. + */ + if (va->va_type == VSOCK) { + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + lnp = NULL; + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + va, &lnp, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error == 0) { + *vpp = LDNTOV(lnp); + } else if (lnp != NULL) { + /* + * It's possible that a racing process created an entry + * at this name since we last performed the lookup. + */ + ldnode_rele(lnp); + } + } else { + error = ENOTSUP; + } + + return (error); +} + +/* ARGSUSED */ +static int +lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct, + int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *ldn = NULL; + int error; + + /* can only remove existing front nodes */ + error = lxd_dirlookup(parent, nm, &ldn, cr); + if (error) { + return (error); + } + + ASSERT(ldn != NULL); + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr); + + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + + ldnode_rele(ldn); + + return (error); +} + +/* ARGSUSED */ +static int +lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + lxd_node_t *oldparent = VTOLDN(odvp); + lxd_node_t *newparent; + lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode); + lxd_node_t *fromnode = NULL; + int error; + int samedir = 0; + + if (!vn_matchops(ndvp, lxd_vnodeops)) { + /* cannot rename out of this file system */ + return (EACCES); + } + + mutex_enter(&lxdm->lxdm_renamelck); + + newparent = VTOLDN(ndvp); + + /* + * We can only rename front nodes. + */ + error = lxd_dirlookup(oldparent, onm, &fromnode, cr); + if (error != 0) { + /* not found in front */ + mutex_exit(&lxdm->lxdm_renamelck); + return (error); + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. If + * that directory is "sticky" it requires further checks. + */ + if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromnode == oldparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (oldparent == fromnode)) { + error = EINVAL; + goto done; + } + + samedir = (oldparent == newparent); + + /* + * Make sure we can search and rename into the destination directory. + */ + if (!samedir) { + if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0) + goto done; + } + + /* + * Link source to new target + */ + rw_enter(&newparent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME, + oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL, + cr); + rw_exit(&newparent->lxdn_rwlock); + + if (error) + goto done; + + /* + * Unlink from source. + */ + rw_enter(&oldparent->lxdn_rwlock, RW_WRITER); + rw_enter(&fromnode->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr); + + /* + * The following handles the case where our source node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + rw_exit(&fromnode->lxdn_rwlock); + rw_exit(&oldparent->lxdn_rwlock); + +done: + ldnode_rele(fromnode); + mutex_exit(&lxdm->lxdm_renamelck); + return (error); +} + +/* ARGSUSED */ +static int +lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, + struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + int error; + vnode_t *tvp; + lxd_node_t *ndir = NULL; + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + /* check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make front directory */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL, + va, &ndir, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error != 0) { + if (ndir != NULL) + ldnode_rele(ndir); + } else { + *vpp = LDNTOV(ndir); + } + + return (error); +} + +static int +lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + *vpp = vp; + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + while (vn_matchops(vp, lxd_vnodeops)) + vp = REALVP(vp); + + if (VOP_REALVP(vp, vpp, ct) != 0) + *vpp = vp; + return (0); +} + +/* ARGSUSED */ +static int +lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr, + caller_context_t *ct, int flags) +{ + int error; + lxd_node_t *ldn; + struct vnode *vp; + lxd_node_t *parent = VTOLDN(dvp); + + /* + * Return error if trying to remove . or .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); + + error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr); + if (error != 0) { + /* not found in front */ + return (error); + } + + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + vp = LDNTOV(ldn); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto err; + } + + if (ldn->lxdn_vnode->v_type != VDIR) { + error = ENOTDIR; + goto err; + } + + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink > 2) { + mutex_exit(&ldn->lxdn_tlock); + error = EEXIST; + goto err; + } + mutex_exit(&ldn->lxdn_tlock); + + /* Check for an empty directory */ + if (ldn->lxdn_dirents > 2) { + error = EEXIST; + gethrestime(&ldn->lxdn_atime); + goto err; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto err; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + vn_vfsunlock(vp); + goto err; + } + + error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr); + vn_vfsunlock(vp); + +err: + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + ldnode_rele(ldn); + + return (error); +} + +/* Not static so it can be used during mount. */ +/* ARGSUSED */ +int +lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm, + struct cred *cr, caller_context_t *ct, int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + lxd_node_t *self = NULL; + vnode_t *tvp; + char *cp = NULL; + int error; + size_t len; + + /* this will check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make symlink in the front */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + tva, &self, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error) { + if (self != NULL) + ldnode_rele(self); + return (error); + } + + len = strlen(tnm) + 1; + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); + if (cp == NULL) { + ldnode_rele(self); + return (ENOSPC); + } + (void) strcpy(cp, tnm); + + self->lxdn_symlink = cp; + self->lxdn_size = len - 1; + ldnode_rele(self); + + return (error); +} + +static int +lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + if (vp->v_type != VLNK) + return (EINVAL); + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ, + uiop); + gethrestime(&ldn->lxdn_atime); + rw_exit(&ldn->lxdn_rwlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READLINK(vp, uiop, cr, ct)); +} + +static int +lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp) +{ + lxd_node_t *ldn = VTOLDN(vp); + struct dirent *sd; + lxd_dirent_t *ldp; + enum lxd_node_type type = ldn->lxdn_type; + ssize_t uresid; + off_t front_off; + int error = 0; + int sdlen; + + /* skip the front entries if the back read was incomplete */ + if (*eofp == 0) + return (0); + + /* + * If this was a back node then reading that node has completed and we + * may have a partially full uio struct. eof should be set to true. + * Leave it set since we're likely to hit eof for the front nodes (if + * any). + */ + + front_off = uiop->uio_offset + 1; + sdlen = sizeof (struct dirent) + MAXPATHLEN; + /* zalloc to ensure we don't have anything in the d_name buffer */ + sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP); + ldp = ldn->lxdn_dir; + while (ldp != NULL && (uresid = uiop->uio_resid) > 0) { + int namelen; + int reclen; + + /* + * Skip dot and dotdot for back nodes since we have them + * already. + */ + if (type == LXDNT_BACK && + (strcmp(ldp->lddir_name, ".") == 0 || + strcmp(ldp->lddir_name, "..") == 0)) { + ldp = ldp->lddir_next; + continue; + } + + /* + * Might have previously had a partial readdir of the front + * nodes, and now we're back for more, or we may just be + * be doing a follow-up readdir after we've previously + * returned all front and back nodes. + */ + if (front_off > req_off) { + namelen = strlen(ldp->lddir_name); /* no +1 needed */ + reclen = (int)DIRENT64_RECLEN(namelen); + + /* + * If the size of the data to transfer is greater + * than that requested, then we can't do it this + * transfer. + */ + if (reclen > uresid) { + *eofp = 0; + /* Buffer too small for any entries. */ + if (front_off == 0) + error = EINVAL; + break; + } + + (void) strncpy(sd->d_name, ldp->lddir_name, + DIRENT64_NAMELEN(reclen)); + sd->d_reclen = (ushort_t)reclen; + sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid; + sd->d_off = front_off; + + /* uiomove will adjust iov_base properly */ + if ((error = uiomove((caddr_t)sd, reclen, UIO_READ, + uiop)) != 0) { + *eofp = 0; + break; + } + } + + /* + * uiomove() above updates both uio_resid and uio_offset by the + * same amount but we want uio_offset to change in increments + * of 1, which is different from the number of bytes being + * returned to the caller, so we set uio_offset explicitly, + * ignoring what uiomove() did. + */ + uiop->uio_offset = front_off; + front_off++; + + ldp = ldp->lddir_next; + } + + kmem_free(sd, sdlen); + return (error); +} + +static int +lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxd_node_t *ldn = VTOLDN(vp); + vnode_t *rvp; + int res; + off_t req_off; + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + req_off = uiop->uio_offset; + + /* First read the back node (if it is one) */ + if (ldn->lxdn_type == LXDNT_BACK) { + rvp = REALVP(vp); + res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags); + if (res != 0) + return (res); + } else { + /* setup for merge_front */ + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + /* caller should have already called lxd_rwlock */ + ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock)); + + *eofp = 1; + /* + * The merge code starts the offset calculation from uio_offset, + * which is normally already set to the high value by the back + * code, but in this case we need to count up from 0. + */ + uiop->uio_offset = 0; + } + + /* + * Our back nodes can also have front entries hanging on them so we + * need to merge those in. Or, we may simply have a front node (i.e. a + * front subdir). + */ + res = lx_merge_front(vp, uiop, req_off, eofp); + return (res); +} + +static int +lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + if (write_lock) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + } else { + rw_enter(&ldn->lxdn_rwlock, RW_READER); + } + return (write_lock); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_RWLOCK(vp, write_lock, ct)); +} + +static void +lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + rw_exit(&ldn->lxdn_rwlock); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + VOP_RWUNLOCK(vp, write_lock, ct); +} + +static int +lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SEEK(vp, ooff, noffp, ct)); +} + +static int +lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + while (vn_matchops(vp1, lxd_vnodeops) && + VTOLDN(vp1)->lxdn_type == LXDNT_BACK) { + vp1 = REALVP(vp1); + } + while (vn_matchops(vp2, lxd_vnodeops) && + VTOLDN(vp2)->lxdn_type == LXDNT_BACK) { + vp2 = REALVP(vp2); + } + + if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +static int +lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +static int +lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct)); +} + +static int +lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot, + struct page *parr[], size_t psz, struct seg *seg, caddr_t addr, + enum seg_rw rw, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr, + ct)); +} + +static int +lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PUTPAGE(vp, off, len, flags, cr, ct)); +} + +static int +lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct)); +} + +static int +lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp, + struct pollhead **phpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct)); +} + +static int +lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DUMP(vp, addr, bn, count, ct)); +} + +static int +lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PATHCONF(vp, cmd, valp, cr, ct)); +} + +static int +lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len, + int flags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct)); +} + +static void +lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + if (vp != NULL && !VN_ISKAS(vp)) + VOP_DISPOSE(vp, pp, fl, dn, cr, ct); +} + +static int +lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (vn_is_readonly(vp)) + return (EROFS); + + vp = REALVP(vp); + return (VOP_SETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct)); +} + +/* + * Loopback vnode operations vector. + */ + +struct vnodeops *lxd_vnodeops; + +const fs_operation_def_t lxd_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxd_open }, + VOPNAME_CLOSE, { .vop_close = lxd_close }, + VOPNAME_READ, { .vop_read = lxd_read }, + VOPNAME_WRITE, { .vop_write = lxd_write }, + VOPNAME_IOCTL, { .vop_ioctl = lxd_ioctl }, + VOPNAME_SETFL, { .vop_setfl = lxd_setfl }, + VOPNAME_GETATTR, { .vop_getattr = lxd_getattr }, + VOPNAME_SETATTR, { .vop_setattr = lxd_setattr }, + VOPNAME_ACCESS, { .vop_access = lxd_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxd_lookup }, + VOPNAME_CREATE, { .vop_create = lxd_create }, + VOPNAME_REMOVE, { .vop_remove = lxd_remove }, + VOPNAME_LINK, { .vop_link = lxd_link }, + VOPNAME_RENAME, { .vop_rename = lxd_rename }, + VOPNAME_MKDIR, { .vop_mkdir = lxd_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = lxd_rmdir }, + VOPNAME_READDIR, { .vop_readdir = lxd_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = lxd_symlink }, + VOPNAME_READLINK, { .vop_readlink = lxd_readlink }, + VOPNAME_FSYNC, { .vop_fsync = lxd_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = lxd_inactive }, + VOPNAME_FID, { .vop_fid = lxd_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = lxd_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = lxd_rwunlock }, + VOPNAME_SEEK, { .vop_seek = lxd_seek }, + VOPNAME_CMP, { .vop_cmp = lxd_cmp }, + VOPNAME_FRLOCK, { .vop_frlock = lxd_frlock }, + VOPNAME_SPACE, { .vop_space = lxd_space }, + VOPNAME_REALVP, { .vop_realvp = lxd_realvp }, + VOPNAME_GETPAGE, { .vop_getpage = lxd_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = lxd_putpage }, + VOPNAME_MAP, { .vop_map = lxd_map }, + VOPNAME_ADDMAP, { .vop_addmap = lxd_addmap }, + VOPNAME_DELMAP, { .vop_delmap = lxd_delmap }, + VOPNAME_POLL, { .vop_poll = lxd_poll }, + VOPNAME_DUMP, { .vop_dump = lxd_dump }, + VOPNAME_DUMPCTL, { .error = fs_error }, + VOPNAME_PATHCONF, { .vop_pathconf = lxd_pathconf }, + VOPNAME_PAGEIO, { .vop_pageio = lxd_pageio }, + VOPNAME_DISPOSE, { .vop_dispose = lxd_dispose }, + VOPNAME_SETSECATTR, { .vop_setsecattr = lxd_setsecattr }, + VOPNAME_GETSECATTR, { .vop_getsecattr = lxd_getsecattr }, + VOPNAME_SHRLOCK, { .vop_shrlock = lxd_shrlock }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c new file mode 100644 index 0000000000..de5a16c414 --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -0,0 +1,499 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + + +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/frame.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + +/* + * We store the syscall number in the low 16 bits (which limits us to 64k + * syscalls). The next bit indicates entry/return probe and the next bit + * indicates 64bit/32bit syscall. + */ +#define SCALL_MASK 0xffff +#define ENTRY_FLAG 0x10000 +#define SYSC_64_BIT 0x100000 + +#define LX_SYSTRACE_IS64BIT(x) ((int)(x) & SYSC_64_BIT) +#define LX_SYSTRACE_ISENTRY(x) ((int)(x) & ENTRY_FLAG) +#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & SCALL_MASK) + +#define LX_SYSTRACE32_ENTRY(id) (ENTRY_FLAG | (id)) +#define LX_SYSTRACE32_RETURN(id) (id) + +#define LX_SYSTRACE64_ENTRY(id) (SYSC_64_BIT | ENTRY_FLAG | (id)) +#define LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id) + +#define LX_SYSTRACE_ENTRY_AFRAMES 2 +#define LX_SYSTRACE_RETURN_AFRAMES 4 + +typedef struct lx_systrace_sysent { + const char *lss_name; + dtrace_id_t lss_entry; + dtrace_id_t lss_return; +} lx_systrace_sysent_t; + +static dev_info_t *lx_systrace_devi; +static dtrace_provider_id_t lx_systrace_id; +static kmutex_t lx_systrace_lock; +static uint_t lx_systrace_nenabled; + +static int lx_systrace_nsysent32; +static lx_systrace_sysent_t *lx_systrace_sysent32; + +#if defined(_LP64) +static int lx_systrace_nsysent64; +static lx_systrace_sysent_t *lx_systrace_sysent64; +#endif + +/*ARGSUSED*/ +static void +lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_entry; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_entry; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_return; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_return; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc) +{ + int i; + + if (desc != NULL) + return; + + for (i = 0; i < lx_systrace_nsysent32; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i))); + + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + for (i = 0; i < lx_systrace_nsysent64; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i))); + + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif +} + +/*ARGSUSED*/ +static int +lx_systrace_enable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + mutex_enter(&lx_systrace_lock); + if (lx_systrace_nenabled++ == 0) + lx_brand_systrace_enable(); + mutex_exit(&lx_systrace_lock); + +#if defined(_LP64) + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = id; + } else { + lx_systrace_sysent64[sysnum].lss_return = id; + } + } else +#endif + { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = id; + } else { + lx_systrace_sysent32[sysnum].lss_return = id; + } + } + return (0); +} + +/*ARGSUSED*/ +static void +lx_systrace_disable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + +#if defined(_LP64) + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE; + } + } else +#endif + { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE; + } + } + + mutex_enter(&lx_systrace_lock); + if (--lx_systrace_nenabled == 0) + lx_brand_systrace_disable(); + mutex_exit(&lx_systrace_lock); +} + +/*ARGSUSED*/ +static void +lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg) +{ +} + +/*ARGSUSED*/ +static uint64_t +lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct frame *fp = (struct frame *)dtrace_getfp(); + uintptr_t *stack; + uint64_t val = 0; + int i; + + if (argno >= 6) + return (0); + + /* + * Walk the four frames down the stack to the entry or return callback. + * Our callback calls dtrace_probe() which calls dtrace_dif_variable() + * which invokes this function to get the extended arguments. We get + * the frame pointer in via call to dtrace_getfp() above which makes for + * four frames. + */ + for (i = 0; i < 4; i++) { + fp = (struct frame *)fp->fr_savfp; + } + + stack = (uintptr_t *)&fp[1]; + + /* + * Skip the first argument to the callback -- the system call number. + */ + argno++; + +#ifdef __amd64 + /* + * On amd64, the first 6 arguments are passed in registers while + * subsequent arguments are on the stack. + */ + argno -= 6; +#endif + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[argno]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + + +static const dtrace_pattr_t lx_systrace_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t lx_systrace_pops = { + lx_systrace_provide, + NULL, + lx_systrace_enable, + lx_systrace_disable, + NULL, + NULL, + NULL, + lx_systrace_getarg, + NULL, + lx_systrace_destroy +}; + +static int +lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int i; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR, + 0, DDI_PSEUDO, NULL) == DDI_FAILURE || + dtrace_register("lx-syscall", &lx_systrace_attr, + DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL, + &lx_systrace_id) != 0) { + ddi_remove_minor_node(devi, NULL); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + lx_systrace_devi = devi; + + /* + * Initialize the 32-bit table. + */ + VERIFY(lx_nsysent32 > 0); + lx_systrace_nsysent32 = lx_nsysent32; + lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent32; i++) { + lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name; + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + /* + * Initialize the 64-bit table. + */ + VERIFY(lx_nsysent64 > 0); + lx_systrace_nsysent64 = lx_nsysent64; + lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent64; i++) { + lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name; + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif + + /* + * Install probe triggers. + */ + lx_systrace_entry_ptr = lx_systrace_entry; + lx_systrace_return_ptr = lx_systrace_return; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (dtrace_unregister(lx_systrace_id) != 0) + return (DDI_FAILURE); + + /* + * Free tables. + */ + kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent32 = NULL; + lx_systrace_nsysent32 = 0; + +#if defined(_LP64) + kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent64 = NULL; + lx_systrace_nsysent64 = 0; +#endif + + /* + * Reset probe triggers. + */ + lx_systrace_entry_ptr = NULL; + lx_systrace_return_ptr = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +static struct cb_ops lx_systrace_cb_ops = { + lx_systrace_open, /* open */ + nodev, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops lx_systrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_systrace_attach, /* attach */ + lx_systrace_detach, /* detach */ + nodev, /* reset */ + &lx_systrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Linux Brand System Call Tracing", /* name of module */ + &lx_systrace_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf new file mode 100644 index 0000000000..e4499c8a5b --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_systrace" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c new file mode 100644 index 0000000000..76d68f5921 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c @@ -0,0 +1,2232 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Compatibility for the Linux netlink(7) kernel/user transport, as well as + * for in-kernel netlink(7) providers like rtnetlink(7). See RFC 3549 for + * details of the protocol, and the Linux man pages for details of the Linux + * implementation that we're mimicking. + */ + +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <sys/debug.h> +#include <sys/ucred.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_impl.h> +#include <inet/ip_ire.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/lx_impl.h> +#include <sys/lx_audit.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/policy.h> +#include <sys/ddi.h> + +/* + * Flags in netlink header + * See Linux include/uapi/linux/netlink.h + * Additional flags for "GET" requests + */ +#define LX_NETLINK_NLM_F_REQUEST 1 +#define LX_NETLINK_NLM_F_MULTI 2 +#define LX_NETLINK_NLM_F_ACK 4 +#define LX_NETLINK_NLM_F_ECHO 8 +#define LX_NETLINK_NLM_F_DUMP_INTR 16 +#define LX_NETLINK_NLM_F_ROOT 0x100 +#define LX_NETLINK_NLM_F_MATCH 0x200 +#define LX_NETLINK_NLM_F_ATOMIC 0x400 + +/* + * Generic message type constants + */ +#define LX_NETLINK_NLMSG_NONE 0 +#define LX_NETLINK_NLMSG_NOOP 1 +#define LX_NETLINK_NLMSG_ERROR 2 +#define LX_NETLINK_NLMSG_DONE 3 +#define LX_NETLINK_NLMSG_OVERRUN 4 + +/* + * Protocol constants. + */ +#define LX_NETLINK_ROUTE 0 +#define LX_NETLINK_UNUSED 1 +#define LX_NETLINK_USERSOCK 2 +#define LX_NETLINK_FIREWALL 3 +#define LX_NETLINK_SOCK_DIAG 4 +#define LX_NETLINK_NFLOG 5 +#define LX_NETLINK_XFRM 6 +#define LX_NETLINK_SELINUX 7 +#define LX_NETLINK_ISCSI 8 +#define LX_NETLINK_AUDIT 9 +#define LX_NETLINK_FIB_LOOKUP 10 +#define LX_NETLINK_CONNECTOR 11 +#define LX_NETLINK_NETFILTER 12 +#define LX_NETLINK_IP6_FW 13 +#define LX_NETLINK_DNRTMSG 14 +#define LX_NETLINK_KOBJECT_UEVENT 15 +#define LX_NETLINK_GENERIC 16 +#define LX_NETLINK_SCSITRANSPORT 18 +#define LX_NETLINK_ECRYPTFS 19 +#define LX_NETLINK_RDMA 20 +#define LX_NETLINK_CRYPTO 21 + +/* + * rtnetlink(7) attribute-related constants + */ +#define LX_NETLINK_NLA_ALIGNTO 4 + +#define LX_NETLINK_RTM_NEWLINK 16 +#define LX_NETLINK_RTM_DELLINK 17 +#define LX_NETLINK_RTM_GETLINK 18 +#define LX_NETLINK_RTM_SETLINK 19 +#define LX_NETLINK_RTM_NEWADDR 20 +#define LX_NETLINK_RTM_DELADDR 21 +#define LX_NETLINK_RTM_GETADDR 22 +#define LX_NETLINK_RTM_NEWROUTE 24 +#define LX_NETLINK_RTM_DELROUTE 25 +#define LX_NETLINK_RTM_GETROUTE 26 +#define LX_NETLINK_RTM_NEWNEIGH 28 +#define LX_NETLINK_RTM_DELNEIGH 29 +#define LX_NETLINK_RTM_GETNEIGH 30 +#define LX_NETLINK_RTM_NEWRULE 32 +#define LX_NETLINK_RTM_DELRULE 33 +#define LX_NETLINK_RTM_GETRULE 34 +#define LX_NETLINK_RTM_NEWQDISC 36 +#define LX_NETLINK_RTM_DELQDISC 37 +#define LX_NETLINK_RTM_GETQDISC 38 +#define LX_NETLINK_RTM_NEWTCLASS 40 +#define LX_NETLINK_RTM_DELTCLASS 41 +#define LX_NETLINK_RTM_GETTCLASS 42 +#define LX_NETLINK_RTM_NEWTFILTER 44 +#define LX_NETLINK_RTM_DELTFILTER 45 +#define LX_NETLINK_RTM_GETTFILTER 46 +#define LX_NETLINK_RTM_NEWACTION 48 +#define LX_NETLINK_RTM_DELACTION 49 +#define LX_NETLINK_RTM_GETACTION 50 +#define LX_NETLINK_RTM_NEWPREFIX 52 +#define LX_NETLINK_RTM_GETMULTICAST 58 +#define LX_NETLINK_RTM_GETANYCAST 62 +#define LX_NETLINK_RTM_NEWNEIGHTBL 64 +#define LX_NETLINK_RTM_GETNEIGHTBL 66 +#define LX_NETLINK_RTM_SETNEIGHTBL 67 +#define LX_NETLINK_RTM_NEWNDUSEROPT 68 +#define LX_NETLINK_RTM_NEWADDRLABEL 72 +#define LX_NETLINK_RTM_DELADDRLABEL 73 +#define LX_NETLINK_RTM_GETADDRLABEL 74 +#define LX_NETLINK_RTM_GETDCB 78 +#define LX_NETLINK_RTM_SETDCB 79 +#define LX_NETLINK_RTM_NEWNETCONF 80 +#define LX_NETLINK_RTM_GETNETCONF 82 +#define LX_NETLINK_RTM_NEWMDB 84 +#define LX_NETLINK_RTM_DELMDB 85 +#define LX_NETLINK_RTM_GETMDB 86 +#define LX_NETLINK_RTM_MAX 87 + +/* + * rtnetlink(7) attribute constants + */ +#define LX_NETLINK_RTA_UNSPEC 0 +#define LX_NETLINK_RTA_DST 1 +#define LX_NETLINK_RTA_SRC 2 +#define LX_NETLINK_RTA_IIF 3 +#define LX_NETLINK_RTA_OIF 4 +#define LX_NETLINK_RTA_GATEWAY 5 +#define LX_NETLINK_RTA_PRIORITY 6 +#define LX_NETLINK_RTA_PREFSRC 7 +#define LX_NETLINK_RTA_METRICS 8 +#define LX_NETLINK_RTA_MULTIPATH 9 +#define LX_NETLINK_RTA_PROTOINFO 10 +#define LX_NETLINK_RTA_FLOW 11 +#define LX_NETLINK_RTA_CACHEINFO 12 +#define LX_NETLINK_RTA_SESSION 13 +#define LX_NETLINK_RTA_MP_ALGO 14 +#define LX_NETLINK_RTA_TABLE 15 +#define LX_NETLINK_RTA_MARK 16 +#define LX_NETLINK_RTA_MFC_STATS 17 +#define LX_NETLINK_MAX_RTA LX_NETLINK_RTA_MFC_STATS + +/* + * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants + */ +#define LX_NETLINK_IFLA_UNSPEC 0 +#define LX_NETLINK_IFLA_ADDRESS 1 +#define LX_NETLINK_IFLA_BROADCAST 2 +#define LX_NETLINK_IFLA_IFNAME 3 +#define LX_NETLINK_IFLA_MTU 4 +#define LX_NETLINK_IFLA_LINK 5 +#define LX_NETLINK_IFLA_QDISC 6 +#define LX_NETLINK_IFLA_STATS 7 +#define LX_NETLINK_IFLA_COST 8 +#define LX_NETLINK_IFLA_PRIORITY 9 +#define LX_NETLINK_IFLA_MASTER 10 +#define LX_NETLINK_IFLA_WIRELESS 11 +#define LX_NETLINK_IFLA_PROTINFO 12 +#define LX_NETLINK_IFLA_TXQLEN 13 +#define LX_NETLINK_IFLA_MAP 14 +#define LX_NETLINK_IFLA_WEIGHT 15 +#define LX_NETLINK_IFLA_OPERSTATE 16 +#define LX_NETLINK_IFLA_LINKMODE 17 +#define LX_NETLINK_IFLA_LINKINFO 18 +#define LX_NETLINK_IFLA_NET_NS_PID 19 +#define LX_NETLINK_IFLA_IFALIAS 20 +#define LX_NETLINK_IFLA_NUM_VF 21 +#define LX_NETLINK_IFLA_VFINFO_LIST 22 +#define LX_NETLINK_IFLA_STATS64 23 +#define LX_NETLINK_IFLA_VF_PORTS 24 +#define LX_NETLINK_IFLA_PORT_SELF 25 +#define LX_NETLINK_IFLA_AF_SPEC 26 +#define LX_NETLINK_IFLA_GROUP 27 +#define LX_NETLINK_IFLA_NET_NS_FD 28 +#define LX_NETLINK_IFLA_EXT_MASK 29 +#define LX_NETLINK_IFLA_PROMISCUITY 30 +#define LX_NETLINK_IFLA_NUM_TX_QUEUES 31 +#define LX_NETLINK_IFLA_NUM_RX_QUEUES 32 +#define LX_NETLINK_IFLA_CARRIER 33 +#define LX_NETLINK_IFLA_PHYS_PORT_ID 34 +#define LX_NETLINK_IFLA_CARRIER_CHANGES 35 +#define LX_NETLINK_IFLA_MAX 36 + +/* + * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants + */ +#define LX_NETLINK_IFA_UNSPEC 0 +#define LX_NETLINK_IFA_ADDRESS 1 +#define LX_NETLINK_IFA_LOCAL 2 +#define LX_NETLINK_IFA_LABEL 3 +#define LX_NETLINK_IFA_BROADCAST 4 +#define LX_NETLINK_IFA_ANYCAST 5 +#define LX_NETLINK_IFA_CACHEINFO 6 +#define LX_NETLINK_IFA_MULTICAST 7 +#define LX_NETLINK_IFA_FLAGS 8 +#define LX_NETLINK_IFA_MAX 9 + +#define LX_NETLINK_IFA_F_SECONDARY 0x01 +#define LX_NETLINK_IFA_F_TEMPORARY LX_NETLINK_IFA_F_SECONDARY +#define LX_NETLINK_IFA_F_NODAD 0x02 +#define LX_NETLINK_IFA_F_OPTIMISTIC 0x04 +#define LX_NETLINK_IFA_F_DADFAILED 0x08 +#define LX_NETLINK_IFA_F_HOMEADDRESS 0x10 +#define LX_NETLINK_IFA_F_DEPRECATED 0x20 +#define LX_NETLINK_IFA_F_TENTATIVE 0x40 +#define LX_NETLINK_IFA_F_PERMANENT 0x80 +#define LX_NETLINK_IFA_F_MANAGETEMPADDR 0x100 +#define LX_NETLINK_IFA_F_NOPREFIXROUTE 0x200 + +/* + * Linux interface flags. + */ +#define LX_IFF_UP (1<<0) +#define LX_IFF_BROADCAST (1<<1) +#define LX_IFF_DEBUG (1<<2) +#define LX_IFF_LOOPBACK (1<<3) +#define LX_IFF_POINTOPOINT (1<<4) +#define LX_IFF_NOTRAILERS (1<<5) +#define LX_IFF_RUNNING (1<<6) +#define LX_IFF_NOARP (1<<7) +#define LX_IFF_PROMISC (1<<8) +#define LX_IFF_ALLMULTI (1<<9) +#define LX_IFF_MASTER (1<<10) +#define LX_IFF_SLAVE (1<<11) +#define LX_IFF_MULTICAST (1<<12) +#define LX_IFF_PORTSEL (1<<13) +#define LX_IFF_AUTOMEDIA (1<<14) +#define LX_IFF_DYNAMIC (1<<15) +#define LX_IFF_LOWER_UP (1<<16) +#define LX_IFF_DORMANT (1<<17) +#define LX_IFF_ECHO (1<<18) + +/* rtm_table */ +#define LX_ROUTE_TABLE_MAIN 254 + +/* rtm_type */ +#define LX_RTN_UNSPEC 0 +#define LX_RTN_UNICAST 1 +#define LX_RTN_LOCAL 2 +#define LX_RTN_BROADCAST 3 +#define LX_RTN_ANYCAST 4 +#define LX_RTN_MULTICAST 5 +#define LX_RTN_BLACKHOLE 6 +#define LX_RTN_UNREACHABLE 7 +#define LX_RTN_PROHIBIT 8 +#define LX_RTN_THROW 9 +#define LX_RTN_NAT 10 +#define LX_RTN_XRESOLVE 11 + +/* rtm_protocol */ +#define LX_RTPROT_UNSPEC 0 +#define LX_RTPROT_REDIRECT 1 /* From ICMP redir */ +#define LX_RTPROT_KERNEL 2 /* From kernel */ +#define LX_RTPROT_BOOT 3 /* From boot */ +#define LX_RTPROT_STATIC 4 /* From administrator */ +#define LX_RTPROT_NULL 0xff /* Uninitialized */ + +/* rtm_scope */ +#define LX_RTSCOPE_UNIVERSE 0 +#define LX_RTSCOPE_SITE 200 +#define LX_RTSCOPE_LINK 253 +#define LX_RTSCOPE_HOST 254 +#define LX_RTSCOPE_NOWHERE 255 + +/* + * Audit message types (lxnh_type in the lx_netlink_hdr_t msg header) + * See Linux include/uapi/linux/audit.h and user-level auditd source + * lib/libaudit.h. + * + * The types fall into range blocks: + * 1000-1099 is for audit system control commands + * 1100-2999 various messages, as detailed in include/uapi/linux/audit.h + */ +#define LX_AUDIT_GET 1000 /* get audit system status */ +#define LX_AUDIT_SET 1001 /* set audit system status */ +#define LX_AUDIT_WATCH_INS 1007 /* insert file watch */ +#define LX_AUDIT_WATCH_REM 1008 /* remove file watch */ +#define LX_AUDIT_WATCH_LIST 1009 /* list file watchs */ +#define LX_AUDIT_ADD_RULE 1011 /* add syscall rule */ +#define LX_AUDIT_DEL_RULE 1012 /* del syscall rule */ +#define LX_AUDIT_LIST_RULES 1013 /* list syscall rules */ +#define LX_AUDIT_SET_FEATURE 1018 +#define LX_AUDIT_GET_FEATURE 1019 +#define LX_AUDIT_USER_MSG_START 1100 + +/* + * Netlink sockopts + */ +#define SOL_LX_NETLINK 270 + +/* See Linux include/uapi/linux/netlink.h */ +#define LX_NETLINK_SO_ADD_MEMBERSHIP 1 +#define LX_NETLINK_SO_DROP_MEMBERSHIP 2 +#define LX_NETLINK_SO_PKTINFO 3 +#define LX_NETLINK_SO_BROADCAST_ERROR 4 +#define LX_NETLINK_SO_NO_ENOBUFS 5 +#define LX_NETLINK_SO_RX_RING 6 +#define LX_NETLINK_SO_TX_RING 7 +#define LX_NETLINK_SO_LISTEN_ALL_NSID 8 +#define LX_NETLINK_SO_LIST_MEMBERSHIPS 9 +#define LX_NETLINK_SO_CAP_ACK 10 + +/* Internal socket flags */ +#define LXNLF_RECVUCRED 0x1 +#define LXNLF_AUDITD 0x2 + +/* nlmsg structure macros */ +#define LXNLMSG_ALIGNTO 4 +#define LXNLMSG_ALIGN(len) \ + (((len) + LXNLMSG_ALIGNTO - 1) & ~(LXNLMSG_ALIGNTO - 1)) +#define LXNLMSG_HDRLEN \ + ((int)LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t))) +#define LXNLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define LXNLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define LXNLMSG_DATA(nlh) ((void*)(((char *)nlh) + NLMSG_LENGTH(0))) +#define LXNLMSG_PAYLOAD(nlh, len) \ + ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define LXATTR_PAYLOAD(lxa) \ + ((void*)((caddr_t)(lxa) + sizeof (lx_netlink_attr_t))) +#define LXATTR_HDRLEN LXNLMSG_ALIGN(sizeof (lx_netlink_attr_t)) +#define LXATTR_LEN(len) (LXATTR_HDRLEN + LXNLMSG_ALIGN(len)) + +typedef struct lx_netlink_hdr { + uint32_t lxnh_len; /* length of message */ + uint16_t lxnh_type; /* type of message */ + uint16_t lxnh_flags; /* flags */ + uint32_t lxnh_seq; /* sequence number */ + uint32_t lxnh_pid; /* sending pid */ +} lx_netlink_hdr_t; + +typedef struct lx_netlink_err { + lx_netlink_hdr_t lxne_hdr; /* header */ + int32_t lxne_errno; /* errno */ + lx_netlink_hdr_t lxne_failed; /* header of err */ +} lx_netlink_err_t; + +typedef struct lx_netlink_attr { + uint16_t lxna_len; /* length of attribute */ + uint16_t lxna_type; /* type of attribute */ +} lx_netlink_attr_t; + +typedef struct lx_netlink_ifinfomsg { + uint8_t lxnl_ifi_family; /* family: AF_UNSPEC */ + uint8_t lxnl_ifi__pad; + uint16_t lxnl_ifi_type; /* device type */ + uint32_t lxnl_ifi_index; /* interface index */ + uint32_t lxnl_ifi_flags; /* device flags */ + uint32_t lxnl_ifi_change; /* unused; must be -1 */ +} lx_netlink_ifinfomsg_t; + +typedef struct lx_netlink_ifaddrmsg { + uint8_t lxnl_ifa_family; /* address type */ + uint8_t lxnl_ifa_prefixlen; /* prefix length of address */ + uint8_t lxnl_ifa_flags; /* address flags */ + uint8_t lxnl_ifa_scope; /* address scope */ + uint8_t lxnl_ifa_index; /* interface index */ +} lx_netlink_ifaddrmsg_t; + +typedef struct lx_netlink_rtmsg { + uint8_t rtm_family; /* route AF */ + uint8_t rtm_dst_len; /* destination addr length */ + uint8_t rtm_src_len; /* source addr length */ + uint8_t rtm_tos; /* TOS filter */ + uint8_t rtm_table; /* routing table ID */ + uint8_t rtm_protocol; /* routing protocol */ + uint8_t rtm_scope; + uint8_t rtm_type; + uint32_t rtm_flags; +} lx_netlink_rtmsg_t; + +typedef struct lx_netlink_sockaddr { + sa_family_t lxnl_family; /* AF_LX_NETLINK */ + uint16_t lxnl_pad; /* padding */ + uint32_t lxnl_port; /* port id */ + uint32_t lxnl_groups; /* multicast groups mask */ +} lx_netlink_sockaddr_t; + +typedef struct lx_netlink_sock { + struct lx_netlink_sock *lxns_next; /* list of lx_netlink sockets */ + sock_upcalls_t *lxns_upcalls; /* pointer to socket upcalls */ + sock_upper_handle_t lxns_uphandle; /* socket upcall handle */ + ldi_handle_t lxns_iphandle; /* handle to /dev/ip */ + ldi_handle_t lxns_ip6handle; /* handle to /dev/ip6 */ + ldi_handle_t lxns_current; /* current ip handle */ + int lxns_proto; /* protocol */ + uint32_t lxns_port; /* port identifier */ + uint32_t lxns_groups; /* group subscriptions */ + uint32_t lxns_bufsize; /* buffer size */ + uint32_t lxns_flags; /* socket flags */ + kmutex_t lxns_flowctl_mtx; /* protects lxns_flowctrled */ + boolean_t lxns_flowctrled; /* sock is flow-controlled */ +} lx_netlink_sock_t; + +typedef struct lx_netlink_reply { + lx_netlink_hdr_t lxnr_hdr; /* header that we're reply to */ + lx_netlink_sock_t *lxnr_sock; /* socket */ + uint32_t lxnr_seq; /* sequence number */ + uint16_t lxnr_type; /* type of reply */ + mblk_t *lxnr_mp; /* current mblk */ + mblk_t *lxnr_err; /* error mblk */ + mblk_t *lxnr_mp1; /* T_UNITDATA_IND mblk */ + int lxnr_errno; /* errno, if any */ +} lx_netlink_reply_t; + +static lx_netlink_sock_t *lx_netlink_head; /* head of lx_netlink sockets */ +static uint_t lx_netlink_audit_cnt; /* prevent unload for audit */ +static kmutex_t lx_netlink_lock; /* lock to protect state */ +static ldi_ident_t lx_netlink_ldi; /* LDI handle */ +static int lx_netlink_bufsize = 4096; /* default buffer size */ +static int lx_netlink_flowctrld; /* # of times flow controlled */ + +typedef enum { + LXNL_BIND, + LXNL_SENDMSG +} lx_netlink_action_t; + +#define LX_UNSUP_BUFSZ 64 + +/* + * On Linux, CAP_NET_ADMIN is required to take certain netlink actions. This + * restriction is loosened for certain protocol types, provided the activity is + * limited to communicating directly with the kernel (rather than transmitting + * to the various multicast groups) + */ +static int +lx_netlink_access(lx_netlink_sock_t *lns, cred_t *cr, lx_netlink_action_t act) +{ + /* Simple actions are allowed on these netlink protocols. */ + if (act != LXNL_SENDMSG) { + switch (lns->lxns_proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + return (0); + default: + break; + } + } + + /* CAP_NET_ADMIN roughly maps to PRIV_SYS_IP_CONFIG. */ + if (secpolicy_ip_config(cr, B_FALSE) != 0) { + return (EACCES); + } + + return (0); +} + +/*ARGSUSED*/ +static void +lx_netlink_activate(sock_lower_handle_t handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, + int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ | + SOCKOPT_MAXBLK | SOCKOPT_MINPSZ; + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl); + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_minpsz = 0; + + lxsock->lxns_upcalls = sock_upcalls; + lxsock->lxns_uphandle = sock_handle; + + sock_upcalls->su_set_proto_props(sock_handle, &sopp); +} + +/*ARGSUSED*/ +static int +lx_netlink_setsockopt(sock_lower_handle_t handle, int level, + int option_name, const void *optval, socklen_t optlen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + + if (level == SOL_SOCKET && option_name == SO_RECVUCRED) { + int *ival; + if (optlen != sizeof (int)) { + return (EINVAL); + } + ival = (int *)optval; + if (*ival == 0) { + lxsock->lxns_flags &= ~LXNLF_RECVUCRED; + } else { + lxsock->lxns_flags |= LXNLF_RECVUCRED; + } + return (0); + } else if (level == SOL_SOCKET) { + /* Punt on the other SOL_SOCKET options */ + return (0); + } else if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_ADD_MEMBERSHIP: + case LX_NETLINK_SO_DROP_MEMBERSHIP: + case LX_NETLINK_SO_PKTINFO: + case LX_NETLINK_SO_BROADCAST_ERROR: + case LX_NETLINK_SO_NO_ENOBUFS: + case LX_NETLINK_SO_RX_RING: + case LX_NETLINK_SO_TX_RING: + /* Blatant lie */ + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockopt(sock_lower_handle_t handle, int level, + int option_name, void *optval, socklen_t *optlen, cred_t *cr) +{ + if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_LIST_MEMBERSHIPS: + /* Report that we have 0 members to allow systemd to proceed. */ + *optlen = 0; + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name, + socklen_t namelen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name; + + if (namelen != sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + /* + * Perform access checks if attempting to bind on any multicast groups. + */ + if (lxsa->lxnl_groups != 0) { + int err; + + if ((err = lx_netlink_access(lxsock, cr, LXNL_BIND)) != 0) { + return (err); + } + + /* Lie about group subscription for now */ + lxsock->lxns_groups = lxsa->lxnl_groups; + } + + /* + * Linux netlink uses nl_port to identify distinct netlink sockets. + * Binding to an address of nl_port=0 triggers the kernel to + * automatically assign a free nl_port identifier. Originally, + * consumers of lx_netlink were required to bind with that automatic + * address. We now support non-zero values for nl_port although strict + * checking to identify conflicts is not performed. Use of the + * id_space facility could be a convenient solution, if a need arose. + */ + if (lxsa->lxnl_port == 0) { + /* + * Because we are not doing conflict detection, there is no + * need to expend effort selecting a unique port for automatic + * addressing during bind. + */ + lxsock->lxns_port = curproc->p_pid; + } else { + lxsock->lxns_port = lxsa->lxnl_port; + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa, + socklen_t *len, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa; + + if (*len < sizeof (lx_netlink_sockaddr_t)) + return (EINVAL); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_pad = 0; + lxsa->lxnl_port = lxsock->lxns_port; + lxsa->lxnl_groups = lxsock->lxns_groups; + + *len = sizeof (lx_netlink_sockaddr_t); + + return (0); +} + +static mblk_t * +lx_netlink_alloc_mp1(lx_netlink_sock_t *lxsock) +{ + mblk_t *mp; + size_t size; + struct T_unitdata_ind *tunit; + lx_netlink_sockaddr_t *lxsa; + boolean_t send_ucred; + + /* + * Certain netlink clients (such as systemd) will set SO_RECVUCRED + * (via the Linux SCM_CREDENTIALS) on the expectation that all replies + * will contain credentials passed via cmsg. They require this to + * authenticate those messages as having originated in the kernel by + * checking uc_pid == 0. + */ + VERIFY(lxsock != NULL); + send_ucred = ((lxsock->lxns_flags & LXNLF_RECVUCRED) != 0); + + /* + * Message structure: + * +----------------------------+ + * | struct T_unit_data_ind | + * +----------------------------+ + * | lx_netlink_sockaddr_t | + * +----------------------------+ -+ + * | struct cmsghdr (SCM_UCRED) | | + * +----------------------------+ +-(optional) + * | struct ucred_s (cmsg data) | | + * +----------------------------+ -+ + */ + size = sizeof (*tunit) + sizeof (*lxsa); + if (send_ucred) { + size += sizeof (struct cmsghdr) + + ROUNDUP_cmsglen(sizeof (struct ucred_s)); + } + mp = allocb(size, 0); + if (mp == NULL) { + return (NULL); + } + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + tunit = (struct T_unitdata_ind *)mp->b_rptr; + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + lxsa = (lx_netlink_sockaddr_t *)((caddr_t)tunit + sizeof (*tunit)); + mp->b_wptr += size; + + mp->b_datap->db_type = M_PROTO; + tunit->PRIM_type = T_UNITDATA_IND; + tunit->SRC_length = sizeof (*lxsa); + tunit->SRC_offset = sizeof (*tunit); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_port = 0; + lxsa->lxnl_groups = 0; + lxsa->lxnl_pad = 0; + + if (send_ucred) { + struct cmsghdr *cmsg; + struct ucred_s *ucred; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + cmsg = (struct cmsghdr *)((caddr_t)lxsa + sizeof (*lxsa)); + ucred = (struct ucred_s *)CMSG_CONTENT(cmsg); + cmsg->cmsg_len = sizeof (*cmsg) + sizeof (*ucred); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + bzero(ucred, sizeof (*ucred)); + ucred->uc_size = sizeof (*ucred); + ucred->uc_zoneid = getzoneid(); + + tunit->OPT_length = sizeof (*cmsg) + + ROUNDUP_cmsglen(sizeof (*ucred)); + tunit->OPT_offset = tunit->SRC_offset + tunit->SRC_length; + } else { + tunit->OPT_length = 0; + tunit->OPT_offset = 0; + } + + return (mp); +} + +static lx_netlink_reply_t * +lx_netlink_reply(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, uint16_t type) +{ + lx_netlink_reply_t *reply; + mblk_t *err, *mp1; + + /* + * We always allocate an error block to assure that even if subsequent + * allocations fail, we can return an error. + */ + if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL) + return (NULL); + + if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) { + freeb(err); + return (NULL); + } + + reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP); + reply->lxnr_err = err; + reply->lxnr_sock = lxsock; + reply->lxnr_hdr = *hdr; + reply->lxnr_type = type; + reply->lxnr_mp1 = mp1; + + return (reply); +} + +static void +lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + uint32_t aligned; + mblk_t *mp = reply->lxnr_mp; + + if (reply->lxnr_errno) + return; + + aligned = LXNLMSG_ALIGN(size); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + + if (hdr->lxnh_len + aligned > lxsock->lxns_bufsize) { + reply->lxnr_errno = E2BIG; + return; + } + + bcopy(payload, mp->b_wptr, size); + hdr->lxnh_len += aligned; + mp->b_wptr += aligned; +} + +static void +lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + if (reply->lxnr_errno) + return; + + VERIFY(reply->lxnr_mp == NULL); + + if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + + mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + + if (payload == NULL) { + /* + * A NULL payload denotes a "done" message. + */ + hdr->lxnh_type = LX_NETLINK_NLMSG_DONE; + } else { + hdr->lxnh_type = reply->lxnr_type; + lx_netlink_reply_add(reply, payload, size); + } +} + +static void +lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type, + void *payload, uint32_t size) +{ + lx_netlink_attr_t attr; + + attr.lxna_len = size + sizeof (lx_netlink_attr_t); + attr.lxna_type = type; + + lx_netlink_reply_add(reply, &attr, sizeof (attr)); + lx_netlink_reply_add(reply, payload, size); +} + +static void +lx_netlink_reply_attr_string(lx_netlink_reply_t *reply, + uint16_t type, const char *str) +{ + lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1); +} + +static void +lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply, + uint16_t type, int32_t val) +{ + int32_t v = val; + + lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t)); +} + +static int +lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg) +{ + int rval; + + if (reply->lxnr_errno != 0) + return (reply->lxnr_errno); + + if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current, + cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) { + reply->lxnr_errno = rval; + } + + return (rval); +} + +static void +lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + int error; + + /* + * To prevent the stream head from coalescing messages and to indicate + * their origin, we send them as T_UNITDATA_IND messages, not as raw + * M_DATA. + */ + mp1->b_cont = mp; + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1, + msgdsize(mp1), 0, &error, NULL); + + if (error != 0) + lx_netlink_flowctrld++; +} + +static void +lx_netlink_reply_send(lx_netlink_reply_t *reply) +{ + mblk_t *mp1; + + if (reply->lxnr_errno) + return; + + if ((mp1 = lx_netlink_alloc_mp1(reply->lxnr_sock)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1); + reply->lxnr_mp = NULL; +} + +static void +lx_netlink_reply_done(lx_netlink_reply_t *reply) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + /* + * Denote that we're done via a message with a NULL payload. + */ + lx_netlink_reply_msg(reply, NULL, 0); + + if (reply->lxnr_errno) { + /* + * If anything failed, we'll send up an error message. + */ + lx_netlink_hdr_t *hdr; + lx_netlink_err_t *err; + + if (reply->lxnr_mp != NULL) { + freeb(reply->lxnr_mp); + reply->lxnr_mp = NULL; + } + + mp = reply->lxnr_err; + VERIFY(mp != NULL); + reply->lxnr_err = NULL; + err = (lx_netlink_err_t *)mp->b_rptr; + hdr = &err->lxne_hdr; + mp->b_wptr += sizeof (lx_netlink_err_t); + + err->lxne_failed = reply->lxnr_hdr; + err->lxne_errno = reply->lxnr_errno; + hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_len = sizeof (lx_netlink_err_t); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + } else { + uint32_t status = 0; + + /* + * More recent versions of the iproute2 utils expect a status + * value after the header, even in the absence of errors. + */ + lx_netlink_reply_add(reply, &status, sizeof (status)); + + /* + * "done" is also the most minimal response possible. If + * lx_netlink_reply_msg() does not set lxnr_errno, we should + * be guaranteed enough room to hold this (i.e. our + * lx_netlink_reply_add() call should never end up setting + * lxnr_errno). + */ + VERIFY0(reply->lxnr_errno); + + mp = reply->lxnr_mp; + VERIFY(mp != NULL); + reply->lxnr_mp = NULL; + } + + lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + + if (reply->lxnr_mp != NULL) + freeb(reply->lxnr_mp); + + if (reply->lxnr_err != NULL) + freeb(reply->lxnr_err); + + kmem_free(reply, sizeof (lx_netlink_reply_t)); +} + +static int +lx_netlink_reply_error(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, int errno) +{ + /* + * The type of the message doesn't matter, as we're going to explicitly + * set lxnr_errno and therefore send only an error message. + */ + lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0); + + if (reply == NULL) + return (ENOMEM); + + reply->lxnr_errno = errno; + lx_netlink_reply_done(reply); + + return (0); +} + +/* + * Send an ack message with an explicit errno of 0. + * TODO: this needs more work + */ +/* + * static void + * lx_netlink_reply_ack(lx_netlink_reply_t *reply) + * { + * lx_netlink_sock_t *lxsock = reply->lxnr_sock; + * mblk_t *mp; + * lx_netlink_hdr_t *hdr; + * lx_netlink_err_t *err; + * + * lx_netlink_reply_msg(reply, NULL, 0); + * + * mp = reply->lxnr_err; + * VERIFY(mp != NULL); + * reply->lxnr_err = NULL; + * err = (lx_netlink_err_t *)mp->b_rptr; + * hdr = &err->lxne_hdr; + * + * err->lxne_failed = reply->lxnr_hdr; + * err->lxne_errno = 0; + * hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + * hdr->lxnh_len = sizeof (lx_netlink_err_t); + * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + * hdr->lxnh_pid = lxsock->lxns_port; + * + * lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + * + * kmem_free(reply, sizeof (lx_netlink_reply_t)); + * } + */ + +static int +lx_netlink_parse_msg_attrs(mblk_t *mp, void **msgp, unsigned int msg_size, + lx_netlink_attr_t **attrp, unsigned int *attr_max) +{ + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + lx_netlink_attr_t *lxa; + unsigned char *buf = mp->b_rptr + LXNLMSG_HDRLEN; + unsigned int i; + uint32_t buf_left = MBLKL(mp) - LXNLMSG_HDRLEN; + uint32_t msg_left = hdr->lxnh_len; + + msg_size = LXNLMSG_ALIGN(msg_size); + if (msg_size > buf_left || msg_size > msg_left) { + return (-1); + } + + *msgp = (void *)buf; + buf += msg_size; + buf_left -= msg_size; + msg_left -= msg_size; + + /* Do not bother with attr parsing if not requested */ + if (attrp == NULL || *attr_max == 0) { + return (0); + } + + for (i = 0; i < *attr_max; i++) { + if (buf_left < LXATTR_HDRLEN || msg_left < LXATTR_HDRLEN) { + break; + } + + lxa = (lx_netlink_attr_t *)buf; + if (lxa->lxna_len > buf_left || lxa->lxna_len > msg_left) { + return (-1); + } + + attrp[i] = lxa; + buf += lxa->lxna_len; + buf_left -= lxa->lxna_len; + msg_left -= lxa->lxna_len; + } + *attr_max = i; + + return (0); +} + +/* + * Takes an IPv4 address (in network byte order) and returns the address scope. + */ +static uint8_t +lx_ipv4_rtscope(in_addr_t nbo_addr) +{ + in_addr_t addr = ntohl(nbo_addr); + if ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + return (LX_RTSCOPE_HOST); + } else if ((addr & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET) { + return (LX_RTSCOPE_LINK); + } else if ((addr & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET || + (addr & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET || + (addr & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +/* + * Takes an IPv6 address and returns the address scope. + */ +static uint8_t +lx_ipv6_rtscope(const in6_addr_t *addr) +{ + if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_RTSCOPE_HOST); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_RTSCOPE_LINK); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +static void +lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifinfomsg_t ifi; + int i; + char if_name[IFNAMSIZ]; + struct sockaddr_dl *sdl; + struct sockaddr hwaddr; + int hwaddr_size; + boolean_t is_loopback; + + struct { + int native; + int lx; + } flags[] = { + { IFF_UP, LX_IFF_UP }, + { IFF_BROADCAST, LX_IFF_BROADCAST }, + { IFF_DEBUG, LX_IFF_DEBUG }, + { IFF_LOOPBACK, LX_IFF_LOOPBACK }, + { IFF_POINTOPOINT, LX_IFF_POINTOPOINT }, + { IFF_NOTRAILERS, LX_IFF_NOTRAILERS }, + { IFF_RUNNING, LX_IFF_RUNNING }, + { IFF_NOARP, LX_IFF_NOARP }, + { IFF_PROMISC, LX_IFF_PROMISC }, + { IFF_ALLMULTI, LX_IFF_ALLMULTI }, + { IFF_MULTICAST, LX_IFF_MULTICAST }, + { 0 } + }; + + /* + * illumos interfaces that contain a ':' are non-zero logical + * interfaces. We should only emit the name of the zeroth logical + * interface, since RTM_GETLINK only expects to see the name of + * devices. The addresses of all logical devices will be + * returned via an RTM_GETADDR. + */ + if (strchr(lifr->lifr_name, ':') != NULL) + return; + + /* + * Most of the lx_netlink module is architected to emit information in + * an illumos-native manner. Socket syscalls such as getsockname will + * not translate fields to values Linux programs would expect since + * that conversion is performed by the generic socket emulation. + * + * This is _not_ true of the actual protocol output from lx_netlink. + * Since translating it at the socket layer would be onerous, all + * output (including constants and names) is pre-translated to values + * valid for Linux. + */ + + bzero(&ifi, sizeof (ifi)); + ifi.lxnl_ifi_family = AF_UNSPEC; + ifi.lxnl_ifi_change = (uint32_t)-1; + + /* Convert the name to be Linux-friendly */ + (void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ); + lx_ifname_convert(if_name, LX_IF_FROMNATIVE); + is_loopback = (strncmp(if_name, "lo", 2) == 0); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifi.lxnl_ifi_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + for (i = 0; flags[i].native; i++) { + if (lifr->lifr_flags & flags[i].native) + ifi.lxnl_ifi_flags |= flags[i].lx; + } + + /* + * Query the datalink address. + * The interface type will be included in the outgoing infomsg while + * the address itself will be output separately. + */ + sdl = (struct sockaddr_dl *)&lifr->lifr_addr; + bzero(sdl, sizeof (*sdl)); + if (!is_loopback) { + (void) lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr); + } else { + /* Simulate an empty hwaddr for loopback */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + } + lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size); + + ifi.lxnl_ifi_type = hwaddr.sa_family; + lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t)); + + lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0) + return; + + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu); + + if (hwaddr_size != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS, + hwaddr.sa_data, hwaddr_size); + } + + /* Emulate a txqlen of 1. (0 for loopbacks) */ + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN, + (is_loopback) ? 0 : 1); + + lx_netlink_reply_send(reply); +} + +static void +lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply, + void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct) +{ + lx_netlink_sock_t *sock = reply->lxnr_sock; + int nlifr, i; + + struct { + int family; + ldi_handle_t handle; + struct lifconf lifc; + struct lifnum lifn; + } families[] = { + { AF_INET, sock->lxns_iphandle }, + { AF_INET6, sock->lxns_ip6handle }, + { AF_UNSPEC } + }, *family, *check; + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + struct lifnum *lifn = &family->lifn; + + lifn->lifn_family = family->family; + sock->lxns_current = family->handle; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0) + break; + + lifc->lifc_family = lifn->lifn_family; + lifc->lifc_flags = 0; + lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq); + if (lifn->lifn_count == 0) { + lifc->lifc_buf = NULL; + continue; + } + lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0) + break; + + nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]); + + for (i = 0; i < nlifr; i++) { + if (!distinct) { + func(reply, &lifc->lifc_req[i]); + continue; + } + + /* + * If we have been asked to provide each interface + * exactly once, we need to (annoyingly) check this + * name against others that we've already processed for + * other families. Yes, this is quadratic time -- but + * the number of interfaces per family is expected to + * be very small. + */ + for (check = families; check != family; check++) { + struct lifconf *clifc = &check->lifc; + int cnlifr = clifc->lifc_len / + sizeof (clifc->lifc_req[0]), j; + char *nm = lifc->lifc_req[i].lifr_name, *cnm; + + for (j = 0; j < cnlifr; j++) { + cnm = clifc->lifc_req[j].lifr_name; + + if (strcmp(nm, cnm) == 0) + break; + } + + if (j != cnlifr) + break; + } + + if (check != family) + continue; + + func(reply, &lifc->lifc_req[i]); + } + } + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + + if (lifc->lifc_buf != NULL) + kmem_free(lifc->lifc_buf, lifc->lifc_len); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE); + lx_netlink_reply_done(reply); + + return (0); +} + +static void +lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifaddrmsg_t ifa; + + bzero(&ifa, sizeof (ifa)); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifa.lxnl_ifa_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + /* + * Don't report on-link subnets + */ + if ((lifr->lifr_flags & IFF_NOLOCAL) != 0) + return; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0) + return; + + ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0) + return; + + if (lifr->lifr_addr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + ifa.lxnl_ifa_family = LX_AF_INET; + + sin = (struct sockaddr_in *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv4_rtscope( + sin->sin_addr.s_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr_int32(reply, + LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin; + + ifa.lxnl_ifa_family = LX_AF_INET6; + + sin = (struct sockaddr_in6 *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv6_rtscope(&sin->sin6_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS, + &sin->sin6_addr, sizeof (sin->sin6_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE); + lx_netlink_reply_done(reply); + + return (0); +} + +struct lx_getroute_ctx { + lx_netlink_reply_t *lgrtctx_reply; + lx_netlink_rtmsg_t *lgrtctx_rtmsg; + lx_netlink_attr_t *lgrtctx_attrs[LX_NETLINK_MAX_RTA]; + unsigned int lgrtctx_max_attr; + lx_netlink_attr_t *lgrtctx_rtadst; +}; + +static void +lx_netlink_getroute_ipv4(ire_t *ire, struct lx_getroute_ctx *ctx) +{ + lx_netlink_reply_t *reply = ctx->lgrtctx_reply; + lx_netlink_rtmsg_t *rtmsg = ctx->lgrtctx_rtmsg; + lx_netlink_attr_t *rtadst = ctx->lgrtctx_rtadst; + lx_netlink_rtmsg_t res; + ill_t *ill = NULL; + + /* Certain IREs are too specific for netlink */ + if ((ire->ire_type & (IRE_BROADCAST | IRE_MULTICAST | IRE_NOROUTE | + IRE_LOOPBACK | IRE_LOCAL)) != 0 || ire->ire_testhidden != 0) { + return; + } + /* + * When listing routes, CLONE entries are undesired. + * They are required for 'ip route get' on a local address. + */ + if (rtmsg->rtm_dst_len == 0 && (ire->ire_type & IRE_IF_CLONE) != 0) { + return; + } + + bzero(&res, sizeof (res)); + res.rtm_family = LX_AF_INET; + res.rtm_table = LX_ROUTE_TABLE_MAIN; + res.rtm_type = LX_RTN_UNICAST; + res.rtm_dst_len = ire->ire_masklen; + + if (ire->ire_type & (IRE_IF_NORESOLVER|IRE_IF_RESOLVER)) { + /* Interface-local networks considered kernel-created */ + res.rtm_protocol = LX_RTPROT_KERNEL; + res.rtm_scope = LX_RTSCOPE_LINK; + } else if (ire->ire_flags & RTF_STATIC) { + res.rtm_protocol = LX_RTPROT_STATIC; + } + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* + * SpecifY single-destination route. + * RTA_DST details will be added later + */ + res.rtm_dst_len = rtmsg->rtm_dst_len; + } + + + lx_netlink_reply_msg(reply, &res, sizeof (res)); + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* Add RTA_DST details for single-destination route. */ + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + LXATTR_PAYLOAD(rtadst), sizeof (ipaddr_t)); + } else if (ire->ire_masklen != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + &ire->ire_addr, sizeof (ire->ire_addr)); + } + + if (ire->ire_ill != NULL) { + ill = ire->ire_ill; + } else if (ire->ire_dep_parent != NULL) { + ill = ire->ire_dep_parent->ire_ill; + } + + if (ill != NULL) { + uint32_t ifindex, addr_src; + + ifindex = ill->ill_phyint->phyint_ifindex; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_OIF, + &ifindex, sizeof (ifindex)); + + addr_src = ill->ill_ipif->ipif_lcl_addr; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_PREFSRC, + &addr_src, sizeof (addr_src)); + } + + if (ire->ire_flags & RTF_GATEWAY) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_GATEWAY, + &ire->ire_gateway_addr, sizeof (ire->ire_gateway_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getroute(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, + mblk_t *mp) +{ + struct lx_getroute_ctx ctx; + lx_netlink_reply_t *reply; + lx_netlink_rtmsg_t rtmsg, *rtmsgp; + int rtmsg_size = sizeof (rtmsg); + netstack_t *ns; + int i; + + bzero(&ctx, sizeof (ctx)); + ctx.lgrtctx_max_attr = LX_NETLINK_MAX_RTA; + + if (lx_netlink_parse_msg_attrs(mp, (void **)&rtmsgp, + rtmsg_size, ctx.lgrtctx_attrs, &ctx.lgrtctx_max_attr) != 0) { + return (EPROTO); + } + + /* + * Older version of libnetlink send a truncated rtmsg struct for + * certain RTM_GETROUTE queries. We must detect this condition and + * truncate our input to prevent later confusion. + */ + if (curproc->p_zone->zone_brand == &lx_brand && + lx_kern_release_cmp(curproc->p_zone, "2.6.32") <= 0 && + rtmsgp->rtm_dst_len == 0) { + rtmsg_size = sizeof (rtmsg.rtm_family); + } + bzero(&rtmsg, sizeof (rtmsg)); + bcopy(rtmsgp, &rtmsg, rtmsg_size); + ctx.lgrtctx_rtmsg = &rtmsg; + + /* If RTA_DST was passed, it effects later decisions */ + for (i = 0; i < ctx.lgrtctx_max_attr; i++) { + lx_netlink_attr_t *attr = ctx.lgrtctx_attrs[i]; + + if (attr->lxna_type == LX_NETLINK_RTA_DST && + attr->lxna_len == LXATTR_LEN(sizeof (ipaddr_t))) { + ctx.lgrtctx_rtadst = attr; + break; + } + } + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWROUTE); + if (reply == NULL) { + return (ENOMEM); + } + ctx.lgrtctx_reply = reply; + + /* Do not report anything outside the main table */ + if (rtmsg.rtm_table != LX_ROUTE_TABLE_MAIN && + rtmsg.rtm_table != 0) { + lx_netlink_reply_done(reply); + return (0); + } + + ns = netstack_get_current(); + if (ns == NULL) { + lx_netlink_reply_done(reply); + return (0); + } + if (rtmsg.rtm_family == LX_AF_INET || rtmsg.rtm_family == 0) { + if (rtmsg.rtm_dst_len == 0x20 && ctx.lgrtctx_rtadst != NULL) { + /* resolve route for host */ + ipaddr_t *dst = LXATTR_PAYLOAD(ctx.lgrtctx_rtadst); + ire_t *ire_dst; + + ire_dst = ire_route_recursive_dstonly_v4(*dst, 0, 0, + ns->netstack_ip); + lx_netlink_getroute_ipv4(ire_dst, &ctx); + ire_refrele(ire_dst); + } else { + /* get route listing */ + ire_walk_v4(&lx_netlink_getroute_ipv4, &ctx, ALL_ZONES, + ns->netstack_ip); + } + } + if (rtmsg.rtm_family == LX_AF_INET6) { + /* punt on ipv6 for now */ + netstack_rele(ns); + lx_netlink_reply_done(reply); + return (EPROTO); + } + netstack_rele(ns); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Auditing callback to emit response. + */ +static void +lx_netlink_au_cb(void *r, void *b, uint_t blen) +{ + lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r; + + lx_netlink_reply_msg(reply, b, blen); +} + +/* + * Audit get + */ +static int +lx_netlink_au_get(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET); + if (reply == NULL) + return (ENOMEM); + + lx_audit_get(reply, lx_netlink_au_cb); + lx_netlink_reply_send(reply); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Set or clear flag indicating socket is being used to communicate with the + * user-level auditd. Also update the counter which prevents this module + * from unloading while auditing is using the socket to the auditd. + */ +static void +lx_netlink_au_sock_cb(void *s, boolean_t set) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s; + + if (set) { + lxsock->lxns_flags |= LXNLF_AUDITD; + mutex_enter(&lx_netlink_lock); + lx_netlink_audit_cnt++; + mutex_exit(&lx_netlink_lock); + } else { + lxsock->lxns_flags &= ~LXNLF_AUDITD; + mutex_enter(&lx_netlink_lock); + VERIFY(lx_netlink_audit_cnt > 0); + lx_netlink_audit_cnt--; + mutex_exit(&lx_netlink_lock); + } +} + +static int +lx_netlink_au_set(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + err = lx_audit_set(lxsock, datap, datalen, lx_netlink_au_sock_cb); + if (err != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_SET); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit append rule + */ +static int +lx_netlink_au_ar(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but as we add additional field support, eventually we might need + * to handle an mblk chain for really long string data in the + * rulep->lxar_buf. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + if ((err = lx_audit_append_rule(datap, datalen)) != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_ADD_RULE); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit delete rule + */ +static int +lx_netlink_au_dr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but as we add additional field support, eventually we might need + * to handle an mblk chain for really long string data in the + * rulep->lxar_buf. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + if ((err = lx_audit_delete_rule(datap, datalen)) != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_DEL_RULE); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Auditing callback to emit rule list. + */ +static void +lx_netlink_au_lr_cb(void *r, void *b0, uint_t b0_len, void *b1, uint_t b1_len) +{ + lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r; + + lx_netlink_reply_msg(reply, b0, b0_len); + lx_netlink_reply_add(reply, b1, b1_len); + lx_netlink_reply_send(reply); +} + +/* + * Audit list rules + */ +static int +lx_netlink_au_lr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_LIST_RULES); + if (reply == NULL) + return (ENOMEM); + + lx_audit_list_rules(reply, lx_netlink_au_lr_cb); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit get feature + */ +static int +lx_netlink_au_gf(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET_FEATURE); + if (reply == NULL) + return (ENOMEM); + + lx_audit_get_feature(reply, lx_netlink_au_cb); + lx_netlink_reply_send(reply); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit user message + * User messages are submitted as free-form messages which need to get sent + * back up to the auditd. This includes informative messages such as starting + * or stopping auditing. + */ +static int +lx_netlink_au_um(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + size_t datalen; + void *bp; + + bp = mp->b_rptr + sizeof (lx_netlink_hdr_t); + datalen = MBLKL(mp) - (sizeof (lx_netlink_hdr_t)); + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but eventually we might need to handle an mblk chain for a really + * long user message. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + lx_audit_emit_user_msg(hdr->lxnh_type, datalen, bp); + + if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) { + reply = lx_netlink_reply(lxsock, hdr, hdr->lxnh_type); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + } + return (0); +} + +static int +lx_netlink_au_emit_cb(void *s, uint_t type, const char *msg, uint_t size) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s; + lx_netlink_hdr_t *hdr; + mblk_t *mp, *mp1; + int error; + uint32_t len; + + len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + if (msg != NULL) { + len += LXNLMSG_ALIGN(size); + if (len > lxsock->lxns_bufsize) + return (E2BIG); + } + + if ((mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + return (ENOMEM); + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = len; + hdr->lxnh_type = (msg == NULL ? LX_NETLINK_NLMSG_DONE : type); + hdr->lxnh_seq = 0; + hdr->lxnh_pid = 0; + + mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + if (msg != NULL) { + bcopy(msg, mp->b_wptr, size); + mp->b_wptr += LXNLMSG_ALIGN(size); + } + + /* As in lx_netlink_reply_sendup, send as T_UNITDATA_IND message. */ + if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) { + freeb(mp); + return (ENOMEM); + } + mp1->b_cont = mp; + + /* + * If the socket is currently flow-controlled, do not allow further + * data to be sent out. Messages of the NLMSG_DONE type, triggered by + * passing msg == NULL, are excempt from this restriction. + */ + mutex_enter(&lxsock->lxns_flowctl_mtx); + if (lxsock->lxns_flowctrled && msg != NULL) { + mutex_exit(&lxsock->lxns_flowctl_mtx); + freemsg(mp1); + return (ENOSPC); + } + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1, + msgdsize(mp1), 0, &error, NULL); + + /* + * The socket indicated that it is now flow-controlled. That said, it + * still queued the last message, so indicated success (but track the + * flow-controlled state). + */ + if (error == ENOSPC) { + lxsock->lxns_flowctrled = B_TRUE; + lx_netlink_flowctrld++; + error = 0; + } + mutex_exit(&lxsock->lxns_flowctl_mtx); + + return (error); +} + +static int +lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * This is paranoia, in case our socket somehow escaped the zone. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (ECONNREFUSED); + + if (MBLKL(mp) < sizeof (lx_netlink_hdr_t)) + return (EINVAL); + + /* + * Ensure audit state is setup whenever we get an audit control msg. + * However, we skip initialization for user messages since some apps + * (e.g. systemd) blindly send audit messages, even though auditing + * is not installed or in use. Uninitialized state is handled in + * lx_audit_user_msg(). + */ + if (hdr->lxnh_type < LX_AUDIT_USER_MSG_START) + lx_audit_init(lx_netlink_au_emit_cb); + + /* + * Within Linux, when a netlink message requests an ack, the code + * first sends the ack as an error response (NLMSG_ERROR) with an + * error code of 0. + * + * TODO: this needs more work, but is unnecessary for now. + * if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) { + * reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_NLMSG_ERROR); + * if (reply == NULL) + * return (ENOMEM); + * lx_netlink_reply_ack(reply); + * } + */ + + if (hdr->lxnh_type >= LX_AUDIT_USER_MSG_START) { + return (lx_netlink_au_um(lxsock, hdr, mp)); + } + + switch (hdr->lxnh_type) { + case LX_AUDIT_GET: + return (lx_netlink_au_get(lxsock, hdr)); + case LX_AUDIT_SET: + return (lx_netlink_au_set(lxsock, hdr, mp)); + case LX_AUDIT_ADD_RULE: + return (lx_netlink_au_ar(lxsock, hdr, mp)); + case LX_AUDIT_DEL_RULE: + return (lx_netlink_au_dr(lxsock, hdr, mp)); + case LX_AUDIT_LIST_RULES: + return (lx_netlink_au_lr(lxsock, hdr)); + case LX_AUDIT_GET_FEATURE: + return (lx_netlink_au_gf(lxsock, hdr)); + } + + /* + * For all other auditing messages (i.e. one we don't yet support), we + * return ECONNREFUSED. + */ + return (ECONNREFUSED); +} + +/*ARGSUSED*/ +static int +lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For udev, we just silently accept all writes and never actually + * reply with anything -- which appears to be sufficient for things + * to work. + */ + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp, + struct nmsghdr *msg, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + int i, rval; + + static struct { + int proto; + uint16_t type; + int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *); + } handlers[] = { + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETLINK, lx_netlink_getlink }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETROUTE, lx_netlink_getroute }, + { LX_NETLINK_AUDIT, + LX_NETLINK_NLMSG_NONE, lx_netlink_audit }, + { LX_NETLINK_KOBJECT_UEVENT, + LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent }, + { LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL } + }; + + if (msg->msg_name != NULL) { + lx_netlink_sockaddr_t *lxsa = + (lx_netlink_sockaddr_t *)msg->msg_name; + + if (msg->msg_namelen != sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + /* + * If this message is targeted beyond just the OS kernel, an + * access check must be made. + */ + if (lxsa->lxnl_port != 0 || lxsa->lxnl_groups != 0) { + int err; + char buf[LX_UNSUP_BUFSZ]; + + err = lx_netlink_access(lxsock, cr, LXNL_SENDMSG); + if (err != 0) { + return (err); + } + + /* + * Support for netlink messages beyond rtnetlink(7) is + * non-existent at this time. These messages are + * tolerated, rather than tossing a potentially fatal + * error to the application. + */ + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "netlink sendmsg addr port:%X groups:%08X", + lxsa->lxnl_port, lxsa->lxnl_groups); + lx_unsupported(buf); + } + } + + if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) { + freemsg(mp); + return (EPROTO); + } + + for (i = 0; handlers[i].func != NULL; i++) { + if (lxsock->lxns_proto != handlers[i].proto) + continue; + + if (handlers[i].type != LX_NETLINK_NLMSG_NONE && + hdr->lxnh_type != handlers[i].type) + continue; + + rval = handlers[i].func(lxsock, hdr, mp); + freemsg(mp); + + return (rval); + } + + /* + * An unrecognized message. We will bounce up an EOPNOTSUPP reply. + */ + rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP); + freemsg(mp); + + return (rval); +} + +static void +lx_netlink_clr_flowctrl(sock_lower_handle_t handle) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + + mutex_enter(&lxsock->lxns_flowctl_mtx); + lxsock->lxns_flowctrled = B_FALSE; + mutex_exit(&lxsock->lxns_flowctl_mtx); +} + +/*ARGSUSED*/ +static int +lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev; + + if (lxsock->lxns_flags & LXNLF_AUDITD) + lx_audit_stop_worker(lxsock, lx_netlink_au_sock_cb); + + mutex_enter(&lx_netlink_lock); + + prev = &lx_netlink_head; + + for (sock = *prev; sock != lxsock; sock = sock->lxns_next) + prev = &sock->lxns_next; + + *prev = sock->lxns_next; + + mutex_exit(&lx_netlink_lock); + + (void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred); + (void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred); + mutex_destroy(&lxsock->lxns_flowctl_mtx); + kmem_free(lxsock, sizeof (lx_netlink_sock_t)); + + return (0); +} + +static sock_downcalls_t sock_lx_netlink_downcalls = { + lx_netlink_activate, /* sd_activate */ + sock_accept_notsupp, /* sd_accept */ + lx_netlink_bind, /* sd_bind */ + sock_listen_notsupp, /* sd_listen */ + sock_connect_notsupp, /* sd_connect */ + sock_getpeername_notsupp, /* sd_getpeername */ + lx_netlink_getsockname, /* sd_getsockname */ + lx_netlink_getsockopt, /* sd_getsockopt */ + lx_netlink_setsockopt, /* sd_setsockopt */ + lx_netlink_send, /* sd_send */ + NULL, /* sd_send_uio */ + NULL, /* sd_recv_uio */ + NULL, /* sd_poll */ + sock_shutdown_notsupp, /* sd_shutdown */ + lx_netlink_clr_flowctrl, /* sd_clr_flowctrl */ + sock_ioctl_notsupp, /* sd_ioctl */ + lx_netlink_close /* sd_close */ +}; + +/*ARGSUSED*/ +static sock_lower_handle_t +lx_netlink_create(int family, int type, int proto, + sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, + int flags, cred_t *credp) +{ + lx_netlink_sock_t *lxsock; + ldi_handle_t handle, handle6; + cred_t *kcred = zone_kcred(); + int err; + + if (family != AF_LX_NETLINK || + (type != SOCK_DGRAM && type != SOCK_RAW)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + switch (proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + break; + + default: + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred, + &handle, lx_netlink_ldi)) != 0) { + *errorp = err; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred, + &handle6, lx_netlink_ldi)) != 0) { + (void) ldi_close(handle, FREAD, kcred); + *errorp = err; + return (NULL); + } + + *sock_downcalls = &sock_lx_netlink_downcalls; + *smodep = SM_ATOMIC; + + lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP); + lxsock->lxns_iphandle = handle; + lxsock->lxns_ip6handle = handle6; + lxsock->lxns_bufsize = lx_netlink_bufsize; + lxsock->lxns_proto = proto; + mutex_init(&lxsock->lxns_flowctl_mtx, NULL, MUTEX_DEFAULT, NULL); + + mutex_enter(&lx_netlink_lock); + + lxsock->lxns_next = lx_netlink_head; + lx_netlink_head = lxsock; + + mutex_exit(&lx_netlink_lock); + + return ((sock_lower_handle_t)lxsock); +} + +static void +lx_netlink_init(void) +{ + major_t major = mod_name_to_major("ip"); + int err; + + VERIFY(major != DDI_MAJOR_T_NONE); + + err = ldi_ident_from_major(major, &lx_netlink_ldi); + VERIFY(err == 0); +} + +static void +lx_netlink_fini(void) +{ + ldi_ident_release(lx_netlink_ldi); +} + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "lx_netlink", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + lx_netlink_create, + NULL +}; + +/* modldrv structure */ +static struct modlsockmod sockmod = { + &mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo +}; + +/* modlinkage structure */ +static struct modlinkage ml = { + MODREV_1, + &sockmod, + NULL +}; + +int +_init(void) +{ + int err; + + lx_netlink_init(); + + if ((err = mod_install(&ml)) != 0) + lx_netlink_fini(); + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&ml, modinfop)); +} + +int +_fini(void) +{ + int err = 0; + + mutex_enter(&lx_netlink_lock); + + if (lx_netlink_head != NULL || lx_netlink_audit_cnt != 0) + err = EBUSY; + + mutex_exit(&lx_netlink_lock); + + if (err == 0) { + lx_audit_cleanup(); + if ((err = mod_remove(&ml)) == 0) + lx_netlink_fini(); + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c new file mode 100644 index 0000000000..23e0c6f459 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c @@ -0,0 +1,1188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + + +/* + * This driver attempts to emulate some of the the behaviors of + * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris + * + * It does this by layering over the /dev/ptmx device and intercepting + * opens to it. + * + * This driver makes the following assumptions about the way the ptm/pts + * drivers on Solaris work: + * + * - all opens of the /dev/ptmx device node return a unique dev_t. + * + * - the dev_t minor node value for each open ptm instance corrospondes + * to it's associated slave terminal device number. ie. the path to + * the slave terminal device associated with an open ptm instance + * who's dev_t minor node vaue is 5, is /dev/pts/5. + * + * - the ptm driver always allocates the lowest numbered slave terminal + * device possible. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/devops.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/kstr.h> +#include <sys/lx_ptm.h> +#include <sys/modctl.h> +#include <sys/pathname.h> +#include <sys/ptms.h> +#include <sys/ptyvar.h> +#include <sys/stat.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/sdt.h> + +#define LP_PTM_PATH "/dev/ptmx" +#define LP_PTS_PATH "/dev/pts/" +#define LP_PTS_DRV_NAME "pts" +#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */ +#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */ + +/* + * this driver is layered on top of the ptm driver. we'd like to + * make this drivers minor name space a mirror of the ptm drivers + * namespace, but we can't actually do this. the reason is that the + * ptm driver is opened via the clone driver. there for no minor nodes + * of the ptm driver are actually accessible via the filesystem. + * since we're not a streams device we can't be opened by the clone + * driver. there for we need to have at least minor node accessible + * via the filesystem so that consumers can open it. we use the device + * node with a minor number of 0 for this purpose. what this means is + * that minor node 0 can't be used to map ptm minor node 0. since this + * minor node is now reserved we need to shift our ptm minor node + * mappings by one. ie. a ptm minor node with a value of 0 will + * corrospond to our minor node with a value of 1. these mappings are + * managed with the following macros. + */ +#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x) +#define INDEX_TO_MINOR(x) ((x) + 1) + +/* + * grow our layered handle array by the same size increment that the ptm + * driver uses to grow the pty device space - PTY_MAXDELTA + */ +#define LP_PTY_INC 128 + +/* + * lx_ptm_ops contains state information about outstanding operations on the + * underlying master terminal device. Currently we only track information + * for read operations. + * + * Note that this data has not been rolled directly into the lx_ptm_handle + * structure because we can't put mutex's of condition variables into + * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle + * structures linked to from the global lx_ptm state can be resized + * dynamically, and when it's resized, the new array is at a different + * memory location and the old array memory is discarded. Mutexs and cvs + * are accessed based off their address, so if this array was re-sized while + * there were outstanding operations on any mutexs or cvs in the array + * then the system would tip over. In the future the lx_ptm_handle structure + * array should probably be replaced with either an array of pointers to + * lx_ptm_handle structures or some other kind of data structure containing + * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure + * could be folded directly into the lx_ptm_handle structures. (This will + * also require the definition of a new locking mechanism to protect the + * contents of lx_ptm_handle structures.) + */ +typedef struct lx_ptm_ops { + int lpo_rops; + kcondvar_t lpo_rops_cv; + kmutex_t lpo_rops_lock; +} lx_ptm_ops_t; + +/* + * Every open of the master terminal device in a zone results in a new + * lx_ptm_handle handle allocation. These handles are stored in an array + * hanging off the lx_ptm_state structure. + */ +typedef struct lx_ptm_handle { + /* Device handle to the underlying real /dev/ptmx master terminal. */ + ldi_handle_t lph_handle; + + /* Flag to indicate if TIOCPKT mode has been enabled. */ + int lph_pktio; + + /* Number of times the slave device has been opened/closed. */ + int lph_eofed; + + /* Callback handler in the ptm driver to check if slave is open. */ + ptmptsopencb_t lph_ppocb; + + /* Pointer to state for operations on underlying device. */ + lx_ptm_ops_t *lph_lpo; +} lx_ptm_handle_t; + +/* + * Global state for the lx_ptm driver. + */ +typedef struct lx_ptm_state { + /* lx_ptm device devinfo pointer */ + dev_info_t *lps_dip; + + /* LDI ident used to open underlying real /dev/ptmx master terminals. */ + ldi_ident_t lps_li; + + /* pts drivers major number */ + major_t lps_pts_major; + + /* rw lock used to manage access and growth of lps_lh_array */ + krwlock_t lps_lh_rwlock; + + /* number of elements in lps_lh_array */ + uint_t lps_lh_count; + + /* Array of handles to underlying real /dev/ptmx master terminals. */ + lx_ptm_handle_t *lps_lh_array; +} lx_ptm_state_t; + +/* Pointer to the lx_ptm global state structure. */ +static lx_ptm_state_t lps; + +/* + * List of modules to be autopushed onto slave terminal devices when they + * are opened in an lx branded zone. + */ +static char *lx_pts_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +static void +lx_ptm_lh_grow(uint_t index) +{ + uint_t new_lh_count, old_lh_count; + lx_ptm_handle_t *new_lh_array, *old_lh_array; + + /* + * allocate a new array. we drop the rw lock on the array so that + * readers can still access devices in case our memory allocation + * blocks. + */ + new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1); + new_lh_array = + kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP); + + /* + * double check that we still actually need to increase the size + * of the array + */ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + if (index < lps.lps_lh_count) { + /* someone beat us to it so there's nothing more to do */ + rw_exit(&lps.lps_lh_rwlock); + kmem_free(new_lh_array, + sizeof (lx_ptm_handle_t) * new_lh_count); + return; + } + + /* copy the existing data into the new array */ + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_count != 0) { + bcopy(lps.lps_lh_array, new_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + } + + /* save info on the old array */ + old_lh_array = lps.lps_lh_array; + old_lh_count = lps.lps_lh_count; + + /* install the new array */ + lps.lps_lh_array = new_lh_array; + lps.lps_lh_count = new_lh_count; + + rw_exit(&lps.lps_lh_rwlock); + + /* free the old array */ + if (old_lh_array != NULL) { + kmem_free(old_lh_array, + sizeof (lx_ptm_handle_t) * old_lh_count); + } +} + +static void +lx_ptm_lh_insert(uint_t index, ldi_handle_t lh) +{ + lx_ptm_ops_t *lpo; + + ASSERT(lh != NULL); + + /* Allocate and initialize the ops structure */ + lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP); + mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + /* check if we need to grow the size of the layered handle array */ + if (index >= lps.lps_lh_count) { + rw_exit(&lps.lps_lh_rwlock); + lx_ptm_lh_grow(index); + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + } + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle == NULL); + ASSERT(lps.lps_lh_array[index].lph_pktio == 0); + ASSERT(lps.lps_lh_array[index].lph_eofed == 0); + ASSERT(lps.lps_lh_array[index].lph_lpo == NULL); + + /* insert the new handle and return */ + lps.lps_lh_array[index].lph_handle = lh; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + lps.lps_lh_array[index].lph_lpo = lpo; + + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_remove(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0); + ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock)); + + /* free the write handle */ + kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t)); + lps.lps_lh_array[index].lph_lpo = NULL; + + /* remove the handle and return it */ + lh = lps.lps_lh_array[index].lph_handle; + lps.lps_lh_array[index].lph_handle = NULL; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static void +lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + *ppocb = lps.lps_lh_array[index].lph_ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static void +lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + lps.lps_lh_array[index].lph_ppocb = *ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_lookup(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the handle */ + lh = lps.lps_lh_array[index].lph_handle; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static lx_ptm_ops_t * +lx_ptm_lpo_lookup(uint_t index) +{ + lx_ptm_ops_t *lpo; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_lpo != NULL); + + /* return the handle */ + lpo = lps.lps_lh_array[index].lph_lpo; + rw_exit(&lps.lps_lh_rwlock); + return (lpo); +} + +static int +lx_ptm_lh_pktio_get(uint_t index) +{ + int pktio; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the pktio state */ + pktio = lps.lps_lh_array[index].lph_pktio; + rw_exit(&lps.lps_lh_rwlock); + return (pktio); +} + +static void +lx_ptm_lh_pktio_set(uint_t index, int pktio) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the pktio state */ + lps.lps_lh_array[index].lph_pktio = pktio; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_lh_eofed_get(uint_t index) +{ + int eofed; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the eofed state */ + eofed = lps.lps_lh_array[index].lph_eofed; + rw_exit(&lps.lps_lh_rwlock); + return (eofed); +} + +static void +lx_ptm_lh_eofed_set(uint_t index) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the eofed state */ + lps.lps_lh_array[index].lph_eofed++; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_read_start(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* Wait for other read operations to finish */ + while (lpo->lpo_rops != 0) { + if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) { + mutex_exit(&lpo->lpo_rops_lock); + return (-1); + } + } + + /* Start a read operation */ + VERIFY(++lpo->lpo_rops == 1); + mutex_exit(&lpo->lpo_rops_lock); + return (0); +} + +static void +lx_ptm_read_end(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* End a read operation */ + VERIFY(--lpo->lpo_rops == 0); + cv_signal(&lpo->lpo_rops_cv); + + mutex_exit(&lpo->lpo_rops_lock); +} + +static int +lx_ptm_pts_isopen(dev_t dev) +{ + ptmptsopencb_t ppocb; + + lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb); + return (ppocb.ppocb_func(ppocb.ppocb_arg)); +} + +static void +lx_ptm_eof_read(ldi_handle_t lh) +{ + struct uio uio; + iovec_t iov; + char junk[1]; + + /* + * We can remove any EOF message from the head of the stream by + * doing a zero byte read from the stream. + */ + iov.iov_len = 0; + iov.iov_base = junk; + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_resid = iov.iov_len; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + (void) ldi_read(lh, &uio, kcred); +} + +static int +lx_ptm_eof_drop_1(dev_t dev, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, msg_size, msg_count; + + *rvalp = 0; + + /* + * Check if there is an EOF message (represented by a zero length + * data message) at the head of the stream. Note that the + * I_NREAD ioctl is a streams framework ioctl so it will succeed + * even if there have been previous write errors on this stream. + */ + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + + if ((msg_count == 0) || (msg_size != 0)) { + /* No EOF message found */ + return (0); + } + + /* Record the fact that the slave device has been closed. */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + + /* drop the EOF */ + lx_ptm_eof_read(lh); + *rvalp = 1; + return (0); +} + +static int +lx_ptm_eof_drop(dev_t dev, int *rvalp) +{ + int rval, err; + + if (rvalp != NULL) + *rvalp = 0; + for (;;) { + if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0) + return (err); + if (rval == 0) + return (0); + if (rvalp != NULL) + *rvalp = 1; + } +} + +static int +lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + *rvalp = 0; + if (ignore_eof) { + int size, rval; + + if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size, + FKIOCTL, kcred, &rval)) != 0) + return (err); + if (size != 0) + *rvalp = 1; + } else { + int msg_size, msg_count; + + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + if (msg_count != 0) + *rvalp = 1; + } + return (0); +} + +static int +lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int err; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + err = ldi_ident_from_dip(dip, &lps.lps_li); + if (err != 0) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + lps.lps_dip = dip; + lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME); + + rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL); + lps.lps_lh_count = 0; + lps.lps_lh_array = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ldi_ident_release(lps.lps_li); + lps.lps_dip = NULL; + + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_array != NULL) { + kmem_free(lps.lps_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + lps.lps_lh_array = NULL; + lps.lps_lh_count = 0; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + struct strioctl iocb; + ptmptsopencb_t ppocb = { NULL, NULL }; + ldi_handle_t lh; + major_t maj, our_major = getmajor(*devp); + minor_t min, lastmin; + uint_t index, anchor = 1; + dev_t ptm_dev; + int err, rval = 0; + + /* + * Don't support the FNDELAY flag and FNONBLOCK until we either + * find a Linux app that opens /dev/ptmx with the O_NDELAY + * or O_NONBLOCK flags explicitly, or until we create test cases + * to determine how reads of master terminal devices opened with + * these flags behave in different situations on Linux. Supporting + * these flags will involve enhancing our read implementation + * and changing the way it deals with EOF notifications. + */ + if (flag & (FNDELAY | FNONBLOCK)) + return (ENOTSUP); + + /* + * we're layered on top of the ptm driver so open that driver + * first. (note that we're opening /dev/ptmx in the global + * zone, not ourselves in the lx zone.) + */ + err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li); + if (err != 0) + return (err); + + /* get the devt returned by the ptmx open */ + err = ldi_get_dev(lh, &ptm_dev); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (err); + } + + /* + * we're a cloning driver so here's where we'll change the devt that we + * return. the ptmx is also a cloning driver so we'll just use + * it's minor number as our minor number (it already manages it's + * minor name space so no reason to duplicate the effort.) + */ + index = getminor(ptm_dev); + *devp = makedevice(our_major, INDEX_TO_MINOR(index)); + + /* Get a callback function to query if the pts device is open. */ + iocb.ic_cmd = PTMPTSOPENCB; + iocb.ic_timout = 0; + iocb.ic_len = sizeof (ppocb); + iocb.ic_dp = (char *)&ppocb; + + err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval); + if ((err != 0) || (rval != 0)) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + ASSERT(ppocb.ppocb_func != NULL); + + /* + * now setup autopush for the terminal slave device. this is + * necessary so that when a Linux program opens the device we + * can push required strmod modules onto the stream. in Solaris + * this is normally done by the application that actually + * allocates the terminal. + */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin, + &anchor, lx_pts_mods); + if (err != 0 && err != EEXIST) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + + /* save off this layered handle for future accesses */ + lx_ptm_lh_insert(index, lh); + lx_ptm_lh_set_ppocb(index, &ppocb); + return (0); +} + +/*ARGSUSED*/ +static int +lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + ldi_handle_t lh; + major_t maj; + minor_t min, lastmin; + uint_t index; + int err; + int i; + + index = DEVT_TO_INDEX(dev); + + /* + * we must cleanup all the state associated with this major/minor + * terminal pair before actually closing the ptm master device. + * this is required because once the close of the ptm device is + * complete major/minor terminal pair is immediatly available for + * re-use in any zone. + */ + + /* free up our saved reference for this layered handle */ + lh = lx_ptm_lh_remove(index); + + /* unconfigure autopush for the associated terminal slave device */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + for (i = 0; i < 5; i++) { + /* + * we loop here because we don't want to release this ptm + * node if autopush can't be disabled on the associated + * slave device because then bad things could happen if + * another brand were to get this terminal allocated + * to them. If we keep failing we eventually drive on so that + * things don't hang. + */ + err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin, + 0, NULL); + if (err == 0) + break; + + cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush", + getzoneid(), err); + + /* wait one second and try again */ + delay(drv_usectohz(1000000)); + } + + err = ldi_close(lh, flag, credp); + + /* + * note that we don't have to bother with changing the permissions + * on the associated slave device here. the reason is that no one + * can actually open the device untill it's associated master + * device is re-opened, which will result in the permissions on + * it being reset. + */ + return (err); +} + +static int +lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, rval; + struct uio uio = *uiop; + + *loop = 0; + + /* + * Here's another way that Linux master terminals behave differently + * from Solaris master terminals. If you do a read on a Linux + * master terminal (that was opened witout NDELAY and NONBLOCK) + * who's corrosponding slave terminal is currently closed and + * has been opened and closed at least once, Linux return -1 and + * set errno to EIO where as Solaris blocks. + */ + if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) { + /* Slave has been opened and closed at least once. */ + if (lx_ptm_pts_isopen(dev) == 0) { + /* + * Slave is closed. Make sure that data is avaliable + * before attempting a read. + */ + if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0) + return (err); + + /* If there is no data available then return. */ + if (rval == 0) + return (EIO); + } + } + + /* Actually do the read operation. */ + if ((err = ldi_read(lh, uiop, credp)) != 0) + return (err); + + /* If read returned actual data then return. */ + if (uio.uio_resid != uiop->uio_resid) + return (0); + + /* + * This was a zero byte read (ie, an EOF). This indicates + * that the slave terinal device has been closed. Record + * the fact that the slave device has been closed and retry + * the read operation. + */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + *loop = 1; + return (0); +} + +static int +lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev)); + int err, loop; + struct uio uio; + struct iovec iovp; + + ASSERT(uiop->uio_iovcnt > 0); + + /* + * If packet mode has been enabled (via TIOCPKT) we need to pad + * all read requests with a leading byte that indicates any + * relevant control status information. + */ + if (pktio != 0) { + /* + * We'd like to write the control information into + * the current buffer but we can't yet. We don't + * want to modify userspace memory here only to have + * the read operation fail later. So instead + * what we'll do here is read one character from the + * beginning of the memory pointed to by the uio + * structure. This will advance the output pointer + * by one. Then when the read completes successfully + * we can update the byte that we passed over. Before + * we do the read make a copy of the current uiop and + * iovec structs so we can write to them later. + */ + uio = *uiop; + iovp = *uiop->uio_iov; + uio.uio_iov = &iovp; + + if (uwritec(uiop) == -1) + return (EFAULT); + } + + do { + /* + * Before we actually attempt a read operation we need + * to make sure there's some buffer space to actually + * read in some data. We do this because if we're in + * pktio mode and the caller only requested one byte, + * then we've already used up that one byte and we + * don't want to pass this read request. Doing a 0 + * byte read (unless there is a problem with the stream + * head) always returns succcess. Normally when a streams + * read returns 0 bytes we interpret that as an EOF on + * the stream (ie, the slave side has been opened and + * closed) and we ignore it and re-try the read operation. + * So if we pass on a 0 byte read here lx_ptm_read_loop() + * will tell us to loop around and we'll end up in an + * infinite loop. + */ + if (uiop->uio_resid == 0) + break; + + /* + * Serialize all reads. We need to do this so that we can + * properly emulate the behavior of master terminals on Linux. + * In reality this serializaion should not pose any kind of + * performance problem since it would be very strange to have + * multiple threads trying to read from the same master + * terminal device concurrently. + */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_read_loop(dev, uiop, credp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + + if (pktio != 0) { + uint8_t pktio_data = TIOCPKT_DATA; + + /* + * Note that the control status information we + * pass back is faked up in the sense that we + * don't actually report any events, we always + * report a status of 0. + */ + if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0) + return (EFAULT); + } + + return (0); +} + +static int +lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + err = ldi_write(lh, uiop, credp); + + return (err); +} + +static int +lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + /* + * here we need to make sure that we never allow the + * I_SETSIG and I_ESETSIG ioctls to pass through. we + * do this because we can't support them. + * + * the native Solaris ptm device supports these ioctls because + * they are streams framework ioctls and all streams devices + * support them by default. these ioctls cause the current + * process to be registered with a stream and receive signals + * when certain stream events occur. + * + * a problem arises with cleanup of these registrations + * for layered drivers. + * + * normally the streams framework is notified whenever a + * process closes any reference to a stream and it goes ahead + * and cleans up these registrations. but actual device drivers + * are not notified when a process performs a close operation + * unless the process is closing the last opened reference to + * the device on the entire system. + * + * so while we could pass these ioctls on and allow processes + * to register for signal delivery, we would never receive + * any notification when those processes exit (or close a + * stream) and we wouldn't be able to unregister them. + * + * luckily these operations are streams specific and Linux + * doesn't support streams devices. so it doesn't actually + * seem like we need to support these ioctls. if it turns + * out that we do need to support them for some reason in + * the future, the current driver model will have to be + * enhanced to better support streams device layering. + */ + if ((cmd == I_SETSIG) || (cmd == I_ESETSIG)) + return (EINVAL); + + /* + * here we fake up support for TIOCPKT. Linux applications expect + * /etc/ptmx to support this ioctl, but on Solaris it doesn't. + * (it is supported on older bsd style ptys.) so we'll fake + * up support for it here. + * + * the reason that this ioctl is emulated here instead of in + * userland is that this ioctl affects the results returned + * from read() operations. if this ioctl was emulated in + * userland the brand library would need to intercept all + * read operations and check to see if pktio was enabled + * for the fd being read from. since this ioctl only needs + * to be supported on the ptmx device it makes more sense + * to support it here where we can easily update the results + * returned for read() operations performed on ourselves. + */ + if (cmd == TIOCPKT) { + int pktio; + + if (ddi_copyin((void *)arg, &pktio, sizeof (pktio), + mode) != DDI_SUCCESS) + return (EFAULT); + + if (pktio == 0) + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0); + else + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1); + + return (0); + } + + err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp); + + /* + * On recent versions of Linux some apps issue the following ioctls to + * the master side of the ptm before opening the slave side. Because + * our streams modules (specifically ptem) aren't autopushed until the + * slave side has been opened, these ioctls will fail. To alleviate the + * issue we simply pretend that these ioctls have succeeded. + * + * We could push our own "lx_ptem" module onto the master side of the + * stream in lx_ptm_open if we need better emulation, but that would + * require an "lx_ptem" module which duplicates most of ptem. ptem + * doesn't work properly when pushed on the master side. + */ + if (err == EINVAL && (cmd == TIOCSWINSZ || cmd == TCSETS) && + lx_ptm_pts_isopen(dev) == 0) { + /* slave side not open, assume we need to succeed */ + DTRACE_PROBE1(lx_ptm_ioctl__override, int, cmd); + return (0); + } + + return (err); +} + +static int +lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + short reventsp2; + int err, rval; + + *loop = 0; + + /* + * If the slave device has been opened and closed at least + * once and the slave device is currently closed, then poll + * always needs to returns immediatly. + */ + if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) && + (lx_ptm_pts_isopen(dev) == 0)) { + /* In this case always return POLLHUP */ + *reventsp = POLLHUP; + + /* + * Check if there really is data on the stream. + * If so set the correct return flags. + */ + if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) { + /* Something went wrong. */ + return (err); + } + if (rval != 0) + *reventsp |= (events & (POLLIN | POLLRDNORM)); + + /* + * Is the user checking for writability? Note that for ptm + * devices Linux seems to ignore the POLLWRBAND write flag. + */ + if ((events & POLLWRNORM) == 0) + return (0); + + /* + * To check if the stream is writable we have to actually + * call poll, but make sure to set anyyet to 1 to prevent + * the streams framework from setting up callbacks. + */ + if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0) + return (err); + + *reventsp |= (reventsp2 & POLLWRNORM); + } else { + int lockstate; + + /* The slave device is open, do the poll */ + if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0) + return (err); + + /* + * Drop any leading EOFs on the stream. + * + * Note that we have to use pollunlock() here to avoid + * recursive mutex enters in the poll framework. The + * reason is that if there is an EOF message on the stream + * then the act of reading from the queue to remove the + * message can cause the ptm drivers event service + * routine to be invoked, and if there is no open + * slave device then the ptm driver may generate + * error messages and put them on the stream. This + * in turn will generate a poll event and the poll + * framework will try to invoke any poll callbacks + * associated with the stream. In the process of + * doing that the poll framework will try to aquire + * locks that we are already holding. So we need to + * drop those locks here before we do our read. + */ + if (pollunlock(&lockstate) != 0) { + *reventsp = POLLNVAL; + return (0); + } + err = lx_ptm_eof_drop(dev, &rval); + pollrelock(lockstate); + if (err) + return (err); + + /* If no EOF was dropped then return */ + if (rval == 0) + return (0); + + /* + * An EOF was removed from the stream. Retry the entire + * poll operation from the top because polls on the ptm + * device should behave differently now. + */ + *loop = 1; + } + return (0); +} + +static int +lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int loop, err; + + do { + /* Serialize ourself wrt read operations. */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_poll_loop(dev, + events, anyyet, reventsp, phpp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + return (0); +} + +static struct cb_ops lx_ptm_cb_ops = { + lx_ptm_open, /* open */ + lx_ptm_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lx_ptm_read, /* read */ + lx_ptm_write, /* write */ + lx_ptm_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + lx_ptm_poll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lx_ptm_ops = { + DEVO_REV, + 0, + ddi_getinfo_1to1, + nulldev, + nulldev, + lx_ptm_attach, + lx_ptm_detach, + nodev, + &lx_ptm_cb_ops, + NULL, + NULL, + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "Linux master terminal driver", /* description of module */ + &lx_ptm_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf new file mode 100644 index 0000000000..481b4e3c74 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_ptm" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/os/lx_acct.c b/usr/src/uts/common/brand/lx/os/lx_acct.c new file mode 100644 index 0000000000..7f38a240ab --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_acct.c @@ -0,0 +1,198 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/acct.h> +#include <sys/proc.h> +#include <sys/user.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/session.h> +#include <sys/wait.h> +#include <sys/ddi.h> +#include <sys/zone.h> +#include <sys/lx_types.h> + +/* + * Based on the Linux acct(5) man page, their comp_t definition is the same + * as ours. lxac_etime is encoded as a float for v3 accounting records. + */ + +#define LX_ACCT_VERSION 3 + +/* + * Bit flags in lxac_flag. The Linux AFORK and ASU match native. The rest of + * the flags diverge. + */ +#define LX_AFORK 0x01 /* executed fork, but no exec */ +#define LX_ASU 0x02 /* used superuser privileges */ +#define LX_ACORE 0x08 /* dumped core */ +#define LX_AXSIG 0x10 /* killed by a signal */ + +typedef struct lx_acct { + char lxac_flag; + char lxac_version; + uint16_t lxac_tty; + uint32_t lxac_exitcode; + uint32_t lxac_uid; + uint32_t lxac_gid; + uint32_t lxac_pid; + uint32_t lxac_ppid; + uint32_t lxac_btime; /* seconds since the epoch */ + uint32_t lxac_etime; /* float representation of ticks */ + comp_t lxac_utime; + comp_t lxac_stime; + comp_t lxac_mem; /* kb */ + comp_t lxac_io; /* unused */ + comp_t lxac_rw; /* unused */ + comp_t lxac_minflt; + comp_t lxac_majflt; + comp_t lxac_swaps; /* unused */ + char lxac_comm[16]; +} lx_acct_t; + +/* + * Same functionality as acct_compress(). Produce a pseudo-floating point + * representation with 3 bits base-8 exponent, 13 bits fraction. + */ +static comp_t +lx_acct_compt(ulong_t t) +{ + int exp = 0, round = 0; + + while (t >= 8192) { + exp++; + round = t & 04; + t >>= 3; + } + if (round) { + t++; + if (t >= 8192) { + t >>= 3; + exp++; + } + } +#ifdef _LP64 + if (exp > 7) { + /* prevent wraparound */ + t = 8191; + exp = 7; + } +#endif + return ((exp << 13) + t); +} + +/* + * 32-bit IEEE float encoding as-per Linux. + */ +static uint32_t +lx_acct_float(int64_t t) +{ + uint32_t val, exp = 190; + + if (t == 0) + return (0); + + while (t > 0) { + t <<= 1; + exp--; + } + val = (uint32_t)(t >> 40) & 0x7fffffu; + + return (val | (exp << 23)); +} + +/* + * Write a Linux-formatted record to the accounting file. + */ +void +lx_acct_out(vnode_t *vp, int exit_status) +{ + struct proc *p; + user_t *ua; + struct cred *cr; + dev_t d; + pid_t pid, ppid; + struct vattr va; + ssize_t resid = 0; + int err; + lx_acct_t a; + + p = curproc; + ua = PTOU(p); + cr = CRED(); + + bzero(&a, sizeof (a)); + + a.lxac_flag = ua->u_acflag & (LX_AFORK | LX_ASU); + a.lxac_version = LX_ACCT_VERSION; + d = cttydev(p); + a.lxac_tty = LX_MAKEDEVICE(getmajor(d), getminor(d)); + if (WIFEXITED(exit_status)) { + a.lxac_exitcode = WEXITSTATUS(exit_status); + } else if (WIFSIGNALED(exit_status)) { + a.lxac_flag |= LX_AXSIG; + if (WCOREDUMP(exit_status)) { + a.lxac_flag |= LX_ACORE; + } + } + a.lxac_uid = crgetruid(cr); + a.lxac_gid = crgetrgid(cr); + pid = p->p_pid; + ppid = p->p_ppid; + /* Perform pid translation ala lxpr_fixpid(). */ + if (pid == curzone->zone_proc_initpid) { + pid = 1; + ppid = 0; + } else { + if (ppid == curzone->zone_proc_initpid) { + ppid = 1; + } else if (ppid == curzone->zone_zsched->p_pid || + (p->p_flag & SZONETOP) != 0) { + ppid = 1; + } + } + a.lxac_pid = pid; + a.lxac_ppid = ppid; + a.lxac_btime = ua->u_start.tv_sec; + /* For Linux v3 accounting record, this is an encoded float. */ + a.lxac_etime = lx_acct_float(ddi_get_lbolt() - ua->u_ticks); + a.lxac_utime = lx_acct_compt(NSEC_TO_TICK(p->p_acct[LMS_USER])); + a.lxac_stime = lx_acct_compt( + NSEC_TO_TICK(p->p_acct[LMS_SYSTEM] + p->p_acct[LMS_TRAP])); + a.lxac_mem = lx_acct_compt((ulong_t)(ptob(ua->u_mem) / 1024)); + /* a.lxac_io unused */ + /* a.lxac_rw unused */ + a.lxac_minflt = lx_acct_compt((ulong_t)p->p_ru.minflt); + a.lxac_majflt = lx_acct_compt((ulong_t)p->p_ru.majflt); + /* a.lxac_swaps unused */ + bcopy(ua->u_comm, a.lxac_comm, sizeof (a.lxac_comm)); + + /* + * As with the native acct() handling, we save the size so that if the + * write fails, we can reset the size to avoid corrupting the accounting + * file. + */ + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) == 0) { + err = vn_rdwr(UIO_WRITE, vp, (caddr_t)&a, sizeof (a), 0LL, + UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFF_T, kcred, &resid); + if (err != 0 || resid != 0) + (void) VOP_SETATTR(vp, &va, 0, kcred, NULL); + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_acl.c b/usr/src/uts/common/brand/lx/os/lx_acl.c new file mode 100644 index 0000000000..184f05b6ed --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_acl.c @@ -0,0 +1,213 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/sunddi.h> +#include <sys/pathname.h> +#include <sys/acl.h> +#include <acl/acl_common.h> +#include <sys/lx_acl.h> + + +typedef struct { + uint16_t lpaxe_tag; + uint16_t lpaxe_perm; + uint32_t lpaxe_id; +} lx_posix_acl_xattr_entry_t; + +typedef struct { + uint32_t lpaxh_version; + lx_posix_acl_xattr_entry_t lpaxh_entries[]; +} lx_posix_acl_xattr_header_t; + +#define LX_POSIX_ACL_XATTR_VERSION 0x0002 + +/* e_tag entry in struct posix_acl_entry */ +#define LX_ACL_USER_OBJ 0x01 /* USER_OBJ */ +#define LX_ACL_USER 0x02 /* USER */ +#define LX_ACL_GROUP_OBJ 0x04 /* GROUP_OBJ */ +#define LX_ACL_GROUP 0x08 /* GROUP */ +#define LX_ACL_MASK 0x10 /* CLASS_OBJ */ +#define LX_ACL_OTHER 0x20 /* OTHER_OBJ */ + + +static int +lx_acl_from_xattr(enum lx_acl_type atype, void *xattr, uint_t xlen, + acl_t **aclpp) +{ + lx_posix_acl_xattr_header_t *head = xattr; + lx_posix_acl_xattr_entry_t *entry; + int err = 0; + uint_t count, sz = xlen; + const uint_t mask = (atype == LX_ACL_DEFAULT) ? ACL_DEFAULT : 0; + acl_t *acl; + aclent_t *acle; + + if (xattr == NULL) { + /* Handle zero-length set operations */ + acl = acl_alloc(ACLENT_T); + *aclpp = acl; + return (0); + } + + if (xlen < sizeof (*head)) { + return (EINVAL); + } else if (head->lpaxh_version != LX_POSIX_ACL_XATTR_VERSION) { + return (EOPNOTSUPP); + } + + sz -= sizeof (lx_posix_acl_xattr_header_t); + if (sz % sizeof (lx_posix_acl_xattr_entry_t) != 0) { + return (EINVAL); + } + count = sz / sizeof (lx_posix_acl_xattr_entry_t); + + acl = acl_alloc(ACLENT_T); + if (count == 0) { + *aclpp = acl; + return (0); + } + + acle = kmem_alloc(count * sizeof (aclent_t), KM_SLEEP); + acl->acl_cnt = count; + acl->acl_aclp = acle; + entry = head->lpaxh_entries; + for (uint_t i = 0; i < count && err == 0; i++, entry++, acle++) { + switch (entry->lpaxe_tag) { + case LX_ACL_USER_OBJ: + case LX_ACL_GROUP_OBJ: + case LX_ACL_OTHER: + case LX_ACL_MASK: + break; + case LX_ACL_USER: + case LX_ACL_GROUP: + if (entry->lpaxe_id > MAXUID) { + err = EINVAL; + } + break; + default: + err = EINVAL; + break; + } + acle->a_id = entry->lpaxe_id | mask; + acle->a_type = entry->lpaxe_tag; + acle->a_perm = entry->lpaxe_perm; + } + if (err != 0) { + acl_free(acl); + return (err); + } + + *aclpp = acl; + return (0); +} + +/* ARGSUSED */ +int +lx_acl_setxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t len) +{ + const boolean_t is_dir = (vp->v_type == VDIR); + acl_t *acl = NULL; + cred_t *cr = CRED(); + int err; + + if (vp->v_type == VLNK) { + return (ENOTSUP); + } else if (atype == LX_ACL_DEFAULT && !is_dir) { + return (EACCES); + } + + /* + * Copyin and verify the input, even through there is little to be done + * with the result. + */ + if ((err = lx_acl_from_xattr(atype, data, len, &acl)) != 0) { + return (err); + } + + /* + * Because systemd has decided to scope-creep its way into a position + * of moribund domination over all things system software, there exist + * work-arounds which are required to address its numerous bugs and + * shortcomings. One such case involves the FreeIPA installer needing + * to perform setfacl(3) on /run/systemd/ask-password. + * + * Between the fact that meaningful ACL translation can be challenging + * and that the path in question resides on tmpfs (which doesn't yet + * support ACLs at all on illumos), faked success is the only palatable + * course of action for now. Atonement will follow. + * + * See also: https://bugzilla.redhat.com/show_bug.cgi?id=1322167 + */ + err = ENOTSUP; + if (crgetuid(cr) == 0) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if (vnodetopath(NULL, vp, path, MAXPATHLEN, cr) == 0 && + strncmp(path, "/run/systemd/", 13) == 0) { + /* Saccharin-sweet fake success */ + err = 0; + } + kmem_free(path, MAXPATHLEN); + } + acl_free(acl); + + return (err); +} + +/* ARGSUSED */ +int +lx_acl_getxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t slen, + ssize_t *solen) +{ + const boolean_t is_dir = (vp->v_type == VDIR); + vsecattr_t vsattr; + int err; + + if (vp->v_type == VLNK) { + return (ENOTSUP); + } else if (atype == LX_ACL_DEFAULT && !is_dir) { + return (ENODATA); + } + + bzero(&vsattr, sizeof (vsattr)); + vsattr.vsa_mask = VSA_ACECNT; + if ((err = VOP_GETSECATTR(vp, &vsattr, 0, CRED(), NULL)) != 0) { + err = (err == ENOENT) ? ENODATA : err; + return (err); + } + + if (vsattr.vsa_aclentp != NULL) + kmem_free(vsattr.vsa_aclentp, vsattr.vsa_aclentsz); + + return (ENODATA); +} + +/* ARGSUSED */ +int +lx_acl_removexattr(vnode_t *vp, enum lx_acl_type atype) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +lx_acl_listxattr(vnode_t *vp, uio_t *uio) +{ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_audit.c b/usr/src/uts/common/brand/lx/os/lx_audit.c new file mode 100644 index 0000000000..6e522e6d8d --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_audit.c @@ -0,0 +1,1604 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * The Linux auditing system provides a fairly complex rule-based syntax + * for configuring what actions are to be audited. The user-level details + * are generally described in the Linux audit.rules(7), auditctl(8), and + * auditd(8) man pages. The user/kernel netlink API does not seem to be + * documented. The Linux kernel source and the user-level auditd source must + * be used to understand the interface we have to emulate. The relevant Linux + * source files are: + * include/uapi/linux/audit.h + * include/linux/audit.h + * kernel/audit.c + * + * The lx_netlink module implements the API used for getting or changing the + * audit configuration. For rule-oriented operations (list, append, delete), + * an lx_audit_rule_t structure (or sequence when listing) is passed in/out of + * the kernel. The netlink code calls into the lx_audit_append_rule or + * lx_audit_delete_rule functions here to perform the relevant operation. + * Within the lx_audit_rule_t structure, each member has the following + * meaning: + * lxar_flag: corresponds to user-level list (e.g. "exit" for syscall return) + * lxar_action: user-level action (e.g. "always") + * lxar_fld_cnt: number of fields specified in lxar_fields, lxar_values, and + * lxar_flg_flag arrays + * lxar_mask: syscall number bitmask the rule applies to (bit position in + * the array corresponds to the syscall number) + * laxr_fields: array of fields in the rule (i.e. each -F on user-level rule). + * A numeric code (e.g. LX_RF_AUDIT_ARCH) is assigned to each + * possible field. + * lxar_values: array of numeric field values (e.g. the internal b64 value on + * the -F AUDIT_ARCH=b64 rule) + * lxar_fld_flag: array of field operators (e.g. the '=' operator on the + * -F AUDIT_ARCH=b64 rule) + * lxar_buflen: length of the buffer data immediately following + * lxar_buf: A variable amount of additional field string data. Non-numeric + * field values are passed here. For example, the string associated + * with the '-F key=...' or -F path=...' rules. For string values, + * the corresponding lxar_values entry is the length of the string. + * The strings in lxar_buf are not C strings because they are not + * NULL terminated. The character data is pulled out of lxar_buf + * in chunks specified by the value and the pointer into the buf + * is advanced accordingly. + * + * There are two primary kinds of actions which we are currently interested in + * auditing; + * 1) system call return + * this corresponds to user-level "exit" rule actions + * 2) file system related actions + * this corresponds to user-level file system watch rules (-w) + * + * Only system call return is currently implemented, and only a very limited + * subset of all of the possible rule selection behavior. + * + * The Linux audit rule syntax defines that all selection criteria within a + * rule is ANDed together before an audit record is created. However, multiple + * rules can be defined for a specific syscall. For example, this user-level + * syntax defines two different rules for the "open" syscall: + * -a always,exit -F arch=b64 -S open -F auid>=1000 -F key=user-open + * -a always,exit -F arch=b64 -S open -F auid=0 -F key=priv-open + * The first rule would cause an audit record to be created when an "open" + * syscall returns and the syscall was performed by a process with a + * loginuid >= 1000. The key added to that audit record would be "user-open". + * The second rule would create an audit record if the loginuid was 0 and the + * record's key would be "priv-open". + * + * When auditing is enabled for a syscall return, we have to look at multiple + * rules and create an audit record for each rule that matches the selection + * criteria. + * + * Although the current implementation is limited, the overall structure is + * designed to be enhanced as more auditing support is added over time. + * + * By default, auditing is not enabled for a zone and no internal audit data + * exists. When the first netlink audit msg is received, the zone's audit state + * (lx_audit_state_t) is allocated (via lx_audit_init) and attached to the + * zone's lx brand-specific data (lxzd_audit_state). Once allocated, the audit + * data will persist until the zone halts. + * + * Audit records are enqueued onto the lxast_ev_queue and a worker thread + * (lx_audit_worker) is responsible for dequeueing the audit records and + * sending them up to the user-level auditd. + * + * Audit rules are stored in the lxast_rules list. This is an internal list + * consisting of elements of type lx_audit_rule_ent_t. Each element contains + * the input rule (lxare_rule) along with some additional data parsed out of + * the rule when it is appended (currently only the arch and key). + * + * When auditing is enabled for a syscall, the appropriate entry in the + * lxast_sys64_rulep (or lxast_sys32_rulep) array will point to the first + * rule that is applicable to the syscall. When that syscall returns, rule + * matching proceeds from that rule to the end of the rule list. + * + * New rules are always appended at the end of the list and Linux expects that + * rules are matched in order. + * + * If the rule list ever gets large enough that a linear search, anchored off + * the syscall pointer, becomes a performance bottleneck, then we'll have to + * explore alternate implementations. However, use of auditing is not that + * common to begin with, and most syscalls are typically not audited, so as + * long as the number of rules is in the order of tens, then the current + * implementation should be fine. + * + * When a rule is deleted, all associated syscall entries (lxast_sys64_rulep or + * lxast_sys32_rulep) are cleared, then the rule list is searched to see if + * there are any remaining rules which are applicable to the syscall(s). If so, + * pointers are reestablished in the relevant lxast_sys64_rulep (or 32) array. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/ddi.h> +#include <sys/zone.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <sys/debug.h> +#include <sys/ucred.h> +#include <sys/session.h> +#include <sys/lx_types.h> +#include <sys/lx_audit.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/bitmap.h> +#include <sockcommon.h> + +#define LX_AUDIT_FEATURE_VERSION 1 + +/* + * Audit status mask values (lxas_mask in structure defined below) + * See Linux include/uapi/linux/audit.h + */ +#define LX_AUDIT_STATUS_ENABLED 0x001 +#define LX_AUDIT_STATUS_FAILURE 0x002 +#define LX_AUDIT_STATUS_PID 0x004 +#define LX_AUDIT_STATUS_RATE_LIMIT 0x008 +#define LX_AUDIT_STATUS_BACKLOG_LIMIT 0x010 +#define LX_AUDIT_STATUS_BACKLOG_WAIT_TIME 0x020 +#define LX_AUDIT_STATUS_LOST 0x040 + +/* + * Audit features + * See Linux include/uapi/linux/audit.h + */ +#define LX_AUDIT_F_BACKLOG_LIMIT 0x001 +#define LX_AUDIT_F_BACKLOG_WAIT_TIME 0x002 +#define LX_AUDIT_F_EXECUTABLE_PATH 0x004 +#define LX_AUDIT_F_EXCLUDE_EXTEND 0x008 +#define LX_AUDIT_F_SESSIONID_FILTER 0x010 +#define LX_AUDIT_F_LOST_RESET 0x020 +#define LX_AUDIT_F_FILTER_FS 0x040 + +#define LX_AUDIT_FEATURE_ALL (LX_AUDIT_F_BACKLOG_LIMIT | \ + LX_AUDIT_F_BACKLOG_WAIT_TIME | LX_AUDIT_F_EXECUTABLE_PATH | \ + LX_AUDIT_F_EXCLUDE_EXTEND | LX_AUDIT_F_SESSIONID_FILTER | \ + LX_AUDIT_F_LOST_RESET | LX_AUDIT_F_FILTER_FS) + + +/* Audit events */ +#define LX_AUDIT_SYSCALL 1300 /* syscall */ +#define LX_AUDIT_PATH 1302 /* file path */ +#define LX_AUDIT_CONFIG_CHANGE 1305 /* configuration change */ +#define LX_AUDIT_CWD 1307 /* current working directory */ +#define LX_AUDIT_EXECVE 1309 /* exec args */ +#define LX_AUDIT_EOE 1320 /* end of multi-record event */ + +#define LX_AUDIT_BITMASK_SIZE 64 +#define LX_AUDIT_MAX_KEY_LEN 256 + +/* Audit rule filter type */ +#define LX_AUDIT_FILTER_USER 0 /* user generated msgs */ +#define LX_AUDIT_FILTER_TASK 1 /* task creation */ +#define LX_AUDIT_FILTER_ENTRY 2 /* syscall entry - obsolete */ +#define LX_AUDIT_FILTER_WATCH 3 /* fs watch */ +#define LX_AUDIT_FILTER_EXIT 4 /* syscall return */ +#define LX_AUDIT_FILTER_TYPE 5 /* audit log start */ +#define LX_AUDIT_FILTER_FS 6 /* audit inode child */ + +/* Audit rule action type */ +#define LX_AUDIT_ACT_NEVER 0 +#define LX_AUDIT_ACT_POSSIBLE 1 +#define LX_AUDIT_ACT_ALWAYS 2 /* the common case */ + +#define LX_AUDIT_RULE_MAX_FIELDS 64 + +/* Linux defaults */ +#define LX_AUDIT_DEF_BACKLOG_LIMIT 64 +#define LX_AUDIT_DEF_WAIT_TIME (60 * HZ_TO_LX_USERHZ(hz)) + +/* + * Audit rule field types + * Linux defines a lot of Rule Field values in include/uapi/linux/audit.h. + * We currently only handle a few. + */ +#define LX_RF_AUDIT_LOGINUID 9 /* e.g. auid>=1000 */ +#define LX_RF_AUDIT_ARCH 11 /* e.g. -F arch=b64 */ +#define LX_RF_AUDIT_WATCH 105 /* user-level -w rule */ +#define LX_RF_AUDIT_PERM 106 /* user-level -p option */ +#define LX_RF_AUDIT_FILTERKEY 210 /* user-level -k key option */ + +/* + * Audit rule field operators + * Linux defines the operator values in include/uapi/linux/audit.h. + * These 4 bits are combined in various ways for additional operators. + */ +#define LX_OF_AUDIT_BM 0x08000000 /* bit mask (&) */ +#define LX_OF_AUDIT_LT 0x10000000 +#define LX_OF_AUDIT_GT 0x20000000 +#define LX_OF_AUDIT_EQ 0x40000000 +#define LX_OF_AUDIT_NE (LX_OF_AUDIT_LT | LX_OF_AUDIT_GT) +#define LX_OF_AUDIT_BT (LX_OF_AUDIT_BM | LX_OF_AUDIT_EQ) /* bit test (&=) */ +#define LX_OF_AUDIT_LE (LX_OF_AUDIT_LT | LX_OF_AUDIT_EQ) +#define LX_OF_AUDIT_GE (LX_OF_AUDIT_GT | LX_OF_AUDIT_EQ) +#define LX_OF_AUDIT_ALL (LX_OF_AUDIT_EQ | LX_OF_AUDIT_NE | LX_OF_AUDIT_BM) + +/* + * Audit rule arch specification + * See Linux EM_X86_64 and EM_386 defs. + * -F arch=b64 looks like: 0xc000003e + * -F arch=b32 looks like: 0x40000003 + * If no arch is specified (possible with '-S syslog', '-S all', or '-w <file>') + * the rule applies to both architectures and LX_RF_AUDIT_ARCH is not passed. + */ +#define LX_AUDIT_ARCH64 0xc000003e +#define LX_AUDIT_ARCH32 0x40000003 + +/* + * See Linux include/uapi/linux/audit.h, AUDIT_MESSAGE_TEXT_MAX is 8560. + * The auditd src has MAX_AUDIT_MESSAGE_LENGTH as 8970. + * Until necessary, we'll limit ourselves to a smaller length. + */ +#define LX_AUDIT_MESSAGE_TEXT_MAX 1024 + +typedef struct lx_audit_features { + uint32_t lxaf_version; + uint32_t lxaf_mask; + uint32_t lxaf_features; + uint32_t lxaf_lock; +} lx_audit_features_t; + +typedef struct lx_audit_status { + uint32_t lxas_mask; + uint32_t lxas_enabled; + uint32_t lxas_failure; + uint32_t lxas_pid; + uint32_t lxas_rate_limit; + uint32_t lxas_backlog_limit; + uint32_t lxas_lost; + uint32_t lxas_backlog; + /* LINTED: E_ANONYMOUS_UNION_DECL */ + union { + uint32_t lxas_version; + uint32_t lxas_feature_bitmap; + }; + uint32_t lxas_backlog_wait_time; +} lx_audit_status_t; + +typedef struct lx_audit_rule { + uint32_t lxar_flag; + uint32_t lxar_action; + uint32_t lxar_fld_cnt; + uint32_t lxar_mask[LX_AUDIT_BITMASK_SIZE]; + uint32_t lxar_fields[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_values[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_fld_flag[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_buflen; + /* LINTED: E_ZERO_OR_NEGATIVE_SUBSCRIPT */ + char lxar_buf[0]; +} lx_audit_rule_t; + +/* + * Internal structure for an audit rule. + * Each rule is on the zone's top-level list of all rules (lxast_rules). + * This structure also holds the parsed character string fields from the + * original input rule (lxar_buf) so that we don't need to re-parse that + * data on every match. + */ +typedef struct lx_audit_rule_ent { + list_node_t lxare_link; + lx_audit_rule_t lxare_rule; + char *lxare_buf; + boolean_t lxare_is32bit; + boolean_t lxare_is64bit; + char *lxare_key; +} lx_audit_rule_ent_t; + +typedef enum lx_audit_fail { + LXAE_SILENT, + LXAE_PRINT, /* default */ + LXAE_PANIC /* reboot the zone */ +} lx_audit_fail_t; + +typedef struct lx_audit_record { + list_node_t lxar_link; + uint32_t lxar_type; + char *lxar_msg; +} lx_audit_record_t; + +/* + * Per-zone audit state + * Lazy allocated when first needed. + * + * lxast_rate_limit + * Currently unused, but can be get/set. Linux default is 0. + * lxast_backlog_limit + * The maximum number of outstanding audit events allowed (the Linux kernel + * default is 64). If the limit is reached, lxast_failure determines what + * to do. + * lxast_backlog_wait_time + * Currently unused, but can be get/set. Linux default is 60HZ. + */ +typedef struct lx_audit_state { + lx_audit_fail_t lxast_failure; /* failure behavior */ + uint32_t lxast_rate_limit; + uint32_t lxast_backlog_limit; + uint32_t lxast_backlog_wait_time; + lx_audit_rule_ent_t *lxast_sys32_rulep[LX_NSYSCALLS]; + lx_audit_rule_ent_t *lxast_sys64_rulep[LX_NSYSCALLS]; + kcondvar_t lxast_worker_cv; + kmutex_t lxast_lock; /* protects members below */ + pid_t lxast_pid; /* auditd pid */ + uint64_t lxast_seq; /* event sequence num */ + uint32_t lxast_backlog; /* num of queued events */ + uint32_t lxast_lost; /* num of lost events */ + void *lxast_sock; /* auditd lx_netlink_sock_t */ + boolean_t lxast_exit; /* taskq worker should quit */ + boolean_t lxast_panicing; /* audit forcing reboot? */ + kthread_t *lxast_worker; + list_t lxast_ev_queue; /* audit record queue */ + list_t lxast_rules; /* the list of rules */ +} lx_audit_state_t; + +/* + * Function pointer to netlink function used by audit worker threads to send + * audit messages up to the user-level auditd. + */ +static int (*lx_audit_emit_msg)(void *, uint_t, const char *, uint_t); +static kmutex_t lx_audit_em_lock; /* protects emit_msg above */ + +/* From uts/common/brand/lx/syscall/lx_socket.c */ +extern long lx_socket(int, int, int); +/* From uts/common/syscall/close.c */ +extern int close(int); + +static int +lx_audit_emit_syscall_event(uint_t mtype, void *lxsock, const char *msg) +{ + int err; + + err = lx_audit_emit_msg(lxsock, mtype, msg, LX_AUDIT_MESSAGE_TEXT_MAX); + if (err != 0) + return (err); + err = lx_audit_emit_msg(lxsock, 0, NULL, 0); + return (err); +} + +/* + * Worker thread for audit record output up to user-level auditd. + */ +static void +lx_audit_worker(void *a) +{ + lx_audit_state_t *asp = (lx_audit_state_t *)a; + lx_audit_record_t *rp; + int err; + + VERIFY(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + while (!asp->lxast_exit) { + + if (asp->lxast_backlog == 0 || asp->lxast_sock == NULL || + asp->lxast_pid == 0) { + cv_wait(&asp->lxast_worker_cv, &asp->lxast_lock); + continue; + } + + rp = list_remove_head(&asp->lxast_ev_queue); + asp->lxast_backlog--; + + err = lx_audit_emit_syscall_event(rp->lxar_type, + asp->lxast_sock, rp->lxar_msg); + if (err != ENOMEM && err != ENOSPC) { + kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX); + kmem_free(rp, sizeof (lx_audit_record_t)); + } else { + /* + * Put it back on the list, drop the mutex so that + * any other audit-related action could occur (such as + * socket deletion), then wait briefly before retry. + */ + list_insert_head(&asp->lxast_ev_queue, rp); + asp->lxast_backlog++; + mutex_exit(&asp->lxast_lock); + /* wait 1/10th second and try again */ + delay(drv_usectohz(100000)); + mutex_enter(&asp->lxast_lock); + } + } + + /* Leave state ready for new worker when auditing restarted */ + asp->lxast_exit = B_FALSE; + mutex_exit(&asp->lxast_lock); + + thread_exit(); +} + +static void +lx_audit_set_worker(uint32_t pid, void *lxsock, + void (*cb)(void *, boolean_t)) +{ + lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state; + + ASSERT(asp != NULL); + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + /* First, stop any existing worker thread */ + while (asp->lxast_sock != NULL) { + mutex_exit(&asp->lxast_lock); + lx_audit_stop_worker(NULL, cb); + mutex_enter(&asp->lxast_lock); + /* unlikely we loop, but handle racing setters */ + } + + VERIFY(asp->lxast_pid == 0); + VERIFY(asp->lxast_sock == NULL); + VERIFY(asp->lxast_exit == B_FALSE); + VERIFY(asp->lxast_worker == NULL); + if (pid != 0) { + /* Start a worker with the new socket */ + asp->lxast_sock = lxsock; + cb(asp->lxast_sock, B_TRUE); + asp->lxast_pid = pid; + asp->lxast_worker = thread_create(NULL, 0, lx_audit_worker, + asp, 0, curzone->zone_zsched, TS_RUN, minclsyspri); + } +} + +static boolean_t +lx_audit_match_val(uint32_t op, uint32_t ruleval, uint32_t curval) +{ + switch (op) { + case LX_OF_AUDIT_LT: + return (curval < ruleval); + case LX_OF_AUDIT_GT: + return (curval > ruleval); + case LX_OF_AUDIT_EQ: + return (curval == ruleval); + case LX_OF_AUDIT_NE: + return (curval != ruleval); + case LX_OF_AUDIT_LE: + return (curval <= ruleval); + case LX_OF_AUDIT_GE: + return (curval >= ruleval); + case LX_OF_AUDIT_BM: /* bit mask - any bit is set? */ + return ((curval & ruleval) != 0); + case LX_OF_AUDIT_BT: /* bit test - all bits must be set */ + return ((curval & ruleval) == ruleval); + default: + break; + } + return (B_FALSE); +} + +/* + * Per the Linux audit.rules(7) man page, a rule with an auid of -1 means the + * process does not have a loginuid. We'll use the absence of a session on the + * process to mimic this behavior. + */ +static uint32_t +lx_audit_get_auid() +{ + sess_t *s; + uint32_t v; + + /* + * A process with no session has: + * s_dev == 0xffffffffffffffff + * s_vp == NULL + * s_cred == NULL + */ + s = curproc->p_sessp; + if (s != NULL && s->s_vp != NULL) { + v = crgetsuid(CRED()); + } else { + v = UINT32_MAX; /* emulate auid of -1 */ + } + + return (v); +} + +/* + * Determine if the rule matches. + * Currently, we're actually just checking LX_RF_AUDIT_LOGINUID (-F auid) + * fields, but as we add support for additional field matching, this function + * should be enhanced. + */ +static boolean_t +lx_audit_syscall_rule_match(lx_audit_rule_ent_t *erp) +{ + uint32_t i, v; + lx_audit_rule_t *rp = &erp->lxare_rule; + + for (i = 0; i < rp->lxar_fld_cnt; i++) { + uint32_t ftype, fval, fop; + + ftype = rp->lxar_fields[i]; + if (ftype != LX_RF_AUDIT_LOGINUID) + continue; + + fop = rp->lxar_fld_flag[i]; + fval = rp->lxar_values[i]; + v = lx_audit_get_auid(); + + if (!lx_audit_match_val(fop, fval, v)) + return (B_FALSE); + } + return (B_TRUE); +} + +static int +lx_audit_write(file_t *fp, const char *msg) +{ + int fflag; + ssize_t count; + size_t nwrite = 0; + struct uio auio; + struct iovec aiov; + + count = strlen(msg); + fflag = fp->f_flag; + + aiov.iov_base = (void *) msg; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + return (lx_write_common(fp, &auio, &nwrite, B_FALSE)); +} + +/* + * We first try to send the msg out to the zone's logging service, then + * fallback to the zone's console, although in practice, that is unlikely to + * be useful to most users. + */ +static void +lx_audit_log_msg(const char *msg) +{ + int fd; + struct sockaddr_un addr; + struct sonode *so; + uint_t alen; + uint_t sizediff = (sizeof (addr) - sizeof (addr.sun_path)); + file_t *fp; + int err; + vnode_t *vp; + + ttolwp(curthread)->lwp_errno = 0; + fd = lx_socket(LX_AF_UNIX, LX_SOCK_DGRAM, 0); + if (ttolwp(curthread)->lwp_errno != 0) + goto trycons; + + bzero((char *)&addr, sizeof (addr)); + addr.sun_family = AF_UNIX; + (void) strncpy(addr.sun_path, "/dev/log", sizeof (addr.sun_path) - 1); + alen = strlen(addr.sun_path) + 1 + sizediff; + + /* + * We can't use lx_connect here since that expects to be called from + * user-land, so we do the (streamlined) connect ourselves. + */ + if ((so = getsonode(fd, &err, &fp)) == NULL) { + (void) close(fd); + goto trycons; + } + + err = socket_connect(so, (struct sockaddr *)&addr, alen, fp->f_flag, + _SOCONNECT_XPG4_2, CRED()); + + if (err == 0) + err = lx_audit_write(fp, msg); + + releasef(fd); /* release getsonode hold */ + (void) close(fd); + + if (err == 0) + return; + +trycons: + /* "open" the console device */ + if (lookupnameatcred("/dev/console", UIO_SYSSPACE, FOLLOW, NULLVPP, + &vp, NULL, CRED()) != 0) + return; + + if (falloc(vp, FWRITE, &fp, &fd) != 0) { + VN_RELE(vp); + return; + } + mutex_exit(&fp->f_tlock); + setf(fd, fp); + + /* nothing left to do if console write fails */ + (void) lx_audit_write(fp, msg); + close(fd); +} + +static void +lx_audit_fail(lx_audit_state_t *asp, const char *msg) +{ + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + if (asp->lxast_failure == LXAE_PRINT || + asp->lxast_failure == LXAE_PANIC) { + /* + * Linux can ratelimit the amount of log spam here, so we'll + * do something similar, especially since this could be called + * on many syscall returns if the audit daemon is down or + * not consuming audit records for some other reason. + */ + if (asp->lxast_lost % 100 == 0) + lx_audit_log_msg(msg); + if (asp->lxast_failure == LXAE_PANIC && + !asp->lxast_panicing) { + /* + * Reboot the zone so that no audit records are lost. + * We delay a second to give the zone's logger a chance + * to handle the log message. We have to drop the lock + * here in case the zone's logger itself is making + * syscalls which would be audited, although that + * wouldn't be the ideal configuration. + */ + asp->lxast_panicing = B_TRUE; + mutex_exit(&asp->lxast_lock); + lx_audit_log_msg("audit: panic"); + delay(drv_usectohz(1000000)); + zone_kadmin(A_SHUTDOWN, AD_BOOT, NULL, kcred); + mutex_enter(&asp->lxast_lock); + } + } + asp->lxast_lost++; +} + +/* + * This formats the input string into a format that matches Linux. The input + * strings are small right now (<= PSARGSZ) so for simpicity we're using + * a temporary buffer of adequate size. + */ +static void +lx_audit_fmt_str(char *dst, char *str, uint_t dlen) +{ + char *sp, tmp[100]; + + (void) strlcpy(tmp, str, sizeof (tmp)); + if ((sp = strchr(tmp, ' ')) != NULL) + *sp = '\0'; + + if ((sp = strchr(tmp, '"')) == NULL) { + (void) snprintf(dst, dlen, "\"%s\"", tmp); + } else { + char *p, *dp; + uint_t olen = 0; + + ASSERT(dlen > 2); + dlen -= 2; /* leave room for terminating nul */ + dp = dst; + for (p = str; *p != '\0' && olen < dlen; p++) { + (void) sprintf(dp, "%02x", *p); + dp += 2; + olen += 2; + } + *dp = '\0'; + } +} + +/* + * Format and enqueue a syscall audit record. + */ +static void +lx_audit_syscall_fmt_rcd(int sysnum, uint32_t arch, long ret, + lx_audit_state_t *asp, lx_audit_rule_ent_t *erp, uint64_t seq, + timestruc_t *tsp) +{ + klwp_t *lwp; + proc_t *p; + uint32_t items, sessid; + lx_lwp_data_t *lwpd; + lx_audit_record_t *rp; + cred_t *cr = CRED(); + minor_t minor; + char key[LX_AUDIT_MAX_KEY_LEN + 6]; /* for key="%s" formatting */ + char exe[PSARGSZ * 2 + 8], comm[MAXCOMLEN * 2 + 8]; + + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + if (asp->lxast_backlog >= asp->lxast_backlog_limit) { + lx_audit_fail(asp, "audit: backlog limit exceeded"); + return; + } + + if (arch == LX_AUDIT_ARCH32) { + items = MIN(4, lx_sysent32[sysnum].sy_narg); + } else { + ASSERT3U(arch, ==, LX_AUDIT_ARCH64); + items = MIN(4, lx_sysent64[sysnum].sy_narg); + } + + lwp = ttolwp(curthread); + lwpd = lwptolxlwp(lwp); + p = curproc; + + /* + * For the key, if no key has been set on the rule, Linux formats the + * string "(null)" (with no quotes - i.e. key=(null)). + */ + if (erp->lxare_key != NULL) { + (void) snprintf(key, sizeof (key), "key=\"%s\"", + erp->lxare_key); + } else { + (void) snprintf(key, sizeof (key), "key=(null)"); + } + + rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP); + if (rp == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + return; + } + rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP); + if (rp->lxar_msg == NULL) { + kmem_free(rp, sizeof (lx_audit_record_t)); + lx_audit_fail(asp, "audit: no kernel memory"); + return; + } + rp->lxar_type = LX_AUDIT_SYSCALL; + + mutex_enter(&p->p_splock); + sessid = p->p_sessp->s_sid; + minor = getminor(p->p_sessp->s_dev); + mutex_exit(&p->p_splock); + + mutex_enter(&p->p_lock); + lx_audit_fmt_str(exe, p->p_user.u_psargs, sizeof (exe)); + lx_audit_fmt_str(comm, p->p_user.u_comm, sizeof (comm)); + mutex_exit(&p->p_lock); + + /* + * See Linux audit_log_exit() for how a syscall exit record is + * formatted. + * + * For "arch" value, see Linux AUDIT_ARCH_IA64, AUDIT_ARCH_I386, + * __AUDIT_ARCH_64BIT and __AUDIT_ARCH_LE definitions. + * + * For fsuid/fsgid, see lx_setfsuid/lx_setfsgid for how we handle that. + */ + (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX, + "audit(%lu.%03lu:%lu): arch=%x syscall=%u " + "success=%s exit=%ld a0=%lu a1=%lu a2=%lu a3=%lu items=%u " + "ppid=%u pid=%u auid=%u uid=%u gid=%u euid=%u suid=%u " + "fsuid=%u egid=%u sgid=%u fsgid=%u tty=pts%u ses=%u " + "comm=%s exe=%s %s", + (uint64_t)tsp->tv_sec, /* zone's timestamp */ + (uint64_t)tsp->tv_nsec / 1000000, + seq, /* serial number */ + arch, /* arch */ + sysnum, /* syscall */ + (lwp->lwp_errno == 0 ? "yes" : "no"), /* success */ + ret, /* exit */ + lwpd->br_syscall_args[0], /* a0 */ + lwpd->br_syscall_args[1], /* a1 */ + lwpd->br_syscall_args[2], /* a2 */ + lwpd->br_syscall_args[3], /* a3 */ + items, /* items */ + lx_lwp_ppid(lwp, NULL, NULL), /* ppid */ + (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid), + lx_audit_get_auid(), /* auid */ + crgetruid(cr), /* uid */ + crgetrgid(cr), /* gid */ + crgetuid(cr), /* euid */ + crgetsuid(cr), /* saved uid */ + crgetuid(cr), /* fsuid */ + crgetgid(cr), /* egid */ + crgetsgid(cr), /* saved gid */ + crgetgid(cr), /* fsgid */ + minor, /* tty */ + sessid, /* ses */ + comm, /* comm */ + exe, /* exe */ + key); /* key="VAL" */ + + list_insert_tail(&asp->lxast_ev_queue, rp); + if (asp->lxast_backlog == 0) + cv_signal(&asp->lxast_worker_cv); + asp->lxast_backlog++; +} + +/* + * Get the next rule in the list that is generally applicable to the given + * syscall. + */ +static lx_audit_rule_ent_t * +lx_audit_next_applicable_rule(int sysnum, uint32_t arch, lx_audit_state_t *asp, + lx_audit_rule_ent_t *erp) +{ + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + for (erp = list_next(&asp->lxast_rules, erp); + erp != NULL; + erp = list_next(&asp->lxast_rules, erp)) { + lx_audit_rule_t *r = &erp->lxare_rule; + + /* Determine if the rule in the list has the same ARCH. */ + if (arch == LX_AUDIT_ARCH32 && !erp->lxare_is32bit) + continue; + if (arch == LX_AUDIT_ARCH64 && !erp->lxare_is64bit) + continue; + + /* Determine if this rule applies to the relevant syscall. */ + if (BT_TEST32(r->lxar_mask, sysnum)) + return (erp); + } + + return (NULL); +} + +void +lx_audit_syscall_exit(int sysnum, long ret) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + uint64_t seq; + lx_audit_rule_ent_t *erp; + timestruc_t ts; + uint32_t arch; + + if (lxzd->lxzd_audit_enabled == LXAE_DISABLED) + return; + + if (sysnum >= LX_NSYSCALLS) + return; + + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + if (get_udatamodel() == DATAMODEL_ILP32) { + arch = LX_AUDIT_ARCH32; + } else { + ASSERT(get_udatamodel() == DATAMODEL_LP64); + arch = LX_AUDIT_ARCH64; + } + + /* + * Fast top-level check to see if we're auditing this syscall. + * We don't take the mutex for this since there is no need. + */ + if (arch == LX_AUDIT_ARCH32) { + if (asp->lxast_sys32_rulep[sysnum] == NULL) + return; + } else { + if (asp->lxast_sys64_rulep[sysnum] == NULL) + return; + } + + mutex_enter(&asp->lxast_lock); + if (arch == LX_AUDIT_ARCH32) { + erp = asp->lxast_sys32_rulep[sysnum]; + } else { + erp = asp->lxast_sys64_rulep[sysnum]; + } + + if (erp == NULL) { + /* Hit a race and the syscall is no longer being audited */ + mutex_exit(&asp->lxast_lock); + return; + } + + /* + * All of the records in the set (i.e. same serial number) have + * the same timestamp. + */ + seq = asp->lxast_seq++; + gethrestime(&ts); + ts.tv_sec -= curzone->zone_boot_time; + + /* + * We have to determine if the first rule associated with the syscall, + * or any subsequent applicable rules, match. + * + * The first rule associated with the syscall may (or may not) match, + * but there can be additional rules which might also match. The first + * possible rule is always the one that enables the syscall auditing, + * but we also have to iterate to the end of the list to see if any + * other rules are applicable to this syscall. + */ + for (; erp != NULL; + erp = lx_audit_next_applicable_rule(sysnum, arch, asp, erp)) { + if (!lx_audit_syscall_rule_match(erp)) + continue; + + lx_audit_syscall_fmt_rcd(sysnum, arch, ret, asp, erp, seq, &ts); + } + + /* + * TODO: Currently we only output a single SYSCALL record. + * Real Linux emits a set of audit records for a syscall exit event + * (e.g. for an unlink syscall): + * type=SYSCALL + * type=CWD + * type=PATH - one for the parent dir + * type=PATH - one for the actual file unlinked + * type=PROCTITLE - (this one seems worthless) + * followed by an AUDIT_EOE message (which seems to be ignored). + * + * For syscalls that don't change files in the file system (e.g. ioctl) + * there are no PATH records. + */ + mutex_exit(&asp->lxast_lock); +} + +/* + * Determine which syscalls this rule applies to and setup a fast pointer for + * the syscall to enable it's rule match. + * + * We have to look at each bit and translate the external syscall bits into the + * internal syscall number. + */ +static void +lx_enable_syscall_rule(lx_audit_state_t *asp, lx_audit_rule_t *rulep, + lx_audit_rule_ent_t *rp) +{ + uint_t sysnum; + + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) { + if (BT_TEST32(rulep->lxar_mask, sysnum)) { + if (rp->lxare_is32bit) { + if (asp->lxast_sys32_rulep[sysnum] == NULL) + asp->lxast_sys32_rulep[sysnum] = rp; + } + if (rp->lxare_is64bit) { + if (asp->lxast_sys64_rulep[sysnum] == NULL) + asp->lxast_sys64_rulep[sysnum] = rp; + } + } + } +} + +int +lx_audit_append_rule(void *r, uint_t datalen) +{ + lx_audit_rule_t *rulep = (lx_audit_rule_t *)r; + char *datap; + uint_t i; + lx_audit_rule_ent_t *rp; + lx_audit_state_t *asp; + boolean_t is_32bit = B_TRUE, is_64bit = B_TRUE, sys_found = B_FALSE; + char *tdp; + char key[LX_AUDIT_MAX_KEY_LEN + 1]; + uint32_t tlen; + + if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED) + return (EPERM); + + if (datalen < sizeof (lx_audit_rule_t)) + return (EINVAL); + datalen -= sizeof (lx_audit_rule_t); + + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + + if (rulep->lxar_buflen > datalen) + return (EINVAL); + + datap = rulep->lxar_buf; + + /* + * First check the rule to determine if we support the flag, actions, + * and all of the fields specified (since currently, our rule support + * is incomplete). + * + * NOTE: We currently only handle syscall exit rules. + */ + if (rulep->lxar_flag != LX_AUDIT_FILTER_EXIT || + rulep->lxar_action != LX_AUDIT_ACT_ALWAYS) + return (ENOTSUP); + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + tdp = datap; + tlen = rulep->lxar_buflen; + key[0] = '\0'; + for (i = 0; i < rulep->lxar_fld_cnt; i++) { + uint32_t ftype, fval, fop; + + fop = rulep->lxar_fld_flag[i]; + ftype = rulep->lxar_fields[i]; + fval = rulep->lxar_values[i]; + DTRACE_PROBE3(lx__audit__field, uint32_t, fop, + uint32_t, ftype, uint32_t, fval); + + if (ftype == LX_RF_AUDIT_ARCH) { + if (fop != LX_OF_AUDIT_EQ) + return (ENOTSUP); + if (!is_32bit || !is_64bit) + return (EINVAL); + if (fval == LX_AUDIT_ARCH64) { + is_32bit = B_FALSE; + } else if (fval == LX_AUDIT_ARCH32) { + is_64bit = B_FALSE; + } else { + return (ENOTSUP); + } + } else if (ftype == LX_RF_AUDIT_LOGINUID) { + if ((fop & LX_OF_AUDIT_ALL) == 0) + return (ENOTSUP); + } else if (ftype == LX_RF_AUDIT_FILTERKEY) { + if (fop != LX_OF_AUDIT_EQ) + return (ENOTSUP); + if (tlen < fval || fval > LX_AUDIT_MAX_KEY_LEN) + return (EINVAL); + if (key[0] != '\0') + return (EINVAL); + /* while we're here, save the parsed key */ + bcopy(tdp, key, fval); + key[fval] = '\0'; + tdp += fval; + tlen -= fval; + } else { + /* + * TODO: expand the support for additional Linux field + * options. + */ + return (ENOTSUP); + } + } + for (i = 0; i < LX_NSYSCALLS; i++) { + if (BT_TEST32(rulep->lxar_mask, i)) { + /* At least one syscall enabled in this mask entry */ + sys_found = B_TRUE; + break; + } + } + if (!sys_found) + return (ENOTSUP); + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + /* + * We have confirmed that we can handle the rule specified. + * Before taking the lock, allocate and setup the internal rule struct. + */ + rp = kmem_alloc(sizeof (lx_audit_rule_ent_t), KM_SLEEP); + bcopy(rulep, &rp->lxare_rule, sizeof (lx_audit_rule_t)); + rp->lxare_buf = kmem_alloc(rulep->lxar_buflen, KM_SLEEP); + bcopy(datap, rp->lxare_buf, rulep->lxar_buflen); + rp->lxare_is32bit = is_32bit; + rp->lxare_is64bit = is_64bit; + if (key[0] == '\0') { + rp->lxare_key = NULL; + } else { + int slen = strlen(key); + rp->lxare_key = kmem_alloc(slen + 1, KM_SLEEP); + (void) strlcpy(rp->lxare_key, key, slen + 1); + } + + mutex_enter(&asp->lxast_lock); + /* Save the rule on our top-level list. */ + list_insert_tail(&asp->lxast_rules, rp); + /* Enable tracing on the relevant syscalls. */ + lx_enable_syscall_rule(asp, rulep, rp); + mutex_exit(&asp->lxast_lock); + + return (0); +} + +int +lx_audit_delete_rule(void *r, uint_t datalen) +{ + lx_audit_rule_t *rulep = (lx_audit_rule_t *)r; + char *datap; + uint_t sysnum; + lx_audit_state_t *asp; + lx_audit_rule_ent_t *erp; + + if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED) + return (EPERM); + + if (datalen < sizeof (lx_audit_rule_t)) + return (EINVAL); + datalen -= sizeof (lx_audit_rule_t); + + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + + if (rulep->lxar_buflen > datalen) + return (EINVAL); + + datap = rulep->lxar_buf; + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + /* Find the matching rule from the rule list */ + for (erp = list_head(&asp->lxast_rules); + erp != NULL; + erp = list_next(&asp->lxast_rules, erp)) { + lx_audit_rule_t *r; + uint_t i; + boolean_t mtch; + + r = &erp->lxare_rule; + if (rulep->lxar_flag != r->lxar_flag) + continue; + if (rulep->lxar_action != r->lxar_action) + continue; + if (rulep->lxar_fld_cnt != r->lxar_fld_cnt) + continue; + for (i = 0, mtch = B_TRUE; i < LX_AUDIT_BITMASK_SIZE; i++) { + if (rulep->lxar_mask[i] != r->lxar_mask[i]) { + mtch = B_FALSE; + break; + } + } + if (!mtch) + continue; + + for (i = 0, mtch = B_TRUE; i < rulep->lxar_fld_cnt; i++) { + if (rulep->lxar_fields[i] != r->lxar_fields[i] || + rulep->lxar_values[i] != r->lxar_values[i] || + rulep->lxar_fld_flag[i] != r->lxar_fld_flag[i]) { + mtch = B_FALSE; + break; + } + } + if (!mtch) + continue; + if (rulep->lxar_buflen != r->lxar_buflen) + continue; + if (bcmp(datap, erp->lxare_buf, r->lxar_buflen) == 0) + break; + } + + /* There is no matching rule */ + if (erp == NULL) { + mutex_exit(&asp->lxast_lock); + return (ENOENT); + } + + /* + * Disable each relevant syscall enabling. + */ + for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) { + if (BT_TEST32(rulep->lxar_mask, sysnum)) { + /* + * If this was the first rule on the list for the + * given syscall (likely, since usually only one rule + * per syscall) then either disable tracing for that + * syscall, or point to the next applicable rule in the + * list. + */ + if (erp->lxare_is32bit) { + if (asp->lxast_sys32_rulep[sysnum] == erp) { + asp->lxast_sys32_rulep[sysnum] = + lx_audit_next_applicable_rule( + sysnum, LX_AUDIT_ARCH32, asp, erp); + } + } + if (erp->lxare_is64bit) { + if (asp->lxast_sys64_rulep[sysnum] == erp) { + asp->lxast_sys64_rulep[sysnum] = + lx_audit_next_applicable_rule( + sysnum, LX_AUDIT_ARCH64, asp, erp); + } + } + } + } + + /* Remove the rule from the top-level list */ + list_remove(&asp->lxast_rules, erp); + + kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen); + if (erp->lxare_key != NULL) + kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1); + kmem_free(erp, sizeof (lx_audit_rule_ent_t)); + + mutex_exit(&asp->lxast_lock); + return (0); +} + +void +lx_audit_emit_user_msg(uint_t mtype, uint_t len, char *datap) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + lx_audit_record_t *rp; + timestruc_t ts; + uint_t sessid; + proc_t *p = curproc; + lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread)); + uint_t prelen, alen; + char msg[LX_AUDIT_MESSAGE_TEXT_MAX]; + + /* + * For user messages, auditing may not actually be initialized. If not, + * just return. + */ + if (lxzd->lxzd_audit_enabled == LXAE_DISABLED || + lxzd->lxzd_audit_state == NULL) + return; + + if (len >= sizeof (msg)) + len = sizeof (msg) - 1; + + mutex_enter(&p->p_splock); + sessid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + if (asp->lxast_backlog >= asp->lxast_backlog_limit) { + lx_audit_fail(asp, "audit: backlog limit exceeded"); + mutex_exit(&asp->lxast_lock); + return; + } + + rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP); + if (rp == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + mutex_exit(&asp->lxast_lock); + return; + } + rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP); + if (rp->lxar_msg == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + mutex_exit(&asp->lxast_lock); + kmem_free(rp, sizeof (lx_audit_record_t)); + return; + } + rp->lxar_type = mtype; + bcopy(datap, msg, len); + msg[len] = '\0'; + + gethrestime(&ts); + ts.tv_sec -= curzone->zone_boot_time; + + (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX, + "audit(%lu.%03lu:%lu): pid=%u uid=%u auid=%u ses=%u msg=\'", + (uint64_t)ts.tv_sec, /* zone's timestamp */ + (uint64_t)ts.tv_nsec / 1000000, + asp->lxast_seq++, /* serial number */ + (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid), + crgetruid(CRED()), /* uid */ + lx_audit_get_auid(), /* auid */ + sessid); /* ses */ + + prelen = strlen(rp->lxar_msg); + alen = LX_AUDIT_MESSAGE_TEXT_MAX - prelen - 2; + (void) strlcat(rp->lxar_msg + prelen, msg, alen); + (void) strlcat(rp->lxar_msg, "\'", LX_AUDIT_MESSAGE_TEXT_MAX); + + list_insert_tail(&asp->lxast_ev_queue, rp); + if (asp->lxast_backlog == 0) + cv_signal(&asp->lxast_worker_cv); + asp->lxast_backlog++; + mutex_exit(&asp->lxast_lock); +} + +void +lx_audit_list_rules(void *reply, + void (*cb)(void *, void *, uint_t, void *, uint_t)) +{ + lx_audit_state_t *asp; + lx_audit_rule_ent_t *rp; + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + /* + * Output the rule list + */ + mutex_enter(&asp->lxast_lock); + for (rp = list_head(&asp->lxast_rules); rp != NULL; + rp = list_next(&asp->lxast_rules, rp)) { + cb(reply, &rp->lxare_rule, sizeof (lx_audit_rule_t), + rp->lxare_buf, rp->lxare_rule.lxar_buflen); + } + mutex_exit(&asp->lxast_lock); +} + +void +lx_audit_get_feature(void *reply, void (*cb)(void *, void *, uint_t)) +{ + lx_audit_features_t af; + + af.lxaf_version = LX_AUDIT_FEATURE_VERSION; + af.lxaf_mask = 0xffffffff; + af.lxaf_features = 0; + af.lxaf_lock = 0; + + cb(reply, &af, sizeof (af)); +} + +void +lx_audit_get(void *reply, void (*cb)(void *, void *, uint_t)) +{ + lx_audit_status_t status; + lx_zone_data_t *lxzd; + lx_audit_state_t *asp; + + lxzd = ztolxzd(curproc->p_zone); + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + bzero(&status, sizeof (status)); + + mutex_enter(&asp->lxast_lock); + status.lxas_enabled = lxzd->lxzd_audit_enabled; + status.lxas_failure = asp->lxast_failure; + status.lxas_pid = asp->lxast_pid; + status.lxas_rate_limit = asp->lxast_rate_limit; + status.lxas_backlog_limit = asp->lxast_backlog_limit; + status.lxas_lost = asp->lxast_lost; + status.lxas_backlog = asp->lxast_backlog; + status.lxas_backlog_wait_time = asp->lxast_backlog_wait_time; + status.lxas_feature_bitmap = LX_AUDIT_FEATURE_ALL; + mutex_exit(&asp->lxast_lock); + + cb(reply, &status, sizeof (status)); +} + +int +lx_audit_set(void *lxsock, void *s, uint_t datalen, + void (*cb)(void *, boolean_t)) +{ + lx_audit_status_t *statusp = (lx_audit_status_t *)s; + lx_zone_data_t *lxzd; + lx_audit_state_t *asp; + + /* + * Unfortunately, some user-level code does not send down a full + * lx_audit_status_t structure in the message (e.g. this occurs on + * CentOS7). Only the structure up to, but not including, the embedded + * union is being sent in. This appears to be a result of the user-level + * code being built for older versions of the kernel. To handle this, + * we have to subtract the last 8 bytes from the size in order to + * accomodate this code. We'll revalidate with the full size if + * LX_AUDIT_STATUS_BACKLOG_WAIT_TIME were to be set in the mask. + */ + if (datalen < sizeof (lx_audit_status_t) - 8) + return (EINVAL); + + lxzd = ztolxzd(curproc->p_zone); + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + /* Once the config is locked, we only allow changing the auditd pid */ + mutex_enter(&asp->lxast_lock); + if (lxzd->lxzd_audit_enabled == LXAE_LOCKED && + (statusp->lxas_mask & ~LX_AUDIT_STATUS_PID)) { + mutex_exit(&asp->lxast_lock); + return (EPERM); + } + + if (statusp->lxas_mask & LX_AUDIT_STATUS_FAILURE) { + switch (statusp->lxas_failure) { + case LXAE_SILENT: + case LXAE_PRINT: + case LXAE_PANIC: + asp->lxast_failure = statusp->lxas_failure; + break; + default: + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_PID) { + /* + * The process that sets the pid is the daemon, so this is the + * socket we'll write audit records out to. + */ + lx_audit_set_worker(statusp->lxas_pid, lxsock, cb); + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_RATE_LIMIT) { + asp->lxast_rate_limit = statusp->lxas_rate_limit; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_LIMIT) { + asp->lxast_backlog_limit = statusp->lxas_backlog_limit; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_WAIT_TIME) { + /* + * See the comment above. We have to revalidate the full struct + * size since we previously only validated for a shorter struct. + */ + if (datalen < sizeof (lx_audit_status_t)) { + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + asp->lxast_backlog_wait_time = statusp->lxas_backlog_wait_time; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_LOST) { + asp->lxast_lost = statusp->lxas_lost; + } + + if (statusp->lxas_mask & LX_AUDIT_STATUS_ENABLED) { + switch (statusp->lxas_enabled) { + case 0: + lxzd->lxzd_audit_enabled = LXAE_DISABLED; + break; + case 1: + lxzd->lxzd_audit_enabled = LXAE_ENABLED; + break; + case 2: + lxzd->lxzd_audit_enabled = LXAE_LOCKED; + break; + default: + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + } + mutex_exit(&asp->lxast_lock); + + return (0); +} + +void +lx_audit_stop_worker(void *s, void (*cb)(void *, boolean_t)) +{ + lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state; + kt_did_t tid = 0; + + ASSERT(asp != NULL); + mutex_enter(&asp->lxast_lock); + if (s == NULL) { + s = asp->lxast_sock; + } else { + VERIFY(s == asp->lxast_sock); + } + asp->lxast_sock = NULL; + asp->lxast_pid = 0; + if (asp->lxast_worker != NULL) { + tid = asp->lxast_worker->t_did; + asp->lxast_worker = NULL; + asp->lxast_exit = B_TRUE; + cv_signal(&asp->lxast_worker_cv); + } + if (s != NULL) + cb(s, B_FALSE); + mutex_exit(&asp->lxast_lock); + + if (tid != 0) + thread_join(tid); +} + +/* + * Called when audit netlink message received, in order to perform lazy + * allocation of audit state for the zone. We also perform the one-time step to + * cache the netlink callback used by the audit worker thread to send messages + * up to the auditd. + */ +void +lx_audit_init(int (*cb)(void *, uint_t, const char *, uint_t)) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + + mutex_enter(&lxzd->lxzd_lock); + + if (lxzd->lxzd_audit_state != NULL) { + mutex_exit(&lxzd->lxzd_lock); + return; + } + + asp = kmem_zalloc(sizeof (lx_audit_state_t), KM_SLEEP); + + mutex_init(&asp->lxast_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&asp->lxast_worker_cv, NULL, CV_DEFAULT, NULL); + list_create(&asp->lxast_ev_queue, sizeof (lx_audit_record_t), + offsetof(lx_audit_record_t, lxar_link)); + list_create(&asp->lxast_rules, sizeof (lx_audit_rule_ent_t), + offsetof(lx_audit_rule_ent_t, lxare_link)); + asp->lxast_failure = LXAE_PRINT; + asp->lxast_backlog_limit = LX_AUDIT_DEF_BACKLOG_LIMIT; + asp->lxast_backlog_wait_time = LX_AUDIT_DEF_WAIT_TIME; + + lxzd->lxzd_audit_state = asp; + + mutex_exit(&lxzd->lxzd_lock); + + mutex_enter(&lx_audit_em_lock); + if (lx_audit_emit_msg == NULL) + lx_audit_emit_msg = cb; + mutex_exit(&lx_audit_em_lock); +} + +/* + * Called when netlink module is unloading so that we can clear the cached + * netlink callback used by the audit worker thread to send messages up to the + * auditd. + */ +void +lx_audit_cleanup(void) +{ + mutex_enter(&lx_audit_em_lock); + lx_audit_emit_msg = NULL; + mutex_exit(&lx_audit_em_lock); +} + +/* + * Called when the zone is being destroyed, not when auditing is being disabled. + * Note that zsched has already exited and any lxast_worker thread has exited. + */ +void +lx_audit_fini(zone_t *zone) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + lx_audit_state_t *asp; + lx_audit_record_t *rp; + lx_audit_rule_ent_t *erp; + + ASSERT(MUTEX_HELD(&lxzd->lxzd_lock)); + + if ((asp = lxzd->lxzd_audit_state) == NULL) + return; + + mutex_enter(&asp->lxast_lock); + + VERIFY(asp->lxast_worker == NULL); + + rp = list_remove_head(&asp->lxast_ev_queue); + while (rp != NULL) { + kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX); + kmem_free(rp, sizeof (lx_audit_record_t)); + rp = list_remove_head(&asp->lxast_ev_queue); + } + + list_destroy(&asp->lxast_ev_queue); + asp->lxast_backlog = 0; + asp->lxast_pid = 0; + + erp = list_remove_head(&asp->lxast_rules); + while (erp != NULL) { + kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen); + if (erp->lxare_key != NULL) + kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1); + kmem_free(erp, sizeof (lx_audit_rule_ent_t)); + erp = list_remove_head(&asp->lxast_rules); + } + list_destroy(&asp->lxast_rules); + + mutex_exit(&asp->lxast_lock); + + cv_destroy(&asp->lxast_worker_cv); + mutex_destroy(&asp->lxast_lock); + lxzd->lxzd_audit_state = NULL; + kmem_free(asp, sizeof (lx_audit_state_t)); +} + +/* + * Audit initialization/cleanup when lx brand module is loaded and + * unloaded. + */ +void +lx_audit_ld() +{ + mutex_init(&lx_audit_em_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_audit_unld() +{ + mutex_destroy(&lx_audit_em_lock); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c new file mode 100644 index 0000000000..0f78bca605 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -0,0 +1,2701 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * The LX Brand: emulation of a Linux operating environment within a zone. + * + * OVERVIEW + * + * The LX brand enables a full Linux userland -- including a C library, + * init(1) framework, and some set of applications -- to run unmodified + * within an illumos zone. Unlike illumos, where applications are expected + * to link against and consume functions exported from libraries, the + * supported Linux binary compatibility boundary is the system call + * interface. By accurately emulating the behaviour of Linux system calls, + * Linux software can be executed in this environment as if it were running + * on a native Linux system. + * + * EMULATING LINUX SYSTEM CALLS + * + * Linux system calls are made in 32-bit processes via the "int 0x80" + * instruction; in 64-bit processes the "syscall" instruction is used, as it + * is with native illumos processes. In both cases, arguments to system + * calls are generally passed in registers and the usermode stack is not + * interpreted or modified by the Linux kernel. + * + * When the emulated Linux process makes a system call, it traps into the + * illumos kernel. The in-kernel brand module contains various emulation + * routines, and can fully service some emulated system calls; e.g. read(2) + * and write(2). Other system calls require assistance from the illumos + * libc, bouncing back out to the brand library ("lx_brand.so.1") for + * emulation. + * + * The brand mechanism allows for the provision of an alternative trap + * handler for the various system call mechanisms. Traditionally this was + * used to immediately revector execution to the usermode emulation library, + * which was responsible for handling all system calls. In the interests of + * more accurate emulation and increased performance, much of the regular + * illumos system call path is now invoked. Only the argument processing and + * handler dispatch are replaced by the brand, via the per-LWP + * "lwp_brand_syscall" interposition function pointer. + * + * THE NATIVE AND BRAND STACKS + * + * Some runtime environments (e.g. the Go language) allocate very small + * thread stacks, preferring to grow or split the stack as necessary. The + * Linux kernel generally does not use the usermode stack when servicing + * system calls, so this is not a problem. In order for our emulation to + * have the same zero stack impact, we must execute usermode emulation + * routines on an _alternate_ stack. This is similar, in principle, to the + * use of sigaltstack(3C) to run signal handlers off the main thread stack. + * + * To this end, the brand library allocates and installs an alternate stack + * (called the "native" stack) for each LWP. The in-kernel brand code uses + * this stack for usermode emulation calls and interposed signal delivery, + * while the emulated Linux process sees only the data on the main thread + * stack, known as the "brand" stack. The stack mode is tracked in the + * per-LWP brand-private data, using the LX_STACK_MODE_* enum. + * + * The stack mode doubles as a system call "mode bit". When in the + * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux + * system calls. In other modes, system calls are assumed to be native + * illumos system calls as made during brand library initialisation and + * usermode emulation. + * + * USERMODE EMULATION + * + * When a Linux system call cannot be emulated within the kernel, we preserve + * the register state of the Linux process and revector the LWP to the brand + * library usermode emulation handler: the "lx_emulate()" function in + * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, + * and is performed in "lx_emulate_user()". + * + * First, the emulated process state is written out to the usermode stack of + * the process as a "ucontext_t" object. Arguments to the emulation routine + * are passed on the stack or in registers, depending on the ABI. When the + * usermode emulation is complete, the result is passed back to the kernel + * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context + * for restoration. + * + * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT + * + * When servicing emulated system calls in the usermode brand library, or + * during signal delivery, various state is preserved by the kernel so that + * the running LWP may be revectored to a handling routine. The context + * allows the kernel to restart the program at the point of interruption, + * either at the return of the signal handler, via setcontext(3C); or after + * the usermode emulation request has been serviced, via B_EMULATION_DONE. + * + * In illumos native processes, the saved context (a "ucontext_t" object) + * includes the state of registers and the current signal mask at the point + * of interruption. The context also includes a link to the most recently + * saved context, forming a chain to be unwound as requests complete. The LX + * brand requires additional book-keeping to describe the machine state: in + * particular, the current stack mode and the occupied extent of the native + * stack. + * + * The brand code is able to interpose on the context save and restore + * operations in the kernel -- see "lx_savecontext()" and + * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to + * function correctly in the face of a dual stack LWP. The brand also + * interposes on the signal delivery mechanism -- see "lx_sendsig()" and + * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand + * library interposer on the native stack, regardless of the interrupted + * execution mode. Linux sigaltstack(2) emulation is performed entirely by + * the usermode brand library during signal handler interposition. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/exec.h> +#include <sys/lx_impl.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <sys/lx_futex.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_userhz.h> +#include <sys/param.h> +#include <sys/termios.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/auxv.h> +#include <sys/priv.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/archsystm.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sdt.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <sys/core.h> +#include <sys/stack.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <lx_signum.h> +#include <util/sscanf.h> +#include <sys/lx_brand.h> +#include <sys/zfs_ioctl.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> + +int lx_debug = 0; +uint_t lx_hz_scale = 0; + +void lx_init_brand_data(zone_t *, kmutex_t *); +void lx_free_brand_data(zone_t *); +void lx_setbrand(proc_t *); +int lx_getattr(zone_t *, int, void *, size_t *); +int lx_setattr(zone_t *, int, void *, size_t); +int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, uintptr_t); +void lx_set_kern_version(zone_t *, char *); +void lx_copy_procdata(proc_t *, proc_t *); + +extern int getsetcontext(int, void *); +extern int waitsys(idtype_t, id_t, siginfo_t *, int); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +extern int waitsys32(idtype_t, id_t, siginfo_t *, int); +#endif + +extern int zvol_name2minor(const char *, minor_t *); +extern int zvol_create_minor(const char *); + +extern void lx_proc_exit(proc_t *); +extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); + +extern void lx_io_clear(lx_proc_data_t *); +extern void lx_io_cleanup(proc_t *); + +extern void lx_ioctl_init(); +extern void lx_ioctl_fini(); +extern void lx_socket_init(); +extern void lx_socket_fini(); + +extern int lx_start_nfs_lockd(); +extern void lx_upcall_statd(); + +lx_systrace_f *lx_systrace_entry_ptr; +lx_systrace_f *lx_systrace_return_ptr; + +static int lx_systrace_enabled; + +/* + * cgroup file system maintenance functions which are set when cgroups loads. + */ +void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +/* + * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly + * want an MMU dependency here (and should there be a microprocessor without + * a hole, we don't want to start allocating from the top of the VA range). + */ +#define LX_MAXSTACK64 0x7ffffff00000 + +uint64_t lx_maxstack64 = LX_MAXSTACK64; + +static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, size_t *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action); + +static boolean_t lx_native_exec(uint8_t, const char **); +static uint32_t lx_map32limit(proc_t *); + +static void lx_savecontext(ucontext_t *); +static void lx_restorecontext(ucontext_t *); +static caddr_t lx_sendsig_stack(int); +static void lx_sendsig(int); +#if defined(_SYSCALL32_IMPL) +static void lx_savecontext32(ucontext32_t *); +#endif +static int lx_setid_clear(vattr_t *, cred_t *); +#if defined(_LP64) +static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); +#endif +static void lx_clearbrand(proc_t *, boolean_t); + +typedef struct lx_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +} lx_zfs_ds_t; + +/* lx brand */ +struct brand_ops lx_brops = { + lx_init_brand_data, /* b_init_brand_data */ + lx_free_brand_data, /* b_free_brand_data */ + lx_brandsys, /* b_brandsys */ + lx_setbrand, /* b_setbrand */ + lx_getattr, /* b_getattr */ + lx_setattr, /* b_setattr */ + lx_copy_procdata, /* b_copy_procdata */ + lx_proc_exit, /* b_proc_exit */ + lx_exec, /* b_exec */ + lx_setrval, /* b_lwp_setrval */ + lx_lwpdata_alloc, /* b_lwpdata_alloc */ + lx_lwpdata_free, /* b_lwpdata_free */ + lx_initlwp, /* b_initlwp */ + lx_initlwp_post, /* b_initlwp_post */ + lx_forklwp, /* b_forklwp */ + lx_freelwp, /* b_freelwp */ + lx_exitlwp, /* b_lwpexit */ + lx_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + lx_sigfd_translate, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + lx_exit_with_sig, /* b_exit_with_sig */ + lx_wait_filter, /* b_wait_filter */ + lx_native_exec, /* b_native_exec */ + lx_map32limit, /* b_map32limit */ + lx_stop_notify, /* b_stop_notify */ + lx_waitid_helper, /* b_waitid_helper */ + lx_sigcld_repost, /* b_sigcld_repost */ + lx_ptrace_issig_stop, /* b_issig_stop */ + lx_ptrace_sig_ignorable, /* b_sig_ignorable */ + lx_savecontext, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + lx_savecontext32, /* b_savecontext32 */ +#endif + lx_restorecontext, /* b_restorecontext */ + lx_sendsig_stack, /* b_sendsig_stack */ + lx_sendsig, /* b_sendsig */ + lx_setid_clear, /* b_setid_clear */ +#if defined(_LP64) + lx_pagefault, /* b_pagefault */ +#else + NULL, +#endif + B_FALSE, /* b_intp_parse_arg */ + lx_clearbrand, /* b_clearbrand */ + lx_upcall_statd, /* b_rpc_statd */ + lx_acct_out /* b_acct_out */ +}; + +struct brand_mach_ops lx_mops = { + NULL, + NULL, + NULL, + NULL, + NULL, + lx_fixsegreg, + lx_fsbase +}; + +struct brand lx_brand = { + BRAND_VER_1, + "lx", + &lx_brops, + &lx_mops, + sizeof (struct lx_proc_data) +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "lx brand", &lx_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +lx_proc_exit(proc_t *p) +{ + lx_proc_data_t *lxpd; + proc_t *cp; + + lx_clone_grp_exit(p, B_FALSE); + /* Cleanup any outstanding aio contexts */ + lx_io_cleanup(p); + + mutex_enter(&p->p_lock); + VERIFY((lxpd = ptolxproc(p)) != NULL); + VERIFY(lxpd->l_ptrace == 0); + if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { + mutex_exit(&p->p_lock); + return; + } + mutex_exit(&p->p_lock); + + /* Check for children which desire notification of parental death. */ + mutex_enter(&pidlock); + for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { + mutex_enter(&cp->p_lock); + if ((lxpd = ptolxproc(cp)) == NULL) { + mutex_exit(&cp->p_lock); + continue; + } + if (lxpd->l_parent_deathsig != 0) { + sigtoproc(cp, NULL, lxpd->l_parent_deathsig); + } + mutex_exit(&cp->p_lock); + } + mutex_exit(&pidlock); +} + +void +lx_setbrand(proc_t *p) +{ + /* Send SIGCHLD to parent by default when child exits */ + ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; + + lx_read_argv_bounds(p); +} + +/* ARGSUSED */ +int +lx_setattr(zone_t *zone, int attr, void *ubuf, size_t ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + bzero(buf, LX_KERN_RELEASE_MAX); + if (ubufsz >= LX_KERN_RELEASE_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_release, buf, + LX_KERN_RELEASE_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + bzero(buf, LX_KERN_VERSION_MAX); + if (ubufsz >= LX_KERN_VERSION_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_version, buf, + LX_KERN_VERSION_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + case LX_ATTR_TTY_GID: { + gid_t gid; + if (ubufsz != sizeof (gid)) { + return (ERANGE); + } + if (copyin(ubuf, &gid, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_ttygrp = gid; + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + default: + return (EINVAL); + } +} + +/* ARGSUSED */ +int +lx_getattr(zone_t *zone, int attr, void *ubuf, size_t *ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + int len; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_release, LX_KERN_RELEASE_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_release, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_version, LX_KERN_VERSION_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_version, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + default: + return (EINVAL); + } +} + +uint32_t +lx_map32limit(proc_t *p) +{ + /* + * To be bug-for-bug compatible with Linux, we have MAP_32BIT only + * allow mappings in the first 31 bits. This was a nuance in the + * original Linux implementation circa 2002, and applications have + * come to depend on its behavior. + * + * This is only relevant for 64-bit processes. + */ + if (p->p_model == DATAMODEL_LP64) + return ((uint32_t)1 << 31); + + return ((uint32_t)USERLIMIT32); +} + +void +lx_brand_systrace_enable(void) +{ + VERIFY(!lx_systrace_enabled); + + lx_systrace_enabled = 1; +} + +void +lx_brand_systrace_disable(void) +{ + VERIFY(lx_systrace_enabled); + + lx_systrace_enabled = 0; +} + +void +lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) +{ + VERIFY(lwpd->br_ntv_stack != 0); + + /* + * The "brand-lx-set-ntv-stack-current" probe has arguments: + * arg0: stack pointer before change + * arg1: stack pointer after change + * arg2: current stack base + */ + DTRACE_PROBE3(brand__lx__set__ntv__stack__current, + uintptr_t, lwpd->br_ntv_stack_current, + uintptr_t, new_sp, + uintptr_t, lwpd->br_ntv_stack); + + lwpd->br_ntv_stack_current = new_sp; +} + +#if defined(_LP64) +static int +lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, + enum seg_rw rw) +{ + int syscall_num; + + /* + * We only want to handle a very specific set of circumstances. + * Namely: this is a 64-bit LX-branded process attempting to execute an + * address in a page for which it does not have a valid mapping. If + * this is not the case, we bail out as fast as possible. + */ + VERIFY(PROC_IS_BRANDED(p)); + if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != + DATAMODEL_NATIVE) { + return (-1); + } + + if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { + return (-1); + } + + /* + * This is a valid vsyscall address. We service the system call and + * return 0 to signal that the pagefault has been handled completely. + */ + lx_vsyscall_enter(p, lwp, syscall_num); + return (0); +} +#endif + +static void +lx_clearbrand(proc_t *p, boolean_t lwps_ok) +{ + lx_clone_grp_exit(p, lwps_ok); +} + +/* + * This hook runs prior to sendsig() processing and allows us to nominate + * an alternative stack pointer for delivery of the signal handling frame. + * Critically, this routine should _not_ modify any LWP state as the + * savecontext() does not run until after this hook. + */ +/* ARGSUSED */ +static caddr_t +lx_sendsig_stack(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We want to take signal delivery on the native stack, but only if + * one has been allocated and installed for this LWP. + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The program is not running on the native stack. Return + * the native stack pointer from our brand-private data so + * that we may switch to it for signal handling. + */ + return ((caddr_t)lwpd->br_ntv_stack_current); + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * Either the program is already running on the native stack, + * or one has not yet been allocated for this LWP. Use the + * current stack pointer value. + */ + return ((caddr_t)rp->r_sp); + } +} + +/* + * This hook runs after sendsig() processing and allows us to update the + * per-LWP mode flags for system calls and stacks. The pre-signal + * context has already been saved and delivered to the user at this point. + */ +/* ARGSUSED */ +static void +lx_sendsig(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * In lx_sendsig_stack(), we nominated a stack pointer from the + * native stack. Update the stack mode, and the current in-use + * extent of the native stack, accordingly: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, rp->r_sp); + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + break; + + default: + /* + * Otherwise, the brand library has not yet installed the + * alternate stack for this LWP. Signals will be handled on + * the regular stack thread. + */ + return; + } +} + +/* + * This hook runs prior to the context restoration, allowing us to take action + * or modify the context before it is loaded. + */ +static void +lx_restorecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + caddr_t sp = ucp->uc_brand_data[1]; + + if (lwpd->br_stack_mode == LX_STACK_MODE_PREINIT) { + /* + * Since we're here with stack_mode as LX_STACK_MODE_PREINIT, + * that can only mean we took a signal really early in this + * thread's lifetime, before we had a chance to setup a native + * stack and start running the thread's code. Since we're still + * handling everything on the single stack, we can't do any of + * the usual work below. Note: this means we cannot look at + * "flags" since the uc_brand_data may not have been properly + * set, depending on where we were when we took the signal. + */ + return; + } + + /* + * We have a saved native stack pointer value that we must restore + * into the per-LWP data. + */ + if (flags & LX_UC_RESTORE_NATIVE_SP) { + lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); + } + + /* + * We do not wish to restore the value of uc_link in this context, + * so replace it with the value currently in the LWP. + */ + if (flags & LX_UC_IGNORE_LINK) { + ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; + } + + /* + * Set or restore the stack mode. Usually this restores the mode, but + * the lx_runexe code flow also uses this to set the mode from + * LX_STACK_MODE_INIT to LX_UC_STACK_BRAND. + */ + if (flags & LX_UC_STACK_NATIVE) { + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + } else if (flags & LX_UC_STACK_BRAND) { + lwpd->br_stack_mode = LX_STACK_MODE_BRAND; + } + +#if defined(__amd64) + /* + * Override the fs/gsbase in the context with the value provided + * through the Linux arch_prctl(2) system call. + */ + if (flags & LX_UC_STACK_BRAND) { + if (lwpd->br_lx_fsbase != 0) { + ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; + } + if (lwpd->br_lx_gsbase != 0) { + ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; + } + } +#endif +} + +static void +lx_savecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = + (void *)(uintptr_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = (void *)flags; +} + +#if defined(_SYSCALL32_IMPL) +static void +lx_savecontext32(ucontext32_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + unsigned int flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = flags; +} +#endif + +static int +lx_zfs_ioctl(ldi_handle_t lh, int cmd, zfs_cmd_t *zc, size_t *dst_alloc_size) +{ + uint64_t cookie; + size_t dstsize; + int rc, unused; + + cookie = zc->zc_cookie; + + dstsize = (dst_alloc_size == NULL ? 0 : 8192); + +again: + if (dst_alloc_size != NULL) { + zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(dstsize, + KM_SLEEP); + zc->zc_nvlist_dst_size = dstsize; + } + + rc = ldi_ioctl(lh, cmd, (intptr_t)zc, FKIOCTL, kcred, &unused); + if (rc == ENOMEM && dst_alloc_size != NULL) { + /* + * Our nvlist_dst buffer was too small, retry with a bigger + * buffer. ZFS will tell us the exact needed size. + */ + size_t newsize = zc->zc_nvlist_dst_size; + ASSERT(newsize > dstsize); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, dstsize); + dstsize = newsize; + zc->zc_cookie = cookie; + + goto again; + } + + if (dst_alloc_size != NULL) { + *dst_alloc_size = dstsize; + } + + return (rc); +} + +static int +lx_zone_zfs_open(ldi_handle_t *lh, dev_t *zfs_dev) +{ + ldi_ident_t li; + + if (ldi_ident_from_mod(&modlinkage, &li) != 0) { + return (-1); + } + if (ldi_open_by_name("/dev/zfs", FREAD|FWRITE, kcred, lh, li) != 0) { + ldi_ident_release(li); + return (-1); + } + ldi_ident_release(li); + if (ldi_get_dev(*lh, zfs_dev) != 0) { + (void) ldi_close(*lh, FREAD|FWRITE, kcred); + return (-1); + } + return (0); +} + +/* + * We only get the relevant properties for zvols. This is because we're + * essentially iterating all of the ZFS datasets/zvols on the entire system + * when we boot the zone and there is a significant performance penalty if we + * have to retrieve all of the properties for everything. Especially since we + * don't care about any of them except the zvols actually in our delegated + * datasets. + * + * Note that the two properties we care about, volsize & volblocksize, are + * mandatory for zvols and should always be present. Also, note that the + * blocksize property value cannot change after the zvol has been created. + */ +static void +lx_zvol_props(ldi_handle_t lh, zfs_cmd_t *zc, uint64_t *vsz, uint64_t *bsz) +{ + int rc; + size_t size; + nvlist_t *nv = NULL, *nv2; + + rc = lx_zfs_ioctl(lh, ZFS_IOC_OBJSET_STATS, zc, &size); + if (rc != 0) + return; + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &nv, 0); + ASSERT(rc == 0); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + zc->zc_nvlist_dst = NULL; + zc->zc_nvlist_dst_size = 0; + + if ((rc = nvlist_lookup_nvlist(nv, "volsize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *vsz = val; + } + } + + if ((rc = nvlist_lookup_nvlist(nv, "volblocksize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *bsz = val; + } + } + + nvlist_free(nv); +} + +/* + * Unlike ZFS proper, which does dynamic zvols, we currently only generate the + * zone's "disk" list once at zone boot time and use that consistently in all + * of the various subsystems (devfs, sysfs, procfs). This allows us to avoid + * re-iterating the datasets every time one of those subsystems accesses a + * "disk" and allows us to keep the view consistent across all subsystems, but + * it does mean a reboot is required to see new "disks". This is somewhat + * mitigated by its similarity to actual disk drives on a real system. + */ +static void +lx_zone_get_zvols(zone_t *zone, ldi_handle_t lh, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd; + list_t *zvol_lst, ds_lst; + int rc; + unsigned int devnum = 0; + size_t size; + zfs_cmd_t *zc; + nvpair_t *elem = NULL; + nvlist_t *pnv = NULL; + + lxzd = ztolxzd(zone); + ASSERT(lxzd != NULL); + zvol_lst = lxzd->lxzd_vdisks; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + if (lx_zfs_ioctl(lh, ZFS_IOC_POOL_CONFIGS, zc, &size) != 0) { + goto out; + } + ASSERT(zc->zc_cookie > 0); + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &pnv, 0); + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + if (rc != 0) + goto out; + + /* + * We use a dataset list to process all of the datasets in the pool + * without doing recursion so that we don't risk blowing the kernel + * stack. + */ + list_create(&ds_lst, sizeof (lx_zfs_ds_t), + offsetof(lx_zfs_ds_t, ds_link)); + + while ((elem = nvlist_next_nvpair(pnv, elem)) != NULL) { + lx_zfs_ds_t *ds; + + ds = kmem_zalloc(sizeof (lx_zfs_ds_t), KM_SLEEP); + (void) strcpy(ds->ds_name, nvpair_name(elem)); + list_insert_head(&ds_lst, ds); + + while (ds != NULL) { + int w; /* dummy variable */ + + bzero(zc, sizeof (zfs_cmd_t)); + zc->zc_cookie = ds->ds_cookie; + (void) strcpy(zc->zc_name, ds->ds_name); + + rc = lx_zfs_ioctl(lh, ZFS_IOC_DATASET_LIST_NEXT, + zc, NULL); + /* Update the cookie before doing anything else. */ + ds->ds_cookie = zc->zc_cookie; + + if (rc != 0) { + list_remove(&ds_lst, ds); + kmem_free(ds, sizeof (lx_zfs_ds_t)); + ds = list_tail(&ds_lst); + continue; + } + + /* Reserved internal names, skip over these. */ + if (strchr(zc->zc_name, '$') != NULL || + strchr(zc->zc_name, '%') != NULL) + continue; + + if (!zone_dataset_visible_inzone(zone, zc->zc_name, &w)) + continue; + + if (zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) { + lx_virt_disk_t *vd; + minor_t m = 0; + char *znm = zc->zc_name; + + /* Create a virtual disk entry for the zvol */ + vd = kmem_zalloc(sizeof (lx_virt_disk_t), + KM_SLEEP); + vd->lxvd_type = LXVD_ZVOL; + (void) snprintf(vd->lxvd_name, + sizeof (vd->lxvd_name), + "zvol%u", devnum++); + (void) strlcpy(vd->lxvd_real_name, + zc->zc_name, + sizeof (vd->lxvd_real_name)); + + /* Record emulated and real dev_t values */ + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, + (*emul_minor)++); + if (zvol_name2minor(znm, &m) != 0) { + (void) zvol_create_minor(znm); + VERIFY(zvol_name2minor(znm, &m) == 0); + } + if (m != 0) { + vd->lxvd_real_dev = makedevice( + getmajor(lxzd->lxzd_zfs_dev), m); + } + + /* Query volume size properties */ + lx_zvol_props(lh, zc, &vd->lxvd_volsize, + &vd->lxvd_blksize); + + list_insert_tail(zvol_lst, vd); + } else { + lx_zfs_ds_t *nds; + + /* Create a new ds_t for the child. */ + nds = kmem_zalloc(sizeof (lx_zfs_ds_t), + KM_SLEEP); + (void) strcpy(nds->ds_name, zc->zc_name); + list_insert_after(&ds_lst, ds, nds); + + /* Depth-first, so do the one just created. */ + ds = nds; + } + } + + ASSERT(list_is_empty(&ds_lst)); + } + + list_destroy(&ds_lst); + +out: + nvlist_free(pnv); + kmem_free(zc, sizeof (zfs_cmd_t)); +} + +static void +lx_zone_get_zfsds(zone_t *zone, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + vfs_t *vfsp = zone->zone_rootvp->v_vfsp; + + /* + * Only the root will be mounted at zone init time. + * Finding means of discovering other datasets mounted in the zone + * would be a good enhancement later. + */ + if (getmajor(vfsp->vfs_dev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = kmem_zalloc(sizeof (lx_virt_disk_t), KM_SLEEP); + vd->lxvd_type = LXVD_ZFS_DS; + vd->lxvd_real_dev = vfsp->vfs_dev; + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, (*emul_minor)++); + (void) snprintf(vd->lxvd_name, sizeof (vd->lxvd_name), + "zfsds%u", 0); + (void) strlcpy(vd->lxvd_real_name, + refstr_value(vfsp->vfs_resource), + sizeof (vd->lxvd_real_name)); + + list_insert_tail(lxzd->lxzd_vdisks, vd); + } +} + +/* Cleanup virtual disk list */ +static void +lx_zone_cleanup_vdisks(lx_zone_data_t *lxzd) +{ + lx_virt_disk_t *vd; + + ASSERT(lxzd->lxzd_vdisks != NULL); + vd = (list_remove_head(lxzd->lxzd_vdisks)); + while (vd != NULL) { + kmem_free(vd, sizeof (lx_virt_disk_t)); + vd = list_remove_head(lxzd->lxzd_vdisks); + } + + list_destroy(lxzd->lxzd_vdisks); + kmem_free(lxzd->lxzd_vdisks, sizeof (list_t)); + lxzd->lxzd_vdisks = NULL; +} + +/* + * By default illumos restricts access to ULP_DEF_EPRIV_PORT1 and + * ULP_DEF_EPRIV_PORT2 for TCP and UDP, even though these ports are outside of + * the privileged port range. Linux does not do this, so we need to remove + * these defaults. + * + * See also: mod_set_extra_privports + */ +static void +lx_fix_ns_eports(netstack_t *ns) +{ + tcp_stack_t *tcps; + udp_stack_t *udps; + in_port_t *ports; + uint_t i, nports; + kmutex_t *lock; + + tcps = ns->netstack_tcp; + ports = tcps->tcps_g_epriv_ports; + nports = tcps->tcps_g_num_epriv_ports; + lock = &tcps->tcps_epriv_port_lock; + mutex_enter(lock); + for (i = 0; i < nports; i++) + ports[i] = 0; + mutex_exit(lock); + + udps = ns->netstack_udp; + ports = udps->us_epriv_ports; + nports = udps->us_num_epriv_ports; + lock = &udps->us_epriv_port_lock; + mutex_enter(lock); + for (i = 0; i < nports; i++) + ports[i] = 0; + mutex_exit(lock); +} + +/* + * The default limit for TCP buffer sizing on illumos is smaller than its + * counterparts on Linux. Adjust it to meet minimum expectations. + */ +static void +lx_fix_ns_buffers(netstack_t *ns) +{ + mod_prop_info_t *pinfo; + ulong_t target, parsed; + char buf[16]; + + /* + * Prior to kernel 3.4, Linux defaulted to a max of 4MB for both the + * tcp_rmem and tcp_wmem tunables. Kernels since then increase the + * tcp_rmem default max to 6MB. Since illumos lacks separate tunables + * to cap sizing for read and write buffers, the higher value is + * selected for compatibility. + */ + if (lx_kern_release_cmp(curzone, "3.4.0") < 0) { + target = 4*1024*1024; + } else { + target = 6*1024*1024; + } + + pinfo = mod_prop_lookup(ns->netstack_tcp->tcps_propinfo_tbl, + "max_buf", MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_getf(ns, pinfo, NULL, buf, sizeof (buf), 0) != 0 || + ddi_strtoul(buf, NULL, 10, &parsed) != 0 || + parsed >= target) { + return; + } + + (void) snprintf(buf, sizeof (buf), "%lu", target); + (void) pinfo->mpi_setf(ns, CRED(), pinfo, NULL, buf, 0); +} + +static void +lx_bootup_hooks() +{ + netstack_t *ns; + + ns = netstack_get_current(); + if (ns == NULL) + return; + + lx_fix_ns_eports(ns); + lx_fix_ns_buffers(ns); + + netstack_rele(ns); +} + +void +lx_init_brand_data(zone_t *zone, kmutex_t *zsl) +{ + lx_zone_data_t *data; + ldi_handle_t lh; + + ASSERT(MUTEX_HELD(zsl)); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(zone->zone_brand_data == NULL); + + data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); + mutex_init(&data->lxzd_lock, NULL, MUTEX_DEFAULT, NULL); + + /* No need to hold mutex now since zone_brand_data is not set yet. */ + + /* + * Set the default lxzd_kernel_version to 2.4. + * This can be changed by a call to setattr() during zone boot. + */ + (void) strlcpy(data->lxzd_kernel_release, "2.4.21", + LX_KERN_RELEASE_MAX); + (void) strlcpy(data->lxzd_kernel_version, "BrandZ virtual linux", + LX_KERN_VERSION_MAX); + data->lxzd_pipe_max_sz = lx_pipe_max_default; + + zone->zone_brand_data = data; + + /* + * In Linux, if the init(1) process terminates the system panics. + * The zone must reboot to simulate this behaviour. + */ + zone->zone_reboot_on_init_exit = B_TRUE; + + /* + * We cannot hold the zone_status_lock while performing zfs operations + * so we drop the lock, get the zfs devs as the last step in this + * function, then reaquire the lock. Don't add any code after this + * which requires that the zone_status_lock was continuously held. + */ + mutex_exit(zsl); + + data->lxzd_vdisks = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(data->lxzd_vdisks, sizeof (lx_virt_disk_t), + offsetof(lx_virt_disk_t, lxvd_link)); + + if (lx_zone_zfs_open(&lh, &data->lxzd_zfs_dev) == 0) { + minor_t emul_minor = 1; + + lx_zone_get_zfsds(zone, &emul_minor); + lx_zone_get_zvols(zone, lh, &emul_minor); + (void) ldi_close(lh, FREAD|FWRITE, kcred); + } else { + /* Avoid matching any devices */ + data->lxzd_zfs_dev = makedevice(-1, 0); + } + mutex_enter(zsl); +} + +void +lx_free_brand_data(zone_t *zone) +{ + lx_zone_data_t *data = ztolxzd(zone); + ASSERT(data != NULL); + mutex_enter(&data->lxzd_lock); + lx_audit_fini(zone); + if (data->lxzd_ioctl_sock != NULL) { + /* + * Since zone_kcred has been cleaned up already, close the + * socket using the global kcred. + */ + (void) ksocket_close(data->lxzd_ioctl_sock, kcred); + data->lxzd_ioctl_sock = NULL; + } + ASSERT(data->lxzd_cgroup == NULL); + + lx_zone_cleanup_vdisks(data); + + mutex_exit(&data->lxzd_lock); + zone->zone_brand_data = NULL; + mutex_destroy(&data->lxzd_lock); + kmem_free(data, sizeof (*data)); +} + +void +lx_unsupported(char *dmsg) +{ + lx_proc_data_t *pd = ttolxproc(curthread); + + DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); + + if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { + /* + * If this process was run with strict mode enabled + * (via LX_STRICT in the environment), we mark this + * LWP as having triggered an unsupported behaviour. + * This flag will be checked at an appropriate point + * by lx_check_strict_failure(). + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + lwpd->br_strict_failure = B_TRUE; + } +} + +void +lx_check_strict_failure(lx_lwp_data_t *lwpd) +{ + proc_t *p; + + if (!lwpd->br_strict_failure) { + return; + } + + lwpd->br_strict_failure = B_FALSE; + + /* + * If this process is operating in strict mode (via LX_STRICT in + * the environment), and has triggered a call to + * lx_unsupported(), we drop SIGSYS on it as we return. + */ + p = curproc; + mutex_enter(&p->p_lock); + sigtoproc(p, curthread, SIGSYS); + mutex_exit(&p->p_lock); +} + +void +lx_trace_sysenter(int syscall_num, uintptr_t *args) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_entry_ptr != NULL); + + (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], + args[2], args[3], args[4], args[5]); + } +} + +void +lx_trace_sysreturn(int syscall_num, long ret) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); + } +} + +/* + * Get the addresses of the user-space system call handler and attach it to + * the proc structure. Returning 0 indicates success; the value returned + * by the system call is the value stored in rval. Returning a non-zero + * value indicates a failure; the value returned is used to set errno, -1 + * is returned from the syscall and the contents of rval are ignored. To + * set errno and have the syscall return a value other than -1 we can + * manually set errno and rval and return 0. + */ +int +lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lx_proc_data_t *pd; + struct termios *termios; + uint_t termios_len; + int error; + int code; + int sig; + lx_brand_registration_t reg; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * There is one operation that is suppored for non-branded + * process. B_EXEC_BRAND. This is the equilivant of an + * exec call, but the new process that is created will be + * a branded process. + */ + if (cmd == B_EXEC_BRAND) { + VERIFY(p->p_zone != NULL); + VERIFY(p->p_zone->zone_brand == &lx_brand); + return (exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_BRAND)); + } + + /* For all other operations this must be a branded process. */ + if (p->p_brand == NULL) + return (ENOSYS); + + VERIFY(p->p_brand == &lx_brand); + VERIFY(p->p_brand_data != NULL); + + switch (cmd) { + case B_REGISTER: + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("stack mode was not PREINIT during " + "REGISTER\n"); + return (EINVAL); + } + + if (p->p_model == DATAMODEL_NATIVE) { + if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + } +#ifdef _LP64 + else { + /* 32-bit userland on 64-bit kernel */ + lx_brand_registration32_t reg32; + + if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + + reg.lxbr_version = (uint_t)reg32.lxbr_version; + reg.lxbr_handler = + (void *)(uintptr_t)reg32.lxbr_handler; + reg.lxbr_flags = reg32.lxbr_flags; + } +#endif + + if (reg.lxbr_version != LX_VERSION_1) { + lx_print("Invalid brand library version (%u)\n", + reg.lxbr_version); + return (EINVAL); + } + + if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { + lx_print("Invalid brand flags (%u)\n", + reg.lxbr_flags); + return (EINVAL); + } + + lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", + (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); + pd = p->p_brand_data; + pd->l_handler = (uintptr_t)reg.lxbr_handler; + pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; + + /* + * There are certain setup tasks which cannot be performed + * during the lx_init_brand_data hook due to the calling + * context from zoneadmd (in the GZ). This work is instead + * delayed until the init process starts inside the zone. + */ + if (p->p_pid == p->p_zone->zone_proc_initpid) { + lx_bootup_hooks(); + } + + return (0); + + case B_TTYMODES: + /* This is necessary for emulating TCGETS ioctls. */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, + &termios_len) != DDI_SUCCESS) + return (EIO); + + ASSERT(termios_len == sizeof (*termios)); + + if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { + ddi_prop_free(termios); + return (EFAULT); + } + + ddi_prop_free(termios); + return (0); + + case B_ELFDATA: { + mutex_enter(&p->p_lock); + pd = curproc->p_brand_data; + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_elf_data_t led; + + bcopy(&pd->l_elf_data, &led, sizeof (led)); + mutex_exit(&p->p_lock); + + if (copyout(&led, (void *)arg1, + sizeof (lx_elf_data_t)) != 0) { + return (EFAULT); + } + } +#if defined(_LP64) + else { + /* 32-bit userland on 64-bit kernel */ + lx_elf_data32_t led32; + + led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; + led32.ed_phent = (int)pd->l_elf_data.ed_phent; + led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; + led32.ed_entry = (int)pd->l_elf_data.ed_entry; + led32.ed_base = (int)pd->l_elf_data.ed_base; + led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; + mutex_exit(&p->p_lock); + + if (copyout(&led32, (void *)arg1, + sizeof (led32)) != 0) { + return (EFAULT); + } + } +#endif + return (0); + } + + case B_EXEC_NATIVE: + return (exec_common((char *)arg1, (const char **)arg2, + (const char **)arg3, EBA_NATIVE)); + + /* + * The B_TRUSS_POINT subcommand is used so that we can make a no-op + * syscall for debugging purposes (dtracing) from within the user-level + * emulation. + */ + case B_TRUSS_POINT: + return (0); + + case B_LPID_TO_SPAIR: { + /* + * Given a Linux pid as arg1, return the Solaris pid in arg2 and + * the Solaris LWP in arg3. We also translate pid 1 (which is + * hardcoded in many applications) to the zone's init process. + */ + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { + return (ESRCH); + } + + if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { + return (EFAULT); + } + + return (0); + } + + case B_PTRACE_STOP_FOR_OPT: + return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); + + case B_PTRACE_CLONE_BEGIN: + /* + * Leverage ptrace brand call to create a clone group for this + * proc if necessary. + */ + lx_clone_grp_create((uint_t)arg3); + + return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE)); + + case B_PTRACE_SIG_RETURN: { + /* + * Our ptrace emulation must emit PR_SYSEXIT for rt_sigreturn. + * Since that syscall does not pass through the normal + * emulation, which would call lx_syscall_return, the event is + * emitted manually. A successful result of the syscall is + * assumed since there is little to be done in the face of + * failure. + */ + struct regs *rp = lwptoregs(lwp); + + rp->r_r0 = 0; + (void) lx_ptrace_stop(LX_PR_SYSEXIT); + return (0); + } + + case B_UNSUPPORTED: { + char dmsg[256]; + + if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { + lx_print("Failed to copyin unsupported msg " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + dmsg[255] = '\0'; + lx_unsupported(dmsg); + + lx_check_strict_failure(lwpd); + + return (0); + } + + case B_STORE_ARGS: { + /* + * B_STORE_ARGS subcommand + * arg1 = address of struct to be copied in + * arg2 = size of the struct being copied in + * arg3-arg6 ignored + * rval = the amount of data copied. + */ + void *buf; + + /* only have upper limit because arg2 is unsigned */ + if (arg2 > LX_BR_ARGS_SIZE_MAX) { + return (EINVAL); + } + + buf = kmem_alloc(arg2, KM_SLEEP); + if (copyin((void *)arg1, buf, arg2) != 0) { + lx_print("Failed to copyin scall arg at 0x%p\n", + (void *) arg1); + kmem_free(buf, arg2); + /* + * Purposely not setting br_scall_args to NULL + * to preserve data for debugging. + */ + return (EFAULT); + } + + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, + lwpd->br_args_size); + } + + lwpd->br_scall_args = buf; + lwpd->br_args_size = arg2; + *rval = arg2; + return (0); + } + + case B_HELPER_CLONE: + return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, + (void *)arg4)); + + case B_HELPER_SETGROUPS: + return (lx_helper_setgroups(arg1, (gid_t *)arg2)); + + case B_HELPER_SIGQUEUE: + return (lx_helper_rt_sigqueueinfo(arg1, arg2, + (siginfo_t *)arg3)); + + case B_HELPER_TGSIGQUEUE: + return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, + (siginfo_t *)arg4)); + + case B_GETPID: + /* + * The usermode clone(2) code needs to be able to call + * lx_getpid() from native code: + */ + *rval = lx_getpid(); + return (0); + + case B_SET_NATIVE_STACK: + /* + * B_SET_NATIVE_STACK subcommand + * arg1 = the base of the stack to use for emulation + */ + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("B_SET_NATIVE_STACK when stack was already " + "set to %p\n", (void *)arg1); + return (EEXIST); + } + + /* + * We move from the PREINIT state, where we have no brand + * emulation stack, to the INIT state. Here, we are still + * running on what will become the BRAND stack, but are running + * emulation (i.e. native) code. Once the initialisation + * process for this thread has finished, we will jump to + * brand-specific code, while moving to the BRAND mode. + * + * When a new LWP is created, lx_initlwp() will clear the + * stack data. If that LWP is actually being duplicated + * into a child process by fork(2), lx_forklwp() will copy + * it so that the cloned thread will keep using the same + * alternate stack. + */ + lwpd->br_ntv_stack = arg1; + lwpd->br_stack_mode = LX_STACK_MODE_INIT; + lx_lwp_set_native_stack_current(lwpd, arg1); + + return (0); + + case B_GET_CURRENT_CONTEXT: + /* + * B_GET_CURRENT_CONTEXT subcommand: + * arg1 = address for pointer to current ucontext_t + */ + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; + + error = copyout(&addr, (void *)arg1, sizeof (addr)); + } else +#endif + { + error = copyout(&lwp->lwp_oldcontext, (void *)arg1, + sizeof (lwp->lwp_oldcontext)); + } + + return (error != 0 ? EFAULT : 0); + + case B_JUMP_TO_LINUX: + /* + * B_JUMP_TO_LINUX subcommand: + * arg1 = ucontext_t pointer for jump state + */ + + if (arg1 == NULL) + return (EINVAL); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_NATIVE: { + struct regs *rp = lwptoregs(lwp); + + /* + * We are on the NATIVE stack, so we must preserve + * the extent of that stack. The pointer will be + * reset by a future setcontext(). + */ + lx_lwp_set_native_stack_current(lwpd, + (uintptr_t)rp->r_sp); + break; + } + + case LX_STACK_MODE_INIT: + /* + * The LWP is transitioning to Linux code for the first + * time. + */ + break; + + case LX_STACK_MODE_PREINIT: + /* + * This LWP has not installed an alternate stack for + * usermode emulation handling. + */ + return (ENOENT); + + case LX_STACK_MODE_BRAND: + /* + * The LWP should not be on the BRAND stack. + */ + exit(CLD_KILLED, SIGSYS); + return (0); + } + + /* + * Transfer control to Linux: + */ + return (lx_runexe(lwp, (void *)arg1)); + + case B_EMULATION_DONE: + /* + * B_EMULATION_DONE subcommand: + * arg1 = ucontext_t * to restore + * arg2 = system call number + * arg3 = return code + * arg4 = if operation failed, the errno value + */ + + /* + * The first part of this operation is a setcontext() to + * restore the register state to the copy we preserved + * before vectoring to the usermode emulation routine. + * If that fails, we return (hopefully) to the emulation + * routine and it will handle the error. + */ +#if (_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + error = getsetcontext32(SETCONTEXT, (void *)arg1); + } else +#endif + { + error = getsetcontext(SETCONTEXT, (void *)arg1); + } + + if (error != 0) { + return (error); + } + + /* + * The saved Linux context has been restored. We handle the + * return value or errno with code common to the in-kernel + * system call emulation. + */ + if ((error = (int)arg4) != 0) { + /* + * lx_syscall_return() looks at the errno in the LWP, + * so set it here: + */ + (void) set_errno(error); + } + lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); + + return (0); + + case B_EXIT_AS_SIG: + code = CLD_KILLED; + sig = (int)arg1; + proc_is_exiting(p); + if (exitlwps(1) != 0) { + mutex_enter(&p->p_lock); + lwp_exit(); + } + ttolwp(curthread)->lwp_cursig = sig; + if (sig == SIGSEGV) { + if (core(sig, 0) == 0) + code = CLD_DUMPED; + } + exit(code, sig); + /* NOTREACHED */ + break; + + case B_OVERRIDE_KERN_VER: { + void *urel = (void *)arg1; + void *uver = (void *)arg2; + size_t len; + + pd = ptolxproc(p); + if (urel != NULL) { + if (copyinstr(urel, pd->l_uname_release, + LX_KERN_RELEASE_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_release[LX_KERN_RELEASE_MAX - 1] = '\0'; + } + if (uver != NULL) { + if (copyinstr(uver, pd->l_uname_version, + LX_KERN_VERSION_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_version[LX_KERN_VERSION_MAX - 1] = '\0'; + } + + return (0); + } + + case B_GET_PERSONALITY: { + unsigned int result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + result = pd->l_personality; + mutex_exit(&p->p_lock); + return (result); + } + + case B_START_NFS_LOCKD: + (void) lx_start_nfs_lockd(); + return (0); + + case B_BLOCK_ALL_SIGS: + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + pd->l_block_all_signals++; + mutex_exit(&p->p_lock); + return (0); + + case B_UNBLOCK_ALL_SIGS: { + uint_t result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + if (pd->l_block_all_signals == 0) { + result = set_errno(EINVAL); + } else { + pd->l_block_all_signals--; + result = 0; + } + mutex_exit(&p->p_lock); + return (result); + } + + case B_ALL_SIGS_BLOCKED: { + uint_t result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + result = pd->l_block_all_signals; + mutex_exit(&p->p_lock); + return (result); + } + } + + return (EINVAL); +} + +/* + * Compare linux kernel version to the one set for the zone. + * Returns greater than 0 if zone version is higher, less than 0 if the zone + * version is lower, and 0 if the versions are equal. + */ +int +lx_kern_release_cmp(zone_t *zone, const char *vers) +{ + int zvers[3] = {0, 0, 0}; + int cvers[3] = {0, 0, 0}; + int i; + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + VERIFY(zone->zone_brand == &lx_brand); + + mutex_enter(&lxzd->lxzd_lock); + (void) sscanf(lxzd->lxzd_kernel_release, "%d.%d.%d", &zvers[0], + &zvers[1], &zvers[2]); + mutex_exit(&lxzd->lxzd_lock); + (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); + + for (i = 0; i < 3; i++) { + if (zvers[i] > cvers[i]) { + return (1); + } else if (zvers[i] < cvers[i]) { + return (-1); + } + } + return (0); +} + +/* + * Linux unconditionally removes the setuid and setgid bits when changing + * file ownership. This brand hook overrides the illumos native behaviour, + * which is based on the PRIV_FILE_SETID privilege. + */ +/* ARGSUSED */ +static int +lx_setid_clear(vattr_t *vap, cred_t *cr) +{ + if (S_ISDIR(vap->va_mode)) { + return (0); + } + + if (vap->va_mode & S_ISUID) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISUID; + } + if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Copy the per-process brand data from a parent proc to a child. + */ +void +lx_copy_procdata(proc_t *cp, proc_t *pp) +{ + lx_proc_data_t *cpd, *ppd; + + /* + * Since b_copy_procdata is called during getproc(), while the child + * process is still being initialized, acquiring cp->p_lock should not + * be required. + */ + VERIFY(cp->p_brand == &lx_brand); + VERIFY((cpd = cp->p_brand_data) != NULL); + + mutex_enter(&pp->p_lock); + VERIFY(pp->p_brand == &lx_brand); + VERIFY((ppd = pp->p_brand_data) != NULL); + + bcopy(ppd, cpd, sizeof (lx_proc_data_t)); + mutex_exit(&pp->p_lock); + + /* Clear any aio contexts from child */ + lx_io_clear(cpd); + + /* + * The l_ptrace count is normally manipulated only while under holding + * p_lock. Since this is a freshly created process, it's safe to zero + * out. If it is to be inherited, the attach will occur later. + */ + cpd->l_ptrace = 0; + + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; + + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; + + bzero(cpd->l_clone_grps, sizeof (cpd->l_clone_grps)); +} + +#if defined(_LP64) +static void +Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) +{ + bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); + dst->e_type = src->e_type; + dst->e_machine = src->e_machine; + dst->e_version = src->e_version; + dst->e_entry = src->e_entry; + dst->e_phoff = src->e_phoff; + dst->e_shoff = src->e_shoff; + dst->e_flags = src->e_flags; + dst->e_ehsize = src->e_ehsize; + dst->e_phentsize = src->e_phentsize; + dst->e_phnum = src->e_phnum; + dst->e_shentsize = src->e_shentsize; + dst->e_shnum = src->e_shnum; + dst->e_shstrndx = src->e_shstrndx; +} +#endif /* _LP64 */ + +static void +restoreexecenv(struct execenv *ep, stack_t *sp) +{ + klwp_t *lwp = ttolwp(curthread); + + setexecenv(ep); + lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; + lwp->lwp_sigaltstack.ss_size = sp->ss_size; + lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; +} + +static uintptr_t +lx_map_vdso(struct uarg *args, struct cred *cred) +{ + int err; + char *fpath = LX_VDSO_PATH; + vnode_t *vp; + vattr_t attr; + caddr_t addr; + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + fpath = LX_VDSO_PATH32; + } +#endif + + /* + * The comm page should have been mapped in already. + */ + if (args->commpage == NULL) { + return (NULL); + } + + /* + * Ensure the VDSO library is present and appropriately sized. + * This lookup is started at the zone root to avoid complications for + * processes which have chrooted. For the specified lookup root to be + * used, the leading slash must be dropped from the path. + */ + ASSERT(fpath[0] == '/'); + fpath++; + if (lookupnameat(fpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, + curzone->zone_rootvp) != 0) { + return (NULL); + } + + /* + * The VDSO requires data exposed via the comm page in order to + * function properly. The VDSO is always mapped in at a fixed known + * offset from the comm page, providing an easy means to locate it. + */ + addr = (caddr_t)(args->commpage - LX_VDSO_SIZE); + attr.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &attr, 0, cred, NULL) != 0 || + attr.va_size > LX_VDSO_SIZE) { + VN_RELE(vp); + return (NULL); + } + + err = execmap(vp, addr, attr.va_size, 0, 0, + PROT_USER|PROT_READ|PROT_EXEC, 1, 0); + VN_RELE(vp); + if (err != 0) { + return (NULL); + } + return ((uintptr_t)addr); +} + +/* + * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux + * binaries. + */ +/* ARGSUSED4 */ +static int +lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, size_t *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action) +{ + int error; + vnode_t *nvp; + Ehdr ehdr; + Addr uphdr_vaddr; + intptr_t voffset; + char *interp = NULL; + uintptr_t ldaddr = NULL; + proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + lx_proc_data_t *lxpd = ptolxproc(p); + struct execenv env, origenv; + stack_t orig_sigaltstack; + struct user *up = PTOU(ttoproc(curthread)); + lx_elf_data_t edp; + char *lib_path = LX_LIB_PATH; + boolean_t execstk = B_TRUE; + unsigned int personality; + + ASSERT(p->p_brand == &lx_brand); + ASSERT(lxpd != NULL); + + /* + * Start with a separate struct for ELF data instead of inheriting + * values from the currently running binary. This ensures that fields + * such as ed_base are cleared if the new binary does not utilize an + * interpreter. + */ + bzero(&edp, sizeof (edp)); + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + lib_path = LX_LIB_PATH32; + } +#endif + + /* + * Set the brandname and library name for the new process so that + * elfexec() puts them onto the stack. + */ + args->brandname = LX_BRANDNAME; + args->emulator = lib_path; + +#if defined(_LP64) + /* + * To conform with the way Linux lays out the address space, we clamp + * the stack to be the top of the lower region of the x86-64 canonical + * form address space -- which has the side-effect of laying out the + * entire address space in that lower region. Note that this only + * matters on 64-bit processes (this value will always be greater than + * the size of a 32-bit address space) and doesn't actually affect + * USERLIMIT: if a Linux-branded processes wishes to map something + * into the top half of the address space, it can do so -- but with + * the user stack starting at the top of the bottom region, those high + * virtual addresses won't be used unless explicitly directed. + */ + args->maxstack = lx_maxstack64; +#endif + + /* + * Search the binary for a PT_GNU_STACK header. The PF_X bit contained + * within is used to dictate protection defaults for the stack, among + * other things. + */ + if (args->to_model == DATAMODEL_NATIVE) { + Ehdr ehdr; + Phdr *phdrp; + caddr_t phdrbase = NULL; + size_t phdrsize = 0; + uint_t nphdrs, hsize; + + if ((error = elfreadhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + /* LINTED: alignment */ + phdrp = (Phdr *)phdrbase; + for (uint_t i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + /* LINTED: alignment */ + phdrp = (Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr; + Elf32_Phdr *phdrp; + caddr_t phdrbase = NULL; + size_t phdrsize = 0; + uint_t nphdrs, hsize; + + if ((error = elf32readhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + /* LINTED: alignment */ + phdrp = (Elf32_Phdr *)phdrbase; + for (uint_t i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + /* LINTED: alignment */ + phdrp = (Elf32_Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#endif + + /* + * Revert the base personality while maintaining any existing flags. + */ + personality = LX_PER_LINUX | (lxpd->l_personality & ~LX_PER_MASK); + + /* + * Linux defaults to an executable stack unless the aformentioned + * PT_GNU_STACK entry in the elf header dictates otherwise. Enabling + * the READ_IMPLIES_EXEC personality flag is also implied in this case. + */ + if (execstk) { + args->stk_prot |= PROT_EXEC; + args->stk_prot_override = B_TRUE; + personality |= LX_PER_READ_IMPLIES_EXEC; + } + + /* + * We will first exec the brand library, then map in the linux + * executable and the linux linker. + */ + if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp))) { + uprintf("%s: not found.", lib_path); + return (error); + } + + /* + * We will eventually set the p_exec member to be the vnode for the new + * executable when we call setexecenv(). However, if we get an error + * before that call we need to restore the execenv to its original + * values so that when we return to the caller fop_close() works + * properly while cleaning up from the failed exec(). Restoring the + * original value will also properly decrement the 2nd VN_RELE that we + * took on the brand library. + */ + origenv.ex_bssbase = p->p_bssbase; + origenv.ex_brkbase = p->p_brkbase; + origenv.ex_brksize = p->p_brksize; + origenv.ex_vp = p->p_exec; + orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; + orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; + orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; + + if (args->to_model == DATAMODEL_NATIVE) { + error = elfexec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#if defined(_LP64) + else { + error = elf32exec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#endif + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * exec-ed in the brand library above. + * The u_auxv vectors are now setup by elfexec to point to the + * brand emulation library and its linker. + */ + + /* + * After execing the brand library (which should have implicitly mapped + * in the comm page), map the VDSO into the approprate place in the AS. + */ + lxpd->l_vdso = lx_map_vdso(args, cred); + + bzero(&env, sizeof (env)); + + /* + * map in the the Linux executable + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + + return (error); + } + + /* + * Save off the important properties of the lx executable. The brand + * library will ask us for this data later, when it is ready to set + * things up for the lx executable. + */ + edp.ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : + voffset + uphdr_vaddr; + edp.ed_entry = voffset + ehdr.e_entry; + edp.ed_phent = ehdr.e_phentsize; + edp.ed_phnum = ehdr.e_phnum; + + if (interp != NULL) { + if (ehdr.e_type == ET_DYN) { + /* + * This is a shared object executable, so we need to + * pick a reasonable place to put the heap. Just don't + * use the first page. + */ + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_bssbase = (caddr_t)PAGESIZE; + } + + /* + * If the program needs an interpreter (most do), map it in and + * store relevant information about it in the aux vector, where + * the brand library can find it. + */ + if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, + NULLVPP, &nvp))) { + uprintf("%s: not found.", interp); + restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); + return (error); + } + + kmem_free(interp, MAXPATHLEN); + interp = NULL; + + /* + * map in the Linux linker + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(nvp, args, &ehdr, + &uphdr_vaddr, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(nvp, args, &ehdr32, + &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * Now that we know the base address of the brand's linker, + * we also save this for later use by the brand library. + */ + edp.ed_base = voffset; + edp.ed_ldentry = voffset + ehdr.e_entry; + } else { + /* + * This program has no interpreter. The lx brand library will + * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, + * so in this case, put the entry point of the main executable + * there. + */ + if (ehdr.e_type == ET_EXEC) { + /* + * An executable with no interpreter, this must be a + * statically linked executable, which means we loaded + * it at the address specified in the elf header, in + * which case the e_entry field of the elf header is an + * absolute address. + */ + edp.ed_ldentry = ehdr.e_entry; + edp.ed_entry = ehdr.e_entry; + } else { + /* + * A shared object with no interpreter, we use the + * calculated address from above. + */ + edp.ed_ldentry = edp.ed_entry; + + /* + * In all situations except an ET_DYN elf object with no + * interpreter, we want to leave the brk and base + * values set by mapexec_brand alone. Normally when + * running ET_DYN objects on Solaris (most likely + * /lib/ld.so.1) the kernel sets brk and base to 0 since + * it doesn't know where to put the heap, and later the + * linker will call brk() to initialize the heap in: + * usr/src/cmd/sgs/rtld/common/setup.c:setup() + * after it has determined where to put it. (This + * decision is made after the linker loads and inspects + * elf properties of the target executable being run.) + * + * So for ET_DYN Linux executables, we also don't know + * where the heap should go, so we'll set the brk and + * base to 0. But in this case the Solaris linker will + * not initialize the heap, so when the Linux linker + * starts running there is no heap allocated. This + * seems to be ok on Linux 2.4 based systems because the + * Linux linker/libc fall back to using mmap() to + * allocate memory. But on 2.6 systems, running + * applications by specifying them as command line + * arguments to the linker results in segfaults for an + * as yet undetermined reason (which seems to indicatej + * that a more permanent fix for heap initalization in + * these cases may be necessary). + */ + if (ehdr.e_type == ET_DYN) { + env.ex_bssbase = (caddr_t)0; + env.ex_brkbase = (caddr_t)0; + env.ex_brksize = 0; + } + } + } + + env.ex_vp = vp; + setexecenv(&env); + + /* + * We try to keep /proc's view of the aux vector consistent with + * what's on the process stack. See the comment on the lx_times + * syscall for an explanation of the hardcoded LX_USERHZ. + */ + if (args->to_model == DATAMODEL_NATIVE) { + auxv_t phdr_auxv[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv[0].a_un.a_val = edp.ed_phdr; + phdr_auxv[1].a_un.a_val = ldaddr; + phdr_auxv[2].a_un.a_val = LX_USERHZ; + phdr_auxv[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv, args->auxp_brand, + sizeof (phdr_auxv)) == -1) + return (EFAULT); + } +#if defined(_LP64) + else { + auxv32_t phdr_auxv32[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv32[0].a_un.a_val = edp.ed_phdr; + phdr_auxv32[1].a_un.a_val = ldaddr; + phdr_auxv32[2].a_un.a_val = hz; + phdr_auxv32[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv32, args->auxp_brand, + sizeof (phdr_auxv32)) == -1) + return (EFAULT); + } +#endif + + /* + * /proc uses the AT_ENTRY aux vector entry to deduce + * the location of the executable in the address space. The user + * structure contains a copy of the aux vector that needs to have those + * entries patched with the values of the real lx executable (they + * currently contain the values from the lx brand library that was + * elfexec'd, above). + * + * For live processes, AT_BASE is used to locate the linker segment, + * which /proc and friends will later use to find Solaris symbols + * (such as rtld_db_preinit). However, for core files, /proc uses + * AT_ENTRY to find the right segment to label as the executable. + * So we set AT_ENTRY to be the entry point of the linux executable, + * but leave AT_BASE to be the address of the Solaris linker. + */ + for (uint_t i = 0; i < __KERN_NAUXV_IMPL; i++) { + switch (up->u_auxv[i].a_type) { + case AT_ENTRY: + up->u_auxv[i].a_un.a_val = edp.ed_entry; + break; + + case AT_SUN_BRAND_LX_PHDR: + up->u_auxv[i].a_un.a_val = edp.ed_phdr; + break; + + case AT_SUN_BRAND_LX_INTERP: + up->u_auxv[i].a_un.a_val = ldaddr; + break; + + case AT_SUN_BRAND_LX_CLKTCK: + up->u_auxv[i].a_un.a_val = hz; + break; + + default: + break; + } + } + + /* + * Record the brand ELF data and new personality now that the exec has + * proceeded successfully. + */ + bcopy(&edp, &lxpd->l_elf_data, sizeof (edp)); + lxpd->l_personality = personality; + + return (0); +} + +boolean_t +lx_native_exec(uint8_t osabi, const char **interp) +{ + if (osabi != ELFOSABI_SOLARIS) + return (B_FALSE); + + /* + * If the process root matches the zone root, prepend /native to the + * interpreter path for native executables. Absolute precision from + * VN_CMP is not necessary since any change of process root is likely + * to make native binaries inaccessible via /native. + * + * Processes which chroot directly into /native will be able to + * function as expected with no need for the prefix. + */ + mutex_enter(&curproc->p_lock); + if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { + *interp = "/native"; + } + mutex_exit(&curproc->p_lock); + + return (B_TRUE); +} + +static void +lx_syscall_init(void) +{ + int i; + + /* + * Count up the 32-bit Linux system calls. Note that lx_sysent32 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) + continue; + lx_nsysent32 = i; + +#if defined(_LP64) + /* + * Count up the 64-bit Linux system calls. Note that lx_sysent64 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) + continue; + lx_nsysent64 = i; +#endif +} + +int +_init(void) +{ + int err = 0; + + /* Initialize USER_HZ scaling factor */ + ASSERT(hz >= LX_USERHZ); + lx_hz_scale = hz / LX_USERHZ; + + lx_syscall_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_futex_init(); + lx_ptrace_init(); + lx_socket_init(); + lx_audit_ld(); + + err = mod_install(&modlinkage); + if (err != 0) { + cmn_err(CE_WARN, "Couldn't install lx brand module"); + + /* + * This looks drastic, but it should never happen. These + * two data structures should be completely free-able until + * they are used by Linux processes. Since the brand + * wasn't loaded there should be no Linux processes, and + * thus no way for these data structures to be modified. + */ + lx_pid_fini(); + lx_ioctl_fini(); + if (lx_futex_fini()) + panic("lx brand module cannot be loaded or unloaded."); + } + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + int futex_done = 0; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&lx_brand)) + return (EBUSY); + + lx_ptrace_fini(); + lx_pid_fini(); + lx_ioctl_fini(); + lx_socket_fini(); + lx_audit_unld(); + + if ((err = lx_futex_fini()) != 0) { + goto done; + } + futex_done = 1; + + err = mod_remove(&modlinkage); + +done: + if (err) { + /* + * If we can't unload the module, then we have to get it + * back into a sane state. + */ + lx_ptrace_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_socket_init(); + + if (futex_done) { + lx_futex_init(); + } + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_lockd.c b/usr/src/uts/common/brand/lx/os/lx_lockd.c new file mode 100644 index 0000000000..d6d965398a --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_lockd.c @@ -0,0 +1,338 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone. + * This uses the same technique as used in our lx cgroupfs to launch a release + * agent process. This is called implicitly when an NFS mount syscall occurs + * within the zone. See the user-level lx_lockd source for the "big theory" + * comment behind this. + * + * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC + * handling so that we can interface to a Linux rpc.statd that must run + * when NFSv3 locking is in use. The rpc.statd handles server or client reboots + * and interacts with the lockd to reclaim locks after the server reboots. The + * rcp.statd also informs the server when we reboot, so the server can release + * the locks we held. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/policy.h> +#include <sys/vmparam.h> +#include <sys/contract_impl.h> +#include <sys/pool.h> +#include <sys/stack.h> +#include <sys/var.h> +#include <sys/rt.h> +#include <sys/fx.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/pathname.h> +#include <rpcsvc/nlm_prot.h> +#include <rpcsvc/sm_inter.h> +#include <klm/nlm_impl.h> + +#define LX_LOCKD_PATH "/native/usr/lib/brand/lx/lx_lockd" + +/* Linux lockd RPC called by statd when it detects an NFS server reboot */ +#define LX_NLMPROC_NSM_NOTIFY 16 + +/* From uts/common/klm/nlm_impl.c */ +extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); +extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); + +/* + * Check if the current lockd is still running. + */ +static boolean_t +lx_lockd_alive(pid_t lockd_pid) +{ + boolean_t ret = B_FALSE; + proc_t *p; + vnode_t *vp; + char path[MAXPATHLEN]; + + mutex_enter(&pidlock); + p = prfind(lockd_pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (B_FALSE); + } + + mutex_enter(&p->p_lock); + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + return (B_FALSE); + } + vp = p->p_exec; + VN_HOLD(vp); + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + + if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 && + strcmp(path, LX_LOCKD_PATH) == 0) { + ret = B_TRUE; + } + + VN_RELE(vp); + return (ret); +} + +static void +lx_run_lockd(void *a) +{ + proc_t *p = curproc; + zone_t *z = curzone; + struct core_globals *cg; + lx_zone_data_t *lxzd = ztolxzd(z); + int res; + + ASSERT(!INGLOBALZONE(p)); + VERIFY(lxzd != NULL); + + /* The following block is derived from start_init_common */ + ASSERT_STACK_ALIGNED(); + + p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; + p->p_usrstack = (caddr_t)USRSTACK32; + p->p_model = DATAMODEL_ILP32; + p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; + p->p_datprot = PROT_ZFOD & ~PROT_EXEC; + p->p_stk_ctl = INT32_MAX; + + p->p_as = as_alloc(); + p->p_as->a_proc = p; + p->p_as->a_userlimit = (caddr_t)USERLIMIT32; + (void) hat_setup(p->p_as->a_hat, HAT_INIT); + + VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + p->p_corefile = cg->core_default_path; + p->p_content = cg->core_default_content; + + init_mstate(curthread, LMS_SYSTEM); + res = exec_init(LX_LOCKD_PATH, NULL); + + /* End of code derived from start_init_common */ + + /* The following is derived from zone_start_init - see comments there */ + if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + if (proc_exit(CLD_EXITED, res) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + } else { + id_t cid = curthread->t_cid; + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + (void) parmsset(&pcparms, curthread); + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + + /* + * Set our pid as the lockd pid in the zone data, or exit + * if another process raced and already did so. + */ + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != 0) { + /* another mount raced and created a new lockd */ + mutex_exit(&lxzd->lxzd_lock); + if (proc_exit(CLD_EXITED, 0) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + return; + } + lxzd->lxzd_lockd_pid = p->p_pid; + mutex_exit(&lxzd->lxzd_lock); + + /* cause the process to return to userland. */ + lwp_rtt(); + } +} + +/* + * Launch the user-level, native, lx_lockd process. + */ +int +lx_start_nfs_lockd() +{ + id_t cid; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + lx_zone_data_t *lxzd = ztolxzd(z); + + ASSERT(!INGLOBALZONE(p)); + ASSERT(lxzd != NULL); + + /* + * This should only be called by the mount emulation, which must have + * 'root' privileges in order to have performed a mount, but + * double-check. + */ + if (crgetuid(CRED()) != 0) + return (EPERM); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != 0) { + /* verify lockd is still alive */ + pid_t lockd_pid; + + lockd_pid = lxzd->lxzd_lockd_pid; + mutex_exit(&lxzd->lxzd_lock); + + if (lx_lockd_alive(lockd_pid)) + return (EEXIST); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != lockd_pid) { + /* another mount raced and created a new lockd */ + mutex_exit(&lxzd->lxzd_lock); + return (EEXIST); + } + + /* old lockd is dead, launch a new one */ + lxzd->lxzd_lockd_pid = 0; + } + mutex_exit(&lxzd->lxzd_lock); + + if (z->zone_defaultcid > 0) { + cid = z->zone_defaultcid; + } else { + pool_lock(); + cid = pool_get_class(z->zone_pool); + pool_unlock(); + } + if (cid == -1) + cid = defaultcid; + + /* + * There's nothing to do here if creating the proc fails, but we + * return the result to make it obvious while DTracing. + */ + return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1)); +} + +void +lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host) +{ + struct nlm_nsm *nsm; + struct mon args; + struct mon_id *mip = &args.mon_id; + int family; + netobj obj; + enum clnt_stat stat; + + /* + * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and + * NSMPROC_UNMON RPC upcalls correspond almost directly to the native + * SM_MON and SM_UNMON RPC upcalls. The key differences with the native + * registration is that in our nlm_host_monitor function we make two + * RPC calls: + * - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr' + * RPC protocol to register the lockd RPC information that statd + * should call when it detects that the remote server rebooted + * - the second RPC (sm_mon_1) tells statd the information about the + * remote server to be monitored + * For Linux, there is only a single RPC from the kernel to the local + * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the + * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc' + * RPC parameter. This corresponds to our private 'nsm_addr' code, and + * tells statd which lockd RPC to call when it detects a server reboot. + * + * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use + * that directly and simply set the expected value in the 'my_proc' + * argument. + * + * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch + * table has an entry for each lockd RPC function. Thus, this table also + * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That + * procedure number is unused by the native lockd code, so there is no + * conflict with dispatching that procedure. The implementation of the + * procedure corresponds to the native, private NLM_SM_NOTIFY1 + * procedure which is called by the native rpc.statd. + * + * The Linux RPC call to "unmonitor" a host expects the same arguments + * as we pass to monitor, so that is also handled here by this same + * brand hook. + */ + nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj); + nsm = &g->nlm_nsm; + + bzero(&args, sizeof (args)); + + mip->mon_name = host->nh_name; + mip->my_id.my_name = uts_nodename(); + mip->my_id.my_prog = NLM_PROG; + mip->my_id.my_vers = NLM_SM; + mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY; + if (op == SM_MON) { + bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t)); + } + + sema_p(&nsm->ns_sem); + nlm_nsm_clnt_init(nsm->ns_handle, nsm); + if (op == SM_MON) { + struct sm_stat_res mres; + + bzero(&mres, sizeof (mres)); + stat = sm_mon_1(&args, &mres, nsm->ns_handle); + } else { + struct sm_stat ures; + + ASSERT(op == SM_UNMON); + bzero(&ures, sizeof (ures)); + stat = sm_unmon_1(mip, &ures, nsm->ns_handle); + } + sema_v(&nsm->ns_sem); + + if (stat != RPC_SUCCESS) { + NLM_WARN("Failed to contact local statd, stat=%d", stat); + if (op == SM_MON) { + mutex_enter(&g->lock); + host->nh_flags &= ~NLM_NH_MONITORED; + mutex_exit(&g->lock); + } + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c new file mode 100644 index 0000000000..35e42edaa3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -0,0 +1,1196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/privregs.h> +#include <sys/exec.h> +#include <sys/lwp.h> +#include <sys/sem.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_siginfo.h> +#include <sys/lx_futex.h> +#include <lx_errno.h> +#include <sys/lx_userhz.h> +#include <sys/cmn_err.h> +#include <sys/siginfo.h> +#include <sys/contract/process_impl.h> +#include <sys/x86_archext.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <net/if.h> +#include <inet/ip6.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/sysmacros.h> + +/* Linux specific functions and definitions */ +static void lx_save(klwp_t *); +static void lx_restore(klwp_t *); + +/* + * Set the return code for the forked child, always zero + */ +/*ARGSUSED*/ +void +lx_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_r0 = 0; +} + +/* + * Reset process state on exec(2) + */ +void +lx_exec() +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = ttoproc(curthread); + lx_proc_data_t *pd = ptolxproc(p); + struct regs *rp = lwptoregs(lwp); + + /* b_exec is called without p_lock held */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Any l_handler handlers set as a result of B_REGISTER are now + * invalid; clear them. + */ + pd->l_handler = NULL; + + /* + * If this was a multi-threaded Linux process and this lwp wasn't the + * main lwp, then we need to make its Illumos and Linux PIDs match. + */ + if (curthread->t_tid != 1) { + lx_pid_reassign(curthread); + } + + /* + * Inform ptrace(2) that we are processing an execve(2) call so that if + * we are traced we can post either the PTRACE_EVENT_EXEC event or the + * legacy SIGTRAP. + */ + (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0); + + /* clear the fs/gsbase values until the app. can reinitialize them */ + lwpd->br_lx_fsbase = NULL; + lwpd->br_ntv_fsbase = NULL; + lwpd->br_lx_gsbase = NULL; + lwpd->br_ntv_gsbase = NULL; + + /* + * Clear the native stack flags. This will be reinitialised by + * lx_init() in the new process image. + */ + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + lwpd->br_ntv_stack = 0; + lwpd->br_ntv_stack_current = 0; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, + NULL); + + /* + * clear out the tls array + */ + bzero(lwpd->br_tls, sizeof (lwpd->br_tls)); + + /* + * reset the tls entries in the gdt + */ + kpreempt_disable(); + lx_restore(lwp); + kpreempt_enable(); + + /* Grab the updated argv bounds */ + mutex_enter(&p->p_lock); + lx_read_argv_bounds(p); + mutex_exit(&p->p_lock); + + /* + * The exec syscall doesn't return (so we don't call lx_syscall_return) + * but for our ptrace emulation we need to do this so that a tracer + * does not get out of sync. We know that by the time this lx_exec + * function is called that the exec has succeeded. + */ + rp->r_r0 = 0; + (void) lx_ptrace_stop(LX_PR_SYSEXIT); +} + +static void +lx_cleanlwp(klwp_t *lwp, proc_t *p) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + void *rb_list = NULL; + + VERIFY(lwpd != NULL); + + mutex_enter(&p->p_lock); + if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) { + lx_ptrace_exit(p, lwp); + } + + /* + * While we have p_lock, clear the TP_KTHREAD flag. This is needed + * to prevent races within lx procfs. It's fine for prchoose() to pick + * this thread now since it is exiting and no longer blocked in the + * kernel. + */ + lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD; + + /* + * While we have p_lock, safely grab any robust_list references and + * clear the lwp field. + */ + sprlock_proc(p); + rb_list = lwpd->br_robust_list; + lwpd->br_robust_list = NULL; + sprunlock(p); + + if (rb_list != NULL) { + lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid); + } + + /* + * We need to run our context exit operation (lx_save) here to ensure + * we don't leave any garbage around. This is necessary to handle the + * following calling sequence: + * exit -> proc_exit -> lx_freelwp -> removectx + * That is, when our branded process exits, proc_exit will call our + * lx_freelwp brand hook which does call this function (lx_cleanlwp), + * but lx_freelwp also removes our context exit operation. The context + * exit functions are run by exitctx, which is called by either + * lwp_exit or thread_exit. The thread_exit function is called at the + * end of proc_exit when we'll swtch() to another thread, but by then + * our context exit function has been removed. + * + * It's ok if this function happens to be called more than once (for + * example, if we exec a native binary). + */ + kpreempt_disable(); + lx_save(lwp); + kpreempt_enable(); +} + +void +lx_exitlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + kthread_t *t; + sigqueue_t *sqp = NULL; + pid_t ppid; + id_t ptid; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* second time thru' */ + return; + } + + lx_cleanlwp(lwp, p); + + if (lwpd->br_clear_ctidp != NULL) { + (void) suword32(lwpd->br_clear_ctidp, 0); + (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1, + NULL, NULL, 0); + lwpd->br_clear_ctidp = NULL; + } + + if (lwpd->br_signal != 0) { + /* + * The first thread in a process doesn't cause a signal to + * be sent when it exits. It was created by a fork(), not + * a clone(), so the parent should get signalled when the + * process exits. + */ + if (lwpd->br_ptid == -1) + goto free; + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * If br_ppid is 0, it means this is a CLONE_PARENT thread, + * so the signal goes to the parent process - not to a + * specific thread in this process. + */ + p = lwptoproc(lwp); + if (lwpd->br_ppid == 0) { + mutex_enter(&p->p_lock); + ppid = p->p_ppid; + t = NULL; + } else { + /* + * If we have been reparented to init or if our + * parent thread is gone, then nobody gets + * signaled. + */ + if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) || + (ptid == -1)) + goto free; + + mutex_enter(&pidlock); + if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + goto free; + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((t = idtot(p, ptid)) == NULL) { + mutex_exit(&p->p_lock); + goto free; + } + } + + sqp->sq_info.si_signo = lwpd->br_signal; + sqp->sq_info.si_code = lwpd->br_exitwhy; + sqp->sq_info.si_status = lwpd->br_exitwhat; + sqp->sq_info.si_pid = lwpd->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + sqp = NULL; + } + +free: + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, lwpd->br_args_size); + } + if (sqp) + kmem_free(sqp, sizeof (sigqueue_t)); +} + +void +lx_freelwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* + * There is one case where an LX branded process will possess + * LWPs which lack their own brand data. During the course of + * executing native binary, the process will be preemptively + * branded to allow hooks such as b_native_exec to function. + * If that process possesses multiple LWPS, they will _not_ be + * branded since they will exit if the exec succeeds. It's + * during this LWP exit that lx_freelwp would be called on an + * unbranded LWP. When that is the case, it is acceptable to + * bypass the hook. + */ + return; + } + + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_freelwp != NULL); + (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } + + /* + * It is possible for the lx_freelwp hook to be called without a prior + * call to lx_exitlwp being made. This happens as part of lwp + * de-branding when a native binary is executed from a branded process. + * + * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well + * here in lx_freelwp. When the second call is redundant, the + * resources will already be freed and no work will be needed. + */ + lx_cleanlwp(lwp, p); + + /* + * Remove our system call interposer. + */ + lwp->lwp_brand_syscall = NULL; + + (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + if (lwpd->br_pid != 0) { + lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid); + } + + /* + * Discard the affinity mask. + */ + VERIFY(lwpd->br_affinitymask != NULL); + cpuset_free(lwpd->br_affinitymask); + lwpd->br_affinitymask = NULL; + + /* + * Ensure that lx_ptrace_exit() has been called to detach + * ptrace(2) tracers and tracees. + */ + VERIFY(lwpd->br_ptrace_tracer == NULL); + VERIFY(lwpd->br_ptrace_accord == NULL); + + lwp->lwp_brand = NULL; + kmem_free(lwpd, sizeof (struct lx_lwp_data)); +} + +void * +lx_lwpdata_alloc(proc_t *p) +{ + lx_lwp_data_t *lwpd; + struct lx_pid *lpidp; + cpuset_t *affmask; + pid_t newpid = 0; + struct pid *pidp = NULL; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * LWPs beyond the first will require a pid to be allocated to emulate + * Linux's goofy thread model. While this allocation may be + * unnecessary when a single-lwp process undergoes branding, it cannot + * be performed during b_initlwp due to p_lock being held. + */ + if (p->p_lwpcnt > 0) { + if ((newpid = pid_allocate(p, 0, 0)) < 0) { + return (NULL); + } + pidp = pid_find(newpid); + } + + lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP); + lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP); + affmask = cpuset_alloc(KM_SLEEP); + + lpidp->lxp_lpid = newpid; + lpidp->lxp_pidp = pidp; + lwpd->br_lpid = lpidp; + lwpd->br_affinitymask = affmask; + + return (lwpd); +} + +/* + * Free lwp brand data if an error occurred during lwp_create. + * Otherwise, lx_freelwp will be used to free the resources after they're + * associated with the lwp via lx_initlwp. + */ +void +lx_lwpdata_free(void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + VERIFY(lwpd != NULL); + VERIFY(lwpd->br_lpid != NULL); + VERIFY(lwpd->br_affinitymask != NULL); + + cpuset_free(lwpd->br_affinitymask); + if (lwpd->br_lpid->lxp_pidp != NULL) { + (void) pid_rele(lwpd->br_lpid->lxp_pidp); + } + kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid)); + kmem_free(lwpd, sizeof (*lwpd)); +} + +void +lx_initlwp(klwp_t *lwp, void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + kthread_t *tp = lwptot(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwp->lwp_brand == NULL); + + lwpd->br_exitwhy = CLD_EXITED; + lwpd->br_lwp = lwp; + lwpd->br_clear_ctidp = NULL; + lwpd->br_set_ctidp = NULL; + lwpd->br_signal = 0; + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + cpuset_all(lwpd->br_affinitymask); + + /* + * The first thread in a process has ppid set to the parent + * process's pid, and ptid set to -1. Subsequent threads in the + * process have their ppid set to the pid of the thread that + * created them, and their ptid to that thread's tid. + */ + if (tp->t_next == tp) { + lwpd->br_ppid = tp->t_procp->p_ppid; + lwpd->br_ptid = -1; + } else if (plwpd != NULL) { + bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls)); + lwpd->br_ppid = plwpd->br_pid; + lwpd->br_ptid = curthread->t_tid; + /* The child inherits the fs/gsbase values from the parent */ + lwpd->br_lx_fsbase = plwpd->br_lx_fsbase; + lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase; + lwpd->br_lx_gsbase = plwpd->br_lx_gsbase; + lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase; + } else { + /* + * Oddball case: the parent thread isn't a Linux process. + */ + lwpd->br_ppid = 0; + lwpd->br_ptid = -1; + } + lwp->lwp_brand = lwpd; + + /* + * When during lx_lwpdata_alloc, we must decide whether or not to + * allocate a new pid to associate with the lwp. Since p_lock is not + * held at that point, the only time we can guarantee a new pid isn't + * needed is when p_lwpcnt == 0. This is because other lwps won't be + * present to race with us with regards to pid allocation. + * + * This means that in all other cases (where p_lwpcnt > 0), we expect + * that lx_lwpdata_alloc will allocate a pid for us to use here, even + * if it is uneeded. If this process is undergoing an exec, for + * example, the single existing lwp will not need a new pid when it is + * rebranded. In that case, lx_pid_assign will free the uneeded pid. + */ + VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0); + + lx_pid_assign(tp, lwpd->br_lpid); + lwpd->br_tgid = lwpd->br_pid; + /* + * Having performed the lx pid assignement, the lpid reference is no + * longer needed. The underlying data will be freed during lx_freelwp. + */ + lwpd->br_lpid = NULL; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + + /* + * Install branded system call hooks for this LWP: + */ + lwp->lwp_brand_syscall = lx_syscall_enter; + + /* + * The new LWP inherits the parent LWP cgroup ID. + */ + if (plwpd != NULL) { + lwpd->br_cgroupid = plwpd->br_cgroupid; + } + /* + * The new LWP inherits the parent LWP emulated scheduling info. + */ + if (plwpd != NULL) { + lwpd->br_schd_class = plwpd->br_schd_class; + lwpd->br_schd_pri = plwpd->br_schd_pri; + lwpd->br_schd_flags = plwpd->br_schd_flags; + lwpd->br_schd_runtime = plwpd->br_schd_runtime; + lwpd->br_schd_deadline = plwpd->br_schd_deadline; + lwpd->br_schd_period = plwpd->br_schd_period; + } + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_initlwp != NULL); + (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } +} + +void +lx_initlwp_post(klwp_t *lwp) +{ + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + /* + * If the parent LWP has a ptrace(2) tracer, the new LWP may + * need to inherit that same tracer. + */ + if (plwpd != NULL) { + lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp)); + } +} + +/* + * There is no need to have any locking for either the source or + * destination struct lx_lwp_data structs. This is always run in the + * thread context of the source thread, and the destination thread is + * always newly created and not referred to from anywhere else. + */ +void +lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + struct lx_lwp_data *src = srclwp->lwp_brand; + struct lx_lwp_data *dst = dstlwp->lwp_brand; + + dst->br_ppid = src->br_pid; + dst->br_ptid = lwptot(srclwp)->t_tid; + bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + + switch (src->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * The parent LWP has an alternate stack installed. + * The child LWP should have the same stack base and extent. + */ + dst->br_stack_mode = src->br_stack_mode; + dst->br_ntv_stack = src->br_ntv_stack; + dst->br_ntv_stack_current = src->br_ntv_stack_current; + break; + + default: + /* + * Otherwise, clear the stack data for this LWP. + */ + dst->br_stack_mode = LX_STACK_MODE_PREINIT; + dst->br_ntv_stack = 0; + dst->br_ntv_stack_current = 0; + } + + /* + * copy only these flags + */ + dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; + dst->br_scall_args = NULL; + lx_affinity_forklwp(srclwp, dstlwp); + + /* + * Flag so child doesn't ptrace-stop on syscall exit. + */ + dst->br_ptrace_flags |= LX_PTF_NOSTOP; + + if (src->br_clone_grp_flags != 0) { + lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp), + lwptoproc(dstlwp)); + /* clone group no longer pending on this thread */ + src->br_clone_grp_flags = 0; + } +} + +/* + * When switching a Linux process off the CPU, clear its GDT entries. + */ +/* ARGSUSED */ +static void +lx_save(klwp_t *t) +{ + int i; + +#if defined(__amd64) + reset_sregs(); +#endif + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &null_udesc); +} + +/* + * When switching a Linux process on the CPU, set its GDT entries. + * + * For 64-bit code we don't have to worry about explicitly setting the + * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen + * automatically in update_sregs if we are executing in user-land. If this + * is the case then pcb_rupdate should be set. + */ +static void +lx_restore(klwp_t *t) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(t); + user_desc_t *tls; + int i; + + ASSERT(lwpd); + + tls = lwpd->br_tls; + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &tls[i]); +} + +void +lx_set_gdt(int entry, user_desc_t *descrp) +{ + + gdt_update_usegd(entry, descrp); +} + +void +lx_clear_gdt(int entry) +{ + gdt_update_usegd(entry, &null_udesc); +} + +longlong_t +lx_nosys() +{ + return (set_errno(ENOSYS)); +} + +/* + * Brand-specific routine to check if given non-Solaris standard segment + * register values should be modified to other values. + */ +/*ARGSUSED*/ +greg_t +lx_fixsegreg(greg_t sr, model_t datamodel) +{ + uint16_t idx = SELTOIDX(sr); + + ASSERT(sr == (sr & 0xffff)); + + /* + * If the segment selector is a valid TLS selector, just return it. + */ + if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX) + return (sr | SEL_UPL); + + /* + * Force the SR into the LDT in ring 3 for 32-bit processes. + * + * 64-bit processes get the null GDT selector since they are not + * allowed to have a private LDT. + */ +#if defined(__amd64) + return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0); +#elif defined(__i386) + datamodel = datamodel; /* datamodel currently unused for 32-bit */ + return (sr | SEL_TI_LDT | SEL_UPL); +#endif /* __amd64 */ +} + +/* + * Brand-specific function to convert the fsbase as pulled from the register + * into a native fsbase suitable for locating the ulwp_t from the kernel. + */ +uintptr_t +lx_fsbase(klwp_t *lwp, uintptr_t fsbase) +{ + lx_lwp_data_t *lwpd = lwp->lwp_brand; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND || + lwpd->br_ntv_fsbase == NULL) { + return (fsbase); + } + + return (lwpd->br_ntv_fsbase); +} + +/* + * These two functions simulate winfo and post_sigcld for the lx brand. The + * difference is delivering a designated signal as opposed to always SIGCLD. + */ +static void +lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat) +{ + ASSERT(MUTEX_HELD(&pidlock)); + bzero(ip, sizeof (k_siginfo_t)); + ip->si_signo = ltos_signo[dat->l_signal]; + ip->si_code = pp->p_wcode; + ip->si_pid = pp->p_pid; + ip->si_ctid = PRCTID(pp); + ip->si_zoneid = pp->p_zone->zone_id; + ip->si_status = pp->p_wdata; + /* + * These siginfo values are converted to USER_HZ in the user-land + * brand signal code. + */ + ip->si_stime = pp->p_stime; + ip->si_utime = pp->p_utime; +} + +static void +lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat) +{ + proc_t *pp = cp->p_parent; + + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + /* + * Since Linux doesn't queue SIGCHLD, or any other non RT + * signals, we just blindly deliver whatever signal we can. + */ + ASSERT(sqp != NULL); + lx_winfo(cp, &sqp->sq_info, dat); + sigaddqa(pp, NULL, sqp); + sqp = NULL; + mutex_exit(&pp->p_lock); +} + + +/* + * Brand specific code for exiting and sending a signal to the parent, as + * opposed to sigcld(). + */ +void +lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp) +{ + proc_t *pp = cp->p_parent; + lx_proc_data_t *lx_brand_data = ptolxproc(cp); + ASSERT(MUTEX_HELD(&pidlock)); + + switch (cp->p_wcode) { + case CLD_EXITED: + case CLD_DUMPED: + case CLD_KILLED: + ASSERT(cp->p_stat == SZOMB); + /* + * The broadcast on p_srwchan_cv is a kludge to + * wakeup a possible thread in uadmin(A_SHUTDOWN). + */ + cv_broadcast(&cp->p_srwchan_cv); + + /* + * Add to newstate list of the parent + */ + add_ns(pp, cp); + + cv_broadcast(&pp->p_cv); + if ((pp->p_flag & SNOWAIT) || + PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) { + if (!(cp->p_pidflag & CLDWAITPID)) + freeproc(cp); + } else if (!(cp->p_pidflag & CLDNOSIGCHLD) && + lx_brand_data->l_signal != 0) { + lx_post_exit_sig(cp, sqp, lx_brand_data); + sqp = NULL; + } + break; + + case CLD_STOPPED: + case CLD_CONTINUED: + case CLD_TRAPPED: + panic("Should not be called in this case"); + } + + if (sqp) + siginfofree(sqp); +} + +/* + * Filters based on arguments that have been passed in by a separate syscall + * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is + * applied, otherwise we look at the difference between a clone and non-clone + * process. + * The definition of a clone process in Linux is a thread that does not deliver + * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone + * processes. Without that option, a process should only wait on normal + * children. The following table shows the cases. + * + * default __WCLONE + * no SIGCHLD - X + * SIGCHLD X - + * + * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on + * process exit. + * + * More information on wait in lx brands can be found at + * usr/src/lib/brand/lx/lx_brand/common/wait.c. + */ +/* ARGSUSED */ +boolean_t +lx_wait_filter(proc_t *pp, proc_t *cp) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int flags = lwpd->br_waitid_flags; + boolean_t ret; + + if (!lwpd->br_waitid_emulate) { + return (B_TRUE); + } + + mutex_enter(&cp->p_lock); + if (flags & LX_WALL) { + ret = B_TRUE; + } else { + lx_proc_data_t *pd = ptolxproc(cp); + boolean_t is_sigchld = B_TRUE; + boolean_t match_wclone = B_FALSE; + + /* + * When calling clone, an alternate signal can be chosen to + * deliver to the parent when the child exits. + */ + if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) { + is_sigchld = B_FALSE; + } + if ((flags & LX_WCLONE) != 0) { + match_wclone = B_TRUE; + } + + ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE; + } + mutex_exit(&cp->p_lock); + + return (ret); +} + +void +lx_ifname_convert(char *ifname, lx_if_action_t act) +{ + if (act == LX_IF_TONATIVE) { + if (strncmp(ifname, "lo", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo0", IFNAMSIZ); + } else { + if (strncmp(ifname, "lo0", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo", IFNAMSIZ); + } +} + +void +lx_ifflags_convert(uint64_t *flags, lx_if_action_t act) +{ + uint64_t buf; + + buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG | + IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS | + IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI); + + /* Linux has different shift for multicast flag */ + if (act == LX_IF_TONATIVE) { + if (*flags & 0x1000) + buf |= IFF_MULTICAST; + } else { + if (*flags & IFF_MULTICAST) + buf |= 0x1000; + } + *flags = buf; +} + +/* + * Convert an IPv6 address into the numbers used by /proc/net/if_inet6 + */ +unsigned int +lx_ipv6_scope_convert(const in6_addr_t *addr) +{ + if (IN6_IS_ADDR_V4COMPAT(addr)) { + return (LX_IPV6_ADDR_COMPATv4); + } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_IPV6_ADDR_LOOPBACK); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_IPV6_ADDR_LINKLOCAL); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_IPV6_ADDR_SITELOCAL); + } else { + return (0x0000U); + } +} + + +void +lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size) +{ + int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data)); + + switch (src->sdl_type) { + case DL_ETHER: + dst->sa_family = LX_ARPHRD_ETHER; + break; + case DL_LOOP: + dst->sa_family = LX_ARPHRD_LOOPBACK; + break; + default: + dst->sa_family = LX_ARPHRD_VOID; + } + + bcopy(LLADDR(src), dst->sa_data, copy_size); + *size = copy_size; +} + +/* + * Brand hook to convert native kernel siginfo signal number, errno, code, pid + * and si_status to Linux values. Similar to the stol_ksiginfo function but + * this one converts in-place, converts the pid, and does not copyout. + */ +void +lx_sigfd_translate(k_siginfo_t *infop) +{ + zone_t *zone = curproc->p_zone; + + infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL); + infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL); + infop->si_code = lx_stol_sigcode(infop->si_code); + infop->si_errno = lx_errno(infop->si_errno, EINVAL); + + /* Map zsched and zone init to pid 1 */ + if (infop->si_pid == zone->zone_proc_initpid || + infop->si_pid == zone->zone_zsched->p_pid) { + infop->si_pid = 1; + } +} + +int +stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime); + lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime); + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +int +stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo32_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime); + lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime); + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif + +/* + * Linux uses the original bounds of the argv array when determining the + * contents of /proc/<pid/cmdline. We mimic those bounds using argv[0] and + * envp[0] as the beginning and end, respectively. + */ +void +lx_read_argv_bounds(proc_t *p) +{ + user_t *up = PTOU(p); + lx_proc_data_t *pd = ptolxproc(p); + uintptr_t addr_arg = up->u_argv; + uintptr_t addr_env = up->u_envp; + uintptr_t arg_start = 0, env_start = 0, env_end = 0; + int i = 0; + + VERIFY(pd != NULL); + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Use AT_SUN_PLATFORM in the aux vector to find the end of the envp + * strings. + */ + for (i = 0; i < __KERN_NAUXV_IMPL; i++) { + if (up->u_auxv[i].a_type == AT_SUN_PLATFORM) { + env_end = (uintptr_t)up->u_auxv[i].a_un.a_val; + } + } + + /* + * If we come through here for a kernel process (zsched), which happens + * with our cgroupfs when we fork the release agent, then u_argv and + * u_envp will be NULL. While this won't cause a failure, it does + * cause a lot of overhead when the fuword causes a fault, which leads + * to a large amount of stack growth and anonymous memory allocation, + * all of which is pointless since the first page can't be mapped. + */ + if (addr_arg != NULL || addr_env != NULL) { + mutex_exit(&p->p_lock); +#if defined(_LP64) + if (p->p_model != DATAMODEL_NATIVE) { + uint32_t buf32; + if (fuword32((void *)addr_arg, &buf32) == 0) { + arg_start = (uintptr_t)buf32; + } + if (fuword32((void *)addr_env, &buf32) == 0) { + env_start = (uintptr_t)buf32; + } + } else +#endif /* defined(_LP64) */ + { + ulong_t buf; + if (fulword((void *)addr_arg, &buf) == 0) { + arg_start = (uintptr_t)buf; + } + if (fulword((void *)addr_env, &buf) == 0) { + env_start = (uintptr_t)buf; + } + } + mutex_enter(&p->p_lock); + } + + pd->l_args_start = arg_start; + pd->l_envs_start = env_start; + pd->l_envs_end = env_end; +} + +/* Given an LX LWP, determine where user register state is stored. */ +lx_regs_location_t +lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write) +{ + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + /* + * The LWP was stopped with the brand stack and register state + * loaded, e.g. during a syscall emulated within the kernel. + */ + return (LX_REG_LOC_LWP); + + case LX_STACK_MODE_PREINIT: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED || + lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) { + /* The LWP was stopped by tracing on exec. */ + return (LX_REG_LOC_LWP); + } + break; + + case LX_STACK_MODE_NATIVE: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whystop == PR_BRAND) { + /* Called while ptrace-event-stopped by lx_exec. */ + if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) { + return (LX_REG_LOC_LWP); + } + + /* Called while ptrace-event-stopped after clone. */ + if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED && + lwpd->br_ptrace_stopsig == LX_SIGSTOP && + (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) { + return (LX_REG_LOC_LWP); + } + + /* + * Called to obtain syscall exit for other cases + * (e.g. pseudo return from rt_sigreturn). + */ + if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT && + (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) { + return (LX_REG_LOC_LWP); + } + } + break; + default: + break; + } + + if (lwpd->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Return the register state + * from that ucontext_t. + */ + VERIFY(ucp != NULL); + *ucp = (void *)lwpd->br_ptrace_stopucp; + return (LX_REG_LOC_UCP); + } + + return (LX_REG_LOC_UNAVAIL); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c new file mode 100644 index 0000000000..8439a23e58 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c @@ -0,0 +1,499 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> + +#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */ +static int hash_len = 4; /* desired average hash chain length */ +static int hash_size; /* no of buckets in the hash table */ + +static struct lx_pid **stol_pid_hash; +static struct lx_pid **ltos_pid_hash; + +#define LTOS_HASH(pid) ((pid) & (hash_size - 1)) +#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1)) + +static kmutex_t hash_lock; + +static void +lx_pid_insert_hash(struct lx_pid *lpidp) +{ + int shash = STOL_HASH(lpidp->lxp_spid, lpidp->lxp_stid); + int lhash = LTOS_HASH(lpidp->lxp_lpid); + + ASSERT(MUTEX_HELD(&hash_lock)); + + lpidp->lxp_stol_next = stol_pid_hash[shash]; + stol_pid_hash[shash] = lpidp; + + lpidp->lxp_ltos_next = ltos_pid_hash[lhash]; + ltos_pid_hash[lhash] = lpidp; +} + +static struct lx_pid * +lx_pid_remove_hash(pid_t pid, id_t tid) +{ + struct lx_pid **hpp; + struct lx_pid *lpidp = NULL; + + ASSERT(MUTEX_HELD(&hash_lock)); + + hpp = &stol_pid_hash[STOL_HASH(pid, tid)]; + while (*hpp) { + if ((*hpp)->lxp_spid == pid && (*hpp)->lxp_stid == tid) { + lpidp = *hpp; + *hpp = (*hpp)->lxp_stol_next; + break; + } + hpp = &(*hpp)->lxp_stol_next; + } + + /* + * when called during error recovery the pid may already + * be released + */ + if (lpidp == NULL) + return (NULL); + + hpp = <os_pid_hash[LTOS_HASH(lpidp->lxp_lpid)]; + while (*hpp) { + if (*hpp == lpidp) { + *hpp = lpidp->lxp_ltos_next; + break; + } + hpp = &(*hpp)->lxp_ltos_next; + } + + return (lpidp); +} + +/* + * given a solaris pid/tid pair, create a linux pid + */ +void +lx_pid_assign(kthread_t *t, struct lx_pid *lpidp) +{ + proc_t *p = ttoproc(t); + lx_lwp_data_t *lwpd = ttolxlwp(t); + pid_t spid = p->p_pid; + id_t stid = t->t_tid; + + /* + * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be + * equal to 1. Since lx_initlwp is being called against an lwp that + * already exists, an additional pid allocation is not necessary. + * + * We check for this by testing br_ppid == 0. + */ + if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) { + /* + * Assign allocated pid to any thread other than the first. + * The lpid and pidp fields should be populated. + */ + VERIFY(lpidp->lxp_pidp != NULL); + VERIFY(lpidp->lxp_lpid != 0); + } else { + /* + * There are cases where a pid is speculatively allocated but + * is not needed. We are obligated to free it here. + */ + if (lpidp->lxp_pidp != NULL) { + (void) pid_rele(lpidp->lxp_pidp); + } + lpidp->lxp_pidp = NULL; + lpidp->lxp_lpid = spid; + } + + lpidp->lxp_spid = spid; + lpidp->lxp_stid = stid; + lpidp->lxp_start = t->t_start; + lpidp->lxp_procp = p; + + /* + * Now place the pid into the Linux-SunOS and SunOS-Linux conversion + * hash tables. + */ + mutex_enter(&hash_lock); + lx_pid_insert_hash(lpidp); + mutex_exit(&hash_lock); + + lwpd->br_pid = lpidp->lxp_lpid; +} + +/* + * If we are exec()ing the process, this thread's tid is about to be reset + * to 1. Make sure the Linux PID bookkeeping reflects that change. + */ +void +lx_pid_reassign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + struct pid *old_pidp; + struct lx_pid *lpidp; + + ASSERT(p->p_lwpcnt == 1); + + mutex_enter(&hash_lock); + + /* + * Clean up all the traces of this thread's 'fake' Linux PID. + */ + lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid); + ASSERT(lpidp != NULL); + old_pidp = lpidp->lxp_pidp; + lpidp->lxp_pidp = NULL; + + /* + * Now register this thread as (pid, 1). + */ + lpidp->lxp_lpid = p->p_pid; + lpidp->lxp_spid = p->p_pid; + lpidp->lxp_stid = 1; + lx_pid_insert_hash(lpidp); + + mutex_exit(&hash_lock); + + if (old_pidp) + (void) pid_rele(old_pidp); +} + +/* + * release a solaris pid/tid pair + */ +void +lx_pid_rele(pid_t pid, id_t tid) +{ + struct lx_pid *lpidp; + + mutex_enter(&hash_lock); + lpidp = lx_pid_remove_hash(pid, tid); + mutex_exit(&hash_lock); + + if (lpidp) { + if (lpidp->lxp_pidp) + (void) pid_rele(lpidp->lxp_pidp); + + kmem_free(lpidp, sizeof (*lpidp)); + } +} + +/* + * given a linux pid, return the solaris pid/tid pair + */ +int +lx_lpid_to_spair(pid_t lpid, pid_t *spid, id_t *stid) +{ + struct lx_pid *hp; + + if (lpid == 1) { + pid_t initpid; + + /* + * We are trying to look up the Linux init process for the + * current zone, which we pretend has pid 1. + */ + if ((initpid = curzone->zone_proc_initpid) == -1) { + /* + * We could not find the init process for this zone. + */ + return (-1); + } + + if (spid != NULL) + *spid = initpid; + if (stid != NULL) + *stid = 1; + + return (0); + } + + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL; + hp = hp->lxp_ltos_next) { + if (hp->lxp_lpid == lpid) { + if (spid) + *spid = hp->lxp_spid; + if (stid) + *stid = hp->lxp_stid; + break; + } + } + mutex_exit(&hash_lock); + if (hp != NULL) + return (0); + + /* + * We didn't find this pid in our translation table. + * But this still could be the pid of a native process + * running in the current zone so check for that here. + * + * Note that prfind() only searches for processes in the current zone. + */ + mutex_enter(&pidlock); + if (prfind(lpid) != NULL) { + mutex_exit(&pidlock); + if (spid) + *spid = lpid; + if (stid) + *stid = 0; + return (0); + } + mutex_exit(&pidlock); + + return (-1); +} + +/* + * Given a Linux pid, locate the proc_t and optionally acquire P_PR_LOCK. + * Returns 0 on success with p_lock held for the proc_t in question. + */ +int +lx_lpid_lock(pid_t lpid, zone_t *zone, lx_pid_flag_t flag, proc_t **pp, + kthread_t **tp) +{ + proc_t *p; + kthread_t *t; + id_t tid = 0; + + ASSERT(MUTEX_NOT_HELD(&pidlock)); + ASSERT(pp != NULL); + ASSERT(zone != NULL && zone->zone_brand == &lx_brand); + +retry: + p = NULL; + if (lpid == 1) { + pid_t initpid; + + /* + * Look up the init process for the zone. + */ + if ((initpid = zone->zone_proc_initpid) <= 0) { + return (-1); + } + mutex_enter(&pidlock); + p = prfind_zone(initpid, zone->zone_id); + tid = 0; + } else { + struct lx_pid *hp; + + mutex_enter(&pidlock); + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL; + hp = hp->lxp_ltos_next) { + if (hp->lxp_lpid == lpid) { + tid = hp->lxp_stid; + p = hp->lxp_procp; + break; + } + } + mutex_exit(&hash_lock); + /* + * If the pid wasn't listed in the ltos hash, it may correspond + * to an native process in the zone. + */ + if (p == NULL) { + p = prfind_zone(lpid, zone->zone_id); + tid = 0; + } + } + + if (p == NULL) { + mutex_exit(&pidlock); + return (-1); + } + + /* + * Bail on processes belonging to the system, those which are not yet + * complete and zombies (unless explicitly allowed via the flags). + */ + if (p->p_stat == SIDL || (p->p_flag & SSYS) != 0 || + (p->p_stat == SZOMB && (flag & LXP_ZOMBOK) == 0)) { + mutex_exit(&pidlock); + return (-1); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (flag & LXP_PRLOCK) { + /* + * It would be convenient to call sprtrylock_proc() for this + * task. Unfortunately, its behavior of filtering zombies is + * excessive for some lx_proc use cases. Instead, when the + * provided flags do not indicate that zombies are allowed, + * exiting processes are filtered out (as would be performed by + * sprtrylock_proc). + */ + if ((p->p_flag & (SEXITING|SEXITLWPS)) != 0 && + (flag & LXP_ZOMBOK) == 0) { + mutex_exit(&p->p_lock); + return (-1); + } + if (p->p_proc_flag & P_PR_LOCK) { + sprwaitlock_proc(p); + goto retry; + } else { + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + } + } + + if (tid == 0) { + t = p->p_tlist; + } else { + lwpdir_t *ld; + + ld = lwp_hash_lookup(p, tid); + if (ld == NULL) { + if (flag & LXP_PRLOCK) { + sprunprlock(p); + } + mutex_exit(&p->p_lock); + return (-1); + } + t = ld->ld_entry->le_thread; + } + *pp = p; + if (tp != NULL) { + *tp = t; + } + return (0); +} + + +/* + * Given an lwp, return the Linux pid of its parent. If the caller + * wants them, we return the SunOS (pid, tid) as well. + */ +pid_t +lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + const pid_t zoneinit = p->p_zone->zone_proc_initpid; + const pid_t ppid = p->p_ppid; + + /* + * Report a ppid of 1 for processes which are children to either init + * or a process outside the zone. + */ + if (ppid == zoneinit || (p->p_flag & SZONETOP) != 0) { + goto ppid_is_zinit; + } + + /* + * Our native concept of a 'parent pid' matches Linux in two cases: + * + * - TGID and PID are equal: This is either the first thread in the + * process or one created with CLONE_THREAD. + * + * - The brand lwp value for PPID is 0: This is either the child of a + * differently-branded process or was created with the CLONE_PARENT. + */ + if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) { + if (ppidp != NULL) + *ppidp = ppid; + if (ptidp != NULL) + *ptidp = -1; + return (ppid); + } + + /* + * In all other cases, we are looking for the parent of this specific + * thread, which in Linux refers to the thread that clone(2)d it. We + * stashed that thread's PID away when this thread was created. + */ + mutex_enter(&hash_lock); + for (struct lx_pid *hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; + hp != NULL; hp = hp->lxp_ltos_next) { + if (lwpd->br_ppid == hp->lxp_lpid) { + /* + * The PID matches, but there are a couple cases when + * the translation is not suitable: + * + * - The cached start time is too young, indicating + * that the thread exited and the PID was reused by + * another process. + * - The parent is zoneinit + * + * In both cases, a result of ppid=1 is yielded. + */ + if (hp->lxp_start > lwptot(lwp)->t_start || + lwpd->br_ppid == zoneinit) { + break; + } + + /* Good match, yield the result */ + if (ppidp != NULL) + *ppidp = hp->lxp_spid; + if (ptidp != NULL) + *ptidp = hp->lxp_stid; + mutex_exit(&hash_lock); + return (lwpd->br_ppid); + } + } + mutex_exit(&hash_lock); + /* + * If no match is found in the Linux->SunOS translation hash, fall back + * to assuming the zone init process as the parent. + */ + +ppid_is_zinit: + if (ppidp != NULL) + *ppidp = 1; + if (ptidp != NULL) + *ptidp = -1; + return (1); +} + +void +lx_pid_init(void) +{ + hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR)); + + stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + + mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_pid_fini(void) +{ + kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size); + kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c new file mode 100644 index 0000000000..252f83fd3f --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c @@ -0,0 +1,2710 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Emulation of the Linux ptrace(2) interface. + * + * OVERVIEW + * + * The Linux process model is somewhat different from the illumos native + * model. One critical difference is that each Linux thread has a unique + * identifier in the pid namespace. The lx brand assigns a pid to each LWP + * within the emulated process, giving the pid of the process itself to the + * first LWP. + * + * The Linux ptrace(2) interface allows for any LWP in a branded process to + * exert control over any other LWP within the same zone. Control is exerted + * by the use of the ptrace(2) system call itself, which accepts a number of + * request codes. Feedback on traced events is primarily received by the + * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system + * calls. Many of the possible ptrace(2) requests will only succeed if the + * target LWP is in a "ptrace-stop" condition. + * + * HISTORY + * + * The brand support for ptrace(2) was originally built on top of the rich + * support for debugging and tracing provided through the illumos /proc + * interfaces, mounted at /native/proc within the zone. The native legacy + * ptrace(3C) functionality was used as a starting point, but was generally + * insufficient for complete and precise emulation. The extant legacy + * interface, and indeed our native SIGCLD and waitid(2) facilities, are + * focused on _process_ level concerns -- the Linux interface has been + * extended to be aware of LWPs as well. + * + * In order to allow us to focus on providing more complete and accurate + * emulation without extensive and undesirable changes to the native + * facilities, this second generation ptrace(2) emulation is mostly separate + * from any other tracing or debugging framework in the system. + * + * ATTACHING TRACERS TO TRACEES + * + * There are several ways that a child LWP may becomed traced by a tracer. + * To determine which attach method caused a tracee to become attached, one + * may inspect the "br_ptrace_attach" member of the LWP-specific brand data + * with the debugger. + * + * The first attach methods to consider are the attaching ptrace(2) requests: + * + * PTRACE_TRACEME + * + * If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee + * to its parent LWP (br_ppid). Using PTRACE_TRACEME does _not_ cause the + * tracee to be held in a stop condition. It is common practice for + * consumers to raise(SIGSTOP) immediately afterward. + * + * PTRACE_ATTACH + * + * An LWP may attempt to trace any other LWP in this, or another, process. + * We currently allow any attach where the process containing the tracer + * LWP has permission to write to /proc for the process containing the + * intended tracer. This action also sends a SIGSTOP to the newly attached + * tracee. + * + * The second class of attach methods are the clone(2)/fork(2) inheritance + * options that may be set on a tracee with PTRACE_SETOPTIONS: + * + * PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE + * + * If these options have been set on a tracee, then a fork(2), vfork(2) or + * clone(2) respectively will cause the newly created LWP to be traced by + * the same tracer. The same set of ptrace(2) options will also be set on + * the new child. + * + * The third class of attach method is the PTRACE_CLONE flag to clone(2). + * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is + * passed by the tracee as an argument to clone(2). + * + * DETACHING TRACEES + * + * Tracees can be detached by the tracer with the PTRACE_DETACH request. + * This request is only valid when the tracee is in a ptrace(2) stop + * condition, and is itself a restarting action. + * + * If the tracer exits without detaching all of its tracees, then all of the + * tracees are automatically detached and restarted. If a tracee was in + * "signal-delivery-stop" at the time the tracer exited, the signal will be + * released to the child unless it is a SIGSTOP. We drop this instance of + * SIGSTOP in order to prevent the child from becoming stopped by job + * control. + * + * ACCORD ALLOCATION AND MANAGEMENT + * + * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP + * and zero or more tracee LWPs. It is explicitly illegal for a tracee to + * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME. + * + * An LWP starts out without an accord. If a child of that LWP calls + * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses + * PTRACE_ATTACH, an accord will be allocated and stored on that LWP. The + * accord structure is not released from that LWP until it arrives in + * lx_exitlwp(), as called by lwp_exit(). A new accord will not be + * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp() + * and sets the LX_PTF_EXITING flag. An LWP will have at most one accord + * structure throughout its entire lifecycle; once it has one, it has the + * same one until death. + * + * The accord is reference counted (lxpa_refcnt), starting at a count of one + * at creation to represent the link from the tracer LWP to its accord. The + * accord is not freed until the reference count falls to zero. + * + * To make mutual exclusion between a detaching tracer and various notifying + * tracees simpler, the tracer will hold "pidlock" while it clears the + * accord members that point back to the tracer LWP and CV. + * + * SIGNALS AND JOB CONTROL + * + * Various actions, either directly ptrace(2) related or commonly associated + * with tracing, cause process- or thread-directed SIGSTOP signals to be sent + * to tracees (a "signal-delivery-stop"). These signals, and indeed any signal + * other than SIGKILL, can be suppressed by the tracer when using a restarting + * request (including PTRACE_DETACH) on a child. The signal may also be + * substituted for a different signal. + * + * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer, + * it will induce the regular illumos native job control stop of the entire + * traced process. This is at least passingly similar to the Linux "group + * stop" ptrace(2) condition. + * + * SYSTEM CALL TRACING + * + * The ptrace(2) interface enables the tracer to hold the tracee on entry and + * exit from system calls. When a stopped tracee is restarted through the + * PTRACE_SYSCALL request, the LX_PTF_SYSCALL flag is set until the next + * system call boundary. Whether this is a "syscall-entry-stop" or + * "syscall-exit-stop", the tracee is held and the tracer is notified via + * SIGCLD/waitpid(2) in the usual way. The flag LX_PTF_SYSCALL flag is + * cleared after each stop; for ongoing system call tracing the tracee must + * be continuously restarted with PTRACE_SYSCALL. + * + * SPECIAL CASES FOR STOP EVENTS + * + * The strace command is one of the primary consumers of ptrace. In order for + * strace to properly understand what is actually happening when it receives a + * signal associated with a stop event, these signals must match Linux behavior + * exactly or the strace consumer will get out of sync and report incorrect + * state. There are a couple of special cases we have to handle to provide + * proper interaction of the syscall-entry-stop, syscall-exit-stop, and + * signal-delivery-stop events: + * 1) The child process of a clone/fork does not emit a syscall-exit-stop event. + * 2) A signal that arrives between syscall-enter-stop & syscall-exit-stop must + * not immediately emit signal-delivery-stop. This event must be emitted + * after the syscall is interrupted and syscall-exit-stop has been emitted. + * + * EVENT STOPS + * + * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are + * enabled by the tracer through PTRACE_SETOPTIONS. Once enabled, the tracee + * will be stopped at the nominated points of interest and the tracer + * notified. The tracer may request additional information about the event, + * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG. + * + * LOCK ORDERING RULES + * + * It is not safe, in general, to hold p_lock for two different processes at + * the same time. This constraint is the primary reason for the existence + * (and complexity) of the ptrace(2) accord mechanism. + * + * In order to facilitate looking up accords by the "pid" of a tracer LWP, + * p_lock for the tracer process may be held while entering the accord mutex + * (lxpa_lock). This mutex protects the accord flags and reference count. + * The reference count is manipulated through lx_ptrace_accord_hold() and + * lx_ptrace_accord_rele(). + * + * DO NOT interact with the accord mutex (lxpa_lock) directly. The + * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various + * book-keeping and lock ordering enforcement and MUST be used. + * + * It is NOT legal to take ANY p_lock while holding the accord mutex + * (lxpa_lock). If the lxpa_tracees_lock is to be held concurrently with + * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock + * of any processes from the tracee list. + * + * It is NOT legal to take a tracee p_lock and then attempt to enter the + * accord mutex (or tracee list mutex) of its tracer. When running as the + * tracee LWP, the tracee's hold will prevent the accord from being freed. + * Use of the LX_PTF_STOPPING or LX_PTF_CLONING flag in the LWP-specific brand + * data prevents an exiting tracer from altering the tracee until the tracee + * has come to an orderly stop, without requiring the tracee to hold its own + * p_lock the entire time it is stopping. + * + * It is not safe, in general, to enter "pidlock" while holding the p_lock of + * any process. It is similarly illegal to hold any accord locks (lxpa_lock + * or lxpa_sublock) while attempting to enter "pidlock". As "pidlock" is a + * global mutex, it should be held for the shortest possible time. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/procfs.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/wait.h> +#include <sys/prsystm.h> +#include <sys/note.h> + +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> +#include <lx_signum.h> + + +typedef enum lx_ptrace_cont_flags_t { + LX_PTC_NONE = 0x00, + LX_PTC_SYSCALL = 0x01, + LX_PTC_SINGLESTEP = 0x02 +} lx_ptrace_cont_flags_t; + + +extern int lx_user_regs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_regs_copyout(lx_lwp_data_t *, void *); +extern int lx_ptrace_peekuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_ptrace_pokeuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_user_fpregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpregs_copyout(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyout(lx_lwp_data_t *, void *); + +/* + * Macros for checking the state of an LWP via "br_ptrace_flags": + */ +#define LX_PTRACE_BUSY \ + (LX_PTF_EXITING | LX_PTF_STOPPING | LX_PTF_CLONING) + +#define VISIBLE(a) (((a)->br_ptrace_flags & LX_PTF_EXITING) == 0) +#define TRACEE_BUSY(a) (((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0) + +#define ACCORD_HELD(a) MUTEX_HELD(&(a)->lxpa_lock) + +#define LX_PID_TO_INIT(x) ((x) == curproc->p_zone->zone_proc_initpid ? \ + 1 : (x)) +#define LX_INIT_TO_PID(x) ((x) == 1 ? \ + curproc->p_zone->zone_proc_initpid : (x)) + +static kcondvar_t lx_ptrace_busy_cv; +static kmem_cache_t *lx_ptrace_accord_cache; + +/* + * Enter the accord mutex. + */ +static void +lx_ptrace_accord_enter(lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + + mutex_enter(&accord->lxpa_lock); +} + +/* + * Exit the accord mutex. If the reference count has dropped to zero, + * free the accord. + */ +static void +lx_ptrace_accord_exit(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + if (accord->lxpa_refcnt > 0) { + mutex_exit(&accord->lxpa_lock); + return; + } + + /* + * When the reference count drops to zero we must free the accord. + */ + VERIFY(accord->lxpa_tracer == NULL); + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + VERIFY(list_is_empty(&accord->lxpa_tracees)); + VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE); + + mutex_destroy(&accord->lxpa_lock); + mutex_destroy(&accord->lxpa_tracees_lock); + + kmem_cache_free(lx_ptrace_accord_cache, accord); +} + +/* + * Drop our reference to this accord. If this drops the reference count + * to zero, the next lx_ptrace_accord_exit() will free the accord. + */ +static void +lx_ptrace_accord_rele(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + VERIFY(accord->lxpa_refcnt > 0); + accord->lxpa_refcnt--; +} + +/* + * Place an additional hold on an accord. + */ +static void +lx_ptrace_accord_hold(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + accord->lxpa_refcnt++; +} + +/* + * Fetch the accord for this LWP. If one has not yet been created, and the + * process is not exiting, allocate it now. Must be called with p_lock held + * for the process containing the target LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp, + boolean_t allocate_one) +{ + lx_ptrace_accord_t *lxpa; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * If this LWP does not have an accord, we wish to allocate + * and install one. + */ + if ((lxpa = lwpd->br_ptrace_accord) == NULL) { + if (!allocate_one || !VISIBLE(lwpd)) { + /* + * Either we do not wish to allocate an accord, or this + * LWP has already begun exiting from a ptrace + * perspective. + */ + *accordp = NULL; + return (ESRCH); + } + + lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP); + bzero(lxpa, sizeof (*lxpa)); + + /* + * The initial reference count is 1 because we are referencing + * it in from the soon-to-be tracer LWP. + */ + lxpa->lxpa_refcnt = 1; + mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t), + offsetof(lx_lwp_data_t, br_ptrace_linkage)); + lxpa->lxpa_cvp = &p->p_cv; + + lxpa->lxpa_tracer = lwpd; + lwpd->br_ptrace_accord = lxpa; + } + + /* + * Lock the accord before returning it to the caller. + */ + lx_ptrace_accord_enter(lxpa); + + /* + * There should be at least one active reference to this accord, + * otherwise it should have been freed. + */ + VERIFY(lxpa->lxpa_refcnt > 0); + + *accordp = lxpa; + return (0); +} + +/* + * Accords belong to the tracer LWP. Get the accord for this tracer or return + * an error if it was not possible. To prevent deadlocks, the caller MUST NOT + * hold p_lock on its own or any other process. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp) +{ + int ret = ESRCH; + proc_t *aproc; + kthread_t *athr; + klwp_t *alwp; + lx_lwp_data_t *alwpd; + + VERIFY(MUTEX_NOT_HELD(&curproc->p_lock)); + + /* + * Locate the process containing the tracer LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_lock(lxpid, curzone, LXP_PRLOCK, &aproc, &athr) != 0) { + return (ESRCH); + } + + /* + * Locate the tracer LWP itself and ensure that it is visible to + * ptrace(2). + */ + if ((alwp = ttolwp(athr)) == NULL || + (alwpd = lwptolxlwp(alwp)) == NULL || + !VISIBLE(alwpd)) { + sprunlock(aproc); + return (ESRCH); + } + + /* + * We should not fetch our own accord this way. + */ + if (athr == curthread) { + sprunlock(aproc); + return (EPERM); + } + + /* + * Fetch (or allocate) the accord owned by this tracer LWP: + */ + ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE); + + /* + * Unlock the process and return. + */ + sprunlock(aproc); + return (ret); +} + +/* + * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a + * tracer. The caller MUST NOT currently hold p_lock on the process containing + * this LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + int ret; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Lock the tracer (this LWP). + */ + mutex_enter(&p->p_lock); + + /* + * Fetch (or allocate) the accord for this LWP: + */ + ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one); + + mutex_exit(&p->p_lock); + + return (ret); +} + +/* + * Restart an LWP if it is in "ptrace-stop". This function may induce sleep, + * so the caller MUST NOT hold any mutexes other than p_lock for the process + * containing the LWP. + */ +static void +lx_ptrace_restart_lwp(klwp_t *lwp) +{ + kthread_t *rt = lwptot(lwp); + proc_t *rproc = lwptoproc(lwp); + lx_lwp_data_t *rlwpd = lwptolxlwp(lwp); + + VERIFY(rt != curthread); + VERIFY(MUTEX_HELD(&rproc->p_lock)); + + /* + * Exclude potential meddling from procfs. + */ + prbarrier(rproc); + + /* + * Check that the LWP is still in "ptrace-stop" and, if so, restart it. + */ + thread_lock(rt); + if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) { + rt->t_schedflag |= TS_BSTART; + setrun_locked(rt); + + /* + * Clear stop reason. + */ + rlwpd->br_ptrace_whystop = 0; + rlwpd->br_ptrace_whatstop = 0; + rlwpd->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); + } + thread_unlock(rt); +} + +static void +lx_ptrace_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag, + pid_t *event_ppid, pid_t *event_pid) +{ + int signo; + + /* + * Populate our k_siginfo_t with data about this "ptrace-stop" + * condition: + */ + bzero(ip, sizeof (*ip)); + ip->si_signo = SIGCLD; + ip->si_pid = LX_PID_TO_INIT(remote->br_pid); + ip->si_code = CLD_TRAPPED; + + switch (remote->br_ptrace_whatstop) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + ip->si_status = SIGTRAP; + if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) { + ip->si_status |= 0x80; + } + break; + + case LX_PR_SIGNALLED: + signo = remote->br_ptrace_stopsig; + if (signo < 1 || signo >= LX_NSIG) { + /* + * If this signal number is not valid, pretend it + * was a SIGTRAP. + */ + ip->si_status = SIGTRAP; + } else { + ip->si_status = ltos_signo[signo]; + } + break; + + case LX_PR_EVENT: + ip->si_status = SIGTRAP | remote->br_ptrace_event; + /* + * Record the Linux pid of both this LWP and the create + * event we are dispatching. We will use this information + * to unblock any subsequent ptrace(2) events that depend + * on this one. + */ + if (event_ppid != NULL) + *event_ppid = remote->br_pid; + if (event_pid != NULL) + *event_pid = (pid_t)remote->br_ptrace_eventmsg; + break; + + default: + cmn_err(CE_PANIC, "unxpected stop subreason: %d", + remote->br_ptrace_whatstop); + } + + /* + * If WNOWAIT was specified, do not mark the event as posted + * so that it may be re-fetched on another call to waitid(). + */ + if (waitflag) + remote->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); +} + +/* + * Receive notification from stop() of a PR_BRAND stop. + */ +void +lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + klwp_t *plwp = NULL; + proc_t *pp = NULL; + lx_lwp_data_t *parent; + boolean_t cldpend = B_TRUE; + boolean_t cldpost = B_FALSE; + sigqueue_t *sqp = NULL; + + /* + * We currently only care about LX-specific stop reasons. + */ + if (why != PR_BRAND) + return; + + switch (what) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + case LX_PR_SIGNALLED: + case LX_PR_EVENT: + break; + default: + cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND" + " stop: %d", (int)what); + } + + /* + * We should be holding the lock on our containing process. The + * STOPPING flag should have been set by lx_ptrace_stop() for all + * PR_BRAND stops. + */ + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + VERIFY((accord = lwpd->br_ptrace_tracer) != NULL); + + /* + * We must drop our process lock to take "pidlock". The + * LX_PTF_STOPPING flag protects us from an exiting or detaching tracer. + */ + mutex_exit(&p->p_lock); + + /* + * Allocate before we enter any mutexes. + */ + sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP); + + /* + * We take pidlock now, which excludes all callers of waitid() and + * prevents an exiting tracer from clearing critical accord members. + */ + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + + /* + * Get the ptrace(2) "parent" process, to which we may send + * a SIGCLD signal later. + */ + if ((parent = accord->lxpa_tracer) != NULL && + (plwp = parent->br_lwp) != NULL) { + pp = lwptoproc(plwp); + } + + /* + * Our tracer should not have been modified in our absence; the + * LX_PTF_STOPPING flag prevents it. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + + /* + * Stash data for this stop condition in the LWP data while we hold + * both pidlock and our p_lock. + */ + lwpd->br_ptrace_whystop = why; + lwpd->br_ptrace_whatstop = what; + lwpd->br_ptrace_flags |= LX_PTF_WAITPEND; + + /* + * If this event does not depend on an event from the parent LWP, + * populate the siginfo_t for the event pending on this tracee LWP. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_PARENT_WAIT) && pp != NULL) { + cldpost = B_TRUE; + lx_ptrace_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL); + } + + /* + * Drop our p_lock so that we may lock the tracer. + */ + mutex_exit(&p->p_lock); + if (cldpost && pp != NULL) { + /* + * Post the SIGCLD to the tracer. + */ + mutex_enter(&pp->p_lock); + if (!sigismember(&pp->p_sig, SIGCLD)) { + sigaddqa(pp, plwp->lwp_thread, sqp); + cldpend = B_FALSE; + sqp = NULL; + } + mutex_exit(&pp->p_lock); + } + + /* + * We re-take our process lock now. The lock will be held until + * the thread is actually marked stopped, so we will not race with + * lx_ptrace_lock_if_stopped() or lx_waitid_helper(). + */ + mutex_enter(&p->p_lock); + + /* + * We clear the STOPPING flag; stop() continues to hold our p_lock + * until our thread stop state is visible. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_STOPPING; + lwpd->br_ptrace_flags |= LX_PTF_STOPPED; + if (cldpend) { + /* + * We sent the SIGCLD for this new wait condition already. + */ + lwpd->br_ptrace_flags |= LX_PTF_CLDPEND; + } + + /* + * If lx_ptrace_exit_tracer(), or a detach operation, is trying to + * detach our tracer, it will be sleeping on this CV until + * LX_PTF_STOPPING is clear. Wake it now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * While still holding pidlock, we attempt to wake our tracer from a + * potential waitid() slumber. + */ + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + + /* + * We release pidlock and return as we were called: with our p_lock + * held. + */ + mutex_exit(&pidlock); + + if (sqp != NULL) { + kmem_free(sqp, sizeof (*sqp)); + } +} + +/* + * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or + * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop". This + * check must ONLY be run on tracees of the current LWP. If the check is + * successful, we return with the tracee p_lock held. + * + * In the case of PTRACE_DETACH, we can return with the tracee locked even if + * it is not in "ptrace-stop". This can happen for various reasons, such as if + * the remote process is already job-stopped in the kernel. We must still be + * able to detach from this process. We return ENOENT in this case. + */ +static int +lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, + boolean_t detaching) +{ + klwp_t *rlwp = remote->br_lwp; + proc_t *rproc = lwptoproc(rlwp); + kthread_t *rt = lwptot(rlwp); + + /* + * We must never check that we, ourselves, are stopped. We must also + * have the accord tracee list locked while we lock our tracees. + */ + VERIFY(curthread != rt); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + VERIFY(accord->lxpa_tracer == ttolxlwp(curthread)); + + /* + * Lock the process containing the tracee LWP. + */ + mutex_enter(&rproc->p_lock); + if (!VISIBLE(remote)) { + /* + * The tracee LWP is currently detaching itself as it exits. + * It is no longer visible to ptrace(2). + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * We must only check whether tracees of the current LWP are stopped. + * We check this condition after confirming visibility as an exiting + * tracee may no longer be completely consistent. + */ + VERIFY(remote->br_ptrace_tracer == accord); + + if (!(remote->br_ptrace_flags & LX_PTF_STOPPED)) { + if (detaching) { + /* + * The tracee is not in "ptrace-stop", but we still + * return with the locked process. This is indicated + * by ENOENT. + */ + return (ENOENT); + } + + /* + * The tracee is not in "ptrace-stop", so we release the + * process. + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * The tracee is stopped. We return holding its process lock so that + * the caller may manipulate it. + */ + return (0); +} + +static int +lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options) +{ + /* + * Check for valid options. + */ + if ((options & ~LX_PTRACE_O_ALL) != 0) { + return (EINVAL); + } + + /* + * Set ptrace options on the target LWP. + */ + remote->br_ptrace_options = (lx_ptrace_options_t)options; + + return (0); +} + +static int +lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp) +{ + int error; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + uint32_t tmp = remote->br_ptrace_eventmsg; + + error = copyout(&tmp, umsgp, sizeof (uint32_t)); + } else +#endif + { + error = copyout(&remote->br_ptrace_eventmsg, umsgp, + sizeof (ulong_t)); + } + + return (error); +} + +static int +lx_ptrace_getsiginfo(lx_lwp_data_t *remote, void *usiginfo) +{ + klwp_t *lwp = remote->br_lwp; + int lx_sig; + + lx_sig = lx_stol_signo(lwp->lwp_cursig, 0); + if (lx_sig < 1 || lwp->lwp_curinfo == NULL) { + return (EINVAL); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + if (stol_ksiginfo32_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } else +#endif + { + if (stol_ksiginfo_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } + + return (0); +} + + +/* + * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface. + */ +static int +lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo) +{ + klwp_t *lwp = remote->br_lwp; + + if (flags & LX_PTC_SINGLESTEP) { + /* + * We do not currently support single-stepping. + */ + lx_unsupported("PTRACE_SINGLESTEP not currently implemented"); + return (EINVAL); + } + + /* + * The tracer may choose to suppress the delivery of a signal, or + * select an alternative signal for delivery. If this is an + * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig + * will be used as the new signal number. + * + * As with so many other aspects of the Linux ptrace(2) interface, this + * may fail silently if the state machine is not aligned correctly. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + /* + * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart: + */ + if (flags & LX_PTC_SYSCALL) { + remote->br_ptrace_flags |= LX_PTF_SYSCALL; + } else { + remote->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + lx_ptrace_restart_lwp(lwp); + + return (0); +} + +/* + * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface. + * + * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the + * current LWP, be detached and (optionally) set runnable. + */ +static void +lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo, + boolean_t restart) +{ + klwp_t *rlwp = remote->br_lwp; + + /* + * The tracee LWP may have been in "ptrace-stop" (restart is true if + * that was the case). We now hold the tracee's p_lock. + * Detach the LWP from the accord and set it running. + */ + VERIFY(!TRACEE_BUSY(remote)); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + remote->br_ptrace_flags &= ~(LX_PTF_SYSCALL | LX_PTF_INHERIT); + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Decrement traced-lwp count for the process. + */ + ASSERT(MUTEX_HELD(&rlwp->lwp_procp->p_lock)); + VERIFY(ptolxproc(rlwp->lwp_procp)->l_ptrace-- >= 1); + + /* + * The tracer may, as described in lx_ptrace_cont(), choose to suppress + * or modify the delivered signal. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + if (restart) { + lx_ptrace_restart_lwp(rlwp); + } +} + +/* + * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2) + * interface. + * + * This LWP is requesting to be attached as a tracer to another LWP -- the + * tracee. If a ptrace accord to track the list of tracees has not yet been + * allocated, one will be allocated and attached to this LWP now. + * + * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the + * tracee LWP is then added to the "lxpa_tracees" list in the accord. We drop + * locks between these two phases; the only consumer of trace events from this + * accord is this LWP, which obviously cannot be running waitpid(2) at the same + * time as this call to ptrace(2). + */ +static int +lx_ptrace_attach(pid_t lx_pid) +{ + int error = ESRCH; + /* + * Our (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + /* + * Remote (Tracee) LWP: + */ + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + + if (lwpd->br_pid == lx_pid) { + /* + * We cannot trace ourselves. + */ + return (EPERM); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This + * routine should not fail because the LWP cannot make ptrace(2) system + * calls after it has begun exiting. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * Place speculative hold in case the attach is successful. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Locate the process containing the tracee LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_lock(lx_pid, curzone, LXP_PRLOCK, &rproc, &rthr) != 0) { + /* + * We could not find the target process. + */ + goto errout; + } + + /* + * Locate the tracee LWP. + */ + if ((rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL || + !VISIBLE(rlwpd)) { + /* + * The LWP could not be found, was not branded, or is not + * visible to ptrace(2) at this time. + */ + goto unlock_errout; + } + + /* + * We now hold the lock on the tracee. Attempt to install ourselves + * as the tracer. + */ + if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc, + NULL, VWRITE) != 0) { + /* + * This process does not have permission to trace the remote + * process. + */ + error = EPERM; + } else if (rlwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(list_link_active(&rlwpd->br_ptrace_linkage)); + VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + lx_proc_data_t *rprocd = ptolxproc(rproc); + + /* + * Bond the tracee to the accord. + */ + VERIFY0(rlwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE); + rlwpd->br_ptrace_attach = LX_PTA_ATTACH; + rlwpd->br_ptrace_tracer = accord; + + /* Don't emit ptrace syscall-stop-exit event on kernel exit. */ + rlwpd->br_ptrace_flags |= LX_PTF_NOSTOP; + + /* + * We had no tracer, and are thus not in the tracees list. + * It is safe to take the tracee list lock while we insert + * ourselves. + */ + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, rlwpd); + /* + * Bump traced-lwp count for the remote process. + */ + rprocd->l_ptrace++; + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Send a thread-directed SIGSTOP. + */ + sigtoproc(rproc, rthr, SIGSTOP); + + + error = 0; + } + +unlock_errout: + /* + * Unlock the process containing the tracee LWP and the accord. + */ + sprunlock(rproc); + +errout: + if (error != 0) { + /* + * The attach was not successful. Remove our speculative + * hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); +} + +int +lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + switch (option) { + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + case LX_PTRACE_O_TRACECLONE: + break; + + default: + return (EINVAL); + } + + mutex_enter(&p->p_lock); + + lwpd->br_ptrace_clone_option = option; + + if (inherit_flag) { + lwpd->br_ptrace_flags |= LX_PTF_INHERIT; + } else { + lwpd->br_ptrace_flags &= ~LX_PTF_INHERIT; + } + + mutex_exit(&p->p_lock); + return (0); +} + +/* + * If the parent LWP is being traced, we want to attach ourselves to the + * same accord. + */ +void +lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst) +{ + proc_t *srcp = lwptoproc(src->br_lwp); + proc_t *dstp = lwptoproc(dst->br_lwp); + lx_ptrace_accord_t *accord; + boolean_t is_fork = B_FALSE; + + VERIFY(MUTEX_HELD(&dstp->p_lock)); + if (srcp != dstp) { + /* + * In the case of being called via forklwp, some lock shuffling + * is required. The destination p_lock must be dropped to + * avoid deadlocks when locking the source and manipulating + * ptrace accord resources. + */ + is_fork = B_TRUE; + sprlock_proc(dstp); + mutex_exit(&dstp->p_lock); + mutex_enter(&srcp->p_lock); + } + + if ((accord = src->br_ptrace_tracer) == NULL) { + /* + * The source LWP does not have a tracer to inherit. + */ + goto out; + } + + /* + * There are two conditions to check when determining if the new + * child should inherit the same tracer (and tracing options) as its + * parent. Either condition is sufficient to trigger inheritance. + */ + dst->br_ptrace_attach = LX_PTA_NONE; + if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) { + /* + * Condition 1: + * The clone(2), fork(2) and vfork(2) emulated system calls + * populate "br_ptrace_clone_option" with the specific + * ptrace(2) SETOPTIONS option that applies to this + * operation. If the relevant option has been enabled by the + * tracer then we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS; + + } else if ((src->br_ptrace_flags & LX_PTF_INHERIT) != 0) { + /* + * Condition 2: + * If the caller opted in to inheritance with the + * PTRACE_CLONE flag to clone(2), the LX_PTF_INHERIT flag + * will be set and we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE; + } + + /* + * These values only apply for the duration of a single clone(2), et + * al, system call. + */ + src->br_ptrace_flags &= ~LX_PTF_INHERIT; + src->br_ptrace_clone_option = 0; + + if (dst->br_ptrace_attach == LX_PTA_NONE) { + /* + * No condition triggered inheritance. + */ + goto out; + } + + /* + * Set the LX_PTF_CLONING flag to prevent us from being detached + * while our p_lock is dropped. + */ + src->br_ptrace_flags |= LX_PTF_CLONING; + mutex_exit(&srcp->p_lock); + + /* + * Hold the accord for the new LWP. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Install the tracer and copy the current PTRACE_SETOPTIONS options. + */ + dst->br_ptrace_tracer = accord; + dst->br_ptrace_options = src->br_ptrace_options; + + /* + * This flag prevents waitid() from seeing events for the new child + * until the parent is able to post the relevant ptrace event to + * the tracer. + */ + dst->br_ptrace_flags |= LX_PTF_PARENT_WAIT; + + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(list_link_active(&src->br_ptrace_linkage)); + VERIFY(!list_link_active(&dst->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, dst); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Relock our process and clear our busy flag. + */ + mutex_enter(&srcp->p_lock); + src->br_ptrace_flags &= ~LX_PTF_CLONING; + + /* + * Bump traced-lwp count for the process. + */ + ptolxproc(dstp)->l_ptrace++; + + /* + * If lx_ptrace_exit_tracer(), or a detach operation, is trying to + * detach our tracer, it will be sleeping on this CV until + * LX_PTF_CLONING is clear. Wake it now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + +out: + if (is_fork) { + mutex_exit(&srcp->p_lock); + mutex_enter(&dstp->p_lock); + sprunprlock(dstp); + } +} + +static int +lx_ptrace_traceme(void) +{ + int error; + boolean_t did_attach = B_FALSE; + /* + * Our (Tracee) LWP: + */ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + /* + * Remote (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + + /* + * We are intending to be the tracee. Fetch (or allocate) the accord + * for our parent LWP. + */ + if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL, + NULL), &accord)) != 0) { + /* + * Could not determine the Linux pid of the parent LWP, or + * could not get the accord for that LWP. + */ + return (error); + } + + /* + * We now hold the accord lock. + */ + if (accord->lxpa_flags & LX_ACC_TOMBSTONE) { + /* + * The accord is marked for death; give up now. + */ + lx_ptrace_accord_exit(accord); + return (ESRCH); + } + + /* + * Bump the reference count so that the accord is not freed. We need + * to drop the accord lock before we take our own p_lock. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * We now lock _our_ process and determine if we can install our parent + * as our tracer. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + /* + * Bond ourselves to the accord. We already bumped the accord + * reference count. + */ + VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE); + lwpd->br_ptrace_attach = LX_PTA_TRACEME; + lwpd->br_ptrace_tracer = accord; + did_attach = B_TRUE; + error = 0; + + /* + * Speculatively bump l_ptrace now before dropping p_lock. + * It will be reverted if the tracee attachment fails. + */ + ptolxproc(p)->l_ptrace++; + } + mutex_exit(&p->p_lock); + + /* + * Lock the accord tracee list and add this LWP. Once we are in the + * tracee list, it is the responsibility of the tracer to detach us. + */ + if (error == 0) { + lx_ptrace_accord_enter(accord); + mutex_enter(&accord->lxpa_tracees_lock); + + if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) { + /* + * Put ourselves in the tracee list for this accord. + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, lwpd); + mutex_exit(&accord->lxpa_tracees_lock); + lx_ptrace_accord_exit(accord); + + return (0); + } + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * The accord has been marked for death. We must + * untrace ourselves. + */ + error = ESRCH; + lx_ptrace_accord_exit(accord); + + /* + * Undo speculative increment of ptracer count. + */ + mutex_enter(&p->p_lock); + ptolxproc(p)->l_ptrace--; + mutex_exit(&p->p_lock); + } + + /* + * Our optimism was unjustified: We were unable to attach. We need to + * lock the process containing this LWP again in order to remove the + * tracer. + */ + VERIFY(error != 0); + mutex_enter(&p->p_lock); + if (did_attach) { + /* + * Verify that things were as we left them: + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + VERIFY(lwpd->br_ptrace_tracer == accord); + + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + } + mutex_exit(&p->p_lock); + + /* + * Remove our speculative hold on the accord, possibly causing it to be + * freed in the process. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (error); +} + +static boolean_t +lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) +{ + boolean_t reset_nostop = B_FALSE; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark this LWP as stopping and call stop() to enter "ptrace-stop". + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + lwpd->br_ptrace_flags |= LX_PTF_STOPPING; + + if (lwpd->br_lwp->lwp_nostop == 1 && + lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) { + /* We need to clear this to get the signal delivered. */ + lwpd->br_lwp->lwp_nostop = 0; + reset_nostop = B_TRUE; + } + + stop(PR_BRAND, what); + + if (reset_nostop) { + VERIFY(lwpd->br_lwp->lwp_nostop == 0); + lwpd->br_lwp->lwp_nostop = 1; + } + + /* + * We are back from "ptrace-stop" with our process lock held. + */ + lwpd->br_ptrace_flags &= ~(LX_PTF_STOPPING | LX_PTF_STOPPED | + LX_PTF_CLDPEND); + lwpd->br_ptrace_stopucp = NULL; + cv_broadcast(&lx_ptrace_busy_cv); + mutex_exit(&p->p_lock); + + return (B_TRUE); +} + +int +lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg, + uintptr_t ucp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (!child) { + /* + * Only the first event posted by a new process is to be held + * until the matching parent event is dispatched, and only if + * it is a "child" event. This is not a child event, so we + * clear the wait flag. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + } else if (option == LX_PTRACE_O_TRACEVFORK) { + /* + * For a child, we have to handle vfork as a special case. In + * lx_ptrace_inherit_tracer() we set LX_PTF_PARENT_WAIT to + * force events to be delayed until the parent posts its event. + * This flag is cleared in lx_waitid_helper() to enforce a + * "happens after" relationship. However, this obviously cannot + * work for the vfork case. Thus, we clear our flag now so that + * we can deliver the signal in lx_stop_notify(), if necessary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + } + + if (!(lwpd->br_ptrace_options & option)) { + if (option == LX_PTRACE_O_TRACEEXEC) { + /* + * Without PTRACE_O_TRACEEXEC, the Linux kernel will + * send SIGTRAP to the process. + */ + sigtoproc(p, t, SIGTRAP); + mutex_exit(&p->p_lock); + return (0); + } + + /* + * The flag for this trace event is not enabled, so we will not + * stop. + */ + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (child) { + switch (option) { + case LX_PTRACE_O_TRACECLONE: + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + /* + * Send the child LWP a directed SIGSTOP. + */ + sigtoproc(p, t, SIGSTOP); + mutex_exit(&p->p_lock); + return (0); + default: + goto nostop; + } + } + + lwpd->br_ptrace_eventmsg = msg; + + switch (option) { + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE; + break; + case LX_PTRACE_O_TRACEEXEC: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC; + lwpd->br_ptrace_eventmsg = 0; + break; + case LX_PTRACE_O_TRACEEXIT: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT; + break; + case LX_PTRACE_O_TRACEFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK; + break; + case LX_PTRACE_O_TRACEVFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK; + break; + case LX_PTRACE_O_TRACEVFORKDONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE; + lwpd->br_ptrace_eventmsg = 0; + break; + default: + goto nostop; + } + + /* + * Userland may have passed in a ucontext_t pointer for + * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped. + */ + lwpd->br_ptrace_stopucp = ucp; + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH); + +nostop: + lwpd->br_ptrace_event = 0; + lwpd->br_ptrace_eventmsg = 0; + mutex_exit(&p->p_lock); + return (ESRCH); +} + +boolean_t +lx_ptrace_stop(ushort_t what) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT || + what == LX_PR_SIGNALLED); + + /* + * If we do not have an accord, bail out early. + */ + if (lwpd->br_ptrace_tracer == NULL) + return (B_FALSE); + + /* + * Lock this process and re-check the condition. + */ + mutex_enter(&p->p_lock); + + /* + * The child after a fork/clone doesn't emit syscall-exit-stop event. + */ + if (what == LX_PR_SYSEXIT && (lwpd->br_ptrace_flags & LX_PTF_NOSTOP)) { + lwpd->br_ptrace_flags &= ~LX_PTF_NOSTOP; + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (lwpd->br_ptrace_tracer == NULL) { + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_SYSCALL); + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) { + if (what == LX_PR_SYSENTRY) { + lwpd->br_ptrace_flags |= LX_PTF_INSYSCALL; + } else { + lwpd->br_ptrace_flags &= ~LX_PTF_INSYSCALL; + } + + /* + * This is a syscall-entry-stop or syscall-exit-stop point. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_SYSCALL)) { + /* + * A system call stop has not been requested. + */ + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + /* + * The PTRACE_SYSCALL restart command applies only to the next + * system call entry or exit. The tracer must restart us with + * PTRACE_SYSCALL while we are in ptrace-stop for us to fire + * again at the next system call boundary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, what)); +} + +/* + * In addition to performing the ptrace sig_stop handling, this function is + * also used to block signal from being delivered. + * + * Return 0 if issig_forreal() should continue on, -1 if issig_forreal should + * recheck after we've made changes, or 1 if issig_forreal should stop checking + * signals. + */ +int +lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int lx_sig; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + if (ptolxproc(p)->l_block_all_signals != 0) + return (1); + + /* + * In very rare circumstances, a process which is almost completely + * through proc_exit() may incur issig checks in the current thread via + * clean-up actions. The process will still be branded, but the thread + * will have already been stripped of any LX-specific data on its way + * to the grave. Bail early if the brand data is missing. + */ + if (lwpd == NULL) { + return (0); + } + + /* + * If we do not have an accord, bail out now. Additionally, if there + * is no valid signal then we have no reason to stop. + */ + if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL || + (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) || + (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) { + if (lwp->lwp_cursig == 0) { + /* + * If this lwp has no current signal, it means that any + * signal ignorance enabled by br_ptrace_donesig has + * already taken place (the signal was consumed). + * By clearing donesig, we declare desire to ignore no + * signals for accurate ptracing. + */ + lwpd->br_ptrace_donesig = 0; + } + return (0); + } + + /* + * We can't deliver the signal-delivery-stop condition while we're + * between the syscall-enter-stop and syscall-exit-stop conditions. + * We must first let the signal interrupt the in-progress syscall, let + * it emit syscall-exit-stop with the interrupted result, then we'll + * come back here to emit signal-delivery-stop. + */ + if (lwpd->br_ptrace_flags & LX_PTF_INSYSCALL) { + return (0); + } + + /* + * We stash the signal on the LWP where our waitid_helper will find it + * and enter the ptrace "signal-delivery-stop" condition. + */ + lwpd->br_ptrace_stopsig = lx_sig; + lwpd->br_ptrace_donesig = 0; + (void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED); + mutex_enter(&p->p_lock); + + /* + * When we return, the signal may have been altered or suppressed. + */ + if (lwpd->br_ptrace_stopsig != lx_sig) { + int native_sig; + lx_sig = lwpd->br_ptrace_stopsig; + + if (lx_sig >= LX_NSIG) { + lx_sig = 0; + } + + /* + * Translate signal from Linux signal number back to + * an illumos native signal. + */ + if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig = + ltos_signo[lx_sig]) < 1) { + /* + * The signal is not deliverable. + */ + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + } else { + /* + * Alter the currently dispatching signal. + */ + if (native_sig == SIGKILL) { + /* + * We mark ourselves the victim and request + * a restart of signal processing. + */ + p->p_flag |= SKILLED; + p->p_flag &= ~SEXTKILLED; + return (-1); + } + lwp->lwp_cursig = native_sig; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo != NULL) { + lwp->lwp_curinfo->sq_info.si_signo = native_sig; + } + } + } + + lwpd->br_ptrace_donesig = lwp->lwp_cursig; + lwpd->br_ptrace_stopsig = 0; + return (0); +} + +boolean_t +lx_ptrace_sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + lx_proc_data_t *lxpd = ptolxproc(p); + + /* + * Ignored signals and ptrace: + * + * When a process is being ptraced by another, special care is needed + * while handling signals. Since the tracer is interested in all + * signals sent to the tracee, an effort must be made to initially + * bypass signal ignorance logic. This allows the signal to be placed + * in the tracee's sigqueue to be inspected and potentially altered by + * the tracer. + * + * A critical detail in this procedure is how a signal is handled after + * tracer has completed processing for the event. If the signal would + * have been ignored, were it not for the initial ptrace override, then + * lx_ptrace_sig_ignorable must report B_TRUE when the tracee is + * restarted and resumes signal processing. This is done by recording + * the most recent tracee signal consumed by ptrace. + */ + + if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) { + /* + * This process is being ptraced. Bypass signal ignorance for + * anything that maps to a valid Linux signal... + */ + if (lwp != NULL && lwptolxlwp(lwp)->br_ptrace_donesig == sig) { + /* + * ...Unless it is a signal which has already been + * processed by the tracer. + */ + return (B_TRUE); + } + return (B_FALSE); + } + return (B_TRUE); +} + +static void +lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + lx_ptrace_accord_enter(accord); + /* + * Mark this accord for death. This means no new tracees can be + * attached to this accord. + */ + VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE); + accord->lxpa_flags |= LX_ACC_TOMBSTONE; + lx_ptrace_accord_exit(accord); + + /* + * Walk the list of tracees, detaching them and setting them runnable + * if they are stopped. + */ + for (;;) { + klwp_t *rlwp; + proc_t *rproc; + lx_lwp_data_t *remote; + kmutex_t *rmp; + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + mutex_exit(&accord->lxpa_tracees_lock); + break; + } + + /* + * Fetch the first tracee LWP in the list and lock the process + * which contains it. + */ + remote = list_head(&accord->lxpa_tracees); + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + /* + * The p_lock mutex persists beyond the life of the process + * itself. We save the address, here, to prevent the need to + * dereference the proc_t after awaking from sleep. + */ + rmp = &rproc->p_lock; + mutex_enter(rmp); + + if (TRACEE_BUSY(remote)) { + /* + * This LWP is currently detaching itself on exit, or + * mid-way through stop(). We must wait for this + * action to be completed. While we wait on the CV, we + * must drop the accord tracee list lock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + cv_wait(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, some state may have changed. + * Restart the walk to be sure we don't miss anything. + */ + mutex_exit(rmp); + continue; + } + + /* + * We now hold p_lock on the process. Remove the tracee from + * the list. + */ + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + /* + * Unlink the accord and clear our trace flags. + */ + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Let go of the list lock before we restart the LWP. We must + * not hold any locks other than the process p_lock when + * we call lx_ptrace_restart_lwp() as it will thread_lock + * the tracee. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Decrement traced-lwp count for the remote process. + */ + VERIFY(ptolxproc(rproc)->l_ptrace-- >= 1); + + /* + * Ensure that the LWP is not stopped on our account. + */ + lx_ptrace_restart_lwp(rlwp); + + /* + * Unlock the former tracee. + */ + mutex_exit(rmp); + + /* + * Drop the hold this tracee had on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + mutex_enter(&p->p_lock); + lwpd->br_ptrace_accord = NULL; + mutex_exit(&p->p_lock); + + /* + * Clean up and release our hold on the accord If we completely + * detached all tracee LWPs, this will free the accord. Otherwise, it + * will be freed when they complete their cleanup. + * + * We hold "pidlock" while clearing these members for easy exclusion of + * waitid(), etc. + */ + mutex_enter(&pidlock); + lx_ptrace_accord_enter(accord); + accord->lxpa_cvp = NULL; + accord->lxpa_tracer = NULL; + mutex_exit(&pidlock); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +static void +lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd) +{ + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Be careful in the face of detaching and attaching tracers. + * lwpd->br_ptrace_tracer is modified only when p->p_lock is held. Lock + * ordering says that accord->lxpa_tracees_lock must be taken prior to + * p->p_lock, so we have to get a reference to the accord and hold it + * across dropping p->p_lock. + * + * In the face of a tracer going away and a new one coming in, we may + * take a lap. + */ +again: + if ((accord = lwpd->br_ptrace_tracer) == NULL) { + return; + } + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + mutex_exit(&p->p_lock); + + /* + * We are the tracee LWP. Lock the accord tracee list and then our + * containing process. + */ + mutex_enter(&accord->lxpa_tracees_lock); + mutex_enter(&p->p_lock); + + /* + * Be sure that the accord currently associated with the lwp is the one + * for which we are holding lxpa_tracees_lock. + */ + if (lwpd->br_ptrace_tracer != accord) { + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + mutex_enter(&p->p_lock); + + goto again; + } + + /* + * Remove our reference to the accord. We will release our hold + * later. + */ + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + + /* + * Remove this LWP from the accord tracee list: + */ + VERIFY(list_link_active(&lwpd->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, lwpd); + + /* + * Wake up any tracers waiting for us to detach from the accord. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * Decrement traced-lwp count for the process. + */ + VERIFY(ptolxproc(p)->l_ptrace-- >= 1); + + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Grab "pidlock" and wake the tracer if it is blocked in waitid(). + */ + mutex_enter(&pidlock); + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + mutex_exit(&pidlock); + + /* + * Release the holds on the accord. One is the hold taken earlier in + * this function and the other is lwpd's hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + mutex_enter(&p->p_lock); +} + +/* + * This routine is called from lx_exitlwp() when an LWP is ready to exit. If + * this LWP is being traced, it will be detached from the tracer's accord. The + * routine will also detach any LWPs being traced by this LWP. + */ +void +lx_ptrace_exit(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark our LWP as exiting from a ptrace perspective. This will + * prevent a new accord from being allocated if one does not exist + * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + lwpd->br_ptrace_flags |= LX_PTF_EXITING; + + if (lwpd->br_ptrace_tracer != NULL) { + /* + * We are traced by another LWP and must detach ourselves. + */ + lx_ptrace_exit_tracee(p, lwpd); + VERIFY(MUTEX_HELD(&p->p_lock)); + } + + if ((accord = lwpd->br_ptrace_accord) != NULL) { + /* + * We have been tracing other LWPs, and must detach from + * them and clean up our accord. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracer(p, lwpd, accord); + mutex_enter(&p->p_lock); + } +} + +/* + * Called when a SIGCLD signal is dispatched so that we may enqueue another. + * Return 0 if we enqueued a signal, or -1 if not. + */ +int +lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + boolean_t found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&pp->p_lock)); + VERIFY(lwptoproc(lwp) == pp); + + mutex_enter(&pp->p_lock); + if ((accord = lwpd->br_ptrace_accord) == NULL) { + /* + * This LWP is not a tracer LWP, so there will be no + * SIGCLD. + */ + mutex_exit(&pp->p_lock); + return (-1); + } + mutex_exit(&pp->p_lock); + + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_CLDPEND)) { + /* + * No SIGCLD is required for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + return (-1); + } + + /* + * Generate siginfo for this tracee LWP. + */ + lx_ptrace_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL); + remote->br_ptrace_flags &= ~LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + mutex_enter(&pp->p_lock); + if (sigismember(&pp->p_sig, SIGCLD)) { + mutex_exit(&pp->p_lock); + + mutex_enter(&rproc->p_lock); + remote->br_ptrace_flags |= LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + return (-1); + } + sigaddqa(pp, curthread, sqp); + mutex_exit(&pp->p_lock); + + return (0); +} + +/* + * Consume the next available ptrace(2) event queued against the accord for + * this LWP. The event will be emitted as if through waitid(), and converted + * by lx_waitpid() and friends before the return to usermode. + */ +int +lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options, + boolean_t *brand_wants_wait, int *rval) +{ + lx_ptrace_accord_t *accord; + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *local = lwptolxlwp(lwp); + lx_lwp_data_t *remote; + boolean_t found = B_FALSE; + klwp_t *rlwp = NULL; + proc_t *rproc = NULL; + pid_t event_pid = 0, event_ppid = 0; + boolean_t waitflag = !(options & WNOWAIT); + boolean_t target_found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * By default, we do not expect waitid() to block on our account. + */ + *brand_wants_wait = B_FALSE; + + if (!local->br_waitid_emulate) { + /* + * This waitid() call is not expecting emulated results. + */ + return (-1); + } + + switch (idtype) { + case P_ALL: + case P_PID: + case P_PGID: + break; + default: + /* + * This idtype has no power here. + */ + return (-1); + } + + if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) { + /* + * This LWP does not have an accord; it cannot be tracing. + */ + return (-1); + } + + /* + * We do not need an additional hold on the accord as it belongs to + * the running, tracer, LWP. + */ + lx_ptrace_accord_exit(accord); + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + /* + * Though it has an accord, there are currently no tracees in + * the list for this LWP. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (-1); + } + + /* + * Walk the list of tracees and determine if any of them have events to + * report. + */ + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * We check to see if this LWP matches an id we are waiting for. + */ + switch (idtype) { + case P_ALL: + break; + case P_PID: + if (remote->br_pid != id) + continue; + break; + case P_PGID: + if (rproc->p_pgrp != id) + continue; + break; + default: + cmn_err(CE_PANIC, "unexpected idtype: %d", idtype); + } + + /* This tracee matches provided idtype and id */ + target_found = B_TRUE; + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + /* + * There were no events of interest, but we have tracees. + * If any of the tracees matched the spcified criteria, signal + * to waitid() that it should block if the provided flags allow + * for it. + */ + if (target_found) { + *brand_wants_wait = B_TRUE; + } + + return (-1); + } + + /* + * Populate the signal information. + */ + lx_ptrace_winfo(remote, ip, waitflag, &event_ppid, &event_pid); + + /* + * Unlock the tracee. + */ + mutex_exit(&rproc->p_lock); + + if (event_pid != 0 && event_ppid != 0) { + /* + * We need to do another pass around the tracee list and + * unblock any events that have a "happens after" relationship + * with this event. + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + mutex_enter(&rproc->p_lock); + + if (remote->br_pid != event_pid || + remote->br_ppid != event_ppid) { + mutex_exit(&rproc->p_lock); + continue; + } + + remote->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + mutex_exit(&rproc->p_lock); + } + mutex_exit(&accord->lxpa_tracees_lock); + } + + /* + * If we are consuming this wait state, we remove the SIGCLD from + * the queue and post another. + */ + if (waitflag) { + mutex_exit(&pidlock); + sigcld_delete(ip); + sigcld_repost(); + mutex_enter(&pidlock); + } + + *rval = 0; + return (0); +} + +static int +lx_ptrace_peek(lx_lwp_data_t *lwpd, uintptr_t addr, void *data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + long buf; + int error = 0, size = sizeof (buf); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uread(p, &buf, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + if (copyout(&buf, data, size) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +lx_ptrace_poke(lx_lwp_data_t *lwpd, uintptr_t addr, uintptr_t data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + int error = 0, size = sizeof (data); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uwrite(p, &data, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + return (0); +} + +static int +lx_ptrace_kill(lx_lwp_data_t *lwpd) +{ + sigtoproc(lwptoproc(lwpd->br_lwp), NULL, SIGKILL); + + return (0); +} + +static int +lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + lx_lwp_data_t *local = ttolxlwp(curthread); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + int error; + boolean_t found = B_FALSE, restart = B_TRUE; + + /* + * PTRACE_TRACEME and PTRACE_ATTACH operations induce the tracing of + * one LWP by another. The target LWP must not be traced already. + */ + switch (ptrace_op) { + case LX_PTRACE_TRACEME: + return (lx_ptrace_traceme()); + + case LX_PTRACE_ATTACH: + return (lx_ptrace_attach(lxpid)); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This routine + * should not fail because the LWP cannot make ptrace(2) system calls + * after it has begun exiting. + */ + VERIFY0(local->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * The accord belongs to this (the tracer) LWP, and we have a hold on + * it. We drop the lock so that we can take other locks. + */ + lx_ptrace_accord_exit(accord); + + /* + * Does the tracee list contain the pid in question? + */ +retry: + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + if (remote->br_pid == lxpid) { + found = B_TRUE; + break; + } + } + if (!found) { + /* + * The requested pid does not appear in the tracee list. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (ESRCH); + } + + if (ptrace_op == LX_PTRACE_DETACH) { + /* + * We're detaching, make sure in-syscall flag is off so that + * signal will stop the process directly. + */ + remote->br_ptrace_flags &= ~LX_PTF_INSYSCALL; + } + + /* + * Attempt to lock the target LWP. + */ + if ((error = lx_ptrace_lock_if_stopped(accord, remote, + (ptrace_op == LX_PTRACE_DETACH))) != 0) { + /* + * The LWP was not in "ptrace-stop". For detach, ENOENT + * indicates that the LWP was not in "ptrace-stop", but is + * still locked. + */ + if (ptrace_op == LX_PTRACE_DETACH && error == ENOENT) { + /* + * We're detaching, but the process was not in + * ptrace_stop, so we don't want to try to restart it. + */ + restart = B_FALSE; + } else { + mutex_exit(&accord->lxpa_tracees_lock); + return (error); + } + } + + /* + * The target LWP is in "ptrace-stop". We have the containing process + * locked. + */ + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + if (ptrace_op == LX_PTRACE_DETACH) { + if (TRACEE_BUSY(remote)) { + kmutex_t *rmp; + + /* + * There is a tricky race condition we have to watch + * out for here (for example, if a tracee is in the + * kernel in the middle of a syscall). When the tracee + * is leaving the kernel, it will set LX_PTF_STOPPING. + * In lx_stop_notify() the tracee has to drop its + * p_lock, take pidlock, then reacquire p_lock, before + * it will clear LX_PTF_STOPPING and set LX_PTF_STOPPED. + * During that window, if this tracer is trying to + * detach, we have to make sure the tracee is restarted. + * We handle this case in the same way we handle + * the tracer exiting in lx_ptrace_exit_tracer(). + */ + rmp = &rproc->p_lock; + mutex_exit(&accord->lxpa_tracees_lock); + (void) cv_wait_sig(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, state will have changed, so + * retry. + */ + mutex_exit(rmp); + goto retry; + } + + lx_ptrace_detach(accord, remote, (int)data, restart); + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Release a hold from the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (0); + } + + /* + * The tracees lock is not needed for any of the other operations. + * Drop it so further actions can avoid deadlock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Process the ptrace(2) request: + */ + switch (ptrace_op) { + case LX_PTRACE_CONT: + error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data); + break; + + case LX_PTRACE_SYSCALL: + error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data); + break; + + case LX_PTRACE_SINGLESTEP: + error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data); + break; + + case LX_PTRACE_SETOPTIONS: + error = lx_ptrace_setoptions(remote, data); + break; + + case LX_PTRACE_GETEVENTMSG: + error = lx_ptrace_geteventmsg(remote, (void *)data); + break; + + case LX_PTRACE_GETREGS: + error = lx_user_regs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETREGS: + error = lx_user_regs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETSIGINFO: + error = lx_ptrace_getsiginfo(remote, (void *)data); + break; + + case LX_PTRACE_PEEKTEXT: + case LX_PTRACE_PEEKDATA: + error = lx_ptrace_peek(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKETEXT: + case LX_PTRACE_POKEDATA: + error = lx_ptrace_poke(remote, addr, data); + break; + + case LX_PTRACE_PEEKUSER: + error = lx_ptrace_peekuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKEUSER: + error = lx_ptrace_pokeuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_GETFPREGS: + error = lx_user_fpregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPREGS: + error = lx_user_fpregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETFPXREGS: + error = lx_user_fpxregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPXREGS: + error = lx_user_fpxregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_KILL: + error = lx_ptrace_kill(remote); + break; + + default: + error = EINVAL; + } + + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + + return (error); +} + +int +lx_ptrace(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + int error; + + error = lx_ptrace_kernel(ptrace_op, LX_INIT_TO_PID(lxpid), addr, data); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +void +lx_ptrace_init(void) +{ + cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL); + + lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord", + sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lx_ptrace_fini(void) +{ + cv_destroy(&lx_ptrace_busy_cv); + + kmem_cache_destroy(lx_ptrace_accord_cache); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_signal.c b/usr/src/uts/common/brand/lx/os/lx_signal.c new file mode 100644 index 0000000000..53e0cecc14 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_signal.c @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/signal.h> +#include <sys/sunddi.h> +#include <lx_signum.h> + +void +lx_ltos_sigset(lx_sigset_t *lsigp, k_sigset_t *ssigp) +{ + int lx_sig, sig; + + sigemptyset(ssigp); + for (lx_sig = 1; lx_sig <= LX_NSIG; lx_sig++) { + if (lx_sigismember(lsigp, lx_sig) && + ((sig = ltos_signo[lx_sig]) > 0)) + sigaddset(ssigp, sig); + } + + /* Emulate sigutok() restrictions */ + ssigp->__sigbits[0] &= (FILLSET0 & ~CANTMASK0); + ssigp->__sigbits[1] &= (FILLSET1 & ~CANTMASK1); + ssigp->__sigbits[2] &= (FILLSET2 & ~CANTMASK2); +} + +void +lx_stol_sigset(k_sigset_t *ssigp, lx_sigset_t *lsigp) +{ + int sig, lx_sig; + + bzero(lsigp, sizeof (lx_sigset_t)); + for (sig = 1; sig < NSIG; sig++) { + if (sigismember(ssigp, sig) && + ((lx_sig = stol_signo[sig]) > 0)) + lx_sigaddset(lsigp, lx_sig); + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c new file mode 100644 index 0000000000..5a8f9322a0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -0,0 +1,1229 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/privregs.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/sdt.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + + +/* + * Flags for sysent entries: + */ +#define LX_SYS_NOSYS_REASON 0x07 +#define LX_SYS_EBPARG6 0x08 + +/* + * Flags that denote the specific reason we do not have a particular system + * call. These reasons are only valid if the function is NULL. + */ +#define NOSYS_USERMODE 0 +#define NOSYS_NULL 1 +#define NOSYS_NONE 2 +#define NOSYS_NO_EQUIV 3 +#define NOSYS_KERNEL 4 +#define NOSYS_UNDOC 5 +#define NOSYS_OBSOLETE 6 +#define NOSYS_MAX NOSYS_OBSOLETE + +#if NOSYS_MAX > LX_SYS_NOSYS_REASON +#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON +#endif + +/* + * Strings describing the reason we do not emulate a particular system call + * in the kernel. + */ +static char *nosys_reasons[] = { + NULL, /* NOSYS_USERMODE means this call is emulated in usermode */ + "Not done yet", + "No such Linux system call", + "No equivalent illumos functionality", + "Reads/modifies Linux kernel state", + "Undocumented and/or rarely used system call", + "Unsupported, obsolete system call" +}; + + +#if defined(_LP64) +/* + * System call handler table and entry count for Linux x86_64 (amd64): + */ +lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +int lx_nsysent64; +#endif +/* + * System call handler table and entry count for Linux x86 (i386): + */ +lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +int lx_nsysent32; + +#if defined(_LP64) +struct lx_vsyscall +{ + uintptr_t lv_addr; + uintptr_t lv_scnum; +} lx_vsyscalls[] = { + { LX_VSYS_gettimeofday, LX_SYS_gettimeofday }, + { LX_VSYS_time, LX_SYS_time }, + { LX_VSYS_getcpu, LX_SYS_getcpu }, + { NULL, NULL } +}; +#endif + +#if defined(__amd64) +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Note: Syscall argument passing is different from function + * call argument passing on amd64. For function calls, the + * fourth arg is passed via %rcx, but for system calls the 4th + * arg is passed via %r10. This is because in amd64, the + * syscall instruction puts the lower 32 bits of %rflags in + * %r11 and puts the %rip value to %rcx. + * + * Appendix A of the amd64 ABI (Linux conventions) states that + * syscalls are limited to 6 args and no arg is passed on the + * stack. + */ + args[0] = rp->r_rdi; + args[1] = rp->r_rsi; + args[2] = rp->r_rdx; + args[3] = rp->r_r10; + args[4] = rp->r_r8; + args[5] = rp->r_r9; + } else { + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + uint32_t args32[6]; + + if (copyin((void *)rp->r_rbx, &args32, + sizeof (args32)) != 0) { + /* + * Clear the argument vector so that the + * trace probe does not expose kernel + * memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + + args[0] = args32[0]; + args[1] = args32[1]; + args[2] = args32[2]; + args[3] = args32[3]; + args[4] = args32[4]; + args[5] = args32[5]; + } else { + args[0] = rp->r_rbx; + args[1] = rp->r_rcx; + args[2] = rp->r_rdx; + args[3] = rp->r_rsi; + args[4] = rp->r_rdi; + args[5] = rp->r_rbp; + } + } + + return (0); +} + +#else /* !__amd64 */ + +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) != + 0) { + /* + * Clear the argument vector so that the trace probe + * does not expose kernel memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + } else { + args[0] = rp->r_ebx; + args[1] = rp->r_ecx; + args[2] = rp->r_edx; + args[3] = rp->r_esi; + args[4] = rp->r_edi; + args[5] = rp->r_ebp; + } + + return (0); +} +#endif + +void +lx_syscall_return(klwp_t *lwp, int syscall_num, long ret) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int error = lwp->lwp_errno; + + if (error != EINTR) { + /* + * If this system call was not interrupted, clear the system + * call restart flag before lx_setcontext() can pass it to + * usermode. + */ + lwpd->br_syscall_restart = B_FALSE; + } + + if (error != 0) { + /* + * Convert from illumos to Linux errno: + */ + ret = -lx_errno(error, EINVAL); + } + + /* + * 32-bit Linux system calls return via %eax; 64-bit calls return via + * %rax. + */ + rp->r_r0 = ret; + + /* + * Hold for the ptrace(2) "syscall-exit-stop" condition if required by + * PTRACE_SYSCALL. Note that the register state may be modified by + * tracer. + */ + (void) lx_ptrace_stop(LX_PR_SYSEXIT); + + /* + * Emit audit record, if necessary. + */ + lx_audit_syscall_exit(syscall_num, ret); + + /* + * Fire the DTrace "lx-syscall:::return" probe: + */ + lx_trace_sysreturn(syscall_num, ret); + + /* + * Clear errno for next time. We do not clear "br_syscall_restart" or + * "br_syscall_num" as they are potentially used by "lx_savecontext()" + * in the signal delivery path. + */ + lwp->lwp_errno = 0; + + lx_check_strict_failure(lwpd); + + /* + * We want complete control of the registers on return from this + * emulated Linux system call: + */ + lwp->lwp_eosys = JUSTRETURN; +} + +static void +lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason) +{ + char buf[100]; + + if (s == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds", + syscall_num); + } else { + VERIFY(unsup_reason < (sizeof (nosys_reasons) / + sizeof (*nosys_reasons))); + + if (s->sy_name == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s", + syscall_num, nosys_reasons[unsup_reason]); + } else { + (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s", + s->sy_name, nosys_reasons[unsup_reason]); + } + } + + lx_unsupported(buf); +} + +/* + * This function is used to override the processing of arguments and + * invocation of a handler for emulated system calls, installed on each + * branded LWP as "lwp_brand_syscall". If this system call should use the + * native path, we return 1. If we handled this system call (and have made + * arrangements with respect to post-return usermode register state) we + * return 0. + */ +int +lx_syscall_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num; + int error; + long ret = 0; + lx_sysent_t *s; + uintptr_t args[6]; + unsigned int unsup_reason; + + /* + * If we got here, we should have an LWP-specific brand data + * structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return + * to the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); + } + + /* + * Clear the restartable system call flag. This flag will be set + * on in the system call handler if the call is a candidate for + * a restart. It will be saved by lx_setcontext() in the event + * that we take a signal, and used in the signal handling path + * to restart the system call iff SA_RESTART was set for this + * signal. Save the system call number so that we can store it + * in the saved context if required. + */ + lwpd->br_syscall_restart = B_FALSE; + lwpd->br_syscall_num = (int)rp->r_r0; + + /* + * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by + * PTRACE_SYSCALL. The system call number and arguments may be + * modified by the tracer. + */ + (void) lx_ptrace_stop(LX_PR_SYSENTRY); + + /* + * Check that the system call number is within the bounds we expect. + */ + syscall_num = lwpd->br_syscall_num; + if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) { + lx_syscall_unsup_msg(NULL, syscall_num, 0); + + (void) set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + s = &lx_sysent64[syscall_num]; + } else +#endif + { + s = &lx_sysent32[syscall_num]; + } + + /* + * Process the arguments for this system call and fire the DTrace + * "lx-syscall:::entry" probe: + */ + error = lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + lwpd->br_syscall_args[0] = args[0]; + lwpd->br_syscall_args[1] = args[1]; + lwpd->br_syscall_args[2] = args[2]; + lwpd->br_syscall_args[3] = args[3]; + if (error != 0) { + /* + * Could not read and process the arguments. Return the error + * to the process. + */ + (void) set_errno(error); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + + if (s->sy_callc != NULL) { + /* + * Call the in-kernel handler for this Linux system call: + */ + lwpd->br_eosys = NORMALRETURN; + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + if (lwpd->br_eosys == NORMALRETURN) { + lx_syscall_return(lwp, syscall_num, ret); + } + return (0); + } + + /* + * There is no in-kernel handler. + */ + switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) { + case NOSYS_USERMODE: + /* + * Pass to the usermode emulation routine. + */ +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(lwp, syscall_num, args); + } else +#endif + { + lx_emulate_user(lwp, syscall_num, args); + } + return (0); + + default: + /* + * We are not emulating this system call at all. + */ + lx_syscall_unsup_msg(s, syscall_num, unsup_reason); + + (void) set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } +} + +#if defined(_LP64) +/* + * Emulate vsyscall support. + * + * Linux magically maps a single page into the address space of each process, + * allowing them to make 'vsyscalls'. Originally designed to counteract the + * perceived overhead of regular system calls, vsyscalls were implemented as + * code residing in userspace which could be called directly. The userspace + * implementations of these vsyscalls which have now been replaced by + * instructions which vector into the normal syscall path. + * + * Implementing vsyscalls on Illumos is complicated by the fact that the + * required static address region resides inside the kernel address space. + * Rather than mapping a user-accessible page into the KAS, a different + * approach is taken. The vsyscall gate is emulated by interposing on + * pagefaults in trap(). An attempt to execute a known vsyscall address will + * result in emulating the appropriate system call rather than inducing a + * SIGSEGV. + */ +void +lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum) +{ + struct regs *rp = lwptoregs(lwp); + uintptr_t raddr; + + /* + * Fetch the return address from the process stack. + */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) { +#if DEBUG + printf("lx_vsyscall_call: bad brand stack at vsyscall " + "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm, + p->p_pid, (void *)rp->r_rsp); +#endif + + /* + * The process jumped to the vsyscall address without a + * correctly configured stack. Terminate the process. + */ + exit(CLD_KILLED, SIGSEGV); + return; + } + + DTRACE_PROBE1(brand__lx__vsyscall, int, scnum); + + /* Simulate vectoring into the syscall */ + rp->r_rax = scnum; + rp->r_rip = raddr; + rp->r_rsp += sizeof (uintptr_t); + + (void) lx_syscall_enter(); +} + +boolean_t +lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int i; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * We only handle vsyscalls when running Linux code. + */ + return (B_FALSE); + } + + if (addr < LX_VSYSCALL_ADDR || + addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) { + /* + * Ignore faults outside the vsyscall page. + */ + return (B_FALSE); + } + + for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) { + if (addr == lx_vsyscalls[i].lv_addr) { + /* + * This is a valid vsyscall address. + */ + *scnum = lx_vsyscalls[i].lv_scnum; + return (B_TRUE); + } + } + + lx_unsupported("bad vsyscall access"); + return (B_FALSE); +} +#endif + +/* + * Linux defines system call numbers for 32-bit x86 in the file: + * arch/x86/syscalls/syscall_32.tbl + */ +lx_sysent_t lx_sysent32[] = { + {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ + {"exit", NULL, 0, 1}, /* 1 */ + {"fork", NULL, 0, 0}, /* 2 */ + {"read", lx_read, 0, 3}, /* 3 */ + {"write", lx_write, 0, 3}, /* 4 */ + {"open", lx_open, 0, 3}, /* 5 */ + {"close", lx_close, 0, 1}, /* 6 */ + {"waitpid", lx_waitpid, 0, 3}, /* 7 */ + {"creat", lx_creat, 0, 2}, /* 8 */ + {"link", lx_link, 0, 2}, /* 9 */ + {"unlink", lx_unlink, 0, 1}, /* 10 */ + {"execve", NULL, 0, 3}, /* 11 */ + {"chdir", lx_chdir, 0, 1}, /* 12 */ + {"time", lx_time, 0, 1}, /* 13 */ + {"mknod", NULL, 0, 3}, /* 14 */ + {"chmod", lx_chmod, 0, 2}, /* 15 */ + {"lchown16", lx_lchown16, 0, 3}, /* 16 */ + {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ + {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ + {"lseek", lx_lseek32, 0, 3}, /* 19 */ + {"getpid", lx_getpid, 0, 0}, /* 20 */ + {"mount", lx_mount, 0, 5}, /* 21 */ + {"umount", lx_umount, 0, 1}, /* 22 */ + {"setuid16", lx_setuid16, 0, 1}, /* 23 */ + {"getuid16", lx_getuid16, 0, 0}, /* 24 */ + {"stime", lx_stime, 0, 1}, /* 25 */ + {"ptrace", lx_ptrace, 0, 4}, /* 26 */ + {"alarm", lx_alarm, 0, 1}, /* 27 */ + {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ + {"pause", lx_pause, 0, 0}, /* 29 */ + {"utime", NULL, 0, 2}, /* 30 */ + {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ + {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ + {"access", lx_access, 0, 2}, /* 33 */ + {"nice", lx_nice, 0, 1}, /* 34 */ + {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ + {"sync", lx_sync, 0, 0}, /* 36 */ + {"kill", lx_kill, 0, 2}, /* 37 */ + {"rename", lx_rename, 0, 2}, /* 38 */ + {"mkdir", lx_mkdir, 0, 2}, /* 39 */ + {"rmdir", NULL, 0, 1}, /* 40 */ + {"dup", lx_dup, 0, 1}, /* 41 */ + {"pipe", lx_pipe, 0, 1}, /* 42 */ + {"times", lx_times, 0, 1}, /* 43 */ + {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ + {"brk", lx_brk, 0, 1}, /* 45 */ + {"setgid16", lx_setgid16, 0, 1}, /* 46 */ + {"getgid16", lx_getgid16, 0, 0}, /* 47 */ + {"signal", NULL, 0, 2}, /* 48 */ + {"geteuid16", lx_geteuid16, 0, 0}, /* 49 */ + {"getegid16", lx_getegid16, 0, 0}, /* 50 */ + {"acct", lx_acct, 0, 1}, /* 51 */ + {"umount2", lx_umount2, 0, 2}, /* 52 */ + {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ + {"ioctl", lx_ioctl, 0, 3}, /* 54 */ + {"fcntl", lx_fcntl, 0, 3}, /* 55 */ + {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ + {"setpgid", lx_setpgid, 0, 2}, /* 57 */ + {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ + {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ + {"umask", lx_umask, 0, 1}, /* 60 */ + {"chroot", lx_chroot, 0, 1}, /* 61 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ + {"dup2", lx_dup2, 0, 2}, /* 63 */ + {"getppid", lx_getppid, 0, 0}, /* 64 */ + {"getpgrp", lx_getpgrp, 0, 0}, /* 65 */ + {"setsid", lx_setsid, 0, 0}, /* 66 */ + {"sigaction", NULL, 0, 3}, /* 67 */ + {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ + {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ + {"setreuid16", lx_setreuid16, 0, 2}, /* 70 */ + {"setregid16", lx_setregid16, 0, 2}, /* 71 */ + {"sigsuspend", NULL, 0, 1}, /* 72 */ + {"sigpending", NULL, 0, 1}, /* 73 */ + {"sethostname", lx_sethostname, 0, 2}, /* 74 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */ + {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */ + {"getrusage", lx_getrusage, 0, 2}, /* 77 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */ + {"settimeofday", NULL, 0, 2}, /* 79 */ + {"getgroups16", NULL, 0, 2}, /* 80 */ + {"setgroups16", NULL, 0, 2}, /* 81 */ + {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ + {"symlink", lx_symlink, 0, 2}, /* 83 */ + {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ + {"readlink", lx_readlink, 0, 3}, /* 85 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ + {"swapon", lx_swapon, 0, 2}, /* 87 */ + {"reboot", lx_reboot, 0, 4}, /* 88 */ + {"readdir", NULL, 0, 3}, /* 89 */ + {"mmap", lx_mmap, 0, 6}, /* 90 */ + {"munmap", lx_munmap, 0, 2}, /* 91 */ + {"truncate", NULL, 0, 2}, /* 92 */ + {"ftruncate", NULL, 0, 2}, /* 93 */ + {"fchmod", lx_fchmod, 0, 2}, /* 94 */ + {"fchown16", lx_fchown16, 0, 3}, /* 95 */ + {"getpriority", lx_getpriority, 0, 2}, /* 96 */ + {"setpriority", lx_setpriority, 0, 3}, /* 97 */ + {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ + {"statfs", NULL, 0, 2}, /* 99 */ + {"fstatfs", NULL, 0, 2}, /* 100 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ + {"socketcall", lx_socketcall, 0, 2}, /* 102 */ + {"syslog", lx_syslog, 0, 3}, /* 103 */ + {"setitimer", NULL, 0, 3}, /* 104 */ + {"getitimer", lx_getitimer, 0, 2}, /* 105 */ + {"stat", lx_stat32, 0, 2}, /* 106 */ + {"lstat", lx_lstat32, 0, 2}, /* 107 */ + {"fstat", lx_fstat32, 0, 2}, /* 108 */ + {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ + {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ + {"vhangup", lx_vhangup, 0, 0}, /* 111 */ + {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ + {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ + {"wait4", lx_wait4, 0, 4}, /* 114 */ + {"swapoff", lx_swapoff, 0, 1}, /* 115 */ + {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ + {"ipc", NULL, 0, 5}, /* 117 */ + {"fsync", NULL, 0, 1}, /* 118 */ + {"sigreturn", NULL, 0, 1}, /* 119 */ + {"clone", NULL, 0, 5}, /* 120 */ + {"setdomainname", lx_setdomainname, 0, 2}, /* 121 */ + {"uname", lx_uname, 0, 1}, /* 122 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */ + {"adjtimex", NULL, 0, 1}, /* 124 */ + {"mprotect", lx_mprotect, 0, 3}, /* 125 */ + {"sigprocmask", NULL, 0, 3}, /* 126 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ + {"getpgid", lx_getpgid, 0, 1}, /* 132 */ + {"fchdir", lx_fchdir, 0, 1}, /* 133 */ + {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"sysfs", NULL, 0, 3}, /* 135 */ + {"personality", lx_personality, 0, 1}, /* 136 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ + {"setfsuid16", lx_setfsuid16, 0, 1}, /* 138 */ + {"setfsgid16", lx_setfsgid16, 0, 1}, /* 139 */ + {"llseek", lx_llseek, 0, 5}, /* 140 */ + {"getdents", lx_getdents_32, 0, 3}, /* 141 */ + {"select", lx_select, 0, 5}, /* 142 */ + {"flock", lx_flock, 0, 2}, /* 143 */ + {"msync", lx_msync, 0, 3}, /* 144 */ + {"readv", lx_readv, 0, 3}, /* 145 */ + {"writev", lx_writev, 0, 3}, /* 146 */ + {"getsid", lx_getsid, 0, 1}, /* 147 */ + {"fdatasync", NULL, 0, 1}, /* 148 */ + {"sysctl", NULL, 0, 1}, /* 149 */ + {"mlock", lx_mlock, 0, 2}, /* 150 */ + {"munlock", lx_munlock, 0, 2}, /* 151 */ + {"mlockall", lx_mlockall, 0, 1}, /* 152 */ + {"munlockall", lx_munlockall, 0, 0}, /* 153 */ + {"sched_setparam", lx_sched_setparam, 0, 2}, /* 154 */ + {"sched_getparam", lx_sched_getparam, 0, 2}, /* 155 */ + {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 156 */ + {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 157 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */ + {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 159 */ + {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 160 */ + {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 161 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */ + {"mremap", lx_mremap, 0, 5}, /* 163 */ + {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */ + {"getresuid16", lx_getresuid16, 0, 3}, /* 165 */ + {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ + {"query_module", NULL, 0, 5}, /* 167 */ + {"poll", lx_poll, 0, 3}, /* 168 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ + {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */ + {"getresgid16", lx_getresgid16, 0, 3}, /* 171 */ + {"prctl", lx_prctl, 0, 5}, /* 172 */ + {"rt_sigreturn", NULL, 0, 0}, /* 173 */ + {"rt_sigaction", NULL, 0, 4}, /* 174 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 175 */ + {"rt_sigpending", NULL, 0, 2}, /* 176 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 179 */ + {"pread64", lx_pread32, 0, 5}, /* 180 */ + {"pwrite64", lx_pwrite32, 0, 5}, /* 181 */ + {"chown16", lx_chown16, 0, 3}, /* 182 */ + {"getcwd", lx_getcwd, 0, 2}, /* 183 */ + {"capget", NULL, 0, 2}, /* 184 */ + {"capset", NULL, 0, 2}, /* 185 */ + {"sigaltstack", NULL, 0, 2}, /* 186 */ + {"sendfile", NULL, 0, 4}, /* 187 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ + {"vfork", NULL, 0, 0}, /* 190 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */ + {"mmap2", lx_mmap2, LX_SYS_EBPARG6, 6}, /* 192 */ + {"truncate64", NULL, 0, 3}, /* 193 */ + {"ftruncate64", NULL, 0, 3}, /* 194 */ + {"stat64", lx_stat64, 0, 2}, /* 195 */ + {"lstat64", lx_lstat64, 0, 2}, /* 196 */ + {"fstat64", lx_fstat64, 0, 2}, /* 197 */ + {"lchown", lx_lchown, 0, 3}, /* 198 */ + {"getuid", lx_getuid, 0, 0}, /* 199 */ + {"getgid", lx_getgid, 0, 0}, /* 200 */ + {"geteuid", lx_geteuid, 0, 0}, /* 201 */ + {"getegid", lx_getegid, 0, 0}, /* 202 */ + {"setreuid", lx_setreuid, 0, 0}, /* 203 */ + {"setregid", lx_setregid, 0, 0}, /* 204 */ + {"getgroups", NULL, 0, 2}, /* 205 */ + {"setgroups", NULL, 0, 2}, /* 206 */ + {"fchown", lx_fchown, 0, 3}, /* 207 */ + {"setresuid", lx_setresuid, 0, 3}, /* 208 */ + {"getresuid", lx_getresuid, 0, 3}, /* 209 */ + {"setresgid", lx_setresgid, 0, 3}, /* 210 */ + {"getresgid", lx_getresgid, 0, 3}, /* 211 */ + {"chown", lx_chown, 0, 3}, /* 212 */ + {"setuid", lx_setuid, 0, 1}, /* 213 */ + {"setgid", lx_setgid, 0, 1}, /* 214 */ + {"setfsuid", lx_setfsuid, 0, 1}, /* 215 */ + {"setfsgid", lx_setfsgid, 0, 1}, /* 216 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ + {"mincore", lx_mincore, 0, 3}, /* 218 */ + {"madvise", lx_madvise, 0, 3}, /* 219 */ + {"getdents64", lx_getdents64, 0, 3}, /* 220 */ + {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ + {"gettid", lx_gettid, 0, 0}, /* 224 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ + {"setxattr", lx_setxattr, 0, 5}, /* 226 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 227 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 228 */ + {"getxattr", lx_getxattr, 0, 4}, /* 229 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 230 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 231 */ + {"listxattr", lx_listxattr, 0, 3}, /* 232 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 233 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 234 */ + {"removexattr", lx_removexattr, 0, 2}, /* 235 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 236 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 237 */ + {"tkill", lx_tkill, 0, 2}, /* 238 */ + {"sendfile64", NULL, 0, 4}, /* 239 */ + {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */ + {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 241 */ + {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 242 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */ + {"io_setup", lx_io_setup, 0, 2}, /* 245 */ + {"io_destroy", lx_io_destroy, 0, 1}, /* 246 */ + {"io_getevents", lx_io_getevents, 0, 5}, /* 247 */ + {"io_submit", lx_io_submit, 0, 3}, /* 248 */ + {"io_cancel", lx_io_cancel, 0, 3}, /* 249 */ + {"fadvise64", lx_fadvise64_32, 0, 5}, /* 250 */ + {"nosys", NULL, 0, 0}, /* 251 */ + {"group_exit", NULL, 0, 1}, /* 252 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */ + {"timer_create", lx_timer_create, 0, 3}, /* 259 */ + {"timer_settime", NULL, 0, 4}, /* 260 */ + {"timer_gettime", NULL, 0, 2}, /* 261 */ + {"timer_getoverrun", NULL, 0, 1}, /* 262 */ + {"timer_delete", NULL, 0, 1}, /* 263 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */ + {"clock_nanosleep", NULL, 0, 4}, /* 267 */ + {"statfs64", NULL, 0, 2}, /* 268 */ + {"fstatfs64", NULL, 0, 2}, /* 269 */ + {"tgkill", lx_tgkill, 0, 3}, /* 270 */ + +/* + * The following system calls only exist in kernel 2.6 and greater: + */ + {"utimes", NULL, 0, 2}, /* 271 */ + {"fadvise64_64", lx_fadvise64_64, LX_SYS_EBPARG6, 6}, /* 272 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ + {"waitid", lx_waitid, 0, 4}, /* 284 */ + {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 289 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 290 */ + {"inotify_init", NULL, 0, 0}, /* 291 */ + {"inotify_add_watch", NULL, 0, 3}, /* 292 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 293 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ + {"openat", lx_openat, 0, 4}, /* 295 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */ + {"mknodat", NULL, 0, 4}, /* 297 */ + {"fchownat", lx_fchownat, 0, 5}, /* 298 */ + {"futimesat", NULL, 0, 3}, /* 299 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */ + {"unlinkat", lx_unlinkat, 0, 3}, /* 301 */ + {"renameat", lx_renameat, 0, 4}, /* 302 */ + {"linkat", lx_linkat, 0, 5}, /* 303 */ + {"symlinkat", lx_symlinkat, 0, 3}, /* 304 */ + {"readlinkat", lx_readlinkat, 0, 4}, /* 305 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 306 */ + {"faccessat", lx_faccessat, 0, 4}, /* 307 */ + {"pselect6", lx_pselect, LX_SYS_EBPARG6, 6}, /* 308 */ + {"ppoll", lx_ppoll, 0, 5}, /* 309 */ + {"unshare", lx_unshare, 0, 1}, /* 310 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 311 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 312 */ + {"splice", lx_splice, LX_SYS_EBPARG6, 6}, /* 313 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 314 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getcpu", lx_getcpu, 0, 3}, /* 318 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */ + {"utimensat", NULL, 0, 4}, /* 320 */ + {"signalfd", NULL, 0, 3}, /* 321 */ + {"timerfd_create", NULL, 0, 2}, /* 322 */ + {"eventfd", lx_eventfd, 0, 1}, /* 323 */ + {"fallocate", lx_fallocate32, LX_SYS_EBPARG6, 6}, /* 324 */ + {"timerfd_settime", NULL, 0, 4}, /* 325 */ + {"timerfd_gettime", NULL, 0, 2}, /* 326 */ + {"signalfd4", NULL, 0, 4}, /* 327 */ + {"eventfd2", lx_eventfd2, 0, 2}, /* 328 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */ + {"dup3", lx_dup3, 0, 3}, /* 330 */ + {"pipe2", lx_pipe2, 0, 2}, /* 331 */ + {"inotify_init1", NULL, 0, 1}, /* 332 */ + {"preadv", lx_preadv32, 0, 5}, /* 333 */ + {"pwritev", lx_pwritev32, 0, 5}, /* 334 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ + {"recvmmsg", lx_recvmmsg, 0, 5}, /* 337 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ + {"syncfs", lx_syncfs, 0, 1}, /* 344 */ + {"sendmmsg", lx_sendmmsg, 0, 4}, /* 345 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ + {"sched_setattr", lx_sched_setattr, 0, 3}, /* 351 */ + {"sched_getattr", lx_sched_getattr, 0, 4}, /* 352 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */ + {"getrandom", lx_getrandom, 0, 3}, /* 355 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */ +}; + +#if defined(_LP64) +/* + * Linux defines system call numbers for 64-bit x86 in the file: + * arch/x86/syscalls/syscall_64.tbl + */ +lx_sysent_t lx_sysent64[] = { + {"read", lx_read, 0, 3}, /* 0 */ + {"write", lx_write, 0, 3}, /* 1 */ + {"open", lx_open, 0, 3}, /* 2 */ + {"close", lx_close, 0, 1}, /* 3 */ + {"stat", lx_stat64, 0, 2}, /* 4 */ + {"fstat", lx_fstat64, 0, 2}, /* 5 */ + {"lstat", lx_lstat64, 0, 2}, /* 6 */ + {"poll", lx_poll, 0, 3}, /* 7 */ + {"lseek", lx_lseek64, 0, 3}, /* 8 */ + {"mmap", lx_mmap, 0, 6}, /* 9 */ + {"mprotect", lx_mprotect, 0, 3}, /* 10 */ + {"munmap", lx_munmap, 0, 2}, /* 11 */ + {"brk", lx_brk, 0, 1}, /* 12 */ + {"rt_sigaction", NULL, 0, 4}, /* 13 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 14 */ + {"rt_sigreturn", NULL, 0, 0}, /* 15 */ + {"ioctl", lx_ioctl, 0, 3}, /* 16 */ + {"pread64", lx_pread, 0, 4}, /* 17 */ + {"pwrite64", lx_pwrite, 0, 4}, /* 18 */ + {"readv", lx_readv, 0, 3}, /* 19 */ + {"writev", lx_writev, 0, 3}, /* 20 */ + {"access", lx_access, 0, 2}, /* 21 */ + {"pipe", lx_pipe, 0, 1}, /* 22 */ + {"select", lx_select, 0, 5}, /* 23 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */ + {"mremap", lx_mremap, 0, 5}, /* 25 */ + {"msync", lx_msync, 0, 3}, /* 26 */ + {"mincore", lx_mincore, 0, 3}, /* 27 */ + {"madvise", lx_madvise, 0, 3}, /* 28 */ + {"shmget", NULL, 0, 3}, /* 29 */ + {"shmat", NULL, 0, 4}, /* 30 */ + {"shmctl", NULL, 0, 3}, /* 31 */ + {"dup", lx_dup, 0, 1}, /* 32 */ + {"dup2", lx_dup2, 0, 2}, /* 33 */ + {"pause", lx_pause, 0, 0}, /* 34 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */ + {"getitimer", lx_getitimer, 0, 2}, /* 36 */ + {"alarm", lx_alarm, 0, 1}, /* 37 */ + {"setitimer", NULL, 0, 3}, /* 38 */ + {"getpid", lx_getpid, 0, 0}, /* 39 */ + {"sendfile", NULL, 0, 4}, /* 40 */ + {"socket", lx_socket, 0, 3}, /* 41 */ + {"connect", lx_connect, 0, 3}, /* 42 */ + {"accept", lx_accept, 0, 3}, /* 43 */ + {"sendto", lx_sendto, 0, 6}, /* 44 */ + {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */ + {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */ + {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */ + {"shutdown", lx_shutdown, 0, 2}, /* 48 */ + {"bind", lx_bind, 0, 3}, /* 49 */ + {"listen", lx_listen, 0, 2}, /* 50 */ + {"getsockname", lx_getsockname, 0, 3}, /* 51 */ + {"getpeername", lx_getpeername, 0, 3}, /* 52 */ + {"socketpair", lx_socketpair, 0, 4}, /* 53 */ + {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */ + {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */ + {"clone", NULL, 0, 5}, /* 56 */ + {"fork", NULL, 0, 0}, /* 57 */ + {"vfork", NULL, 0, 0}, /* 58 */ + {"execve", NULL, 0, 3}, /* 59 */ + {"exit", NULL, 0, 1}, /* 60 */ + {"wait4", lx_wait4, 0, 4}, /* 61 */ + {"kill", lx_kill, 0, 2}, /* 62 */ + {"uname", lx_uname, 0, 1}, /* 63 */ + {"semget", NULL, 0, 3}, /* 64 */ + {"semop", NULL, 0, 3}, /* 65 */ + {"semctl", NULL, 0, 4}, /* 66 */ + {"shmdt", NULL, 0, 1}, /* 67 */ + {"msgget", NULL, 0, 2}, /* 68 */ + {"msgsnd", NULL, 0, 4}, /* 69 */ + {"msgrcv", NULL, 0, 5}, /* 70 */ + {"msgctl", NULL, 0, 3}, /* 71 */ + {"fcntl", lx_fcntl64, 0, 3}, /* 72 */ + {"flock", lx_flock, 0, 2}, /* 73 */ + {"fsync", NULL, 0, 1}, /* 74 */ + {"fdatasync", NULL, 0, 1}, /* 75 */ + {"truncate", NULL, 0, 2}, /* 76 */ + {"ftruncate", NULL, 0, 2}, /* 77 */ + {"getdents", lx_getdents_64, 0, 3}, /* 78 */ + {"getcwd", lx_getcwd, 0, 2}, /* 79 */ + {"chdir", lx_chdir, 0, 1}, /* 80 */ + {"fchdir", lx_fchdir, 0, 1}, /* 81 */ + {"rename", lx_rename, 0, 2}, /* 82 */ + {"mkdir", lx_mkdir, 0, 2}, /* 83 */ + {"rmdir", NULL, 0, 1}, /* 84 */ + {"creat", lx_creat, 0, 2}, /* 85 */ + {"link", lx_link, 0, 2}, /* 86 */ + {"unlink", lx_unlink, 0, 1}, /* 87 */ + {"symlink", lx_symlink, 0, 2}, /* 88 */ + {"readlink", lx_readlink, 0, 3}, /* 89 */ + {"chmod", lx_chmod, 0, 2}, /* 90 */ + {"fchmod", lx_fchmod, 0, 2}, /* 91 */ + {"chown", lx_chown, 0, 3}, /* 92 */ + {"fchown", lx_fchown, 0, 3}, /* 93 */ + {"lchown", lx_lchown, 0, 3}, /* 94 */ + {"umask", lx_umask, 0, 1}, /* 95 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */ + {"getrusage", lx_getrusage, 0, 2}, /* 98 */ + {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */ + {"times", lx_times, 0, 1}, /* 100 */ + {"ptrace", lx_ptrace, 0, 4}, /* 101 */ + {"getuid", lx_getuid, 0, 0}, /* 102 */ + {"syslog", lx_syslog, 0, 3}, /* 103 */ + {"getgid", lx_getgid, 0, 0}, /* 104 */ + {"setuid", lx_setuid, 0, 1}, /* 105 */ + {"setgid", lx_setgid, 0, 1}, /* 106 */ + {"geteuid", lx_geteuid, 0, 0}, /* 107 */ + {"getegid", lx_getegid, 0, 0}, /* 108 */ + {"setpgid", lx_setpgid, 0, 2}, /* 109 */ + {"getppid", lx_getppid, 0, 0}, /* 110 */ + {"getpgrp", lx_getpgrp, 0, 0}, /* 111 */ + {"setsid", lx_setsid, 0, 0}, /* 112 */ + {"setreuid", lx_setreuid, 0, 0}, /* 113 */ + {"setregid", lx_setregid, 0, 0}, /* 114 */ + {"getgroups", NULL, 0, 2}, /* 115 */ + {"setgroups", NULL, 0, 2}, /* 116 */ + {"setresuid", lx_setresuid, 0, 3}, /* 117 */ + {"getresuid", lx_getresuid, 0, 3}, /* 118 */ + {"setresgid", lx_setresgid, 0, 3}, /* 119 */ + {"getresgid", lx_getresgid, 0, 3}, /* 120 */ + {"getpgid", lx_getpgid, 0, 1}, /* 121 */ + {"setfsuid", lx_setfsuid, 0, 1}, /* 122 */ + {"setfsgid", lx_setfsgid, 0, 1}, /* 123 */ + {"getsid", lx_getsid, 0, 1}, /* 124 */ + {"capget", NULL, 0, 2}, /* 125 */ + {"capset", NULL, 0, 2}, /* 126 */ + {"rt_sigpending", NULL, 0, 2}, /* 127 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 130 */ + {"sigaltstack", NULL, 0, 2}, /* 131 */ + {"utime", NULL, 0, 2}, /* 132 */ + {"mknod", NULL, 0, 3}, /* 133 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"personality", lx_personality, 0, 1}, /* 135 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ + {"statfs", NULL, 0, 2}, /* 137 */ + {"fstatfs", NULL, 0, 2}, /* 138 */ + {"sysfs", NULL, 0, 3}, /* 139 */ + {"getpriority", lx_getpriority, 0, 2}, /* 140 */ + {"setpriority", lx_setpriority, 0, 3}, /* 141 */ + {"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */ + {"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */ + {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */ + {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 145 */ + {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 146 */ + {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 147 */ + {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 148 */ + {"mlock", lx_mlock, 0, 2}, /* 149 */ + {"munlock", lx_munlock, 0, 2}, /* 150 */ + {"mlockall", lx_mlockall, 0, 1}, /* 151 */ + {"munlockall", lx_munlockall, 0, 0}, /* 152 */ + {"vhangup", lx_vhangup, 0, 0}, /* 153 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ + {"sysctl", NULL, 0, 1}, /* 156 */ + {"prctl", lx_prctl, 0, 5}, /* 157 */ + {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ + {"adjtimex", NULL, 0, 1}, /* 159 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */ + {"chroot", lx_chroot, 0, 1}, /* 161 */ + {"sync", lx_sync, 0, 0}, /* 162 */ + {"acct", lx_acct, 0, 1}, /* 163 */ + {"settimeofday", NULL, 0, 2}, /* 164 */ + {"mount", lx_mount, 0, 5}, /* 165 */ + {"umount2", lx_umount2, 0, 2}, /* 166 */ + {"swapon", lx_swapon, 0, 2}, /* 167 */ + {"swapoff", lx_swapoff, 0, 1}, /* 168 */ + {"reboot", lx_reboot, 0, 4}, /* 169 */ + {"sethostname", lx_sethostname, 0, 2}, /* 170 */ + {"setdomainname", lx_setdomainname, 0, 2}, /* 171 */ + {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ + {"query_module", NULL, 0, 5}, /* 178 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ + {"gettid", lx_gettid, 0, 0}, /* 186 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ + {"setxattr", lx_setxattr, 0, 5}, /* 188 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 189 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 190 */ + {"getxattr", lx_getxattr, 0, 4}, /* 191 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 192 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 193 */ + {"listxattr", lx_listxattr, 0, 3}, /* 194 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 195 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 196 */ + {"removexattr", lx_removexattr, 0, 2}, /* 197 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 198 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 199 */ + {"tkill", lx_tkill, 0, 2}, /* 200 */ + {"time", lx_time, 0, 1}, /* 201 */ + {"futex", lx_futex, 0, 6}, /* 202 */ + {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 203 */ + {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 204 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */ + {"io_setup", lx_io_setup, 0, 2}, /* 206 */ + {"io_destroy", lx_io_destroy, 0, 1}, /* 207 */ + {"io_getevents", lx_io_getevents, 0, 5}, /* 208 */ + {"io_submit", lx_io_submit, 0, 3}, /* 209 */ + {"io_cancel", lx_io_cancel, 0, 3}, /* 210 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */ + {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ + {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ + {"getdents64", lx_getdents64, 0, 3}, /* 217 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */ + {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ + {"semtimedop", NULL, 0, 4}, /* 220 */ + {"fadvise64", lx_fadvise64, 0, 4}, /* 221 */ + {"timer_create", lx_timer_create, 0, 3}, /* 222 */ + {"timer_settime", NULL, 0, 4}, /* 223 */ + {"timer_gettime", NULL, 0, 2}, /* 224 */ + {"timer_getoverrun", NULL, 0, 1}, /* 225 */ + {"timer_delete", NULL, 0, 1}, /* 226 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */ + {"clock_nanosleep", NULL, 0, 4}, /* 230 */ + {"exit_group", NULL, 0, 1}, /* 231 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */ + {"tgkill", lx_tgkill, 0, 3}, /* 234 */ + {"utimes", NULL, 0, 2}, /* 235 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ + {"waitid", lx_waitid, 0, 4}, /* 247 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 251 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 252 */ + {"inotify_init", NULL, 0, 0}, /* 253 */ + {"inotify_add_watch", NULL, 0, 3}, /* 254 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 255 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ + {"openat", lx_openat, 0, 4}, /* 257 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */ + {"mknodat", NULL, 0, 4}, /* 259 */ + {"fchownat", lx_fchownat, 0, 5}, /* 260 */ + {"futimesat", NULL, 0, 3}, /* 261 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */ + {"unlinkat", lx_unlinkat, 0, 3}, /* 263 */ + {"renameat", lx_renameat, 0, 4}, /* 264 */ + {"linkat", lx_linkat, 0, 5}, /* 265 */ + {"symlinkat", lx_symlinkat, 0, 3}, /* 266 */ + {"readlinkat", lx_readlinkat, 0, 4}, /* 267 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 268 */ + {"faccessat", lx_faccessat, 0, 4}, /* 269 */ + {"pselect6", lx_pselect, 0, 6}, /* 270 */ + {"ppoll", lx_ppoll, 0, 5}, /* 271 */ + {"unshare", lx_unshare, 0, 1}, /* 272 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 273 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 274 */ + {"splice", lx_splice, 0, 6}, /* 275 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 277 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ + {"utimensat", NULL, 0, 4}, /* 280 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */ + {"signalfd", NULL, 0, 3}, /* 282 */ + {"timerfd_create", NULL, 0, 2}, /* 283 */ + {"eventfd", lx_eventfd, 0, 1}, /* 284 */ + {"fallocate", lx_fallocate, 0, 4}, /* 285 */ + {"timerfd_settime", NULL, 0, 4}, /* 286 */ + {"timerfd_gettime", NULL, 0, 2}, /* 287 */ + {"accept4", lx_accept4, 0, 4}, /* 288 */ + {"signalfd4", NULL, 0, 4}, /* 289 */ + {"eventfd2", lx_eventfd2, 0, 2}, /* 290 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */ + {"dup3", lx_dup3, 0, 3}, /* 292 */ + {"pipe2", lx_pipe2, 0, 2}, /* 293 */ + {"inotify_init1", NULL, 0, 1}, /* 294 */ + {"preadv", lx_preadv, 0, 4}, /* 295 */ + {"pwritev", lx_pwritev, 0, 4}, /* 296 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ + {"recvmmsg", lx_recvmmsg, 0, 5}, /* 299 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ + {"syncfs", lx_syncfs, 0, 1}, /* 306 */ + {"sendmmsg", lx_sendmmsg, 0, 4}, /* 307 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ + {"getcpu", lx_getcpu, 0, 3}, /* 309 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sched_setattr", lx_sched_setattr, 0, 3}, /* 314 */ + {"sched_getattr", lx_sched_getattr, 0, 4}, /* 315 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getrandom", lx_getrandom, 0, 3}, /* 318 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */ + {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */ + + /* XXX TBD gap then x32 syscalls from 512 - 544 */ +}; +#endif diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h new file mode 100644 index 0000000000..ad86667997 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h @@ -0,0 +1,378 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _LX_PROC_H +#define _LX_PROC_H + +#ifdef _LXPROC_NATIVE_H +#error Attempted to include branded lx_proc.h after native lxproc.h +#endif + +#define _LXPROC_BRANDED_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/nvpair.h> +#include <vm/as.h> +#include <vm/anon.h> + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_INVALID, /* nodes start at 1 */ + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_AUXV, /* /proc/<pid>/auxv */ + LXPR_PID_CGROUP, /* /proc/<pid>/cgroup */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_COMM, /* /proc/<pid>/comm */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_GIDMAP, /* /proc/<pid>/gid_map */ + LXPR_PID_LIMITS, /* /proc/<pid>/limits */ + LXPR_PID_LOGINUID, /* /proc/<pid>/loginuid */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_MOUNTINFO, /* /proc/<pid>/mountinfo */ + LXPR_PID_MOUNTS, /* /proc/<pid>/mounts */ + LXPR_PID_OOM_SCR_ADJ, /* /proc/<pid>/oom_score_adj */ + LXPR_PID_PERSONALITY, /* /proc/<pid>/personality */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_TASKDIR, /* /proc/<pid>/task */ + LXPR_PID_TASK_IDDIR, /* /proc/<pid>/task/<tid> */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_PID_UIDMAP, /* /proc/<pid>/uid_map */ + LXPR_PID_TID_AUXV, /* /proc/<pid>/task/<tid>/auxv */ + LXPR_PID_TID_CGROUP, /* /proc/<pid>/task/<tid>/cgroup */ + LXPR_PID_TID_CMDLINE, /* /proc/<pid>/task/<tid>/cmdline */ + LXPR_PID_TID_COMM, /* /proc/<pid>/task/<tid>/comm */ + LXPR_PID_TID_CPU, /* /proc/<pid>/task/<tid>/cpu */ + LXPR_PID_TID_CURDIR, /* /proc/<pid>/task/<tid>/cwd */ + LXPR_PID_TID_ENV, /* /proc/<pid>/task/<tid>/environ */ + LXPR_PID_TID_EXE, /* /proc/<pid>/task/<tid>/exe */ + LXPR_PID_TID_GIDMAP, /* /proc/<pid>/task/<tid>/gid_map */ + LXPR_PID_TID_LIMITS, /* /proc/<pid>/task/<tid>/limits */ + LXPR_PID_TID_LOGINUID, /* /proc/<pid>/task/<tid>/loginuid */ + LXPR_PID_TID_MAPS, /* /proc/<pid>/task/<tid>/maps */ + LXPR_PID_TID_MEM, /* /proc/<pid>/task/<tid>/mem */ + LXPR_PID_TID_MOUNTINFO, /* /proc/<pid>/task/<tid>/mountinfo */ + LXPR_PID_TID_OOM_SCR_ADJ, /* /proc/<pid>/task/<tid>/oom_score_adj */ + LXPR_PID_TID_PERSONALITY, /* /proc/<pid>/task/<tid>/personality */ + LXPR_PID_TID_ROOTDIR, /* /proc/<pid>/task/<tid>/root */ + LXPR_PID_TID_STAT, /* /proc/<pid>/task/<tid>/stat */ + LXPR_PID_TID_STATM, /* /proc/<pid>/task/<tid>/statm */ + LXPR_PID_TID_STATUS, /* /proc/<pid>/task/<tid>/status */ + LXPR_PID_TID_FDDIR, /* /proc/<pid>/task/<tid>/fd */ + LXPR_PID_TID_FD_FD, /* /proc/<pid>/task/<tid>/fd/nn */ + LXPR_PID_TID_UIDMAP, /* /proc/<pid>/task/<tid>/uid_map */ + LXPR_CGROUPS, /* /proc/cgroups */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DISKSTATS, /* /proc/diskstats */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MODULES, /* /proc/modules */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IF_INET6, /* /proc/net/if_inet6 */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_IPV6_ROUTE, /* /proc/net/ipv6_route */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_TCP6, /* /proc/net/tcp6 */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UDP6, /* /proc/net/udp6 */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_SWAPS, /* /proc/swaps */ + LXPR_SYSDIR, /* /proc/sys/ */ + LXPR_SYS_FSDIR, /* /proc/sys/fs/ */ + LXPR_SYS_FS_AIO_MAX_NR, /* /proc/sys/fs/aio-max-nr */ + LXPR_SYS_FS_AIO_NR, /* /proc/sys/fs/aio-nr */ + LXPR_SYS_FS_FILEMAX, /* /proc/sys/fs/file-max */ + LXPR_SYS_FS_FILENR, /* /proc/sys/fs/file-nr */ + LXPR_SYS_FS_INOTIFYDIR, /* /proc/sys/fs/inotify */ + LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, /* inotify/max_queued_events */ + LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, /* inotify/max_user_instances */ + LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, /* inotify/max_user_watches */ + LXPR_SYS_FS_PIPE_MAX, /* /proc/sys/fs/pipe-max-size */ + LXPR_SYS_KERNELDIR, /* /proc/sys/kernel/ */ + LXPR_SYS_KERNEL_CAPLCAP, /* /proc/sys/kernel/cap_last_cap */ + LXPR_SYS_KERNEL_COREPATT, /* /proc/sys/kernel/core_pattern */ + LXPR_SYS_KERNEL_HOSTNAME, /* /proc/sys/kernel/hostname */ + LXPR_SYS_KERNEL_MSGMAX, /* /proc/sys/kernel/msgmax */ + LXPR_SYS_KERNEL_MSGMNB, /* /proc/sys/kernel/msgmnb */ + LXPR_SYS_KERNEL_MSGMNI, /* /proc/sys/kernel/msgmni */ + LXPR_SYS_KERNEL_NGROUPS_MAX, /* /proc/sys/kernel/ngroups_max */ + LXPR_SYS_KERNEL_OSREL, /* /proc/sys/kernel/osrelease */ + LXPR_SYS_KERNEL_PID_MAX, /* /proc/sys/kernel/pid_max */ + LXPR_SYS_KERNEL_RANDDIR, /* /proc/sys/kernel/random */ + LXPR_SYS_KERNEL_RAND_BOOTID, /* /proc/sys/kernel/random/boot_id */ + LXPR_SYS_KERNEL_RAND_ENTAVL, /* /proc/sys/kernel/random/entropy_avail */ + LXPR_SYS_KERNEL_SEM, /* /proc/sys/kernel/sem */ + LXPR_SYS_KERNEL_SHMALL, /* /proc/sys/kernel/shmall */ + LXPR_SYS_KERNEL_SHMMAX, /* /proc/sys/kernel/shmmax */ + LXPR_SYS_KERNEL_SHMMNI, /* /proc/sys/kernel/shmmni */ + LXPR_SYS_KERNEL_THREADS_MAX, /* /proc/sys/kernel/threads-max */ + LXPR_SYS_NETDIR, /* /proc/sys/net */ + LXPR_SYS_NET_COREDIR, /* /proc/sys/net/core */ + LXPR_SYS_NET_CORE_SOMAXCON, /* /proc/sys/net/core/somaxconn */ + LXPR_SYS_NET_IPV4DIR, /* /proc/sys/net/ipv4 */ + LXPR_SYS_NET_IPV4_ICMP_EIB, /* .../icmp_echo_ignore_broadcasts */ + LXPR_SYS_NET_IPV4_IP_FORWARD, /* .../net/ipv4/ip_forward */ + LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, /* .../net/ipv4/ip_local_port_range */ + LXPR_SYS_NET_IPV4_TCP_FIN_TO, /* /proc/sys/net/ipv4/tcp_fin_timeout */ + LXPR_SYS_NET_IPV4_TCP_KA_INT, /* .../net/ipv4/tcp_keepalive_intvl */ + LXPR_SYS_NET_IPV4_TCP_KA_TIM, /* .../net/ipv4/tcp_keepalive_time */ + LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, /* .../net/ipv4/tcp_max_syn_backlog */ + LXPR_SYS_NET_IPV4_TCP_RETRY2, /* /proc/sys/net/ipv4/tcp_retries2 */ + LXPR_SYS_NET_IPV4_TCP_RMEM, /* /proc/sys/net/ipv4/tcp_rmem */ + LXPR_SYS_NET_IPV4_TCP_SACK, /* /proc/sys/net/ipv4/tcp_sack */ + LXPR_SYS_NET_IPV4_TCP_WINSCALE, /* .../net/ipv4/tcp_window_scaling */ + LXPR_SYS_NET_IPV4_TCP_WMEM, /* /proc/sys/net/ipv4/tcp_wmem */ + LXPR_SYS_VMDIR, /* /proc/sys/vm */ + LXPR_SYS_VM_DIRTY_BG_BYTES, /* .../vm/dirty_background_bytes */ + LXPR_SYS_VM_DIRTY_BG_RATIO, /* .../vm/dirty_background_ratio */ + LXPR_SYS_VM_DIRTY_BYTES, /* /proc/sys/vm/dirty_bytes */ + LXPR_SYS_VM_DIRTY_EXP_CS, /* .../vm/dirty_expire_centisecs */ + LXPR_SYS_VM_DIRTY_RATIO, /* /proc/sys/vm/dirty_ratio */ + LXPR_SYS_VM_DIRTYTIME_EXP_SEC, /* .../vm/dirtytime_expire_seconds */ + LXPR_SYS_VM_DIRTY_WB_CS, /* .../vm/dirty_writeback_centisecs */ + LXPR_SYS_VM_MAX_MAP_CNT, /* /proc/sys/vm/max_map_count */ + LXPR_SYS_VM_MINFR_KB, /* /proc/sys/vm/min_free_kbytes */ + LXPR_SYS_VM_NHUGEP, /* /proc/sys/vm/nr_hugepages */ + LXPR_SYS_VM_OVERCOMMIT_MEM, /* /proc/sys/vm/overcommit_memory */ + LXPR_SYS_VM_SWAPPINESS, /* /proc/sys/vm/swappiness */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_VMSTAT, /* /proc/vmstat */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * Linux sector size for /proc/diskstats + */ +#define LXPR_SECTOR_SIZE 512 + +/* + * external dirent characteristics + */ +typedef struct { + lxpr_nodetype_t d_type; + char *d_name; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + uint_t lxpr_desc; /* addl. descriptor (fd or tid) */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern boolean_t lxpr_is_writable(lxpr_nodetype_t); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); +extern vnode_t *lxpr_lookup_fdnode(vnode_t *, const char *); +extern int lxpr_readlink_fdnode(lxpr_node_t *, char *, size_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +} lxpr_uiobuf_t; + +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +extern int lxpr_core_path_l2s(const char *, char *, size_t); +extern int lxpr_core_path_s2l(const char *, char *, size_t); + +typedef enum lxpr_zombok { + NO_ZOMB = 0, + ZOMB_OK +} zombok_t; + +extern proc_t *lxpr_lock(lxpr_node_t *, zombok_t); +extern proc_t *lxpr_lock_pid(lxpr_node_t *, pid_t, zombok_t, kthread_t **); +extern void lxpr_unlock(proc_t *); +extern netstack_t *lxpr_netstack(lxpr_node_t *); +extern void lxpr_fixpid(zone_t *, proc_t *, pid_t *, pid_t *); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LX_PROC_H */ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c new file mode 100644 index 0000000000..07dc432329 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -0,0 +1,917 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. + */ + +/* + * lxprsubr.c: Various functions for the /lxproc vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "lx_proc.h" + +#define LXPRCACHE_NAME "lxbpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +int lx_pr_bufsize = 4000; + +struct lxpr_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +}; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lx_pr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +boolean_t +lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf) +{ + if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0) + return (B_TRUE); + return (B_FALSE); +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * Lookup process, potentially constrained by pid associated with lxpr_node and + * return with p_lock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock_pid(lxpr_node_t *lxpnp, pid_t pid, zombok_t zombie_ok, + kthread_t **tp) +{ + zone_t *zone = LXPTOZ(lxpnp); + proc_t *p; + kthread_t *t; + lx_pid_flag_t flags = LXP_PRLOCK; + + ASSERT(!MUTEX_HELD(&pidlock)); + + /* Consider zsched to be invisible to LX */ + if (pid == zone->zone_zsched->p_pid) { + return (NULL); + } + if (zombie_ok == ZOMB_OK) { + flags |= LXP_ZOMBOK; + } + +retry: + if (lx_lpid_lock(pid, zone, flags, &p, &t) != 0) { + return (NULL); + } + + /* + * Make sure that thread lookups (where non-main LX threads are + * assigned a pid not equal to the encompassing parent) match the pid + * of the encompasing directory. This must be performed carefully for + * the Linux pid 1 as it will not equal the native pid despite the + * process matching. + * + * This is necessary to constrain paths such as /proc/<pid>/task/<tid>. + */ + if (lxpnp->lxpr_pid != 0 && lxpnp->lxpr_pid != pid && + !(pid == 1 && lxpnp->lxpr_pid == zone->zone_proc_initpid)) { + klwp_t *lwp; + lx_lwp_data_t *lwpd; + + /* + * Only LWPs of branded processes will be accessible this way. + * The threads of native processes lack pid assignments which + * LX uses to emulate Linux's weird thread/process model. + */ + if ((lwp = ttolwp(t)) == NULL || + (lwpd = lwptolxlwp(lwp)) == NULL || + lwpd->br_pid != pid) { + sprunlock(p); + return (NULL); + } + } + + if (zombie_ok == NO_ZOMB && + ((p->p_flag & SEXITING) || p->p_stat == SZOMB)) { + sprunlock(p); + return (NULL); + } + + /* + * Accessing a process which is undergoing exec(2) is somewhat risky. + * In particular, the p_exec field is updated outside p_lock. To avoid + * this mess, access is denied when P_PR_EXEC set unless the caller + * happens to be the process itself. This allows actions such as + * re-exec()-ing /proc/<pid>/exe to make forward progress. + * + * All other callers must block until the flag is cleared. + */ + if ((p->p_proc_flag & P_PR_EXEC) != 0) { + if (p != curproc) { + kmutex_t *mp; + + /* + * Drop PR_LOCK and wait for the exec() to ping the CV + * once it has completed. Afterward, the pid is looked + * up again in case the process exited for some reason. + */ + mp = &p->p_lock; + sprunprlock(p); + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + goto retry; + } + } + + if (tp != NULL) { + *tp = t; + } + return (p); +} + +netstack_t * +lxpr_netstack(lxpr_node_t *lxpnp) +{ + return (netstack_hold_if_active(LXPTOZ(lxpnp)->zone_netstack)); +} + +/* + * Lookup process from pid associated with lxpr_node and return with p_lock and + * P_PR_LOCK held. + */ +proc_t * +lxpr_lock(lxpr_node_t *lxpnp, zombok_t zombie_ok) +{ + return (lxpr_lock_pid(lxpnp, lxpnp->lxpr_pid, zombie_ok, NULL)); +} + +void +lxpr_fixpid(zone_t *zone, proc_t *p, pid_t *pidp, pid_t *ppidp) +{ + pid_t pid = p->p_pid; + pid_t ppid = p->p_ppid; + + ASSERT(p != NULL); + ASSERT(pidp != NULL); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(pid != zone->zone_zsched->p_pid); + + if (pid == zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + if (ppid == zone->zone_proc_initpid) { + /* + * Convert ppid to the Linux default of 1 if our parent + * is the zone's init process + */ + ppid = 1; + } else if (ppid == zone->zone_zsched->p_pid || + (p->p_flag & SZONETOP) != 0) { + /* + * Additionally, if the process has no valid parent + * inside the zone (or its parent is zsched), lie and + * claim init as the parent. + */ + ppid = 1; + } + } + + *pidp = pid; + if (ppidp != NULL) { + *ppidp = ppid; + } +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc) +{ + switch (type) { + case LXPR_PIDDIR: + return (maxpid + pid + 1); + case LXPR_PID_TASK_IDDIR: + return (maxpid + (desc * 10)); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + desc); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_inactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + lxpnp->lxpr_desc = desc; + VN_HOLD(dp); + if (p != NULL) { + lxpr_node_t *dlxpnp = VTOLXP(dp); + + lxpnp->lxpr_pid = p->p_pid; + /* Propagate the tid whenever possible. */ + if (desc == 0 && dlxpnp->lxpr_desc != 0) { + lxpnp->lxpr_desc = dlxpnp->lxpr_desc; + } + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + if (lxpr_is_writable(type)) { + /* These two have different modes; handled later. */ + if (type != LXPR_PID_FD_FD && type != LXPR_PID_TID_FD_FD) { + vp->v_type = VREG; + lxpnp->lxpr_mode = 0644; + return (lxpnp); + } + } + + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_TASKDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_TASK_IDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + vp->v_type = VLNK; + break; + + case LXPR_PID_FDDIR: + case LXPR_PID_TID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + case LXPR_SYSDIR: + case LXPR_SYS_FSDIR: + case LXPR_SYS_FS_INOTIFYDIR: + case LXPR_SYS_KERNELDIR: + case LXPR_SYS_KERNEL_RANDDIR: + case LXPR_SYS_NETDIR: + case LXPR_SYS_NET_COREDIR: + case LXPR_SYS_NET_IPV4DIR: + case LXPR_SYS_VMDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_AUXV: + case LXPR_PID_PERSONALITY: + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} + +/* + * Attempt to locate vnode for /proc/<pid>/fd/<#>. + */ +vnode_t * +lxpr_lookup_fdnode(vnode_t *dvp, const char *name) +{ + lxpr_node_t *lxdp = VTOLXP(dvp); + lxpr_node_t *lxfp; + char *endptr = NULL; + long num; + int fd; + proc_t *p; + vnode_t *vp = NULL; + file_t *fp; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(lxdp->lxpr_type == LXPR_PID_FDDIR || + lxdp->lxpr_type == LXPR_PID_TID_FDDIR); + + if (ddi_strtol(name, &endptr, 10, &num) != 0) { + return (NULL); + } else if (name[0] < '0' || name[0] > '9' || *endptr != '\0') { + /* + * ddi_strtol allows leading spaces and trailing garbage + * We do not tolerate such foolishness. + */ + return (NULL); + } else if ((fd = (int)num) < 0) { + return (NULL); + } + + /* Lock the owner process */ + if ((p = lxpr_lock(lxdp, NO_ZOMB)) == NULL) { + return (NULL); + } + + /* Not applicable to processes which are system-owned. */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + lxfp = lxpr_getnode(dvp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + fip = P_FINFO(p); + mutex_exit(&p->p_lock); + mutex_enter(&fip->fi_lock); + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + if ((fp = ufp->uf_file) != NULL) { + vp = fp->f_vnode; + VN_HOLD(vp); + } + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxfp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxfp->lxpr_realvp = vp; + + /* + * For certain entries (sockets, pipes, etc), Linux expects a + * bogus-named symlink. If that's the case, report the type as + * VNON to bypass link-following elsewhere in the vfs system. + * + * See lxpr_readlink for more details. + */ + if (lxpr_readlink_fdnode(lxfp, NULL, 0) == 0) + LXPTOV(lxfp)->v_type = VNON; + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + ASSERT(LXPTOV(lxfp) != NULL); + return (LXPTOV(lxfp)); +} + +/* + * Attempt to create Linux-proc-style fake symlinks contents for supported + * /proc/<pid>/fd/<#> entries. + */ +int +lxpr_readlink_fdnode(lxpr_node_t *lxpnp, char *bp, size_t len) +{ + const char *format; + vnode_t *rvp = lxpnp->lxpr_realvp; + vattr_t attr; + + switch (rvp->v_type) { + case VSOCK: + format = "socket:[%lu]"; + break; + case VFIFO: + format = "pipe:[%lu]"; + break; + default: + return (-1); + } + + /* Fetch the inode of the underlying vnode */ + if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0) + return (-1); + + if (bp != NULL) + (void) snprintf(bp, len, format, (ino_t)attr.va_nodeid); + return (0); +} + +/* + * Translate a Linux core_pattern path to a native Illumos one, by replacing + * the appropriate % escape sequences. + * + * Any % escape sequences that are not recognised are double-escaped so that + * they will be inserted literally into the path (to mimic Linux). + */ +int +lxpr_core_path_l2s(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'E': + if (j + 4 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = 'd'; + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'e': + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'p': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = x; + break; + case 'h': + outp[j++] = '%'; + outp[j++] = 'n'; + break; + default: + /* No translation, make it literal. */ + if (j + 3 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = '%'; + outp[j++] = x; + break; + } + } + + outp[j] = '\0'; + return (0); +} + +/* + * Translate an Illumos core pattern path back to Linux format. + */ +int +lxpr_core_path_s2l(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'd': + /* No Linux equivalent unless it's %d%f. */ + if (inp[i] == '%' && inp[i + 1] == 'f') { + i += 2; + outp[j++] = '%'; + outp[j++] = 'E'; + } + break; + case 'f': + outp[j++] = '%'; + outp[j++] = 'e'; + break; + case 'p': + case 'P': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = (x == 'P' ? 'p' : x); + break; + case 'n': + outp[j++] = '%'; + outp[j++] = 'h'; + break; + default: + /* No translation. */ + break; + } + } + + outp[j] = '\0'; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c new file mode 100644 index 0000000000..b4dc5091c2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * lxprvfsops.c: vfs operations for /lxprocfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#include "lx_proc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_proc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialise cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Mounting lx_proc is not allowed outside an LX zone. + */ + if (zone->zone_brand != &lx_brand) { + return (ENOTSUP); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Hold a zone reference for access to the lxzd structure. + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* + * Allocate the first vnode and arbitrarily set the parent vnode to the + * mounted over directory + */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + + ldi_ident_release(lxpr_mnt->lxprm_li); + + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/proc"); + (void) strcpy(&sp->f_fstr[6], "/proc"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c new file mode 100644 index 0000000000..e5ca432bbd --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -0,0 +1,8377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + */ + +/* + * lx_proc -- a Linux-compatible /proc for the LX brand + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support native (but Linux-borne) programs that wish to view the native + * system through the Linux /proc model; the other -- this one -- is to + * support Linux binaries via the LX brand. These two implementations differ + * greatly in their aspirations (and their willingness to bend the truth + * of the system to accommodate those aspirations); they should not be unified. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <lx_auxv.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/fcntl.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/rctl.h> +#include <sys/kstat.h> +#include <sys/lx_misc.h> +#include <sys/lx_types.h> +#include <sys/lx_userhz.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/corectl.h> +#include <sys/rctl_impl.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> +#include <inet/ipclassifier.h> +#include <sys/socketvar.h> +#include <fs/sockfs/socktpi.h> +#include <sys/random.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); +extern int prreadbuf(proc_t *, uintptr_t, uint8_t *, size_t, size_t *); + +#include "lx_proc.h" + +extern pgcnt_t swapfs_minfree; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_create(struct vnode *, char *, struct vattr *, enum vcexcl, + int, struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_write(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_space(vnode_t *, int, flock64_t *, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_setattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_poll(vnode_t *, short, int, short *, pollhead_t **, + caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static int lxpr_doaccess(lxpr_node_t *, boolean_t, int, int, cred_t *, + caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kdir_randdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_coredir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_ipv4dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_vmdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kdir_randdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_coredir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_vmdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cgroups(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_devices(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_filesystems(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_vmstat(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_auxv(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cgroup(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_id_map(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_loginuid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_oom_scr_adj(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_personality(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_tid_comm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_aiomax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_aionr(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_filemax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_filenr(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_pipe_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_caplcap(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_corepatt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmnb(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_osrel(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_sem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmall(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_core_somaxc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_dirty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_minfr_kb(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_nhpages(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_swappiness(lxpr_node_t *, lxpr_uiobuf_t *); + +static int lxpr_write_pid_tid_comm(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_pid_loginuid(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_fs_pipe_max(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_core_somaxc(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_kernel_corepatt(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) + +extern rctl_hndl_t rc_process_semmsl; +extern rctl_hndl_t rc_process_semopm; +extern rctl_hndl_t rc_zone_semmni; +extern rctl_hndl_t rc_process_msgmnb; + +extern rctl_hndl_t rc_zone_msgmni; +extern rctl_hndl_t rc_zone_shmmax; +extern rctl_hndl_t rc_zone_shmmni; + +/* From uts/common/crypto/io/swrand.c */ +extern swrand_stats_t swrand_stats; + +#define ONEGB 1073741824ULL +#define FOURGB 4294967295ULL + +/* + * The maximum length of the concatenation of argument vector strings we + * will return to the user via the branded procfs. Likewise for the env vector. + */ +int lxpr_maxargvlen = 4096; +int lxpr_maxenvvlen = 4096; + +/* + * The lx /proc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_WRITE, { .vop_read = lxpr_write }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_CREATE, { .vop_create = lxpr_create }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_SPACE, { .vop_space = lxpr_space }, + VOPNAME_SETATTR, { .vop_setattr = lxpr_setattr }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + VOPNAME_POLL, { .vop_poll = lxpr_poll }, + NULL, NULL +}; + + +/* + * file contents of an lx /proc directory. + */ +static lxpr_dirent_t lx_procdir[] = { + { LXPR_CGROUPS, "cgroups" }, + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DISKSTATS, "diskstats" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MODULES, "modules" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_SWAPS, "swaps" }, + { LXPR_SYSDIR, "sys" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" }, + { LXPR_VMSTAT, "vmstat" } +}; + +#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0])) + +/* + * Contents of an lx /proc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_GIDMAP, "gid_map" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_MOUNTS, "mounts" }, + { LXPR_PID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_TASKDIR, "task" }, + { LXPR_PID_FDDIR, "fd" }, + { LXPR_PID_UIDMAP, "uid_map" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * Contents of an lx /proc/<pid>/task/<tid> directory. + */ +static lxpr_dirent_t tiddir[] = { + { LXPR_PID_TID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_TID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_GIDMAP, "gid_map" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_TID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_TID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_TID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" }, + { LXPR_PID_UIDMAP, "uid_map" } +}; + +#define TIDDIRFILES (sizeof (tiddir) / sizeof (tiddir[0])) + +#define LX_RLIM_INFINITY 0xFFFFFFFFFFFFFFFF + +#define RCTL_INFINITE(x) \ + ((x.rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x.rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct lxpr_rlimtab { + char *rlim_name; /* limit name */ + char *rlim_unit; /* limit unit */ + char *rlim_rctl; /* rctl source */ +} lxpr_rlimtab_t; + +static lxpr_rlimtab_t lxpr_rlimtab[] = { + { "Max cpu time", "seconds", "process.max-cpu-time" }, + { "Max file size", "bytes", "process.max-file-size" }, + { "Max data size", "bytes", "process.max-data-size" }, + { "Max stack size", "bytes", "process.max-stack-size" }, + { "Max core file size", "bytes", "process.max-core-size" }, + { "Max resident set", "bytes", "zone.max-physical-memory" }, + { "Max processes", "processes", "zone.max-lwps" }, + { "Max open files", "files", "process.max-file-descriptor" }, + { "Max locked memory", "bytes", "zone.max-locked-memory" }, + { "Max address space", "bytes", "process.max-address-space" }, + { "Max file locks", "locks", NULL }, + { "Max pending signals", "signals", + "process.max-sigqueue-size" }, + { "Max msgqueue size", "bytes", "process.max-msg-messages" } +}; + +#define LX_RLIM_TAB_LEN (sizeof (lxpr_rlimtab) / sizeof (lxpr_rlimtab[0])) + + +/* + * contents of lx /proc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IF_INET6, "if_inet6" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_IPV6_ROUTE, "ipv6_route" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_TCP6, "tcp6" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UDP6, "udp6" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * contents of /proc/sys directory + */ +static lxpr_dirent_t sysdir[] = { + { LXPR_SYS_FSDIR, "fs" }, + { LXPR_SYS_KERNELDIR, "kernel" }, + { LXPR_SYS_NETDIR, "net" }, + { LXPR_SYS_VMDIR, "vm" }, +}; + +#define SYSDIRFILES (sizeof (sysdir) / sizeof (sysdir[0])) + +/* + * contents of /proc/sys/fs directory + */ +static lxpr_dirent_t sys_fsdir[] = { + { LXPR_SYS_FS_AIO_MAX_NR, "aio-max-nr" }, + { LXPR_SYS_FS_AIO_NR, "aio-nr" }, + { LXPR_SYS_FS_FILEMAX, "file-max" }, + { LXPR_SYS_FS_FILENR, "file-nr" }, + { LXPR_SYS_FS_INOTIFYDIR, "inotify" }, + { LXPR_SYS_FS_PIPE_MAX, "pipe-max-size" }, +}; + +#define SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0])) + +/* + * contents of /proc/sys/fs/inotify directory + */ +static lxpr_dirent_t sys_fs_inotifydir[] = { + { LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, +}; + +#define SYS_FS_INOTIFYDIRFILES \ + (sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0])) + +/* + * contents of /proc/sys/kernel directory + */ +static lxpr_dirent_t sys_kerneldir[] = { + { LXPR_SYS_KERNEL_CAPLCAP, "cap_last_cap" }, + { LXPR_SYS_KERNEL_COREPATT, "core_pattern" }, + { LXPR_SYS_KERNEL_HOSTNAME, "hostname" }, + { LXPR_SYS_KERNEL_MSGMAX, "msgmax" }, + { LXPR_SYS_KERNEL_MSGMNB, "msgmnb" }, + { LXPR_SYS_KERNEL_MSGMNI, "msgmni" }, + { LXPR_SYS_KERNEL_NGROUPS_MAX, "ngroups_max" }, + { LXPR_SYS_KERNEL_OSREL, "osrelease" }, + { LXPR_SYS_KERNEL_PID_MAX, "pid_max" }, + { LXPR_SYS_KERNEL_RANDDIR, "random" }, + { LXPR_SYS_KERNEL_SEM, "sem" }, + { LXPR_SYS_KERNEL_SHMALL, "shmall" }, + { LXPR_SYS_KERNEL_SHMMAX, "shmmax" }, + { LXPR_SYS_KERNEL_SHMMNI, "shmmni" }, + { LXPR_SYS_KERNEL_THREADS_MAX, "threads-max" }, +}; + +#define SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0])) + +/* + * contents of /proc/sys/kernel/random directory + */ +static lxpr_dirent_t sys_randdir[] = { + { LXPR_SYS_KERNEL_RAND_BOOTID, "boot_id" }, + { LXPR_SYS_KERNEL_RAND_ENTAVL, "entropy_avail" }, +}; + +#define SYS_RANDDIRFILES (sizeof (sys_randdir) / sizeof (sys_randdir[0])) + +/* + * contents of /proc/sys/net directory + */ +static lxpr_dirent_t sys_netdir[] = { + { LXPR_SYS_NET_COREDIR, "core" }, + { LXPR_SYS_NET_IPV4DIR, "ipv4" }, +}; + +#define SYS_NETDIRFILES (sizeof (sys_netdir) / sizeof (sys_netdir[0])) + +/* + * contents of /proc/sys/net/core directory + */ +static lxpr_dirent_t sys_net_coredir[] = { + { LXPR_SYS_NET_CORE_SOMAXCON, "somaxconn" }, +}; + +#define SYS_NET_COREDIRFILES \ + (sizeof (sys_net_coredir) / sizeof (sys_net_coredir[0])) + +/* + * contents of /proc/sys/net/ipv4 directory + * See the Linux ip(7) & tcp(7) man pages for descriptions and the illumos + * ip(7p) & tcp(7p) man pages for the native descriptions. + */ +static lxpr_dirent_t sys_net_ipv4dir[] = { + { LXPR_SYS_NET_IPV4_ICMP_EIB, "icmp_echo_ignore_broadcasts" }, + { LXPR_SYS_NET_IPV4_IP_FORWARD, "ip_forward" }, + { LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, "ip_local_port_range" }, + { LXPR_SYS_NET_IPV4_TCP_FIN_TO, "tcp_fin_timeout" }, + { LXPR_SYS_NET_IPV4_TCP_KA_INT, "tcp_keepalive_intvl" }, + { LXPR_SYS_NET_IPV4_TCP_KA_TIM, "tcp_keepalive_time" }, + { LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, "tcp_max_syn_backlog" }, + { LXPR_SYS_NET_IPV4_TCP_RETRY2, "tcp_retries2" }, + { LXPR_SYS_NET_IPV4_TCP_RMEM, "tcp_rmem" }, + { LXPR_SYS_NET_IPV4_TCP_SACK, "tcp_sack" }, + { LXPR_SYS_NET_IPV4_TCP_WINSCALE, "tcp_window_scaling" }, + { LXPR_SYS_NET_IPV4_TCP_WMEM, "tcp_wmem" }, +}; + +#define SYS_NET_IPV4DIRFILES \ + (sizeof (sys_net_ipv4dir) / sizeof (sys_net_ipv4dir[0])) + +/* + * contents of /proc/sys/vm directory + */ +static lxpr_dirent_t sys_vmdir[] = { + { LXPR_SYS_VM_DIRTY_BG_BYTES, "dirty_background_bytes" }, + { LXPR_SYS_VM_DIRTY_BG_RATIO, "dirty_background_ratio" }, + { LXPR_SYS_VM_DIRTY_BYTES, "dirty_bytes" }, + { LXPR_SYS_VM_DIRTY_EXP_CS, "dirty_expire_centisecs" }, + { LXPR_SYS_VM_DIRTY_RATIO, "dirty_ratio" }, + { LXPR_SYS_VM_DIRTYTIME_EXP_SEC, "dirtytime_expire_seconds" }, + { LXPR_SYS_VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, + { LXPR_SYS_VM_MAX_MAP_CNT, "max_map_count" }, + { LXPR_SYS_VM_MINFR_KB, "min_free_kbytes" }, + { LXPR_SYS_VM_NHUGEP, "nr_hugepages" }, + { LXPR_SYS_VM_OVERCOMMIT_MEM, "overcommit_memory" }, + { LXPR_SYS_VM_SWAPPINESS, "swappiness" }, +}; + +#define SYS_VMDIRFILES (sizeof (sys_vmdir) / sizeof (sys_vmdir[0])) + +/* + * Table for standard writable files. Non-standard writable files not in this + * table can be handled explicitly as special cases. + * This table drives lxpr_is_writable, lxpr_write, and lxpr_create. + * Note that the entries LXPR_PID_FD_FD and LXPR_PID_TID_FD_FD exist in the + * table both to verify writability and to satisfy opening with O_CREATE. + */ +typedef struct wftab { + lxpr_nodetype_t wft_type; /* file entry type */ + int (*wft_wrf)(lxpr_node_t *, struct uio *, cred_t *, + caller_context_t *); /* write function */ +} wftab_t; + +static wftab_t wr_tab[] = { + {LXPR_PID_COMM, lxpr_write_pid_tid_comm}, + {LXPR_PID_FD_FD, NULL}, + {LXPR_PID_LOGINUID, lxpr_write_pid_loginuid}, + {LXPR_PID_OOM_SCR_ADJ, NULL}, + {LXPR_PID_TID_COMM, lxpr_write_pid_tid_comm}, + {LXPR_PID_TID_FD_FD, NULL}, + {LXPR_PID_TID_OOM_SCR_ADJ, NULL}, + {LXPR_SYS_FS_FILEMAX, NULL}, + {LXPR_SYS_KERNEL_COREPATT, lxpr_write_sys_kernel_corepatt}, + {LXPR_SYS_KERNEL_SHMALL, NULL}, + {LXPR_SYS_KERNEL_SHMMAX, NULL}, + {LXPR_SYS_FS_PIPE_MAX, lxpr_write_sys_fs_pipe_max}, + {LXPR_SYS_NET_CORE_SOMAXCON, lxpr_write_sys_net_core_somaxc}, + {LXPR_SYS_NET_IPV4_ICMP_EIB, lxpr_write_sys_net_ipv4_icmp_eib}, + {LXPR_SYS_NET_IPV4_IP_FORWARD, NULL}, + {LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, + lxpr_write_sys_net_ipv4_ip_lport_range}, + {LXPR_SYS_NET_IPV4_TCP_FIN_TO, lxpr_write_sys_net_ipv4_tcp_fin_to}, + {LXPR_SYS_NET_IPV4_TCP_KA_INT, lxpr_write_sys_net_ipv4_tcp_ka_int}, + {LXPR_SYS_NET_IPV4_TCP_KA_TIM, lxpr_write_sys_net_ipv4_tcp_ka_tim}, + {LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, + lxpr_write_sys_net_ipv4_tcp_max_syn_bl}, + {LXPR_SYS_NET_IPV4_TCP_RETRY2, lxpr_write_sys_net_ipv4_tcp_retry2}, + {LXPR_SYS_NET_IPV4_TCP_RMEM, lxpr_write_sys_net_ipv4_tcp_rwmem}, + {LXPR_SYS_NET_IPV4_TCP_SACK, lxpr_write_sys_net_ipv4_tcp_sack}, + {LXPR_SYS_NET_IPV4_TCP_WINSCALE, lxpr_write_sys_net_ipv4_tcp_winscale}, + {LXPR_SYS_NET_IPV4_TCP_WMEM, lxpr_write_sys_net_ipv4_tcp_rwmem}, + {LXPR_SYS_VM_DIRTY_BG_BYTES, NULL}, + {LXPR_SYS_VM_DIRTY_BG_RATIO, NULL}, + {LXPR_SYS_VM_DIRTY_BYTES, NULL}, + {LXPR_SYS_VM_DIRTY_EXP_CS, NULL}, + {LXPR_SYS_VM_DIRTY_RATIO, NULL}, + {LXPR_SYS_VM_DIRTYTIME_EXP_SEC, NULL}, + {LXPR_SYS_VM_DIRTY_WB_CS, NULL}, + {LXPR_SYS_VM_OVERCOMMIT_MEM, NULL}, + {LXPR_SYS_VM_SWAPPINESS, NULL}, + {LXPR_INVALID, NULL} +}; + +/* + * Centralized test for the standard writable proc files. Other non-standard + * writable files might be handled separately. + */ +boolean_t +lxpr_is_writable(lxpr_nodetype_t type) +{ + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* Restrict writes to certain files */ + if ((flag & FWRITE) && !lxpr_is_writable(type)) { + return (EPERM); + } + + /* + * If we are opening an underlying file only allow regular files, + * fifos or sockets; reject the open for anything else. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG && + rvp->v_type != VFIFO && rvp->v_type != VSOCK) { + error = EACCES; + } else { + if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) { + /* + * This flag lets the fifo open know that + * we're using proc/fd to open a fd which we + * already have open. Otherwise, the fifo might + * reject an open if the other end has closed. + */ + flag |= FKLYR; + } + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ +#ifdef DEBUG + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); +#endif /* DEBUG */ + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_auxv, /* /proc/<pid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_pid_tid_comm, /* /proc/<pid>/comm */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_id_map, /* /proc/<pid>/gid_map */ + lxpr_read_pid_limits, /* /proc/<pid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/mountinfo */ + lxpr_read_mounts, /* /proc/<pid>/mounts */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/oom_score_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task */ + lxpr_read_isdir, /* /proc/<pid>/task/nn */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_pid_id_map, /* /proc/<pid>/uid_map */ + lxpr_read_pid_auxv, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_read_pid_tid_comm, /* /proc/<pid>/task/<tid>/comm */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/task/<tid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/exe */ + lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_read_pid_limits, /* /proc/<pid>/task/<tid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/task/<tid>/maps */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/task/<tid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/task/<tid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/task/<tid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/task/<tid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_read_fd, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_read_cgroups, /* /proc/cgroups */ + lxpr_read_cmdline, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_devices, /* /proc/devices */ + lxpr_read_diskstats, /* /proc/diskstats */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_filesystems, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_empty, /* /proc/modules */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_if_inet6, /* /proc/net/if_inet6 */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_ipv6_route, /* /proc/net/ipv6_route */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_tcp6, /* /proc/net/tcp6 */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_udp6, /* /proc/net/udp6 */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_swaps, /* /proc/swaps */ + lxpr_read_invalid, /* /proc/sys */ + lxpr_read_invalid, /* /proc/sys/fs */ + lxpr_read_sys_fs_aiomax, /* /proc/sys/fs/aio-max-nr */ + lxpr_read_sys_fs_aionr, /* /proc/sys/fs/aio-nr */ + lxpr_read_sys_fs_filemax, /* /proc/sys/fs/file-max */ + lxpr_read_sys_fs_filenr, /* /proc/sys/fs/file-nr */ + lxpr_read_invalid, /* /proc/sys/fs/inotify */ + lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */ + lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */ + lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */ + lxpr_read_sys_fs_pipe_max, /* /proc/sys/fs/pipe-max-size */ + lxpr_read_invalid, /* /proc/sys/kernel */ + lxpr_read_sys_kernel_caplcap, /* /proc/sys/kernel/cap_last_cap */ + lxpr_read_sys_kernel_corepatt, /* /proc/sys/kernel/core_pattern */ + lxpr_read_sys_kernel_hostname, /* /proc/sys/kernel/hostname */ + lxpr_read_sys_kernel_msgmax, /* /proc/sys/kernel/msgmax */ + lxpr_read_sys_kernel_msgmnb, /* /proc/sys/kernel/msgmnb */ + lxpr_read_sys_kernel_msgmni, /* /proc/sys/kernel/msgmni */ + lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */ + lxpr_read_sys_kernel_osrel, /* /proc/sys/kernel/osrelease */ + lxpr_read_sys_kernel_pid_max, /* /proc/sys/kernel/pid_max */ + lxpr_read_invalid, /* /proc/sys/kernel/random */ + lxpr_read_sys_kernel_rand_bootid, /* /proc/sys/kernel/random/boot_id */ + lxpr_read_sys_kernel_rand_entavl, /* .../kernel/random/entropy_avail */ + lxpr_read_sys_kernel_sem, /* /proc/sys/kernel/sem */ + lxpr_read_sys_kernel_shmall, /* /proc/sys/kernel/shmall */ + lxpr_read_sys_kernel_shmmax, /* /proc/sys/kernel/shmmax */ + lxpr_read_sys_kernel_shmmni, /* /proc/sys/kernel/shmmni */ + lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */ + lxpr_read_invalid, /* /proc/sys/net */ + lxpr_read_invalid, /* /proc/sys/net/core */ + lxpr_read_sys_net_core_somaxc, /* /proc/sys/net/core/somaxconn */ + lxpr_read_invalid, /* /proc/sys/net/ipv4 */ + lxpr_read_sys_net_ipv4_icmp_eib, /* .../icmp_echo_ignore_broadcasts */ + lxpr_read_sys_net_ipv4_ip_forward, /* .../ipv4/ip_forward */ + lxpr_read_sys_net_ipv4_ip_lport_range, /* ../ipv4/ip_local_port_range */ + lxpr_read_sys_net_ipv4_tcp_fin_to, /* .../ipv4/tcp_fin_timeout */ + lxpr_read_sys_net_ipv4_tcp_ka_int, /* .../ipv4/tcp_keepalive_intvl */ + lxpr_read_sys_net_ipv4_tcp_ka_tim, /* .../ipv4/tcp_keepalive_time */ + lxpr_read_sys_net_ipv4_tcp_max_syn_bl, /* ../ipv4/tcp_max_syn_backlog */ + lxpr_read_sys_net_ipv4_tcp_retry2, /* .../ipv4/tcp_retries2 */ + lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_rmem */ + lxpr_read_sys_net_ipv4_tcp_sack, /* .../ipv4/tcp_sack */ + lxpr_read_sys_net_ipv4_tcp_winscale, /* .../ipv4/tcp_window_scaling */ + lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_wmem */ + lxpr_read_invalid, /* /proc/sys/vm */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_bytes */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_ratio */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_bytes */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_expire_centisecs */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_ratio */ + lxpr_read_sys_vm_dirty, /* .../vm/dirtytime_expire_seconds */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_writeback_centisecs */ + lxpr_read_sys_vm_max_map_cnt, /* /proc/sys/vm/max_map_count */ + lxpr_read_sys_vm_minfr_kb, /* /proc/sys/vm/min_free_kbytes */ + lxpr_read_sys_vm_nhpages, /* /proc/sys/vm/nr_hugepages */ + lxpr_read_sys_vm_overcommit_mem, /* /proc/sys/vm/overcommit_memory */ + lxpr_read_sys_vm_swappiness, /* /proc/sys/vm/swappiness */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ + lxpr_read_vmstat, /* /proc/vmstat */ +}; + +/* + * Array of lookup functions, indexed by lx /proc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/gid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mounts */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_taskdir, /* /proc/<pid>/task */ + lxpr_lookup_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/uid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_lookup_not_a_dir, /* /proc/cgroups */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/diskstats */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/modules */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/swaps */ + lxpr_lookup_sysdir, /* /proc/sys */ + lxpr_lookup_sys_fsdir, /* /proc/sys/fs */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-max-nr */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-nr */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-max */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-nr */ + lxpr_lookup_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_lookup_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/pipe-max-size */ + lxpr_lookup_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmnb */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_lookup_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_lookup_not_a_dir, /* .../kernel/random/entropy_avail */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_lookup_sys_netdir, /* /proc/sys/net */ + lxpr_lookup_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_lookup_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_lookup_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_lookup_not_a_dir, /* .../icmp_echo_ignore_broadcasts */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_forward */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_retries2 */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_rmem */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_wmem */ + lxpr_lookup_sys_vmdir, /* /proc/sys/vm */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_background_bytes */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_background_ratio */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_bytes */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_expire_centisecs */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_ratio */ + lxpr_lookup_not_a_dir, /* .../vm/dirtytime_expire_seconds */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_writeback_centisecs */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ + lxpr_lookup_not_a_dir, /* /proc/vmstat */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + NULL, /* invalid */ + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/gid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mounts */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_taskdir, /* /proc/<pid>/task */ + lxpr_readdir_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/uid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/oom_scr_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_readdir_not_a_dir, /* /proc/cgroups */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/diskstats */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/modules */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/swaps */ + lxpr_readdir_sysdir, /* /proc/sys */ + lxpr_readdir_sys_fsdir, /* /proc/sys/fs */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-max-nr */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-nr */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-max */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-nr */ + lxpr_readdir_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_readdir_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/pipe-max-size */ + lxpr_readdir_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmnb */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_readdir_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_readdir_not_a_dir, /* .../kernel/random/entropy_avail */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_readdir_sys_netdir, /* /proc/sys/net */ + lxpr_readdir_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_readdir_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_readdir_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_readdir_not_a_dir, /* .../icmp_echo_ignore_broadcasts */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_forward */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_retries2 */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_rmem */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_wmem */ + lxpr_readdir_sys_vmdir, /* /proc/sys/vm */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_background_bytes */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_background_ratio */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_bytes */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_expire_centisecs */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_ratio */ + lxpr_readdir_not_a_dir, /* .../vm/dirtytime_expire_seconds */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_writeback_centisecs */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ + lxpr_readdir_not_a_dir, /* /proc/vmstat */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in the lx procfs is human + * readable and not binary structures there do not have to be different + * read variants depending on whether the reading process model is 32 or 64 bits + * (at least in general, and certainly the difference is unlikely to be enough + * to justify have different routines for 32 and 64 bit reads + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_auxv(): read process aux vector + */ +static void +lxpr_read_pid_auxv(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + lx_elf_data_t *edp = NULL; + int i, cnt; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_AUXV || + lxpnp->lxpr_type == LXPR_PID_TID_AUXV); + + p = lxpr_lock(lxpnp, NO_ZOMB); + + if (p == NULL) { + return; + } + if ((pd = ptolxproc(p)) == NULL) { + /* Emit a single AT_NULL record for non-branded processes */ + auxv_t buf; + + bzero(&buf, sizeof (buf)); + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)&buf, sizeof (buf)); + return; + } else { + edp = &pd->l_elf_data; + } + + if (p->p_model == DATAMODEL_NATIVE) { + auxv_t buf[__KERN_NAUXV_IMPL]; + + /* + * Because a_type is only of size int (not long), the buffer + * contents must be zeroed first to ensure cleanliness. + */ + bzero(buf, sizeof (buf)); + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &buf[cnt], edp) == 0) { + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#if defined(_SYSCALL32_IMPL) + else { + auxv32_t buf[__KERN_NAUXV_IMPL]; + + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + auxv_t temp; + + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &temp, edp) == 0) { + buf[cnt].a_type = (int)temp.a_type; + buf[cnt].a_un.a_val = (int)temp.a_un.a_val; + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#endif /* defined(_SYSCALL32_IMPL) */ +} + +/* + * lxpr_read_pid_cgroup(): read cgroups for process + */ +static void +lxpr_read_pid_cgroup(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CGROUP || + lxpnp->lxpr_type == LXPR_PID_TID_CGROUP); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* basic stub, 3rd field will need to be populated */ + lxpr_uiobuf_printf(uiobuf, "1:name=systemd:/\n"); +} + +static void +lxpr_copy_cmdline(proc_t *p, lx_proc_data_t *pd, lxpr_uiobuf_t *uiobuf) +{ + uio_t *uiop = uiobuf->uiop; + char *buf = uiobuf->buffer; + int bsz = uiobuf->buffsize; + boolean_t env_overflow = B_FALSE; + uintptr_t pos = pd->l_args_start + uiop->uio_offset; + uintptr_t estart = pd->l_envs_start; + uintptr_t eend = pd->l_envs_end; + size_t chunk, copied; + int err = 0; + + /* Do not bother with data beyond the end of the envp strings area. */ + if (pos > eend) { + return; + } + mutex_exit(&p->p_lock); + + /* + * If the starting or ending bounds are outside the argv strings area, + * check to see if the process has overwritten the terminating NULL. + * If not, no data needs to be copied from oustide the argv area. + */ + if (pos >= estart || (pos + uiop->uio_resid) >= estart) { + uint8_t term; + if (uread(p, &term, sizeof (term), estart - 1) != 0) { + err = EFAULT; + } else if (term != 0) { + env_overflow = B_TRUE; + } + } + + /* Data between astart and estart-1 can be copied freely. */ + while (pos < estart && uiop->uio_resid > 0 && err == 0) { + chunk = MIN(estart - pos, uiop->uio_resid); + chunk = MIN(chunk, bsz); + + if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) != 0 || + copied != chunk) { + err = EFAULT; + break; + } + err = uiomove(buf, copied, UIO_READ, uiop); + pos += copied; + } + + /* + * Onward from estart, data is copied as a contiguous string. To + * protect env data from potential snooping, only one buffer-sized copy + * is allowed to avoid complex seek logic. + */ + if (err == 0 && env_overflow && pos == estart && uiop->uio_resid > 0) { + chunk = MIN(eend - pos, uiop->uio_resid); + chunk = MIN(chunk, bsz); + if (prreadbuf(p, pos, (uint8_t *)buf, chunk, &copied) == 0) { + int len = strnlen(buf, copied); + if (len > 0) { + err = uiomove(buf, len, UIO_READ, uiop); + } + } + } + + uiobuf->error = err; + /* reset any uiobuf state */ + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + + mutex_enter(&p->p_lock); +} + +/* + * lxpr_read_pid_cmdline(): read argument vector from process + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxargvlen, sz; + lx_proc_data_t *pd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE || + lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + if ((pd = ptolxproc(p)) != NULL && pd->l_args_start != 0 && + pd->l_envs_start != 0 && pd->l_envs_end != 0) { + /* Use Linux-style argv bounds if possible. */ + lxpr_copy_cmdline(p, pd, uiobuf); + lxpr_unlock(p); + } else { + int r; + + r = prreadargv(p, buf, asz, &sz); + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + } + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_tid_comm(): read command name from thread + */ +static void +lxpr_read_pid_tid_comm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + pid_t tid; + char buf[LX_PR_SET_NAME_NAMELEN], *pnm; + + VERIFY(lxpnp->lxpr_type == LXPR_PID_COMM || + lxpnp->lxpr_type == LXPR_PID_TID_COMM); + + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + if (t == NULL) { + lxpr_unlock(p); + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* + * If a thread name has not been set, use the process command name. + * This also covers the /proc/{pid}/comm case. + */ + if (t->t_name == NULL) { + pnm = p->p_user.u_comm; + } else { + pnm = t->t_name; + } + + /* Truncate with NUL if the name is longer than the Linux size. */ + (void) strlcpy(buf, pnm, sizeof (buf)); + + lxpr_unlock(p); + lxpr_uiobuf_printf(uiobuf, "%s\n", buf); +} + +/* ARGSUSED */ +static int +lxpr_write_pid_tid_comm(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + int error; + size_t olen; + char *buf; + proc_t *p; + kthread_t *t; + pid_t tid; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_COMM || + lxpnp->lxpr_type == LXPR_PID_TID_COMM); + + /* + * Only a thread in the process can update one of the thread names. Not + * even a process with root privileges. Linux returns EINVAL (not EPERM) + * for this case. + */ + if (lxpnp->lxpr_pid != curproc->p_pid) + return (EINVAL); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > LX_PR_SET_NAME_NAMELEN - 1) + olen = LX_PR_SET_NAME_NAMELEN - 1; + + buf = kmem_zalloc(THREAD_NAME_MAX, KM_SLEEP); + + error = uiomove(buf, olen, UIO_WRITE, uio); + if (error != 0) { + kmem_free(buf, THREAD_NAME_MAX); + return (error); + } + buf[LX_PR_SET_NAME_NAMELEN - 1] = '\0'; + + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, NO_ZOMB, &t); + if (p == NULL) { + kmem_free(buf, THREAD_NAME_MAX); + return (ENXIO); + } + if (t == NULL) { + lxpr_unlock(p); + kmem_free(buf, THREAD_NAME_MAX); + return (ENXIO); + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* + * See comments for thread_setname() and prctl(LX_PR_SET_NAME) handling. + */ + if (t->t_name == NULL) { + t->t_name = buf; + } else { + (void) strlcpy(t->t_name, buf, THREAD_NAME_MAX); + kmem_free(buf, THREAD_NAME_MAX); + } + + if (t->t_tid == 1) { + (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1); + (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ); + } + + lxpr_unlock(p); + return (0); +} + +/* + * lxpr_read_pid_env(): read env vector from process + */ +static void +lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxenvvlen, sz; + int r; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + r = prreadenvv(p, buf, asz, &sz); + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_limits(): ulimit file + */ +static void +lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + rctl_qty_t cur[LX_RLIM_TAB_LEN], max[LX_RLIM_TAB_LEN]; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS || + lxpnp->lxpr_type == LXPR_PID_TID_LIMITS); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + return; + } + + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + char *kname = lxpr_rlimtab[i].rlim_rctl; + rctl_val_t nval, *oval = NULL; + rctl_hndl_t hndl; + + /* default to unlimited for resources without an analog */ + cur[i] = RLIM_INFINITY; + max[i] = RLIM_INFINITY; + if (kname == NULL || (hndl = rctl_hndl_lookup(kname)) == -1) { + continue; + } + while (rctl_local_get(hndl, oval, &nval, p) == 0) { + oval = &nval; + switch (nval.rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + cur[i] = nval.rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + max[i] = nval.rcv_value; + break; + } + } + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name); + if (cur[i] == RLIM_INFINITY || cur[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", cur[i]); + } + if (max[i] == RLIM_INFINITY || max[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", max[i]); + } + lxpr_uiobuf_printf(uiobuf, " %-10s\n", + lxpr_rlimtab[i].rlim_unit); + } +} +/* + * lxpr_read_pid_id_map(): gid_map and uid_map file + */ +static void +lxpr_read_pid_id_map(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_GIDMAP || + lxpnp->lxpr_type == LXPR_PID_UIDMAP); + + lxpr_uiobuf_printf(uiobuf, "%10u %10u %10u\n", 0, 0, MAXUID); +} + +/* + * lxpr_read_pid_loginuid(): loginuid file + */ +static void +lxpr_read_pid_loginuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + uid_t lu = 0; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID || + lxpnp->lxpr_type == LXPR_PID_TID_LOGINUID); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + if ((pd = ptolxproc(p)) != NULL) { + lu = pd->l_loginuid; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%d", lu); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + uintptr_t saddr; + uintptr_t eaddr; + int type; + char prot[5]; + uintptr_t offset; + vnode_t *vp; + char *name_override; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS || + lxpnp->lxpr_type == LXPR_PID_TID_MAPS); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + return; + } + + as = p->p_as; + lxpd = ptolxproc(p); + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = (uintptr_t)seg->s_base; + pbuf->eaddr = pbuf->saddr + seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr); + + pbuf->name_override = NULL; + if (lxpd != NULL) { + if (pbuf->saddr == lxpd->l_vdso) { + pbuf->name_override = "[vdso]"; + } else if (pbuf->saddr == p->p_user.u_commpagep) { + pbuf->name_override = "[vvar]"; + } + } + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + ino_t inode = 0; + + *buf = '\0'; + if (pbuf->name_override != NULL) { + (void) strncpy(buf, pbuf->name_override, buflen); + } else if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (p->p_model == DATAMODEL_LP64) { + lxpr_uiobuf_printf(uiobuf, + "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, *buf != '\0' ? " " : "", buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02x:%02x %llu%s%s\n", + (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr, + pbuf->prot, (uint32_t)pbuf->offset, maj, min, + inode, *buf != '\0' ? " " : "", buf); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * Make mount entry look more like Linux. Non-zero return to skip it. + */ +static int +lxpr_clean_mntent(char **mntpt, char **fstype, char **resource) +{ + if (strcmp(*mntpt, "/var/ld") == 0 || + strcmp(*fstype, "objfs") == 0 || + strcmp(*fstype, "mntfs") == 0 || + strcmp(*fstype, "ctfs") == 0 || + strncmp(*mntpt, "/native/", 8) == 0) { + return (1); + } + + if (strcmp(*fstype, "tmpfs") == 0) { + *resource = "tmpfs"; + } else if (strcmp(*fstype, "lx_proc") == 0) { + *resource = *fstype = "proc"; + } else if (strcmp(*fstype, "lx_sysfs") == 0) { + *resource = *fstype = "sysfs"; + } else if (strcmp(*fstype, "lx_devfs") == 0) { + *resource = *fstype = "devtmpfs"; + } else if (strcmp(*fstype, "lx_cgroup") == 0) { + *resource = *fstype = "cgroup"; + } else if (strcmp(*fstype, "lxautofs") == 0) { + *fstype = "autofs"; + } + + return (0); +} + + +typedef struct lxpr_mount_entry { + list_node_t lme_link; + uint_t lme_id; + uint_t lme_parent_id; + refstr_t *lme_mntpt; + refstr_t *lme_resource; + uint_t lme_mntopts_len; + char *lme_mntopts; + uint_t lme_flag; + int lme_fstype; + dev_t lme_dev; + boolean_t lme_force; +} lxpr_mount_entry_t; + +static int lxpr_zfs_fstype = -1; + +#define LXPR_ROOT_MOUNT_ID 15 +#define LXPR_MNT_OPT_CHUNK 128 + +/* List of native, non-Linux mount options we should omit. */ +static const char *lx_invalid_mnt_opts[] = { + "xattr", + NULL +}; + +/* First see if we should omit this option */ +static boolean_t +lxpr_skip_mntopt(const char *s) +{ + uint_t i; + + for (i = 0; lx_invalid_mnt_opts[i] != NULL; i++) { + if (strcmp(s, lx_invalid_mnt_opts[i]) == 0) + return (B_TRUE); + } + return (B_FALSE); +} + +static void +lxpr_append_mntopt(lxpr_mount_entry_t *lme, char *s) +{ + while (strlcat(lme->lme_mntopts, s, lme->lme_mntopts_len) >= + lme->lme_mntopts_len) { + /* expand option string */ + uint_t tlen = lme->lme_mntopts_len + LXPR_MNT_OPT_CHUNK; + char *t = kmem_alloc(tlen, KM_SLEEP); + + (void) strlcpy(t, lme->lme_mntopts, tlen); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + lme->lme_mntopts_len = tlen; + lme->lme_mntopts = t; + } +} + +/* + * Perform the somewhat complicated work of getting the mount options string + * for the mount. + */ +static void +lxpr_get_mntopts(vfs_t *vfsp, lxpr_mount_entry_t *lme) +{ + uint_t i; + mntopt_t *mop; + boolean_t have_nosuid = B_FALSE, have_nodev = B_FALSE; + + lme->lme_mntopts_len = LXPR_MNT_OPT_CHUNK; + lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP); + lme->lme_mntopts[0] = '\0'; + + /* Always show rw/ro option */ + lxpr_append_mntopt(lme, + (lme->lme_flag & VFS_RDONLY) == 0 ? "rw" : "ro"); + + for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) { + mop = &vfsp->vfs_mntopts.mo_list[i]; + if ((mop->mo_flags & MO_NODISPLAY) || !(mop->mo_flags & MO_SET)) + continue; + + if (strcmp(mop->mo_name, "ro") == 0 || + strcmp(mop->mo_name, "rw") == 0) + continue; + + if (strcmp(mop->mo_name, "nosuid") == 0) + have_nosuid = B_TRUE; + /* sigh, either option string is used */ + if (strcmp(mop->mo_name, "nodev") == 0 || + strcmp(mop->mo_name, "nodevices") == 0) + have_nodev = B_TRUE; + + if (!lxpr_skip_mntopt(mop->mo_name)) { + lxpr_append_mntopt(lme, ","); + lxpr_append_mntopt(lme, mop->mo_name); + if (mop->mo_arg != NULL) { + lxpr_append_mntopt(lme, "="); + lxpr_append_mntopt(lme, mop->mo_arg); + } + } + } + + /* + * Sometimes nosuid is an explicit string, other times it's a flag. + * The same is true for nodevices. + */ + if (!have_nosuid && (lme->lme_flag & VFS_NOSETUID)) { + lxpr_append_mntopt(lme, ",nosuid"); + } + if (!have_nodev && (lme->lme_flag & VFS_NODEVICES)) { + lxpr_append_mntopt(lme, ",nodevices"); + } +} + +static list_t * +lxpr_enumerate_mounts(zone_t *zone) +{ + vfs_t *vfsp, *rvfsp, *vfslist; + lx_zone_data_t *lxzd = ztolxzd(zone); + list_t *result; + lxpr_mount_entry_t *lme; + lx_virt_disk_t *vd; + uint_t root_id, mount_id; + char tmppath[MAXPATHLEN]; + + result = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(result, sizeof (lxpr_mount_entry_t), + offsetof(lxpr_mount_entry_t, lme_link)); + /* use an arbitrary start value for the root mount_id */ + root_id = 15; + mount_id = root_id + 1; + + ASSERT(zone != global_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + vfs_list_read_lock(); + vfsp = vfslist = zone->zone_vfslist; + + /* + * If the zone has a root entry, it will be the first in the list. + * Conjure one up if needed. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + rvfsp = zone->zone_rootvp->v_vfsp; + } else { + rvfsp = vfslist; + vfsp = vfslist->vfs_zone_next; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = root_id; + lme->lme_parent_id = 0; + lme->lme_mntpt = refstr_alloc(zone->zone_rootpath); + lme->lme_flag = rvfsp->vfs_flag; + lme->lme_fstype = rvfsp->vfs_fstype; + lme->lme_force = B_TRUE; + lxpr_get_mntopts(rvfsp, lme); + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == rvfsp->vfs_dev) { + (void) snprintf(tmppath, sizeof (tmppath), + "%sdev/%s", zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(tmppath); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = refstr_alloc(zone->zone_rootpath); + lme->lme_dev = rvfsp->vfs_dev; + } + list_insert_head(result, lme); + + do { + if (vfsp == NULL) { + break; + } + /* Skip mounts we shouldn't show */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_mntpt); + lme->lme_flag = vfsp->vfs_flag; + lme->lme_fstype = vfsp->vfs_fstype; + lme->lme_force = B_FALSE; + lxpr_get_mntopts(vfsp, lme); + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == vfsp->vfs_dev) { + char vdev[MAXPATHLEN]; + + (void) snprintf(vdev, sizeof (vdev), + "%sdev/%s", + zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(vdev); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = vfsp->vfs_resource; + refstr_hold(vfsp->vfs_resource); + lme->lme_dev = vfsp->vfs_dev; + } + list_insert_tail(result, lme); + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* Add a single dummy entry for /native/usr */ + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_flag = VFS_RDONLY; + lme->lme_dev = makedevice(0, 1); + (void) snprintf(tmppath, sizeof (tmppath), + "%snative/usr", zone->zone_rootpath); + lme->lme_mntpt = refstr_alloc(tmppath); + lme->lme_resource = lme->lme_mntpt; + lme->lme_mntopts_len = 3; + lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP); + (void) strlcpy(lme->lme_mntopts, "ro", lme->lme_mntopts_len); + refstr_hold(lme->lme_mntpt); + if (lxpr_zfs_fstype == -1) { + vfssw_t *zfssw = vfs_getvfssw("zfs"); + VERIFY(zfssw != NULL); + lxpr_zfs_fstype = ((uintptr_t)zfssw - (uintptr_t)vfssw) / + sizeof (vfssw[0]); + VERIFY(&vfssw[lxpr_zfs_fstype] == zfssw); + } + lme->lme_fstype = lxpr_zfs_fstype; + lme->lme_force = B_TRUE; + list_insert_tail(result, lme); + + return (result); +} + +/* + * lxpr_read_pid_mountinfo(): information about process mount points. + */ +static void +lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO || + lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO); + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype, *rwflag; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro"; + + /* + * XXX parent ID is not tracked correctly here. Currently we + * always assume the parent ID is the root ID. + */ + lxpr_uiobuf_printf(uiobuf, + "%d %d %d:%d / %s %s - %s %s %s\n", + lme->lme_id, lme->lme_parent_id, + getmajor(lme->lme_dev), getminor(lme->lme_dev), + mntpt, rwflag, fstype, resource, lme->lme_mntopts); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_pid_oom_scr_adj(): read oom_score_adj for process + */ +static void +lxpr_read_pid_oom_scr_adj(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_OOM_SCR_ADJ || + lxpnp->lxpr_type == LXPR_PID_TID_OOM_SCR_ADJ); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* always 0 */ + lxpr_uiobuf_printf(uiobuf, "0\n"); +} + +/* + * lxpr_read_pid_personality(): read personality for process + */ +static void +lxpr_read_pid_personality(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + unsigned int personality; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_PERSONALITY); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + if ((lxpd = ptolxproc(p)) != NULL) { + personality = lxpd->l_personality; + } else { + /* Report native processes as having the SunOS personality */ + personality = LX_PER_SUNOS; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%08x\n", personality); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize, rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM || + lxpnp->lxpr_type == LXPR_PID_TID_STATM); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * Determine number of LWPs visible in the process. In particular we want to + * ignore aio in-kernel threads. + */ +static uint_t +lxpr_count_tasks(proc_t *p) +{ + uint_t cnt = 0; + kthread_t *t; + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + return (0); + } + + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) { + cnt = p->p_lwpcnt; + } else { + do { + lx_lwp_data_t *lwpd = ttolxlwp(t); + /* Don't count aio kernel worker threads */ + if ((t->t_proc_flag & TP_KTHREAD) != 0 && + lwpd != NULL && + (lwpd->br_lwp_flags & BR_AIO_LWP) == 0) { + cnt++; + } + + t = t->t_forw; + } while (t != p->p_tlist); + } + + return (cnt); +} + +/* + * pid/tid common code to read status file + */ +static void +lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + struct as *as; + char *status; + pid_t pid, ppid; + pid_t tid = (lookup_id == 0) ? lxpnp->lxpr_pid : lookup_id; + k_sigset_t current, ignore, handle; + int i, lx_sig, lwpcnt, ngroups; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t fdlim; + size_t vsize = 0, nlocked = 0, rss = 0, stksize = 0; + boolean_t printsz = B_FALSE; + + + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, &ppid); + + if (t != NULL) { + thread_lock(t); + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + (void) strlcpy(buf_comm, up->u_comm, sizeof (buf_comm)); + fdlim = p->p_fno_ctl; + lwpcnt = lxpr_count_tasks(p); + + /* + * Gather memory information + */ + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) && + (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + nlocked = p->p_locked_mem; + stksize = p->p_stksize; + printsz = B_TRUE; + } + + /* + * Gather signal information + */ + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + for (i = 1; i < NSIG; i++) { + lx_sig = stol_signo[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + buf_comm, + status, + pid, /* thread group id - same as pid */ + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + fdlim); + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + if (printsz) { + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + btok(nlocked), + ptok(rss), + 0l, + btok(stksize), + ptok(rss), + 0l); + } + lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u\n", lwpcnt); + lxpr_uiobuf_printf(uiobuf, + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0]); + /* Report only the full bounding set for now */ + lxpr_uiobuf_printf(uiobuf, + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n" + "CapBnd:\t%016llx\n", + 0, 0, 0, 0x1fffffffffLL); +} + +/* + * lxpr_read_pid_tid_status(): status file + */ +static void +lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS || + lxpnp->lxpr_type == LXPR_PID_TID_STATUS); + + lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* + * Same logic as the lx devfs lxd_pts_devt_translator. + */ +static dev_t +lxpr_xlate_pts_dev(dev_t dev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + + return (LX_MAKEDEVICE(lx_maj, lx_min)); +} + +/* + * pid/tid common code to read stat file + */ +static void +lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + zone_t *zone; + char stat; + pid_t pid, ppid, pgpid, spid, tid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri, lwpcnt; + caddr_t wchan, stackbase; + processorid_t cpu; + clock_t utime, stime, cutime, cstime, ticks, boottime; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t vmem_ctl; + int exit_signal = -1; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT || + lxpnp->lxpr_type == LXPR_PID_TID_STAT); + + zone = LXPTOZ(lxpnp); + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + /* Set Linux defaults if we're the zone's init process */ + pid = p->p_pid; + lxpr_fixpid(zone, p, &pid, &ppid); + if (pid == 1) { + /* init process */ + pgpid = 0; + psgid = (gid_t)-1; + spid = 0; + psdev = 0; + } else { + pgpid = p->p_pgrp; + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = lxpr_xlate_pts_dev(p->p_sessp->s_dev); + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + stackbase = 0; + } else { + /* from prgetstackbase() */ + stackbase = p->p_usrstack - p->p_stksize; + } + + utime = stime = 0; + if (t != NULL) { + klwp_t *lwp = ttolwp(t); + hrtime_t utm = 0, stm = 0; + + /* + * For field 38 (the exit signal), some apps explicitly use + * this field in a check to distinguish processes from threads, + * and assume only processes have a valid signal in this field! + */ + if (t->t_tid == 1) { + lx_proc_data_t *lxpd = ptolxproc(p); + + if (lxpd != NULL) { + exit_signal = lxpd->l_signal; + } else { + exit_signal = SIGCHLD; + } + } + + thread_lock(t); + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; + break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; + break; + case TS_ZOMB: + stat = 'Z'; + break; + case TS_STOPPED: + stat = 'T'; + break; + default: + stat = '!'; + break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + + if (lwp != NULL) { + struct mstate *ms = &lwp->lwp_mstate; + + utm = ms->ms_acct[LMS_USER]; + stm = ms->ms_acct[LMS_SYSTEM]; + + /* convert unscaled high-res time to nanoseconds */ + scalehrtime(&utm); + scalehrtime(&stm); + } + + thread_unlock(t); + + /* Linux /proc expects these values in ticks */ + utime = (clock_t)NSEC_TO_TICK(utm); + stime = (clock_t)NSEC_TO_TICK(stm); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + + if (tid == p->p_pid) { + /* process */ + utime = p->p_utime; + stime = p->p_stime; + } else { + /* tid: utime & stime for the thread set in block above */ + /* EMPTY */ + } + cutime = p->p_cutime; + cstime = p->p_cstime; + lwpcnt = lxpr_count_tasks(p); + vmem_ctl = p->p_vmem_ctl; + (void) strlcpy(buf_comm, p->p_user.u_comm, sizeof (buf_comm)); + ticks = p->p_user.u_ticks; /* lbolt at process start */ + /* adjust ticks to account for zone boot time */ + boottime = zone->zone_zsched->p_user.u_ticks; + ticks -= boottime; + lxpr_unlock(p); + + /* Adjust hz for relevant fields */ + utime = HZ_TO_LX_USERHZ(utime); + stime = HZ_TO_LX_USERHZ(stime); + cutime = HZ_TO_LX_USERHZ(cutime); + cstime = HZ_TO_LX_USERHZ(cstime); + ticks = HZ_TO_LX_USERHZ(ticks); + + lxpr_uiobuf_printf(uiobuf, + "%d " /* 1 */ + "(%s) %c %d %d %d %d %d " /* 2-8 */ + "%lu %lu %lu %lu %lu " /* 9-13 */ + "%lu %lu %ld %ld " /* 14-17 */ + "%d %d %d " /* 18-20 */ + "%lu " /* 21 */ + "%lu " /* 22 */ + "%lu %ld %llu " /* 23-25 */ + "%lu %lu %llu " /* 26-28 */ + "%lu %lu " /* 29-30 */ + "%lu %lu %lu %lu " /* 31-34 */ + "%lu " /* 35 */ + "%lu %lu " /* 36-37 */ + "%d " /* 38 */ + "%d" /* 39 */ + "\n", + tid, /* 1 */ + buf_comm, stat, ppid, pgpid, spid, psdev, psgid, /* 2-8 */ + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + utime, stime, cutime, cstime, /* 14-17 */ + pri, nice, lwpcnt, /* 18-20 */ + 0l, /* itrealvalue (time before next SIGALRM) 21 */ + ticks, /* 22 */ + vsize, rss, vmem_ctl, /* 23-25 */ + 0l, 0l, stackbase, /* startcode, endcode, startstack 26-28 */ + 0l, 0l, /* kstkesp, kstkeip 29-30 */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch 31-34 */ + wchan, /* 35 */ + 0l, 0l, /* nswap,cnswap 36-37 */ + exit_signal, /* exit_signal 38 */ + cpu /* 39 */); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +struct lxpr_ifstat { + uint64_t rx_bytes; + uint64_t rx_packets; + uint64_t rx_errors; + uint64_t rx_drop; + uint64_t tx_bytes; + uint64_t tx_packets; + uint64_t tx_errors; + uint64_t tx_drop; + uint64_t collisions; + uint64_t rx_multicast; +}; + +static void * +lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num, + zoneid_t zoneid) +{ + kstat_t *kp; + int i, nrec = 0; + size_t bufsize; + void *buf = NULL; + + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, zoneid); + } else { + kp = kstat_hold_bykid(kn->ks_kid, zoneid); + } + if (kp == NULL) { + return (NULL); + } + if (kp->ks_flags & KSTAT_FLAG_INVALID) { + kstat_rele(kp); + return (NULL); + } + + bufsize = kp->ks_data_size + 1; + kstat_rele(kp); + + /* + * The kstat in question is released so that kmem_alloc(KM_SLEEP) is + * performed without it held. After the alloc, the kstat is reacquired + * and its size is checked again. If the buffer is no longer large + * enough, the alloc and check are repeated up to three times. + */ + for (i = 0; i < 2; i++) { + buf = kmem_alloc(bufsize, KM_SLEEP); + + /* Check if bufsize still appropriate */ + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, zoneid); + } else { + kp = kstat_hold_bykid(kn->ks_kid, zoneid); + } + if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) { + if (kp != NULL) { + kstat_rele(kp); + } + kmem_free(buf, bufsize); + return (NULL); + } + KSTAT_ENTER(kp); + (void) KSTAT_UPDATE(kp, KSTAT_READ); + if (bufsize < kp->ks_data_size) { + kmem_free(buf, bufsize); + buf = NULL; + bufsize = kp->ks_data_size + 1; + KSTAT_EXIT(kp); + kstat_rele(kp); + continue; + } else { + if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) { + kmem_free(buf, bufsize); + buf = NULL; + } + nrec = kp->ks_ndata; + KSTAT_EXIT(kp); + kstat_rele(kp); + break; + } + } + + if (buf != NULL) { + *size = bufsize; + *num = nrec; + } + return (buf); +} + +static int +lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs, zoneid_t zoneid) +{ + kstat_named_t *kp; + int i, num; + size_t size; + + /* + * Search by name instead of by kid since there's a small window to + * race against kstats being added/removed. + */ + bzero(ifs, sizeof (*ifs)); + kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num, zoneid); + if (kp == NULL) + return (-1); + for (i = 0; i < num; i++) { + if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0) + ifs->rx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0) + ifs->rx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0) + ifs->rx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0) + ifs->rx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0) + ifs->rx_multicast = kp[i].value.ui32; + else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0) + ifs->tx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0) + ifs->tx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0) + ifs->tx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0) + ifs->tx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0) + ifs->collisions = kp[i].value.ui32; + } + kmem_free(kp, size); + return (0); +} + +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + int i, nidx; + size_t sidx; + struct lxpr_ifstat ifs; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid); + if (ksr == NULL) + return; + + for (i = 1; i < nidx; i++) { + if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 || + strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) { + if (lxpr_kstat_ifstat(&ksr[i], &ifs, zoneid) != 0) + continue; + + /* Overwriting the name is ok in the local snapshot */ + lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE); + lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu " + "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u " + "%5lu %7u %10u\n", + ksr[i].ks_name, + ifs.rx_bytes, ifs.rx_packets, + ifs.rx_errors, ifs.rx_drop, + 0, 0, 0, ifs.rx_multicast, + ifs.tx_bytes, ifs.tx_packets, + ifs.tx_errors, ifs.tx_drop, + 0, ifs.collisions, 0, 0); + } + } + + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_inet6_out(const in6_addr_t *addr, char buf[33]) +{ + const uint8_t *ip = addr->s6_addr; + char digits[] = "0123456789abcdef"; + int i; + for (i = 0; i < 16; i++) { + buf[2 * i] = digits[ip[i] >> 4]; + buf[2 * i + 1] = digits[ip[i] & 0xf]; + } + buf[32] = '\0'; +} + +/* ARGSUSED */ +static void +lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + ipif_t *ipif; + ill_walk_context_t ctx; + char ifname[LIFNAMSIZ], ip6out[33]; + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = ILL_START_WALK_V6(&ctx, ipst); + + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + uint_t index = ill->ill_phyint->phyint_ifindex; + int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); + unsigned int scope = lx_ipv6_scope_convert( + &ipif->ipif_v6lcl_addr); + /* Always report PERMANENT flag */ + int flag = 0x80; + + (void) snprintf(ifname, LIFNAMSIZ, "%s", ill->ill_name); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out); + + lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x" + " %8s\n", ip6out, index, plen, scope, flag, ifname); + } + } + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + char ipv6addr[33]; + + lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr, + ip_mask_to_plen_v6(&ire->ire_mask_v6)); + + /* punt on this for now */ + lxpr_uiobuf_printf(uiobuf, "%s %02x ", + "00000000000000000000000000000000", 0); + + lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr); + + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + /* Linux's RTF_LOCAL equivalent */ + if (ire->ire_metrics.iulp_local) + flags |= 0x80000000; + + if (ire->ire_ill != NULL) { + ill_get_name(ire->ire_ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n", + 0, /* metric */ + ire->ire_refcnt, + 0, + flags, + name); +} + +/* ARGSUSED */ +static void +lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +#define LXPR_SKIP_ROUTE(type) \ + (((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \ + IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0) + +static void +lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + ill_t *ill; + ire_t *nire; + ipif_t *ipif; + ipaddr_t gateway; + + if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0) + return; + + /* These route flags have direct Linux equivalents */ + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + + /* + * Search for a suitable IRE for naming purposes. + * On Linux, the default route is typically associated with the + * interface used to access gateway. The default IRE on Illumos + * typically lacks an ill reference but its parent might have one. + */ + nire = ire; + do { + ill = nire->ire_ill; + nire = nire->ire_dep_parent; + } while (ill == NULL && nire != NULL); + if (ill != NULL) { + ill_get_name(ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '*'; + name[1] = '\0'; + } + + /* + * Linux suppresses the gateway address for directly connected + * interface networks. To emulate this behavior, we walk all addresses + * of a given route interface. If one matches the gateway, it is + * displayed as NULL. + */ + gateway = ire->ire_gateway_addr; + if ((ill = ire->ire_ill) != NULL) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (ipif->ipif_lcl_addr == gateway) { + gateway = 0; + break; + } + } + } + + lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u\n", + name, + ire->ire_addr, + gateway, + flags, 0, 0, + 0, /* priority */ + ire->ire_mask, + 0, 0, /* mss, window */ + ire->ire_metrics.iulp_rtt); +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t" + "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n"); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +typedef struct lxpr_snmp_table { + const char *lst_proto; + const char **lst_fields; +} lxpr_snmp_table_t; + +static const char *lxpr_snmp_ip_fields[] = { + "forwarding", "defaultTTL", "inReceives", "inHdrErrors", + "inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards", + "inDelivers", "outRequests", "outDiscards", "outNoRoutes", + "reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs", + "fragFails", "fragCreates", + NULL +}; + +static const char *lxpr_snmp_icmp_fields[] = { + "inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds", + "inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps", + "inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps", + "outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds", + "outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos", + "outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks", + "outAddrMaskReps", + NULL +}; + +static const char *lxpr_snmp_tcp_fields[] = { + "rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens", + "passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs", + "outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors", + NULL +}; + +static const char *lxpr_snmp_udp_fields[] = { + "inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors", + "sndbufErrors", "inCsumErrors", + NULL +}; + +static lxpr_snmp_table_t lxpr_snmp_ip = { "ip", lxpr_snmp_ip_fields }; +static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp", lxpr_snmp_icmp_fields }; +static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp", lxpr_snmp_tcp_fields }; +static lxpr_snmp_table_t lxpr_snmp_udp = { "udp", lxpr_snmp_udp_fields }; + +static lxpr_snmp_table_t *lxpr_net_snmptab[] = { + &lxpr_snmp_ip, + &lxpr_snmp_icmp, + &lxpr_snmp_tcp, + &lxpr_snmp_udp, + NULL +}; + +static void +lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table, + kstat_t *kn, zoneid_t zoneid) +{ + kstat_named_t *klist; + char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN]; + int i, j, num; + size_t size; + + klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num, + zoneid); + if (klist == NULL) + return; + + /* Print the header line, fields capitalized */ + (void) strncpy(upname, table->lst_proto, KSTAT_STRLEN); + upname[0] = toupper(upname[0]); + lxpr_uiobuf_printf(uiobuf, "%s:", upname); + for (i = 0; table->lst_fields[i] != NULL; i++) { + (void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN); + upfield[0] = toupper(upfield[0]); + lxpr_uiobuf_printf(uiobuf, " %s", upfield); + } + lxpr_uiobuf_printf(uiobuf, "\n%s:", upname); + + /* Then loop back through to print the value line. */ + for (i = 0; table->lst_fields[i] != NULL; i++) { + kstat_named_t *kpoint = NULL; + for (j = 0; j < num; j++) { + if (strncmp(klist[j].name, table->lst_fields[i], + KSTAT_STRLEN) == 0) { + kpoint = &klist[j]; + break; + } + } + if (kpoint == NULL) { + /* Output 0 for unknown fields */ + lxpr_uiobuf_printf(uiobuf, " 0"); + } else { + switch (kpoint->data_type) { + case KSTAT_DATA_INT32: + lxpr_uiobuf_printf(uiobuf, " %d", + kpoint->value.i32); + break; + case KSTAT_DATA_UINT32: + lxpr_uiobuf_printf(uiobuf, " %u", + kpoint->value.ui32); + break; + case KSTAT_DATA_INT64: + lxpr_uiobuf_printf(uiobuf, " %ld", + kpoint->value.l); + break; + case KSTAT_DATA_UINT64: + lxpr_uiobuf_printf(uiobuf, " %lu", + kpoint->value.ul); + break; + } + } + } + lxpr_uiobuf_printf(uiobuf, "\n"); + kmem_free(klist, size); +} + +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + lxpr_snmp_table_t **table = lxpr_net_snmptab; + int i, t, nidx; + size_t sidx; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid); + if (ksr == NULL) + return; + + for (t = 0; table[t] != NULL; t++) { + for (i = 0; i < nidx; i++) { + if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0) + continue; + if (strncmp(ksr[i].ks_name, table[t]->lst_proto, + KSTAT_STRLEN) == 0) { + lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i], + zoneid); + break; + } + } + } + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static int +lxpr_convert_tcp_state(int st) +{ + /* + * Derived from the enum located in the Linux kernel sources: + * include/net/tcp_states.h + */ + switch (st) { + case TCPS_ESTABLISHED: + return (1); + case TCPS_SYN_SENT: + return (2); + case TCPS_SYN_RCVD: + return (3); + case TCPS_FIN_WAIT_1: + return (4); + case TCPS_FIN_WAIT_2: + return (5); + case TCPS_TIME_WAIT: + return (6); + case TCPS_CLOSED: + return (7); + case TCPS_CLOSE_WAIT: + return (8); + case TCPS_LAST_ACK: + return (9); + case TCPS_LISTEN: + return (10); + case TCPS_CLOSING: + return (11); + default: + /* No translation for TCPS_IDLE, TCPS_BOUND or anything else */ + return (0); + } +} + +static void +lxpr_format_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + int sonode_shift; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address " + "st tx_queue rx_queue tr tm->when retrnsmt uid timeout " + "inode\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode\n"); + } + /* + * Due to differences between the Linux and illumos TCP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st + * - tx_queue + * - rx_queue + * - uid + * - inode + * + * Omitted/invalid fields + * - tr + * - tm->when + * - retrnsmt + * - timeout + */ + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + sonode_shift = highbit(sizeof (sonode_t)); + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp_t *tcp; + ino_t inode; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + if (connp->conn_ipversion != ipver) + continue; + tcp = connp->conn_tcp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + /* + * We cannot use VOP_GETATTR here to fetch the + * simulated inode for the socket via the + * so->so_vnode. This is because there is a (very + * tight) race for when the v_vfsp is set on the + * sonode's vnode. However, all we really want here is + * the inode number, which we can compute using the + * same algorithm as socket_vop_getattr. + */ + inode = ((ino_t)so >> sonode_shift) & 0xFFFF; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %u %u %u %u %d\n", + lxpr_convert_tcp_state(tcp->tcp_state), + tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* per-connection rexmits aren't tracked today */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode + more */ + inode, 0, NULL, 0, 0, 0, 0, 0); + } + } + netstack_rele(ns); +} + +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(lxpnp, uiobuf, IPV4_VERSION); +} + +static void +lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(lxpnp, uiobuf, IPV6_VERSION); +} + +static void +lxpr_format_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + int sonode_shift; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address" + " st tx_queue rx_queue tr tm->when retrnsmt uid" + " timeout inode ref pointer drops\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode ref pointer drops\n"); + } + /* + * Due to differences between the Linux and illumos UDP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st: limited + * - uid + * + * Omitted/invalid fields + * - tx_queue + * - rx_queue + * - tr + * - tm->when + * - retrnsmt + * - timeout + * - inode + */ + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + sonode_shift = highbit(sizeof (sonode_t)); + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) { + udp_t *udp; + ino_t inode; + int state = 0; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + if (connp->conn_ipversion != ipver) + continue; + udp = connp->conn_udp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + switch (udp->udp_state) { + case TS_UNBND: + case TS_IDLE: + state = 7; + break; + case TS_DATA_XFER: + state = 1; + break; + } + + /* + * We cannot use VOP_GETATTR here to fetch the + * simulated inode for the socket via the + * so->so_vnode. This is because there is a (very + * tight) race for when the v_vfsp is set on the + * sonode's vnode. However, all we really want here is + * the inode number, which we can compute using the + * same algorithm as socket_vop_getattr. + */ + inode = ((ino_t)so >> sonode_shift) & 0xFFFF; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %d\n", + state, + 0, 0, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* retrans */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode, ref, pointer, drops */ + inode, 0, NULL, 0); + } + } + netstack_rele(ns); +} + +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(lxpnp, uiobuf, IPV4_VERSION); +} + +static void +lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(lxpnp, uiobuf, IPV6_VERSION); +} + +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + sonode_t *so; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + lxpr_uiobuf_printf(uiobuf, "Num RefCount Protocol Flags Type " + "St Inode Path\n"); + + mutex_enter(&socklist.sl_lock); + for (so = socklist.sl_list; so != NULL; + so = _SOTOTPI(so)->sti_next_so) { + vnode_t *vp = so->so_vnode; + vattr_t attr; + sotpi_info_t *sti; + const char *name = NULL; + int status = 0; + int type = 0; + int flags = 0; + + /* Only process active sonodes in this zone */ + if (so->so_count == 0 || so->so_zoneid != zoneid) + continue; + + /* + * Grab the inode, if possible. + * This must be done before entering so_lock. + */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + mutex_enter(&so->so_lock); + sti = _SOTOTPI(so); + + if (sti->sti_laddr_sa != NULL && + sti->sti_laddr_len > 0) { + name = sti->sti_laddr_sa->sa_data; + } else if (sti->sti_faddr_sa != NULL && + sti->sti_faddr_len > 0) { + name = sti->sti_faddr_sa->sa_data; + } + + /* + * Derived from enum values in Linux kernel source: + * include/uapi/linux/net.h + */ + if ((so->so_state & SS_ISDISCONNECTING) != 0) { + status = 4; + } else if ((so->so_state & SS_ISCONNECTING) != 0) { + status = 2; + } else if ((so->so_state & SS_ISCONNECTED) != 0) { + status = 3; + } else { + status = 1; + /* Add ACC flag for stream-type server sockets */ + if (so->so_type != SOCK_DGRAM && + sti->sti_laddr_sa != NULL) + flags |= 0x10000; + } + + /* Convert to Linux type */ + switch (so->so_type) { + case SOCK_DGRAM: + type = 2; + break; + case SOCK_SEQPACKET: + type = 5; + break; + default: + type = 1; + } + + lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu", + so, + so->so_count, + 0, /* proto, always 0 */ + flags, + type, + status, + (ino_t)attr.va_nodeid); + + /* + * Due to shortcomings in the abstract socket emulation, they + * cannot be properly represented here (as @<path>). + * + * This will be the case until they are better implemented. + */ + if (name != NULL) + lxpr_uiobuf_printf(uiobuf, " %s\n", name); + else + lxpr_uiobuf_printf(uiobuf, "\n"); + mutex_exit(&so->so_lock); + } + mutex_exit(&socklist.sl_lock); +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced, unless we're open non-blocking, in which case we return after + * 1ms. + */ + +#define LX_KMSG_PRI "<0>" + +/* ARGSUSED */ +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + timestruc_t to; + timestruc_t *tp = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (lxpr_uiobuf_nonblock(uiobuf)) { + to.tv_sec = 0; + to.tv_nsec = 1000000; /* 1msec */ + tp = &to; + } + + if (ldi_getmsg(lh, &mp, tp) == 0) { + /* + * lx procfs doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(LXPTOZ(lxpnp)); + + ASSERT(LXPTOZ(lxpnp) != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + ulong_t total_mem, free_mem, total_swap; + boolean_t swap_disabled; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + swap_disabled = lxzd->lxzd_swap_disabled; + + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); + + if (swap_disabled) { + total_swap = 0; + } else { + if (zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = ptob(k_anoninfo.ani_max); + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + mutex_exit(&zone->zone_mem_lock); + } + } + + /* + * SwapFree + * On illumos we reserve swap up front, whereas on Linux they just + * wing it and kill a random process if they run out of backing store + * for virtual memory. Our swap reservation doesn't translate to that + * model, so just inform the caller that no swap is being used. + */ + lxpr_uiobuf_printf(uiobuf, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + * + * Note: we currently also use this for /proc/{pid}/mounts since we don't + * yet support mount namespaces. + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + + lxpr_uiobuf_printf(uiobuf, "%s %s %s %s 0 0\n", + resource, mntpt, fstype, lme->lme_mntopts); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_partitions(): + * + * Over the years, /proc/partitions has been made considerably smaller -- to + * the point that it really is only major number, minor number, number of + * blocks (which we report as 0), and partition name. + * + * We support this because some things want to see it to make sense of + * /proc/diskstats, and also because "fdisk -l" and a few other things look + * here to find all disks on the system. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS); + + lxpr_uiobuf_printf(uiobuf, "major minor #blocks name\n\n"); + + lxzd = ztolxzd(LXPTOZ(lxpnp)); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + lxpr_uiobuf_printf(uiobuf, "%4d %7d %10d %s\n", + getmajor(vd->lxvd_emul_dev), getminor(vd->lxvd_emul_dev), + 0, vd->lxvd_name); + vd = list_next(lxzd->lxzd_vdisks, vd); + } +} + +/* + * There aren't many actual devices inside a zone but we want to provide the + * major numbers for the pseudo devices that do exist, including our pts/ptm + * device, as well as the zvol virtual disk device. We simply hardcode the + * emulated major numbers that are used elsewhere in the code and that match + * the expected Linux major numbers. See lx devfs where some of the major + * numbers have no defined constants. + */ +/* ARGSUSED */ +static void +lxpr_read_devices(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_DEVICES); + + lxpr_uiobuf_printf(uiobuf, "Character devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/tty\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/console\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/ptmx\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d ptm\n", LX_PTM_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d pts\n", LX_PTS_MAJOR_MIN); + + lxpr_uiobuf_printf(uiobuf, "\nBlock devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d zvol\n", LX_MAJOR_DISK); +} + +/* + * lxpr_read_diskstats(): + * + * See the block comment above the per-device output-generating line for the + * details of the format. + */ +/* ARGSUSED */ +static void +lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd; + kstat_t kn; + int num; + zone_vfs_kstat_t *kip; + size_t size; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS); + + lxzd = ztolxzd(zone); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + /* + * Use the zone_vfs kstat, which is a superset of a kstat_io_t, since + * it tracks IO at the zone level. + */ + (void) strlcpy(kn.ks_module, "zone_vfs", sizeof (kn.ks_module)); + (void) strlcpy(kn.ks_name, zone->zone_name, sizeof (kn.ks_name)); + kn.ks_instance = zone->zone_id; + + kip = (zone_vfs_kstat_t *)lxpr_kstat_read(&kn, B_TRUE, &size, &num, + zone->zone_id); + if (kip == NULL) + return; + + if (size < sizeof (kstat_io_t)) { + kmem_free(kip, size); + return; + } + + /* + * Because the zone vfs stats are tracked at the zone level we use + * the same kstat for the zone's virtual disk (the zpool) and any + * zvols that might also visible within the zone. + */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + /* + * /proc/diskstats is defined to have one line of output for + * each block device, with each line containing the following + * 14 fields: + * + * 1 - major number + * 2 - minor mumber + * 3 - device name + * 4 - reads completed successfully + * 5 - reads merged + * 6 - sectors read + * 7 - time spent reading (ms) + * 8 - writes completed + * 9 - writes merged + * 10 - sectors written + * 11 - time spent writing (ms) + * 12 - I/Os currently in progress + * 13 - time spent doing I/Os (ms) + * 14 - weighted time spent doing I/Os (ms) + * + * One small hiccup: we don't actually keep track of time + * spent reading vs. time spent writing -- we keep track of + * time waiting vs. time actually performing I/O. While we + * could divide the total time by the I/O mix (making the + * obviously wrong assumption that I/O operations all take the + * same amount of time), this has the undesirable side-effect + * of moving backwards. Instead, we report the total time + * (read + write) for all three stats (read, write, total). + * This is also a lie of sorts, but it should be more + * immediately clear to the user that reads and writes are + * each being double-counted as the other. + * + * Since certain consumers interpret the major/minor numbers to + * infer device names, some translation is required to avoid + * output which results in totally unexpected results. + */ + + lxpr_uiobuf_printf(uiobuf, "%4d %7d %s ", + getmajor(vd->lxvd_emul_dev), + getminor(vd->lxvd_emul_dev), + vd->lxvd_name); + + if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Use the zone-wide vfs stats for any zfs datasets + * represented via virtual devices. + */ +#define KV(N) kip->zv_ ## N.value.ui64 +#define NS_PER_MS (uint64_t)(NANOSEC / MILLISEC) + lxpr_uiobuf_printf(uiobuf, + "%llu %llu %llu %llu " + "%llu %llu %llu %llu " + "%llu %llu %llu\n", + (uint64_t)KV(reads), 0LL, + KV(nread) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)KV(writes), 0LL, + KV(nwritten) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)(KV(rcnt) + KV(wcnt)), + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (KV(rlentime) + KV(wlentime)) / NS_PER_MS); +#undef KV +#undef NS_PER_MS + } else { + /* + * Report nearly-zeroed statistics for other devices. + * + * Since iostat will ignore devices which report no + * succesful reads or writes, a single read of one + * sector, taking 1ms, is reported. + */ + lxpr_uiobuf_printf(uiobuf, + "1 0 1 1 0 0 0 0 0 0 0\n"); + } + + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + kmem_free(kip, size); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + lx_proc_data_t *lxpd = ptolxproc(curproc); + char release[LX_KERN_RELEASE_MAX]; + char version[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(release, lxzd->lxzd_kernel_release, sizeof (release)); + (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version)); + mutex_exit(&lxzd->lxzd_lock); + + /* Use per-process overrides, if specified */ + if (lxpd != NULL && lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(release, lxpd->l_uname_release, + sizeof (release)); + } + if (lxpd != NULL && lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(version, lxpd->l_uname_version, + sizeof (version)); + } + + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) %s\n", + LX_UNAME_SYSNAME, release, +#if defined(__GNUC__) + "gcc", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, +#else + "cc", 1, 0, 0, +#endif + version); +} + +/* ARGSUSED */ +static void +lxpr_read_vmstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* Only count CPUs which are present and active. */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* + * Needless to say, the metrics presented by vmstat are very specific + * to the internals of the Linux kernel. There is little per-zone + * information which can be translated in a meaningful way to fit the + * expected fields. For the time being, the output is kept sparse. + */ + lxpr_uiobuf_printf(uiobuf, + "pgpgin %lu\n" + "pgpgout %lu\n" + "pswpin %lu\n" + "pswpout %lu\n", + pgpgin_cum, + pgpgout_cum, + pgswapin_cum, + pgswapout_cum); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + /* is the emulated release > 2.4 */ + boolean_t newer_than24 = lx_kern_release_cmp(LXPTOZ(lxpnp), "2.4") > 0; + zone_t *zone = LXPTOZ(lxpnp); + const char *fmtstr0, *fmtstr1; + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + + if (newer_than24) { + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + if (lx_kern_release_cmp(zone, "2.6.33") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.6.24") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.6.11") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.5.41") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0\n"; + } else { + /* Note: we pass an unused param to these fmt strings */ + fmtstr0 = "cpu %lu 0 %lu %lu\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu\n"; + } + + /* Adjust hz */ + user_cum = HZ_TO_LX_USERHZ(user_cum); + sys_cum = HZ_TO_LX_USERHZ(sys_cum); + idle_cum = HZ_TO_LX_USERHZ(idle_cum); + irq_cum = HZ_TO_LX_USERHZ(irq_cum); + + lxpr_uiobuf_printf(uiobuf, fmtstr0, + user_cum, sys_cum, idle_cum, irq_cum); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_IDLE])); + sys_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_SYSTEM])); + user_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_USER])); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + irq_ticks = HZ_TO_LX_USERHZ(irq_ticks); + + lxpr_uiobuf_printf(uiobuf, fmtstr1, HZ_TO_LX_USERHZ(cp->cpu_id), + user_ticks, sys_ticks, idle_ticks, irq_ticks); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + if (newer_than24) { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + zone->zone_boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); + } else { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + zone->zone_boot_time, + forks_cum); + } +} + +/* + * lxpr_read_swaps(): + * + * We don't support swap files or partitions, but some programs like to look + * here just to check we have some swap on the system, so we lie and show + * our entire swap cap as one swap partition. See lxpr_read_meminfo for an + * explanation on why we report 0 used swap. + * + * The zone's lxzd_swap_disabled boolean controls whether or not we pretend + * swap space is configured. + * + * It is important to use formatting identical to the Linux implementation + * so that consumers do not break. See swap_show() in mm/swapfile.c. + */ +/* ARGSUSED */ +static void +lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + boolean_t swap_enabled; + lx_zone_data_t *lxzd = ztolxzd(zone); + + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + swap_enabled = !lxzd->lxzd_swap_disabled; + + lxpr_uiobuf_printf(uiobuf, + "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + + if (swap_enabled) { + uint64_t totswap, usedswap; + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + totswap = (k_anoninfo.ani_max * PAGESIZE) >> 10; + } else { + mutex_enter(&zone->zone_mem_lock); + /* Uses units of 1 kb (2^10). */ + totswap = zone->zone_max_swap_ctl >> 10; + mutex_exit(&zone->zone_mem_lock); + } + usedswap = 0; + + lxpr_uiobuf_printf(uiobuf, "%-40s%s\t%llu\t%llu\t%d\n", + "/dev/swap", "partition", totswap, usedswap, -1); + } +} + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_aiomax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_MAX_NR); + lxpr_uiobuf_printf(uiobuf, "%llu\n", LX_AIO_MAX_NR); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_aionr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + uint64_t curr; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_NR); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + curr = (uint64_t)(lxzd->lxzd_aio_nr); + mutex_exit(&lxzd->lxzd_lock); + lxpr_uiobuf_printf(uiobuf, "%llu\n", curr); +} + +/* + * lxpr_read_sys_fs_filemax(): + * + * The zone's total number of open files is not fixed or tunable, but we can + * provide a number by taking: + * (zone's proc limit) * (process.max-file-descriptor rctl privileged limit). + * The privileged rctl limit is the same as rlim_fd_max. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_fs_filemax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + uint64_t max_fh, proc_lim; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILEMAX); + proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ? + maxpid : zone->zone_nprocs_ctl); + max_fh = proc_lim * (uint64_t)rlim_fd_max; + lxpr_uiobuf_printf(uiobuf, "%llu\n", max_fh); +} + +/* + * lxpr_read_sys_fs_filenr(): + * + * Contains 3 numbers: current number of allocated file handles (open files), + * number of free file handles, and max. number of file handles (same value as + * we use in lxpr_read_sys_fs_filemax). Note that since Linux 2.6 the "free" + * value is always 0, so we just do the same here. We don't keep track of the + * number of files in use within a zone, so we approximate that value by + * looking at the current "fi_nfiles" value for each process in the zone. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_fs_filenr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + uint64_t max_fh, proc_lim, curr_files = 0; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILENR); + proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ? + maxpid : zone->zone_nprocs_ctl); + max_fh = proc_lim * (uint64_t)rlim_fd_max; + + for (i = 1; i < v.v_proc; i++) { + uint_t nfiles; + proc_t *p; + uf_info_t *fip; + + mutex_enter(&pidlock); + + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || p->p_zone != zone || + p == zone->zone_zsched || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + continue; + } + + fip = P_FINFO(p); + mutex_enter(&fip->fi_lock); + nfiles = fip->fi_nfiles; + mutex_exit(&fip->fi_lock); + + mutex_exit(&pidlock); + + curr_files += nfiles; + } + + lxpr_uiobuf_printf(uiobuf, "%llu\t0\t%llu\n", curr_files, max_fh); +} + +/* + * inotify tunables exported via /proc. + */ +extern int inotify_maxevents; +extern int inotify_maxinstances; +extern int inotify_maxwatches; + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_fs_pipe_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + uint_t pipe_max; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + pipe_max = lxzd->lxzd_pipe_max_sz; + mutex_exit(&lxzd->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", pipe_max); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_caplcap(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_CAPLCAP); + lxpr_uiobuf_printf(uiobuf, "%d\n", LX_CAP_MAX_VALID); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_corepatt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + struct core_globals *cg; + refstr_t *rp; + corectl_path_t *ccp; + char tr[MAXPATHLEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + /* If core dumps are disabled, return an empty string. */ + if ((cg->core_options & CC_PROCESS_PATH) == 0) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + if ((rp = ccp->ccp_path) != NULL) + refstr_hold(rp); + mutex_exit(&ccp->ccp_mtx); + + if (rp == NULL) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + bzero(tr, sizeof (tr)); + if (lxpr_core_path_s2l(refstr_value(rp), tr, sizeof (tr)) != 0) { + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "%s\n", tr); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME); + lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename()); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_msgmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * We don't have an rctl for this. See our definition for LX_MSGMAX + * in the user-level emulation library. Once that code moves into + * the kernel, we can use a common definition. This matches the + * value on Linux. + */ + uint_t val = 8192; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMAX); + + lxpr_uiobuf_printf(uiobuf, "%u\n", val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_msgmnb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + proc_t *pp = curproc; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNB); + + mutex_enter(&pp->p_lock); + val = rctl_enforced_value(rc_process_msgmnb, pp->p_rctls, pp); + mutex_exit(&pp->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_msgmni, + LXPTOZ(lxpnp)->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_osrel(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + char version[LX_KERN_VERSION_MAX]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_OSREL); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version)); + mutex_exit(&lxzd->lxzd_lock); + lxpr_uiobuf_printf(uiobuf, "%s\n", version); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * This file isn't documented on the Linux proc(5) man page but + * according to the blog of the author of systemd/journald (the + * consumer), he says: + * boot_id: A random ID that is regenerated on each boot. As such it + * can be used to identify the local machine's current boot. It's + * universally available on any recent Linux kernel. It's a good and + * safe choice if you need to identify a specific boot on a specific + * booted kernel. + * + * We'll just generate a random ID if necessary. On Linux the format + * appears to resemble a uuid but since it is not documented to be a + * uuid, we don't worry about that. + */ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + char bootid[LX_BOOTID_LEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_BOOTID); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_bootid[0] == '\0') { + int i; + + for (i = 0; i < 5; i++) { + u_longlong_t n; + char s[32]; + + (void) random_get_bytes((uint8_t *)&n, sizeof (n)); + switch (i) { + case 0: (void) snprintf(s, sizeof (s), "%08llx", n); + s[8] = '\0'; + break; + case 4: (void) snprintf(s, sizeof (s), "%012llx", n); + s[12] = '\0'; + break; + default: (void) snprintf(s, sizeof (s), "%04llx", n); + s[4] = '\0'; + break; + } + if (i > 0) + (void) strlcat(lxzd->lxzd_bootid, "-", + sizeof (lxzd->lxzd_bootid)); + (void) strlcat(lxzd->lxzd_bootid, s, + sizeof (lxzd->lxzd_bootid)); + } + } + (void) strlcpy(bootid, lxzd->lxzd_bootid, sizeof (bootid)); + mutex_exit(&lxzd->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%s\n", bootid); +} + +/* + * The amount of entropy available (in bits). + */ +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_ENTAVL); + ASSERT(LXPTOZ(lxpnp)->zone_brand == &lx_brand); + + lxpr_uiobuf_printf(uiobuf, "%d\n", swrand_stats.ss_entEst); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_sem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *pp = curproc; + zone_t *zone = LXPTOZ(lxpnp); + rctl_qty_t vmsl, vopm, vmni, vmns; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SEM); + + mutex_enter(&pp->p_lock); + vmsl = rctl_enforced_value(rc_process_semmsl, pp->p_rctls, pp); + vopm = rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp); + vmni = rctl_enforced_value(rc_zone_semmni, zone->zone_rctls, pp); + mutex_exit(&pp->p_lock); + vmns = vmsl * vmni; + if (vmns < vmsl || vmns < vmni) { + vmns = ULLONG_MAX; + } + /* + * Format: semmsl semmns semopm semmni + * - semmsl: Limit semaphores in a sempahore set. + * - semmns: Limit semaphores in all semaphore sets + * - semopm: Limit operations in a single semop call + * - semmni: Limit number of semaphore sets + */ + lxpr_uiobuf_printf(uiobuf, "%llu\t%llu\t%llu\t%llu\n", + vmsl, vmns, vopm, vmni); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_shmall(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMALL); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + /* value is in pages */ + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)btop(val)); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_shmmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmni, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", LXPTOZ(lxpnp)->zone_nlwps_ctl); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_net_core_somaxc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_printf(uiobuf, "%d\n", SOMAXCONN); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q); + netstack_rele(ns); +} + +/* + * icmp_echo_ignore_broadcasts + * integer; 0 or 1 + * + * illumos: ndd /dev/ip ip_respond_to_echo_broadcast + * From the tunable guide: Control whether IPv4 responds to broadcast ICMPv4 + * echo request. default: 1 (enabled) + * Not in ip(7p) man page. + * + * Note that the Linux setting is the inverse of the illumos value. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + ipst = ns->netstack_ip; + lxpr_uiobuf_printf(uiobuf, "%d\n", !ipst->ips_ip_g_resp_to_echo_bcast); + netstack_rele(ns); +} + +/* + * ip_forward + * integer; default: 0 + * + * illumos: ndd /dev/ip ip_forwarding + * default: 0 (disabled) + * Forwarding is described in the ip(7p) man page. We do not support forwarding + * in lx at this time, thus we do not support Linux-ABI methods for + * enabling/disabling forwarding, and this is always 0. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_FORWARD); + lxpr_uiobuf_printf(uiobuf, "0\n"); +} + +/* + * ip_local_port_range + * + * The low & high port number range. + * integers; default: 32768 61000 + * + * illumos: tcp_smallest_anon_port & tcp_largest_anon_port + * Not in tcp(7p) man page. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\t%d\n", + tcps->tcps_smallest_anon_port, tcps->tcps_largest_anon_port); + netstack_rele(ns); +} + +/* + * tcp_fin_timeout + * + * This specifies how many seconds to wait for a final FIN packet before the + * socket is forcibly closed. This is strictly a violation of the TCP + * specification, but required to prevent denial-of-service attacks. + * integer; default: 60; + * + * illumos: tcp_fin_wait_2_flush_interval + * Not in tcp(7p) man page but see comment in uts/common/inet/tcp/tcp_input.c + * in the tcp_input_data() function on the use of tcp_fin_wait_2_flush_interval. + * The value is in milliseconds. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + tcps->tcps_fin_wait_2_flush_interval / 1000); + netstack_rele(ns); +} + +/* + * tcp_keepalive_intvl + * + * The number of seconds between TCP keep-alive probes. default: 75 + * Linux retries tcp_keepalive_probes (9) times before timing out. + * + * illumos: + * We have tcp_ka_rinterval but there is no corresponding tcps_* tunable for + * this. The closest is tcps_keepalive_abort_interval which specifies the + * time threshold for aborting a TCP connection in milliseconds. Linux retries + * 9 times (giving a total of 11.25 minutes) so we emulate this by dividing out + * tcps_keepalive_abort_interval by 9. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_abort_interval / 1000) / 9); + netstack_rele(ns); +} + +/* + * tcp_keepalive_time + * + * The number of seconds a connection needs to be idle before TCP begins + * sending out keep-alive probes. The default value is 7200 seconds (2 hours). + * + * illumos: tcp_keepalive_interval + * The interval for sending out the first probe in milliseconds. The default is + * two hours. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_interval / 1000)); + netstack_rele(ns); +} + +/* + * tcp_max_syn_backlog + * + * The number of half-open connections that can be kept by the backlog queue. + * See the Linux tcp(7) man page. + * + * illumos: tcp_conn_req_max_q0 + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q0); + netstack_rele(ns); +} + +/* + * tcp_retries2 + * + * Controls number of TCP retries for data packets. Often tuned down for HA + * configurations. RFC 1122 recommends at least 100 seconds for the timeout, + * which, for Linux, corresponds to a value of ~8. Oracle suggests a value of + * 3 for a RAC configuration, as do various HA tuning guides. + * integer; Ubuntu 16.04 default: 15 + * + * illumos: There are 4 ndd parameters that are related to this: + * tcp_rexmit_interval_initial: 1000 + * tcp_rexmit_interval_min: 400 + * tcp_rexmit_interval_max: 60000 + * tcp_rexmit_interval_extra: 0 + * Not in tcp(7p) man page. + * + * From the tunables guide: + * tcp_rexmit_interval_initial is the initial retransmission timeout (RTO) for + * a TCP connection in milliseconds (ms). + * The interval_min value is the minimum RTO in ms. + * The interval_max value is the maximum RTO in ms. + * The extra value is an extra time (in ms) to add in to the RTO. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + uint_t i, retry, rx_min, rx_max; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + rx_min = tcps->tcps_rexmit_interval_min; + rx_max = tcps->tcps_rexmit_interval_max; + netstack_rele(ns); + + for (i = rx_min, retry = 0; i < rx_max; retry++) { + i *= 2; + } + + lxpr_uiobuf_printf(uiobuf, "%u\n", retry); +} + +/* + * tcp_rmem and tcp_wmem + * + * Display the minimum, default, and maximum TCP receive/transmit window sizes, + * in bytes. See the Linux tcp(7) man page. + * + * In illumos this roughly corresponds to: tcp_recv_hiwat or tcp_xmit_hiwat, + * and tcp_max_buf. + * tcp_recv_hiwat is the default TCP receive window size + * tcp_xmit_hiwat is the default TCP send window size + * tcp_max_buf is the maximum TCP send and receive buffer size + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + uint_t min; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM || + lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + + /* Linux defaults to a page */ + min = MIN((lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat), PAGESIZE); + + lxpr_uiobuf_printf(uiobuf, "%d\t%d\t%d\n", + min, + (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat), + tcps->tcps_max_buf); + netstack_rele(ns); +} + +/* + * tcp_sack + * + * Enable RFC 2018 TCP Selective Acknowledgements. Boolean, default: enabled + * + * illumos: tcp_sack_permitted + * tcp_sack_permitted 0 == disabled, 1 == no initiate but accept, + * 2 == initiate and accept. default is 2. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_sack_permitted == 0 ? 0 : 1)); + netstack_rele(ns); +} + +/* + * tcp_window_scaling + * + * RFC 1323 TCP window scaling. This feature allows the use of a large window + * (> 64K) on a TCP connection. Boolean; default: enabled + * + * illumos: tcp_wscale_always + * tcp_wscale_always is set to 1, the window scale option will always be + * set when connecting to a remote system. If tcp_wscale_always is 0, the + * window scale option will be set only if the user has requested a send or + * receive window larger than 64K. The default value of is 1. + */ +/* ARGSUSED */ +static void +lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_wscale_always); + netstack_rele(ns); +} + +/* + * The /proc/sys/vm/dirty* files are (poorly) documented in the Linux + * source file Documentation/sysctl/vm.txt. These are various VM tunables + * that we'll never support, but that a few misguided apps want to inspect and + * modify. We simply hardcode some default values and we'll lie about write + * success to these files. + */ +static void +lxpr_read_sys_vm_dirty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + uint_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_BYTES || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_RATIO || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BYTES || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_EXP_CS || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_RATIO || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTYTIME_EXP_SEC || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_WB_CS); + + switch (lxpnp->lxpr_type) { + case LXPR_SYS_VM_DIRTY_BG_RATIO: + val = 10; + break; + case LXPR_SYS_VM_DIRTY_EXP_CS: + val = 3000; + break; + case LXPR_SYS_VM_DIRTY_RATIO: + val = 20; + break; + case LXPR_SYS_VM_DIRTYTIME_EXP_SEC: + val = 43200; + break; + case LXPR_SYS_VM_DIRTY_WB_CS: + val = 500; + break; + default: + val = 0; + break; + } + + lxpr_uiobuf_printf(uiobuf, "%u\n", val); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MAX_MAP_CNT); + /* We don't limit mappings, just say we have a large limit. */ + lxpr_uiobuf_printf(uiobuf, "%d\n", 16777215); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_vm_minfr_kb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MINFR_KB); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_vm_nhpages(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_NHUGEP); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_OVERCOMMIT_MEM); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* ARGSUSED */ +static void +lxpr_read_sys_vm_swappiness(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_SWAPPINESS); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +/* + * Report a list of each cgroup subsystem supported by our emulated cgroup fs. + * This needs to exist for systemd to run but for now we don't report any + * cgroup subsystems as being installed. The commented example below shows + * how to print a subsystem entry. + */ +/* ARGSUSED */ +static void +lxpr_read_cgroups(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + "#subsys_name", "hierarchy", "num_cgroups", "enabled"); + + /* + * lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + * "cpu,cpuacct", "2", "1", "1"); + */ +} + +/* + * Report the zone boot arguments. + */ +static void +lxpr_read_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lxpr_uiobuf_printf(uiobuf, "%s\n", zone->zone_bootargs); +} + + +typedef enum { + LXCS_ALWAYS = 0, + LXCS_CPUID1_ECX, + LXCS_CPUID1_EDX, + LXCS_CPUID7_EBX, + LXCS_CPUID7_ECX, + LXCS_CPUID7_EDX, + LXCS_CPUIDD1_EAX, + LXCS_CPUIDX1_ECX, + LXCS_CPUIDX1_EDX, + LXCS_REG_MAX +} lx_cpuinfo_source_t; + +typedef struct { + lx_cpuinfo_source_t lxcm_source; + uint32_t lxcm_flag; + const char *lxcm_name; +} lx_cpuinfo_mapping_t; + +/* + * This listing is derived from the X86_FEATURE flags data in the Linux kernel. + * Some entries are missing detectino routines. They remain in the list, + * although commented out, to preserve proper order should they be fixed later. + */ +lx_cpuinfo_mapping_t lx_cpuinfo_mappings[] = { + /* CPUID EDX: */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FPU, "fpu" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_VME, "vme" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DE, "de" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE, "pse" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TSC, "tsc" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MSR, "msr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAE, "pae" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCE, "mce" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CX8, "cx8" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_APIC, "apic" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SEP, "sep" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MTRR, "mtrr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PGE, "pge" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCA, "mca" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CMOV, "cmov" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAT, "pat" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE36, "pse36" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSN, "pn" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CLFSH, "clflush" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DS, "dts" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_ACPI, "acpi" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MMX, "mmx" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FXSR, "fxsr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE, "sse" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE2, "sse2" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SS, "ss" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_HTT, "ht" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TM, "tm" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PBE, "pbe" }, + + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +#if defined(__amd64) + { LXCS_ALWAYS, 1, "syscall" }, +#endif + /* Present in the Linux listing but not in recent AMD docs: "mp" */ + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_NX, "nx" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_MMXamd, "mmxext" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_FFXSR, "fxsr_opt" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_1GPG, "pdpe1gb" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_TSCP, "rdtscp" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_LM, "lm" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNowx, "3dnowext" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNow, "3dnow" }, + + /* CPUID ECX: */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE3, "pni" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCLMULQDQ, "pclmulqdq" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DTES64, "dtes64" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MON, "monitor" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DSCPL, "ds_cpl" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_VMX, "vmx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SMX, "smx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_EST, "est" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TM2, "tm2" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSSE3, "ssse3" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CID, "cid" }, + { LXCS_CPUID1_ECX, 0x00000800, "sdbg" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_FMA, "fma" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CX16, "cx16" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_ETPRD, "xtpr" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PDCM, "pdcm" }, + /* reserved */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCID, "pcid" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DCA, "dca" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_1, "sse4_1" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_2, "sse4_2" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_X2APIC, "x2apic" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MOVBE, "movbe" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_POPCNT, "popcnt" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TSCDL, "tsc_deadline_timer" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AES, "aes" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_XSAVE, "xsave" }, + /* osxsave */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AVX, "avx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_F16C, "f16c" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_RDRAND, "rdrand" }, + /* not used */ + + /* + * Other features, Linux-defined mapping + * This range is used for feature bits which conflict or are synthesized + * Skipped: + * "recovery", + * "longrun", + * "lrti", + * "cxmmx", + * "k6_mtrr", + * "cyrix_arr", + * "centaur_mcr", + * "constant_tsc", + * "up", + * "arch_perfmon", + * "pebs", + * "bts", + * "rep_good", + * "nopl", + * "xtopology", + * "tsc_reliable", + * "nonstop_tsc", + * "extd_apicid", + * "amd_dcm", + * "aperfmperf", + * "eagerfpu", + * "nonstop_tsc_s3", + * + * "hypervisor", + * "rng", + * "rng_en", + * "ace", + * "ace_en", + * "ace2", + * "ace2_en", + * "phe", + * "phe_en", + * "pmm", + * "pmm_en", + */ + + /* + * More extended AMD flags: CPUID level 0x80000001, ecx, word 6 + */ + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_AHF64, "lahf_lm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CMP_LGCY, "cmp_legacy" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SVM, "svm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_EAS, "extapic" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CR8D, "cr8_legacy" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LZCNT, "abm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SSE4A, "sse4a" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_MAS, "misalignsse" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_3DNP, "3dnowprefetch" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_OSVW, "osvw" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_IBS, "ibs" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_XOP, "xop" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SKINIT, "skinit" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_WDT, "wdt" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LWP, "lwp" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_FMA4, "fma4" }, + { LXCS_CPUIDX1_ECX, 0x00020000, "tce" }, + + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_NIDMSR, "nodeid_msr" }, + + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TBM, "tbm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TOPOEXT, "topoext" }, + { LXCS_CPUIDX1_ECX, 0x00800000, "perfctr_core" }, + { LXCS_CPUIDX1_ECX, 0x01000000, "perfctr_nb" }, + { LXCS_CPUIDX1_ECX, 0x02000000, "bpext" }, + { LXCS_CPUIDX1_ECX, 0x04000000, "perfctr_l2" }, + { LXCS_CPUIDX1_ECX, 0x08000000, "mwaitx" }, + + /* + * Aux flags and virt bits. + * Skipped: + * "cpb", + * "epb", + * "hw_pstate", + * "proc_feedback", + * "intel_pt", + * "tpr_shadow", + * "vnmi", + * "flexpriority", + * "ept", + * "vpid", + * "vmmcall", + */ + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 + */ + { LXCS_CPUID7_EBX, 0x00000001, "fsgsbase" }, + { LXCS_CPUID7_EBX, 0x00000002, "tsc_adjust" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI1, "bmi1" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_HLE, "hle" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX2, "avx2" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMEP, "smep" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI2, "bmi2" }, + { LXCS_CPUID7_EBX, 0x00000200, "erms" }, + { LXCS_CPUID7_EBX, 0x00000400, "invpcid" }, + { LXCS_CPUID7_EBX, 0x00000800, "rtm" }, + { LXCS_CPUID7_EBX, 0x00001000, "cqm" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_MPX, "mpx" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512F, "avx512f" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512DQ, "avx512dq" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_RDSEED, "rdseed" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_ADX, "adx" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMAP, "smap" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512IFMA, "avx512ifma" }, + + { LXCS_CPUID7_EBX, 0x00400000, "pcommit" }, + { LXCS_CPUID7_EBX, 0x00800000, "clflushopt" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_CLWB, "clwb" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512PF, "avx512pf" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512ER, "avx512er" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512CD, "avx512cd" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SHA, "sha_ni" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512BW, "avx512bw" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512VL, "avx512vl" }, + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) + */ + { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VBMI, "avx512vbmi" }, + { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VPOPCDQ, + "avx512_vpopcntdq" }, + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (edx) + */ + { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124NNIW, "avx512_4nniw" }, + { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124FMAPS, "avx512_4fmaps" }, + + /* + * Extended state features, CPUID level 0x0000000d:1 (eax) + */ + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEOPT, "xsaveopt" }, + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEC, "xsavec" }, + { LXCS_CPUIDD1_EAX, 0x00000004, "xgetbv1" }, + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVES, "xsaves" }, + + /* + * Skipped: + * "cqm_llc", + * "cqm_occup_llc", + * "clzero", + */ + + /* + * Thermal and Power Management Leaf, CPUID level 0x00000006 (eax) + * Skipped: + * "dtherm", + * "ida", + * "arat", + * "pln", + * "pts", + * "hwp", + * "hwp_notify", + * "hwp_act_window", + * "hwp_epp", + * "hwp_pkg_req", + */ + + /* + * AMD SVM Feature Identification, CPUID level 0x8000000a (edx) + * Skipped: + * "npt", + * "lbrv", + * "svm_lock", + * "nrip_save", + * "tsc_scale", + * "vmcb_clean", + * "flushbyasid", + * "decodeassists", + * "pausefilter", + * "pfthreshold", + */ +}; + +#define LX_CPUINFO_MAPPING_MAX \ + (sizeof (lx_cpuinfo_mappings) / sizeof (lx_cpuinfo_mappings[0])) + +/* ARGSUSED */ +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + cpu_t *cp, *cpstart; + int pools_enabled; + char brandstr[CPU_IDSTRLEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + struct cpuid_regs cpr; + uint32_t maxeax, xmaxeax, cpuid_res[LXCS_REG_MAX] = { 0 }; + + cpr.cp_eax = 0; + maxeax = cpuid_insn(cp, &cpr); + cpr.cp_eax = 0x80000000; + xmaxeax = cpuid_insn(cp, &cpr); + + cpuid_res[LXCS_ALWAYS] = 1; + if (maxeax >= 1) { + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUID1_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUID1_EDX] = cpr.cp_edx; + } + if (maxeax >= 7) { + cpr.cp_eax = 7; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUID7_EBX] = cpr.cp_ebx; + cpuid_res[LXCS_CPUID7_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUID7_EDX] = cpr.cp_edx; + } + if (maxeax >= 0xd) { + cpr.cp_eax = 0xd; + cpr.cp_ecx = 1; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUIDD1_EAX] = cpr.cp_eax; + } + if (xmaxeax >= 0x80000001) { + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUIDX1_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUIDX1_EDX] = cpr.cp_edx; + } + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + /* Print CPUID feature flags */ + for (i = 0; i < LX_CPUINFO_MAPPING_MAX; i++) { + lx_cpuinfo_mapping_t *lxm = &lx_cpuinfo_mappings[i]; + + ASSERT(lxm->lxcm_source < LXCS_REG_MAX); + if (cpuid_res[lxm->lxcm_source] & lxm->lxcm_flag) { + lxpr_uiobuf_printf(uiobuf, " %s", + lxm->lxcm_name); + } + } + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * Report a list of file systems loaded in the kernel. We only report the ones + * which we support and which may be checked by various components to see if + * they are loaded. + */ +/* ARGSUSED */ +static void +lxpr_read_filesystems(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "autofs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "cgroup"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "nfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "proc"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "sysfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "tmpfs"); +} + +/* + * Calculate the number of links in the task dir. Some code (e.g. chromium) + * depends on this value being accurate. + */ +static uint_t +lxpr_count_taskdir(lxpr_node_t *lxpnp) +{ + proc_t *p; + uint_t cnt; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) + return (0); + + cnt = lxpr_count_tasks(p); + + lxpr_unlock(p); + + /* Add the fixed entries ("." & "..") */ + cnt += 2; + return (cnt); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink, fifo or socket + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = lxpnp->lxpr_realvp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_PID_TASKDIR: + vap->va_nlink = lxpr_count_taskdir(lxpnp); + vap->va_size = vap->va_nlink * LXPR_SDSIZE; + break; + case LXPR_PID_TASK_IDDIR: + vap->va_nlink = TIDDIRFILES; + vap->va_size = TIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + /* + * Restore VLNK type for lstat-type activity. + * See lxpr_readlink for more details. + */ + if ((flags & FOLLOW) == 0) + vap->va_type = VLNK; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + return (lxpr_doaccess(VTOLXP(vp), B_FALSE, mode, flags, cr, ct)); +} + +/* + * This makes up the bulk of the logic for lxpr_access. An extra parameter + * ('shallow') is present to differentiate checks that must pass muster against + * an underlying resource (lxpr_realvp) and those that are only concerned with + * permission to the process. + */ +static int +lxpr_doaccess(lxpr_node_t *lxpnp, boolean_t shallow, int mode, int flags, + cred_t *cr, caller_context_t *ct) +{ + lxpr_nodetype_t type = lxpnp->lxpr_type; + boolean_t allow_pid_access = B_FALSE; + int shift = 0; + proc_t *tp; + + /* + * lx /proc is primarily a read only file system + * We handle LXPR_SYSDIR as a special case. At least 'systemd' expects + * access() to report /proc/sys is writable, but we can't do that in + * lxpr_is_writable since it breaks other code paths that check if they + * can write there. + */ + if ((mode & VWRITE) && !lxpr_is_writable(type)) { + if (type != LXPR_SYSDIR) + return (EROFS); + } + + if (type == LXPR_PIDDIR) { + return (0); + } + if (lxpnp->lxpr_pid != 0) { + if ((tp = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) { + return (ENOENT); + } + if (tp == curproc || secpolicy_proc_access(cr) == 0 || + priv_proc_cred_perm(cr, tp, NULL, mode) == 0) { + allow_pid_access = B_TRUE; + } + lxpr_unlock(tp); + switch (type) { + case LXPR_PID_CGROUP: + case LXPR_PID_CMDLINE: + case LXPR_PID_COMM: + case LXPR_PID_LIMITS: + case LXPR_PID_LOGINUID: + case LXPR_PID_MOUNTINFO: + case LXPR_PID_MOUNTS: + case LXPR_PID_OOM_SCR_ADJ: + case LXPR_PID_STAT: + case LXPR_PID_STATM: + case LXPR_PID_STATUS: + case LXPR_PID_TASKDIR: + case LXPR_PID_TASK_IDDIR: + case LXPR_PID_TID_CGROUP: + case LXPR_PID_TID_CMDLINE: + case LXPR_PID_TID_COMM: + case LXPR_PID_TID_LIMITS: + case LXPR_PID_TID_LOGINUID: + case LXPR_PID_TID_MOUNTINFO: + case LXPR_PID_TID_OOM_SCR_ADJ: + case LXPR_PID_TID_STAT: + case LXPR_PID_TID_STATM: + case LXPR_PID_TID_STATUS: + /* + * These entries are accessible to any process on the + * system which wishes to query them. + */ + break; + default: + /* + * All other entries under the pid/tid hierarchy + * require proper authorization to be accessed. + */ + if (!allow_pid_access) { + return (EACCES); + } + break; + } + } + + /* + * If this entry has an underlying vnode, rely upon its access checks. + * Skip this if a shallow check has been requested. + */ + if (lxpnp->lxpr_realvp != NULL && !shallow) { + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* + * Allow access to those (root) possessing the correct privilege or + * already authorized against a pid-specific resource. + */ + if (allow_pid_access || secpolicy_proc_access(cr) == 0) { + return (0); + } + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp), ZOMB_OK); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_taskdir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + uint_t tid; + int c; + kthread_t *t; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR); + + /* + * convert the string rendition of the filename to a thread ID + */ + tid = 0; + while ((c = *comp++) != '\0') { + int otid; + if (c < '0' || c > '9') + return (NULL); + + otid = tid; + tid = 10 * tid + c - '0'; + /* integer overflow */ + if (tid / 10 != otid) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock_pid(dlxpnp, tid, NO_ZOMB, &t); + if (p == NULL) + return (NULL); + + /* + * Bail if this is a system process. + */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + if (p->p_brand != &lx_brand) { + /* + * Only the main thread is visible for non-branded processes. + */ + t = p->p_tlist; + if (tid != p->p_pid || t == NULL) { + t = NULL; + } + } else if (t != NULL) { + /* + * Disallow any access to aio in-kernel worker threads. + * To prevent a potential race while looking at the lwp data + * for an exiting thread, we clear the TP_KTHREAD bit in + * lx_cleanlwp() while the p_lock is held. + */ + if ((t->t_proc_flag & TP_KTHREAD) != 0) { + lx_lwp_data_t *lwpd; + + VERIFY((lwpd = ttolxlwp(t)) != NULL); + if ((lwpd->br_lwp_flags & BR_AIO_LWP) != 0) { + lxpr_unlock(p); + return (NULL); + } + } + } + + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + + /* + * Allocate and fill in a new lx /proc taskid node. + * Instead of the last arg being a fd, it is a tid. + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + kthread_t *t; + int i; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock_pid(dlxpnp, dlxpnp->lxpr_desc, NO_ZOMB, &t); + if (p == NULL) + return (NULL); + + /* + * Bail if this is a system process. + */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in the new lx /proc taskid dir node + */ + for (i = 0; i < TIDDIRFILES; i++) { + if (strcmp(tiddir[i].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p, + dlxpnp->lxpr_desc); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); + } + } + + lxpr_unlock(p); + return (NULL); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PID_FDDIR || + VTOLXP(dp)->lxpr_type == LXPR_PID_TID_FDDIR); + + return (lxpr_lookup_fdnode(dp, comp)); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + vnode_t *vp; + proc_t *p; + kthread_t *t; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock_pid(VTOLXP(dp), pid, ZOMB_OK, &t); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * Allocate and populate a new LX /proc node. + * + * Directory entries for non-main threads can be looked up as + * /proc/<tid> despite the fact that they do not appear in the + * readdir output. Record the lookup pid (tid) so that later + * operations can be aware of this context. + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, pid); + + lxpr_unlock(p); + vp = LXPTOV(lxpnp); + ASSERT(vp != NULL); + + return (vp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sysdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kdir_randdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_randdir, + SYS_RANDDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_netdir, + SYS_NETDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_coredir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_ipv4dir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_vmdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_vmdir, + SYS_VMDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir, + SYS_FSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zone_t *zone; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zone = LXPTOZ(lxpnp); + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + pid_t pid, raw_pid; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, the + * zsched process for the zone, and anything the security + * policy doesn't allow us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || p->p_zone != zone || + p == zone->zone_zsched || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, NULL); + raw_pid = p->p_pid; + + ASSERT(p->p_stat != 0); + + mutex_exit(&pidlock); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, raw_pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + int err; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + if ((p = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) { + return (ENOENT); + } + err = lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES); + lxpr_unlock(p); + return (err); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, tiddirsize, tasknum; + proc_t *p; + kthread_t *t; + boolean_t branded; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + oresid = uiop->uio_resid; + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + return (ENOENT); + } + + /* + * Just emit static entries for system processes and zombies. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its threads. + */ + tiddirsize = p->p_lwpcnt; + branded = (p->p_brand == &lx_brand); + mutex_exit(&p->p_lock); + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + if ((t = p->p_tlist) == NULL) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until all thread's have + * been returned. + */ + for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) { + int i, reclen, len; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the thread list + */ + i = (uoffset / LXPR_SDSIZE) - 2; + if (i < 0 || i >= tiddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (i != tasknum) + goto next; + + if (!branded) { + /* + * Emulating the goofy linux task model is impossible + * to do for native processes. We can compromise by + * presenting only the main thread to the consumer. + */ + emul_tid = p->p_pid; + } else { + if ((lwpd = ttolxlwp(t)) == NULL) { + goto next; + } + /* Don't show aio kernel worker threads */ + if ((t->t_proc_flag & TP_KTHREAD) != 0 && + (lwpd->br_lwp_flags & BR_AIO_LWP) != 0) { + goto next; + } + emul_tid = lwpd->br_pid; + /* + * Convert pid to Linux default of 1 if we're the + * zone's init. + */ + if (emul_tid == LXPTOZ(lxpnp)->zone_proc_initpid) + emul_tid = 1; + } + + dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, p->p_pid, + emul_tid); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + + if ((t = t->t_forw) == p->p_tlist || !branded) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + } + + if (eofp != NULL) + *eofp = 0; + +out: + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + kthread_t *t; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* Confirm that process and thread are still present */ + p = lxpr_lock_pid(lxpnp, lxpnp->lxpr_desc, NO_ZOMB, &t); + if (p == NULL) { + return (ENOENT); + } + lxpr_unlock(p); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, fddirsize; + proc_t *p; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR || + lxpnp->lxpr_type == LXPR_PID_TID_FDDIR); + + oresid = uiop->uio_resid; + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) + return (ENOENT); + + /* + * For exiting/exited processes or those belonging to the system, only + * emit the fixed entries. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, p->p_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES)); +} + +static int +lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir, + SYS_FSDIRFILES)); +} + +static int +lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +static int +lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static int +lxpr_readdir_sys_kdir_randdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_randdir, + SYS_RANDDIRFILES)); +} + +static int +lxpr_readdir_sys_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_netdir, + SYS_NETDIRFILES)); +} + +static int +lxpr_readdir_sys_net_coredir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static int +lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static int +lxpr_readdir_sys_vmdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_vmdir, + SYS_VMDIRFILES)); +} + +#define isdigit(c) ((c) >= '0' && (c) <= '9') +#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') + +/* + * Obtain a numeric value from the null-terminated input string. + * We don't have strtok in the kernel, so tokenize this ourselves and + * validate the input. + */ +static int +lxpr_tokenize_num(char *str, long *pv, char **ep) +{ + char *pstart, *pc, c, *endptr; + long v; + + for (pc = str; isspace(*pc); pc++) + ; + + for (pstart = pc; isdigit(*pc); pc++) + ; + if (pc == pstart || (!isspace(*pc) && *pc != '\0')) + return (EINVAL); + c = *pc; + *pc = '\0'; + + if (ddi_strtol(pstart, &endptr, 10, &v) != 0) { + *pc = c; + return (EINVAL); + } + if (*endptr != '\0') { + *pc = c; + return (EINVAL); + } + + if (pv != NULL) + *pv = v; + if (ep != NULL) + *ep = ++pc; + + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write_tcp_property(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct, char *prop, + int (*xlate)(char *, int)) +{ + int error; + int res = 0; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + netstack_t *ns; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '\0') /* no input */ + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + if (xlate != NULL && xlate(val, sizeof (val)) != 0) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + pinfo = mod_prop_lookup(ptbl, prop, MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, val, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_core_somaxc(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_conn_req_max_q", NULL)); +} + +static int +lxpr_xlate_sec2ms(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_ka_intvl(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000 * 9) >= size) + return (EINVAL); + return (0); +} + +/* + * Approximately translate the input count value into a reasonable + * _rexmit_interval_max timeout. + */ +static int +lxpr_xlate_retry2(char *val, int size) +{ + long cnt; + char *ep; + uint_t i, rx_max; + + if (lxpr_tokenize_num(val, &cnt, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + /* + * The _rexmit_interval_max is limited to 2 hours, so a count of 15 + * or more will exceed that due to exponential backoff. + */ + if (cnt > 15) + cnt = 15; + + rx_max = 400; /* Start with default _rexmit_interval_min in ms */ + for (i = 0; i < cnt; i++) + rx_max *= 2; + + /* + * The _rexmit_interval_max is limited to 2 hours, so if we went over + * the limit, just use 2 hours (in ms). + */ + if (rx_max > (7200 * 1000)) + rx_max = 7200 * 1000; + + if (snprintf(val, size, "%u", rx_max) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_sack(char *val, int size) +{ + long flag; + char *ep; + + if (lxpr_tokenize_num(val, &flag, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (flag != 0 && flag != 1) + return (EINVAL); + /* see comment on lxpr_read_sys_net_ipv4_tcp_sack */ + if (snprintf(val, size, "%d", (flag == 0 ? 0 : 2)) >= size) + return (EINVAL); + return (0); +} + +/* + * We're updating a property on the ip stack so we can't reuse + * lxpr_write_tcp_property. + */ +/* ARGSUSED */ +static int +lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + long flag; + char *ep; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '\0') /* no input */ + return (EINVAL); + + if (lxpr_tokenize_num(val, &flag, &ep) != 0) + return (EINVAL); + + if (*ep != '\0' || (flag != 0 && flag != 1)) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + ipst = ns->netstack_ip; + ipst->ips_ip_g_resp_to_echo_bcast = !flag; + + netstack_rele(ns); + return (0); +} + +/* + * We expect two port numbers on a line as input for the range, and we have to + * set two properties on the netstack_tcp, so we can't reuse + * lxpr_write_tcp_property. + */ +/* ARGSUSED */ +static int +lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int res; + size_t olen; + char vals[32]; /* big enough for a line w/ 2 16-bit numeric strings */ + char *ep; + long low, high; + netstack_t *ns; + tcp_stack_t *tcps; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (vals) - 1) + return (EINVAL); + + bzero(vals, sizeof (vals)); + res = uiomove(vals, olen, UIO_WRITE, uio); + if (res != 0) + return (res); + + if (lxpr_tokenize_num(vals, &low, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &high, &ep) != 0) + return (EINVAL); + + if (*ep != '\0') { + /* make sure no other tokens on the line */ + *ep++ = '\0'; + for (; isspace(*ep); ep++) + ; + if (*ep != '\0') + return (EINVAL); + } + + if (low > high || high > 65535) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + tcps = ns->netstack_tcp; + if (low < tcps->tcps_smallest_nonpriv_port) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + + (void) snprintf(vals, sizeof (vals), "%ld", low); + pinfo = mod_prop_lookup(ptbl, "smallest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + (void) snprintf(vals, sizeof (vals), "%ld", high); + pinfo = mod_prop_lookup(ptbl, "largest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +/* + * We expect three numbers on a line as input for the range, and we have to + * set two properties on the netstack_tcp, so we can't reuse + * lxpr_write_tcp_property. + * + * See the Linux tcp(7) man page. + */ +/* ARGSUSED */ +static int +lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int res; + size_t olen; + char vals[80]; /* big enough for a line w/ 3 numeric strings */ + char *ep; + long min, def, max, min_limit; + netstack_t *ns; + tcp_stack_t *tcps; + mod_prop_info_t *ptbl; + mod_prop_info_t *pinfo; + char *attr; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM || + lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (vals) - 1) + return (EINVAL); + + bzero(vals, sizeof (vals)); + res = uiomove(vals, olen, UIO_WRITE, uio); + if (res != 0) + return (res); + + if (lxpr_tokenize_num(vals, &min, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &def, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &max, &ep) != 0) + return (EINVAL); + + if (*ep != '\0') { + /* make sure no other tokens on the line */ + *ep++ = '\0'; + for (; isspace(*ep); ep++) + ; + if (*ep != '\0') + return (EINVAL); + } + + /* + * Ensure the numbers are valid, low to high. + * Valid ranges from the tunable's guide. + */ + min_limit = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + 2048 : 4096); + if (min > def || def > max || min < min_limit || + def > ONEGB || max < 8192) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + tcps = ns->netstack_tcp; + + /* recv_hiwat and xmit_hiwat are aliased to recv_buf and send_buf. */ + attr = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + "recv_buf" : "send_buf"); + + (void) snprintf(vals, sizeof (vals), "%ld", def); + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + pinfo = mod_prop_lookup(ptbl, attr, MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + /* + * Don't reduce max for one side (recv or xmit) since that impacts the + * other. + */ + if (res == 0 && max > tcps->tcps_max_buf) { + (void) snprintf(vals, sizeof (vals), "%ld", max); + pinfo = mod_prop_lookup(ptbl, "max_buf", MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + } + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_fin_wait_2_flush_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_abort_interval", lxpr_xlate_ka_intvl)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_conn_req_max_q0", NULL)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_rexmit_interval_max", lxpr_xlate_retry2)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "sack", + lxpr_xlate_sack)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "_wscale_always", + NULL)); +} + +/* ARGSUSED */ +static int +lxpr_write_sys_fs_pipe_max(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + char *ep; + long u; + size_t size; + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (lxpr_tokenize_num(val, &u, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + /* + * Bound to PAGESIZE <= input <= lx_pipe_max_limit, then round to the + * nearest page. Linux is a little more picky, rounding to the nearest + * power-of-two pages. Such strengthened behavior can be added later + * if needed. + */ + size = (size_t)u; + size = P2ROUNDUP(MIN(MAX(PAGESIZE, size), lx_pipe_max_limit), PAGESIZE); + + ASSERT(size <= lx_pipe_max_limit); + + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_pipe_max_sz = size; + mutex_exit(&lxzd->lxzd_lock); + + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write_sys_kernel_corepatt(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + zone_t *zone = LXPTOZ(lxpnp); + struct core_globals *cg; + refstr_t *rp, *nrp; + corectl_path_t *ccp; + char val[MAXPATHLEN]; + char valtr[MAXPATHLEN]; + size_t olen; + int error; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + if (secpolicy_coreadm(cr) != 0) + return (EPERM); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '|') + return (EINVAL); + + if ((error = lxpr_core_path_l2s(val, valtr, sizeof (valtr))) != 0) + return (error); + + nrp = refstr_alloc(valtr); + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + rp = ccp->ccp_path; + refstr_hold((ccp->ccp_path = nrp)); + cg->core_options |= CC_PROCESS_PATH; + mutex_exit(&ccp->ccp_mtx); + + if (rp != NULL) + refstr_rele(rp); + + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write_pid_loginuid(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + char *ep; + long u; + proc_t *p; + lx_proc_data_t *pd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (lxpr_tokenize_num(val, &u, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + if ((p = lxpr_lock(lxpnp, NO_ZOMB)) == NULL) + return (ENXIO); + + if ((pd = ptolxproc(p)) != NULL) { + pd->l_loginuid = (uid_t)u; + } + lxpr_unlock(p); + + return (0); +} + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* + * Linux does something very "clever" for /proc/<pid>/fd/<num> entries. + * Open FDs are represented as symlinks, the link contents + * corresponding to the open resource. For plain files or devices, + * this isn't absurd since one can dereference the symlink to query + * the underlying resource. For sockets or pipes, it becomes ugly in a + * hurry. To maintain this human-readable output, those FD symlinks + * point to bogus targets such as "socket:[<inodenum>]". This requires + * circumventing vfs since the stat/lstat behavior on those FD entries + * will be unusual. (A stat must retrieve information about the open + * socket or pipe. It cannot fail because the link contents point to + * an absent file.) + * + * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD + * entries. This bypasses code paths which would normally + * short-circuit on symlinks and allows us to emulate the vfs behavior + * expected by /proc consumers. + */ + if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + error = lxpr_doaccess(lxpnp, B_TRUE, VREAD, 0, cr, ct); + if (error != 0) + return (error); + + if ((error = vnodetopath(NULL, rvp, bp, buflen, cr)) != 0) { + /* + * Special handling possible for /proc/<pid>/fd/<num> + * Generate <type>:[<inode>] links, if allowed. + */ + if (lxpnp->lxpr_type != LXPR_PID_FD_FD || + lxpr_readlink_fdnode(lxpnp, bp, buflen) != 0) { + return (error); + } + } + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), curproc, &pid, NULL); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} + +/* Pollhead for fake POLLET support below */ +static struct pollhead lxpr_pollhead; + +/* ARGSUSED */ +static int +lxpr_poll(vnode_t *vp, short ev, int anyyet, short *reventsp, + pollhead_t **phpp, caller_context_t *ct) +{ + *reventsp = 0; + if (ev & POLLIN) + *reventsp |= POLLIN; + if (ev & POLLRDNORM) + *reventsp |= POLLRDNORM; + if (ev & POLLRDBAND) + *reventsp |= POLLRDBAND; + if (ev & POLLOUT) + *reventsp |= POLLOUT; + if (ev & POLLWRBAND) + *reventsp |= POLLWRBAND; + + /* + * Newer versions of systemd will monitor /proc/self/mountinfo with + * edge-triggered epoll (via libmount). If adding said resource to an + * epoll descriptor fails, as would be the expectation for a call to + * fs_poll when POLLET is present, then systemd will abort and the zone + * will fail to properly boot. Until proper pollwakeup() support is + * wired into lx_proc, valid POLLET support must be faked. + * + * While the only known (at this time) lx_proc resource where POLLET + * support is mandatory is LXPR_PID_MOUNTINFO, we cast a wide net to + * avoid other unexpected trouble. Normal devpoll caching (emitting a + * pollhead when (*reventsp == 0 && !anyyet)) is not enabled. + */ + if ((ev & POLLET) != 0) { + *phpp = &lxpr_pollhead; + } + return (0); +} + +/* ARGSUSED */ +static int +lxpr_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) { + if (wr_tab[i].wft_wrf != NULL) { + return (wr_tab[i].wft_wrf(lxpnp, uiop, cr, ct)); + } + break; + } + } + + /* pretend we wrote the whole thing */ + uiop->uio_offset += uiop->uio_resid; + uiop->uio_resid = 0; + return (0); +} + +/* Needed for writable files which are first "truncated" */ +/* ARGSUSED */ +static int +lxpr_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + cred_t *cred, caller_context_t *ct) +{ + int error; + + if (cmd != F_FREESP) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cred, ct)) != 0) + return (error); + + return (0); +} + +/* + * Needed for writable files which are first "truncated". We only support + * truncation. + */ +/* ARGSUSED */ +static int +lxpr_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (vap->va_mask != AT_SIZE) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cr, ct)) != 0) + return (error); + + return (0); +} + +/* + * We need to allow open with O_CREAT for the writable files. + */ +/* ARGSUSED */ +static int +lxpr_create(vnode_t *dvp, char *nm, vattr_t *vap, enum vcexcl exclusive, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + lxpr_node_t *lxpnp = VTOLXP(dvp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *vp = NULL; + int error; + + ASSERT(type < LXPR_NFILES); + + /* + * restrict create permission to owner or root + */ + if ((error = lxpr_access(dvp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + if (*nm == '\0') + return (EPERM); + + if (dvp->v_type != VDIR) + return (EPERM); + + if (exclusive == EXCL) + return (EEXIST); + + /* + * No writable files in top-level proc dir. We check this to avoid + * getting a non-proc node via "..". + */ + if (type != LXPR_PROCDIR && + lxpr_lookup(dvp, nm, &vp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + lxpr_nodetype_t ftype = VTOLXP(vp)->lxpr_type; + if (!lxpr_is_writable(ftype)) { + VN_RELE(vp); + vp = NULL; + } + } + + if (vp != NULL) { + ASSERT(vp->v_type != VDIR); + + /* confirm permissions against existing file */ + if ((error = lxpr_access(vp, mode, 0, cr, ct)) != 0) { + VN_RELE(vp); + return (error); + } + + *vpp = vp; + return (0); + } + + /* + * Linux proc does not allow creation of addition, non-subsystem + * specific files inside the hierarchy. ENOENT is tossed when such + * actions are attempted. + */ + return (ENOENT); +} diff --git a/usr/src/uts/common/brand/lx/sys/lx_acl.h b/usr/src/uts/common/brand/lx/sys/lx_acl.h new file mode 100644 index 0000000000..1e5ab26407 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_acl.h @@ -0,0 +1,45 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#ifndef _LX_ACL_H +#define _LX_ACL_H + +#include <sys/vnode.h> +#include <sys/uio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Both fall under the 'system.' namespace */ +#define LX_XATTR_POSIX_ACL_ACCESS "posix_acl_access" +#define LX_XATTR_POSIX_ACL_DEFAULT "posix_acl_default" + +enum lx_acl_type { + LX_ACL_ACCESS, + LX_ACL_DEFAULT +}; + +extern int lx_acl_setxattr(vnode_t *, enum lx_acl_type, void *, size_t); +extern int lx_acl_getxattr(vnode_t *, enum lx_acl_type, void *, size_t, + ssize_t *); +extern int lx_acl_removexattr(vnode_t *, enum lx_acl_type); +extern int lx_acl_listxattr(vnode_t *, uio_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_ACL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_audit.h b/usr/src/uts/common/brand/lx/sys/lx_audit.h new file mode 100644 index 0000000000..76686dd9ec --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_audit.h @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2018 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_AUDIT_H +#define _LX_AUDIT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void lx_audit_init(int (*)(void *, uint_t, const char *, uint_t)); +extern void lx_audit_cleanup(void); +extern void lx_audit_stop_worker(void *, void (*)(void *, boolean_t)); +extern int lx_audit_append_rule(void *, uint_t); +extern int lx_audit_delete_rule(void *, uint_t); +extern void lx_audit_list_rules(void *, + void (*)(void *, void *, uint_t, void *, uint_t)); +extern void lx_audit_get_feature(void *, void (*)(void *, void *, uint_t)); +extern void lx_audit_get(void *, void (*)(void *, void *, uint_t)); +extern int lx_audit_set(void *, void *, uint_t, void (*cb)(void *, boolean_t)); +extern void lx_audit_emit_user_msg(uint_t, uint_t, char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUDIT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h new file mode 100644 index 0000000000..17b19895f4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h @@ -0,0 +1,511 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_H +#define _LX_AUTOFS_H + +/* + * The lxautofs filesystem and driver exist to emulate the Linux autofs + * filesystem and /dev/autofs device (this code emulates both). The + * purpose is to provide support for the Linux "automount" automounter. + * + * The device ioctls map fairly closely to the filesystem ioctls. The device + * ioctls have superseded the filesystem ioctls and the automounter will + * use the device ioctls if the device exists. + * + * The device ioctls are used by the automounter to perform recovery + * in cases where the automounter is restarted while mounts are present. It + * also allows for better management operations when a filesystem is mounted + * on top of an autofs mountpoint, as in the case of an NFS direct mount on + * top of an autofs mount. + * + * + * +++ Linux automounter background. + * + * Linux has two automounters: "amd" (not used in any popular, modern distro) + * and "automount". + * + * "automount" is the normal Linux automounter. It utilizes a kernel + * filesystem (autofs) and device (/dev/autofs) to provide its functionality. + * Basically, it mounts the autofs filesystem at any automounter controlled + * mountpoint. This filesystem then intercepts and redirects lookup operations + * to the userland automounter process via a pipe. The pipe to the automounter + * is established via a mount option when the autofs filesystem is mounted or + * via the setpipefd ioctl if the automounter restarts. When the automounter + * receives a request via this pipe, it does lookups (or unmounts) to whatever + * backing store it's configured to use, does mkdir operations on the autofs + * filesystem, mounts remote NFS filesystems on any directories it manages or + * just created, and signals the autofs device via an ioctl to let it know + * that the lookup (or expire) can continue. Other management operations (such + * as querying expiration for unmounting) are performed using the autofs device. + * + * + * +++ Linux autofs documentation. + * + * Within the Linux src tree, see the file: + * Documentation/filesystems/autofs4-mount-control.txt + * This documents some of the autofs behavior and the device driver ioctls. + * + * The following URL (https://lwn.net/Articles/606960/) documents autofs in + * general. This patch was targeted for Documentation/filesystems/autofs4.txt, + * but seems to have never integrated into the Linux src tree. + * + * + * +++ Linux autofs (and automount daemon) notes + * + * Since we're mimicking the behavior of the Linux autofs filesystem and + * device, we document some of the observed behavior here. + * + * There are multiple versions of the autofs filesystem kernel API protocol + * and modern implementations of the user-land automount daemon would depend + * on v5, although the filesystem API has been superseded by the driver ioctl + * API, which is roughly similar. + * + * We'll describe the filesystem ioctls first, since support for those was + * implemented first. The device ioctls roughly correspond to the filesystem + * ioctls and were implemented last, but the automounter will use those + * ioctls, instead of the filesystem ioctls, when the device is present. + * + * Our original autofs implementation was developed in the mid-2000s around the + * v2 protocol, but that is currently obsolete. Our current implementation is + * based around the v5 protocol API. There was no autofs device support at that + * time. + * + * The autoumounter supports 3 different, mutually exclusive, mount options for + * each mountpoint: + * - indirect (this was all you got with the v2 support) + * - direct + * - offset + * + * An 'indirect' mountpoint is managed with dynamic mounts below that + * mountpoint. For example, if '/home' were an indirect autofs mount, then + * accessing a username under /home would traverse the 'lookup' code described + * below, cause a local subdirectory to be created, and a mount, usually NFS, + * onto that username subdirectory. + * + * A 'direct' mountpoint is an autofs mountpoint which will trigger the + * mounting of another filesystem overtop that mountpoint when accessed. + * + * An 'offset' mountpoint behaves like a 'direct' mountpoint but it is + * created dynamically by the automounter underneath an 'indirect' mountpoint. + * For example, if '/net' were an indirect autosfs mountpoint and the host + * 'jurassic' exported two NFS filesystems; '/var/crash' and '/var/core', then + * accessing '/net/jurassic' would trigger the automounter to create two + * subdirectories; '/net/jurassic/var/crash' and '/net/jurassic/var/core'. The + * automounter would then mount an autofs offset mount onto each one of these + * directories. Accessing either of those directories would then trigger + * automounter to perform another mount on top, as is done with a 'direct' + * mount. + * + * General behavior + * + * A) Autofs allows root owned, non-automounter processes to create + * directories in the autofs filesystem. The autofs filesystem treats the + * automounter's process group as special, but it doesn't prevent root + * processes outside of the automounter's process group from creating new + * directories in the autofs filesystem. + * + * B) Autofs doesn't allow creation of any non-directory entries in the + * autofs filesystem. No entity can create files (e.g. /bin/touch or + * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within + * the autofs filesystem are directories. + * + * C) Autofs only intercepts vop lookup operations. Notably, it does _not_ + * intercept and re-direct vop readdir operations. This means that the + * observed behavior of the Linux automounter can be considerably different + * from that of the illumos automounter. Specifically, on illumos if an autofs + * mountpoint is mounted _without_ the -nobrowse option then if a user does + * an ls operation (which translates into a vop readdir operation) then the + * automounter will intercept that operation and list all the possible + * directories and mountpoints without actually mounting any filesystems. + * Essentially, all automounter managed mountpoints on Linux will behave + * like "-nobrowse" mountpoints on illumos. Here's an example to illustrate + * this. If /ws was mounted on illumos without the -nobrowse option and an + * auto_ws yp map was setup as the backing store for this mountpoint, then an + * "ls /ws" would list all the keys in the map as valid directories, but an + * "ls /ws" on Linux would list an emptry directory. + * + * D) NFS mounts are performed by the automount process. When the automount + * process gets a redirected lookup request, it determines _all_ the + * possible remote mountpoints for that request, creates directory paths + * via mkdir, and mounts the remote filesystems on the newly created paths. + * This is described in the offset mount example above. Once the automounter + * completed the mounts it would signal the autofs filesystem (via an ioctl) + * that the lookup could continue. + * + * E.1) Autofs only redirects vop lookup operations for path entries that + * don't already exist in the autofs filesystem. So for the example above, + * an initial (after the start of the automounter) "ls /net/jurassic" would + * result in a request to the automounter. A subsequest "ls /net/jurassic" + * would not result in a request to the automounter. Even if + * /net/jurassic/var/crash and /net/jurassic/var/core were manually unmounted + * after the initial "ls /net/jurassic", a subsequest "ls /net/jurassic" + * would not result in a new request to the automounter. + * + * E.2) Autofs lookup requests that are sent to the automounter only include + * the root directory path component. So for example, after starting up + * the automounter if a user were to do a "ls /net/jurassic/var/crash", the + * initial lookup request actually sent to the automounter would just be for + * "jurassic" (the same request as if the user had done "ls /net/jurassic"). + * After the initial mounting of the two offset mounts onto crash and core the + * lookup would continue and a final lookup request would be sent to the + * automounter for "crash" (but this would be on a different vfs from the + * /net vfs). + * + * E.3) The two statements above aren't entirely entirely true. The Linux + * autofs filesystem will also redirect lookup operations for leaf + * directories that don't have a filesystem mounted on them. Using the + * example above, if a user did a "ls /net/jurassic", then manually + * unmounted /net/jurassic/var/crash, and then did an "ls + * /net/jurassic/var/crash", this would result in a request for + * "jurassic/var/crash" being sent to the automounter. The strange thing + * (a Linux bug perhaps) is that the automounter won't do anything with this + * request and the lookup will fail. + * + * F) The autofs filesystem communication protocol (what ioctls it supports + * and what data it passes to the automount process) is versioned. The + * userland automount daemon (as of version v5.0.7) expects v5 of the protocol + * (by running the AUTOFS_IOC_PROTOSUBVER ioctl), and exits if that is not + * supported. For v2-v5 the structure passed through the pipe always begins + * with a common header followed by different fields depending on the packet + * type. In addition the different versions support additional ioctls. + * + * v2 - basic lookup request + * v3 - adds expiring (umounting) + * v4 - adds expire multi + * v5 - adds missing indirect, expire indirect, missing direct & expire direct. + * Defines a new protocol structure layout. + * The v5 'missing indirect' and 'missing direct' ioctls are analogous to + * the v2 'missing' ioctl. These ioctls are used to initiate a mount via + * a lookup. The 'expire' ioctls are used by the automounter to query if + * it is possible to unmount the filesystem. 'direct' and 'indirect' + * refer to the mount option type that the automounter performed and + * correlate to an automounter direct or indirect map mointpoint. + * + * G) The automounter periodically issues an 'expire' ioctl to autofs to + * obtain the name of a mountpoint which the automounter can unmount. + * Unmounting is dicussed in more detail below. + * + * H) The device ioctls roughly correspond to the filesystem ioctls, but + * instead of being tied to an auotfs mountpoint vnode, they can be called any + * time. The argument structure uses either a path or an autofs pipe file + * descriptor to indicate what is being operated on. + * + * +++ lxautofs notes + * + * 1) In general, the lxautofs filesystem tries to mimic the behavior of the + * Linux autofs filesystem with the following exceptions: + * + * 1.1) We don't bother to implement the E.3 functionality listed above + * since it doesn't appear to be of any use. + * + * 1.2) We only fully implement v2 and v5 of the autofs protocol. + * + * 2) In general, the approach taken for lxautofs is to keep it as simple + * as possible and to minimize it's memory usage. To do this all information + * about the contents of the lxautofs filesystem are mirrored in the + * underlying filesystem that lxautofs is mounted on and most vop operations + * are simply passed onto this underlying filesystem. This means we don't + * have to implement most of the complex operations that a full filesystem + * normally has to implement. It also means that most of our filesystem state + * (wrt the contents of the filesystem) doesn't actually have to be stored + * in memory, we can simply go to the underlying filesystem to get it when + * it's requested. For the purposes of discussion, we'll call the underlying + * filesystem the "backing store." + * + * The backing store is actually a directory called ".lxautofs" which is created + * in the directory where the lxautofs filesystem is mounted. When the + * lxautofs filesystem is unmounted this backing store directory is deleted. + * If this directory exists at mount time (perhaps the system crashed while a + * previous lxautofs instance was mounted at the same location) it will be + * deleted. There are a few implications of using a backing store worth + * mentioning. + * + * 2.1) lxautofs can't be mounted on a read only filesystem. If this + * proves to be a problem we can probably move the location of the + * backing store. + * + * 2.2) If the backing store filesystem runs out of space then the + * automounter process won't be able to create more directories and mount + * new filesystems. Of course, strange failures usually happen when + * filesystems run out of space. + * + * 3) Why aren't we using gfs? gfs has two different usage models. + * + * 3.1) I'm my own filesystem but i'm using gfs to help with managing + * readdir operations. + * + * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes + * + * We're not using the 3.1 interfaces because we don't implement readdir + * ourselves. We pass all readdir operations onto the backing store + * filesystem and utilize its readdir implementation. + * + * We're not using the 3.2 interfaces because they are really designed for + * in memory filesystems where all of the filesystem state is stored in + * memory. They don't lend themselves to filesystems where part of the + * state is in memory and part of the state is on disk. + * + * For more information on gfs take a look at the block comments in the + * top of gfs.c + * + * 4) Unmounting + * + * The automounter has a timeout associated with each mount. It informs autofs + * of this timeout using the LX_AUTOFS_DEV_IOC_TIMEOUT_CMD ioctl after autofs + * has been mounted on the mountpoint. + * + * After the automounter has mounted something associated with the mountpoint + * then periodically (<timeout>/4 seconds) the automounter will issue the + * LX_AUTOFS_DEV_IOC_EXPIRE_CMD ioctl on the autofs mount. autofs is expected + * to respond with an underlying mountpoint entry which is a candidate for + * unmounting. The automounter will attempt to unmount the filesystem + * (which may fail if it is busy, since this is obviously racy) and then + * acknowledge the expire ioctl. The successful acknowledgement is independent + * of the success of unmounting the underlying filesystem. + * + * Unmount handling varies based on which type of mount the autofs was mounted + * with (indirect, direct or offset). + * + * To support 'indirect' mount expiration, the autofs vfs keeps track of the + * filesystems mounted immediately under the autofs mountpoint (in + * lav_mnt_list) after a lookup has completed successfully. Upon receipt of the + * LX_AUTOFS_IOC_DEV_EXPIRE_CMD ioctl, autofs removes the first element from + * the list, attempts to check if it is busy and if not, returns that mountpoint + * over the fifo (if busy the entry is added to the end of the list). When the + * ioctl is acknowledged, if the mountpoint still exists, that means the unmount + * failed and the entry is added at the back of the list. If there are no + * elements or the first one is busy, EAGAIN is returned for the 'expire' ioctl + * and the autoumounter will check again in <timeout>/4 seconds. + * + * For example, if /home is an autofs indirect mount, then there are typically + * many different {username}-specific NFS mounts under that /home autofs mount. + * autofs uses the lav_mnt_list to respond to 'expire' ioctls in a round-robin + * fashion so that the automounter can unmount user file systems that aren't in + * use. + * + * Expiring 'direct' mounts is similar, but since there is only a single mount, + * the lav_mnt_list only will have at most one entry if there is a filesystem + * mounted overtop of the autofs mount. + * + * Expiring 'offset' mounts is more complicated because there are at least + * two different autofs VFSs involved (the top-level and one for each offset + * mount underneath). The actual offset mount is handled exactly like a 'direct' + * mount. The top-level is an indirect mount and is handled in a similar way + * as described above for indirect mounts, but special handling is needed for + * each offset mount below. + * + * This can be explained using the same 'jurassic' example described earlier + * (/net is an autofs 'indirect' mount and the host 'jurassic' has two exported + * file systems; /var/crash and /var/core). If the user accesses + * /net/jurassic/var/crash then the automounter would setup the system so that + * the following mounts exist: + * - /net (the original autofs indirect mount which triggers everything) + * - /net/jurassic/var/crash (autofs offset mount) + * - /net/jurassic/var/crash (NFS mount on top of the autofs offset mount) + * - /net/jurassic/var/core (autofs offset mount) + * + * For expiration the automounter will issue the LX_AUTOFS_IOC_EXPIRE_MULTI + * ioctl on each autofs vfs for which something is mounted, so we would receive + * an expire ioctl on /net and another on /net/jusrassic/var/crash. The vfs for + * /net will be tracking "jurassic", but we detect it is busy and won't do + * anything at first. The vfs for "crash" will work like a direct mount and + * acknowledge the expire ioctl to the automounter once that filesystem times + * out and is no longer busy. The automounter will then unmount the "crash" + * NFS mount. + * + * Once the "crash" NFS mount has been unmounted by the automounter, we're left + * with the two autofs offset mounts under jurassic. The automounter will not + * try to unmount either of those, so we have to do that. Once we get another + * expire ioctl on /net and check "jurassic", we'll see there are only autofs + * mounts under /net/jurassic. We umount those using the lx_autofs_umount_offset + * function and respond to the automounter expire ioctl with "jurassic", in the + * same way as we would for any other indirect mount. + * + * 5) Recovery + * + * If the automounter is restarted for any reason, it needs to cope with + * pre-existing autofs mounts, as well as other automount-initiated mounts (e.g. + * a direct mount on top of an autofs mountpoint). The automounter uses the + * /proc/mounts file to correlate mounts to the managed mountpoints. It then + * uses the /dev/autofs device to openmount each of the autofs devices and + * reinitialize them using the various dev ioctls (timeout, requester, etc.). + * + * In general, the autoumounter will closemount the mountpoint once it's done, + * but it doesn't in the case of an offset mountpoint with nothing mounted + * on top. In this case the automounter expects autofs to expire that mountpoint + * before it will closemount (so things can subsequently cleanup). We handle + * this special case in the expire code path. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that the name of the actual file system is lxautofs, not lx_autofs, but + * the code uses lx_autofs to prefix the various names. This is because file + * system names are limited to 8 characters. + */ +#define LX_AUTOFS_NAME "lxautofs" + +#define LX_AUTOFS_MINORNAME "autofs" + +/* + * Mount options supported. + */ +#define LX_MNTOPT_FD "fd" +#define LX_MNTOPT_PGRP "pgrp" +#define LX_MNTOPT_MINPROTO "minproto" +#define LX_MNTOPT_MAXPROTO "maxproto" +#define LX_MNTOPT_INDIRECT "indirect" +#define LX_MNTOPT_DIRECT "direct" +#define LX_MNTOPT_OFFSET "offset" + +/* + * Version/subversion of the Linux kernel automount protocol we support. + * + * We fully support v2 and v5. We'll return ENOTSUP for all of the ioctls we + * don't yet handle. + */ +#define LX_AUTOFS_PROTO_VERS5 5 +#define LX_AUTOFS_PROTO_SUBVERSION 2 +#define LX_AUTOFS_PROTO_VERS2 2 + +/* packet types */ +typedef enum laph_ptype { + LX_AUTOFS_PTYPE_MISSING, /* 0 */ + LX_AUTOFS_PTYPE_EXPIRE, /* 1 */ + LX_AUTOFS_PTYPE_EXPIRE_MULTI, /* 2 */ + LX_AUTOFS_PTYPE_MISSING_INDIR, /* 3 */ + LX_AUTOFS_PTYPE_EXPIRE_INDIR, /* 4 */ + LX_AUTOFS_PTYPE_MISSING_DIRECT, /* 5 */ + LX_AUTOFS_PTYPE_EXPIRE_DIRECT /* 6 */ +} laph_ptype_t; + +/* + * Common header for all versions of the protocol. + */ +typedef struct lx_autofs_pkt_hdr { + int laph_protover; /* protocol version number */ + laph_ptype_t laph_type; + int laph_id; /* every pkt must have a unique id */ +} lx_autofs_pkt_hdr_t; + +/* + * Command structure sent to automount process from lxautofs via a pipe. + * This structure is the same for v2-v4 of the automount protocol + * (the communication pipe is established at mount time). + */ +typedef struct lx_autofs_v2_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component to lookup */ +} lx_autofs_v2_pkt_t; + +/* v4 multi-expire */ +typedef struct lx_autofs_v4_exp_pkt { + lx_autofs_pkt_hdr_t lape_hdr; + int lape_len; + char lape_name[MAXNAMELEN]; +} lx_autofs_v4_exp_pkt_t; + +/* v5 */ +typedef struct lx_autofs_v5_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + uint32_t lap_dev; + uint64_t lap_ino; + uint32_t lap_uid; + uint32_t lap_gid; + uint32_t lap_pid; + uint32_t lap_tgid; + uint32_t lap_name_len; + char lap_name[256]; +} lx_autofs_v5_pkt_t; + +union lx_autofs_pkt { + lx_autofs_v2_pkt_t lap_v2; + lx_autofs_v5_pkt_t lap_v5; +}; + +#define lap_protover lap_v2.lap_hdr.laph_protover +#define lap_type lap_v2.lap_hdr.laph_type +#define lap_id lap_v2.lap_hdr.laph_id + +/* + * Ioctls fully supported (v2 protocol). + */ +#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */ +#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */ +#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */ + +/* + * Ioctls supported (v3/v4 protocol). + */ +#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */ +#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0089364 /* arg: ulong_t */ + +/* + * Ioctls not supported (v3/v4 protocol). + */ + /* arg: lx_autofs_v3_exp_pkt_t * */ +#define LX_AUTOFS_IOC_EXPIRE 0x81109365 + +/* + * Ioctls supported (v5 protocol). + */ +#define LX_AUTOFS_IOC_PROTOSUBVER 0x80049367 /* arg: int */ +#define LX_AUTOFS_IOC_ASKUMOUNT 0x80049370 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_MULTI 0x40049366 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_INDIRECT LX_AUTOFS_IOC_EXPIRE_MULTI +#define LX_AUTOFS_IOC_EXPIRE_DIRECT LX_AUTOFS_IOC_EXPIRE_MULTI + +/* + * autofs device ioctls + */ +#define LX_AUTOFS_DEV_IOC_VERSION_CMD 0xc0189371 +#define LX_AUTOFS_DEV_IOC_PROTOVER_CMD 0xc0189372 +#define LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD 0xc0189373 +#define LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD 0xc0189374 +#define LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD 0xc0189375 +#define LX_AUTOFS_DEV_IOC_READY_CMD 0xc0189376 +#define LX_AUTOFS_DEV_IOC_FAIL_CMD 0xc0189377 +#define LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD 0xc0189378 +#define LX_AUTOFS_DEV_IOC_CATATONIC_CMD 0xc0189379 +#define LX_AUTOFS_DEV_IOC_TIMEOUT_CMD 0xc018937a +#define LX_AUTOFS_DEV_IOC_REQUESTER_CMD 0xc018937b +#define LX_AUTOFS_DEV_IOC_EXPIRE_CMD 0xc018937c +#define LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD 0xc018937d +#define LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD 0xc018937e + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h new file mode 100644 index 0000000000..39ea96d1fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_IMPL_H +#define _LX_AUTOFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/modhash.h> +#include <sys/vnode.h> + +#include <sys/lx_autofs.h> + +/* + * Space key. + * Used to persist data across lx_autofs filesystem module unloads. + */ +#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev" + +/* + * Name of the backing store directory. + */ +#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME + +#define LX_AUTOFS_VFS_ID_HASH_SIZE 15 +#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15 +#define LX_AUTOFS_VFS_VN_HASH_SIZE 15 + +enum lx_autofs_mnttype { LXAMT_NONE, LXAMT_INDIR, LXAMT_DIRECT, LXAMT_OFFSET }; + +typedef struct lx_autofs_mntent { + list_node_t lxafme_lst; + uint64_t lxafme_ts; /* time stamp */ + uint_t lxafme_len; + char *lxafme_path; +} lx_autofs_mntent_t; + +/* + * VFS data object. + */ +typedef struct lx_autofs_vfs { + /* Info about the underlying filesystem and backing store. */ + vnode_t *lav_mvp; + char *lav_bs_name; + vnode_t *lav_bs_vp; + + /* Info about the automounter process managing this filesystem. */ + int lav_fd; + pid_t lav_pgrp; + file_t *lav_fifo_wr; + file_t *lav_fifo_rd; + + /* The mount's dev and ino values for v5 protocol msg */ + uint64_t lav_dev; + u_longlong_t lav_ino; + + /* options from the mount */ + enum lx_autofs_mnttype lav_mnttype; + int lav_min_proto; + + /* + * ioctl-set timeout value. The automounter will perform an expire + * ioctl every timeout/4 seconds. We use this to expire a mount once + * it is inactive for the full timeout. + */ + ulong_t lav_timeout; + + /* ioctl-set catatonic value (prevents future mounts). */ + boolean_t lav_catatonic; + + /* Mount initiator's uid/gid for recovery handling. */ + uid_t lav_uid; + gid_t lav_gid; + + /* Each automount requests needs a unique id. */ + id_space_t *lav_ids; + + /* All remaining structure members are protected by lav_lock. */ + kmutex_t lav_lock; + /* openmount counter */ + int lav_openmnt_cnt; + + + /* Hashes to keep track of outstanding automounter requests. */ + mod_hash_t *lav_path_hash; + mod_hash_t *lav_id_hash; + + /* We need to keep track of all our vnodes. */ + vnode_t *lav_root; + mod_hash_t *lav_vn_hash; + + /* list of current mounts */ + list_t lav_mnt_list; +} lx_autofs_vfs_t; + +enum lx_autofs_callres { LXACR_NONE, LXACR_READY, LXACR_FAIL }; + +/* + * Structure to keep track of automounter requests sent to user-land. + */ +typedef struct lx_autofs_automnt_req { + /* Packet that gets sent to the automounter. */ + union lx_autofs_pkt laar_pkt; + int laar_pkt_size; + + /* Reference count. Always updated atomically. */ + uint_t laar_ref; + + /* + * Fields to keep track and sync threads waiting on a lookup. + * Fields are protected by lalr_lock. + */ + kmutex_t laar_lock; + kcondvar_t laar_cv; + int laar_complete; + + enum lx_autofs_callres laar_result; +} lx_autofs_automnt_req_t; + +/* + * Generic stack structure. + */ +typedef struct stack_elem { + list_node_t se_list; + caddr_t se_ptr1; + caddr_t se_ptr2; + caddr_t se_ptr3; +} stack_elem_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h new file mode 100644 index 0000000000..9c1579cc82 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -0,0 +1,778 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _LX_BRAND_H +#define _LX_BRAND_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/zone.h> +#include <sys/ksocket.h> +#include <sys/vfs.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/cpuvar.h> +#include <sys/lx_futex.h> +#include <sys/lx_userhz.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_BRANDNAME "lx" + +/* + * Brand uname info + */ +#define LX_UNAME_SYSNAME "Linux" +#define LX_UNAME_RELEASE_2_6 "2.6.18" +#define LX_UNAME_RELEASE_2_4 "2.4.21" +#define LX_UNAME_VERSION "BrandZ virtual linux" +#define LX_UNAME_MACHINE32 "i686" +#define LX_UNAME_MACHINE64 "x86_64" + +#define LX_LIB_PATH32 "/native/usr/lib/lx_brand.so.1" +#define LX_LIB_PATH64 "/native/usr/lib/amd64/lx_brand.so.1" + +#define LX_VDSO_PATH32 "/native/usr/lib/brand/lx/lx_vdso.so.1" +#define LX_VDSO_PATH64 "/native/usr/lib/brand/lx/amd64/lx_vdso.so.1" + +#if defined(_LP64) +#define LX_LIB_PATH LX_LIB_PATH64 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE64 +#define LX_VDSO_PATH LX_VDSO_PATH64 +#else +#define LX_LIB_PATH LX_LIB_PATH32 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE32 +#define LX_VDSO_PATH LX_VDSO_PATH32 +#endif + +/* + * This must be large enough for both the 32-bit table and 64-bit table. + */ +#define LX_NSYSCALLS 358 + +/* Highest capability we know about */ +#define LX_CAP_MAX_VALID 36 + +/* sched attr flag values */ +#define LX_SCHED_FLAG_RESET_ON_FORK 0x1 +/* + * brand(2) subcommands + * + * Everything >= 128 is a brand-specific subcommand. + * > 192 is reserved for in-kernel emulated system calls. + */ +#define B_LPID_TO_SPAIR 128 +#define B_GET_CURRENT_CONTEXT 129 +#define B_EMULATION_DONE 130 +#define B_START_NFS_LOCKD 131 +#define B_BLOCK_ALL_SIGS 132 +#define B_UNBLOCK_ALL_SIGS 133 +#define B_PTRACE_CLONE_BEGIN 134 +#define B_PTRACE_STOP_FOR_OPT 135 +#define B_UNSUPPORTED 136 +#define B_STORE_ARGS 137 +#define B_GETPID 138 +#define B_JUMP_TO_LINUX 139 +#define B_ALL_SIGS_BLOCKED 140 +#define B_EXIT_AS_SIG 141 +/* formerly B_HELPER_WAITID 142 */ +#define B_HELPER_CLONE 143 +#define B_HELPER_SETGROUPS 144 +#define B_HELPER_SIGQUEUE 145 +#define B_HELPER_TGSIGQUEUE 146 +#define B_SET_NATIVE_STACK 147 +/* formerly B_SIGEV_THREAD_ID 148 */ +#define B_OVERRIDE_KERN_VER 149 +#define B_PTRACE_SIG_RETURN 150 +#define B_GET_PERSONALITY 151 + +#ifndef _ASM +/* + * Support for Linux PTRACE_SETOPTIONS handling. + */ +typedef enum lx_ptrace_options { + LX_PTRACE_O_TRACESYSGOOD = 0x0001, + LX_PTRACE_O_TRACEFORK = 0x0002, + LX_PTRACE_O_TRACEVFORK = 0x0004, + LX_PTRACE_O_TRACECLONE = 0x0008, + LX_PTRACE_O_TRACEEXEC = 0x0010, + LX_PTRACE_O_TRACEVFORKDONE = 0x0020, + LX_PTRACE_O_TRACEEXIT = 0x0040, + LX_PTRACE_O_TRACESECCOMP = 0x0080 +} lx_ptrace_options_t; + +#define LX_PTRACE_O_ALL \ + (LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | \ + LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | \ + LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE | \ + LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP) +#endif /* !_ASM */ + +/* siginfo si_status for traced events */ +#define LX_PTRACE_EVENT_FORK 0x100 +#define LX_PTRACE_EVENT_VFORK 0x200 +#define LX_PTRACE_EVENT_CLONE 0x300 +#define LX_PTRACE_EVENT_EXEC 0x400 +#define LX_PTRACE_EVENT_VFORK_DONE 0x500 +#define LX_PTRACE_EVENT_EXIT 0x600 +#define LX_PTRACE_EVENT_SECCOMP 0x700 + +/* + * Brand-private values for the "pr_what" member of lwpstatus, for use with the + * PR_BRAND stop reason. These reasons are validated in lx_stop_notify(); + * update it if you add new reasons here. + */ +#define LX_PR_SYSENTRY 1 +#define LX_PR_SYSEXIT 2 +#define LX_PR_SIGNALLED 3 +#define LX_PR_EVENT 4 + + +#define LX_VERSION_1 1 +#define LX_VERSION LX_VERSION_1 + +#define LX_ATTR_KERN_RELEASE ZONE_ATTR_BRAND_ATTRS +#define LX_ATTR_KERN_VERSION (ZONE_ATTR_BRAND_ATTRS + 1) +#define LX_ATTR_TTY_GID (ZONE_ATTR_BRAND_ATTRS + 2) + +/* + * Aux vector containing phdr of Linux executable and ehdr of interpreter + * (if any), both of which are used by lx_librtld_db to ascertain r_debug. + * We repurpose the 3rd brand-specific aux vector slot for the Linux + * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library). + */ +#define AT_SUN_BRAND_LX_PHDR AT_SUN_BRAND_AUX1 +#define AT_SUN_BRAND_LX_INTERP AT_SUN_BRAND_AUX2 +#define AT_SUN_BRAND_LX_CLKTCK AT_SUN_BRAND_AUX3 +#define AT_SUN_BRAND_LX_SYSINFO_EHDR AT_SUN_BRAND_AUX4 + +/* Aux vectors containing real/effective user/group IDs */ +#define AT_LX_UID 11 +#define AT_LX_EUID 12 +#define AT_LX_GID 13 +#define AT_LX_EGID 14 +/* Aux vector containing hz value */ +#define AT_CLKTCK 17 +/* Aux vector containing secure boolean */ +#define AT_SECURE 23 +/* Aux vector containing vDSO addr */ +#define AT_SYSINFO_EHDR 33 + +/* + * Usermode emulation routines are run on an alternate stack allocated by + * the brand library. Every LWP in a process will incur this overhead beyond + * the regular thread stack: + */ +#define LX_NATIVE_STACK_PAGE_COUNT 64 + +/* + * When returning in a new child process created with vfork(2) (or CLONE_VFORK) + * we discard some of the native stack to prevent corruption of the parent + * emulation state. + */ +#define LX_NATIVE_STACK_VFORK_GAP 0x3000 + +#ifndef _ASM + +extern struct brand lx_brand; + +typedef struct lx_brand_registration { + uint_t lxbr_version; /* version number */ + void *lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration_t; + +typedef struct lx_brand_registration32 { + uint_t lxbr_version; /* version number */ + uint32_t lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration32_t; + +#endif /* _ASM */ + +/* + * GDT usage + */ +#define GDT_TLSMIN (GDT_BRANDMIN) +#define GDT_TLSMAX (GDT_TLSMIN + 2) +#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN) + +#ifndef _ASM + +/* + * Stores information needed by the lx linker to launch the main + * lx executable. + */ +typedef struct lx_elf_data64 { + uintptr_t ed_phdr; + uintptr_t ed_phent; + uintptr_t ed_phnum; + uintptr_t ed_entry; + uintptr_t ed_base; + uintptr_t ed_ldentry; +} lx_elf_data64_t; + +typedef struct lx_elf_data32 { + uint32_t ed_phdr; + uint32_t ed_phent; + uint32_t ed_phnum; + uint32_t ed_entry; + uint32_t ed_base; + uint32_t ed_ldentry; +} lx_elf_data32_t; + +#if defined(_LP64) +typedef lx_elf_data64_t lx_elf_data_t; +#else +typedef lx_elf_data32_t lx_elf_data_t; +#endif + +typedef enum lx_proc_flags { + /* flags configurable via brandsys() and members of LX_PROC_ALL */ + LX_PROC_INSTALL_MODE = 0x01, + LX_PROC_STRICT_MODE = 0x02, + /* internal flags */ + LX_PROC_CHILD_DEATHSIG = 0x04, + LX_PROC_NO_DUMP = 0x08 /* for lx_prctl LX_PR_[GS]ET_DUMPABLE */ +} lx_proc_flags_t; + +#define LX_PROC_ALL (LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE) + +/* Maximum length for fields of LX uname */ +#define LX_SYS_UTS_LN 65 + +/* Max. length of kernel release string */ +#define LX_KERN_RELEASE_MAX LX_SYS_UTS_LN +#define LX_KERN_VERSION_MAX LX_SYS_UTS_LN + +#ifdef _KERNEL + +/* + * Entry points for cgroup integration. + */ +extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +#define LX_RLFAKE_LOCKS 0 +#define LX_RLFAKE_NICE 1 +#define LX_RLFAKE_RTPRIO 2 +#define LX_RLFAKE_RTTIME 3 + +#define LX_RLFAKE_NLIMITS 4 + +#define LX_RLIM64_INFINITY (~0ULL) + +typedef struct { + uint64_t rlim_cur; + uint64_t rlim_max; +} lx_rlimit64_t; + +typedef struct { + list_node_t lx_clgrpm_link; + proc_t *lx_clgrpm_pp; +} lx_clone_grp_member_t; + +typedef struct { + kmutex_t lx_clgrp_lock; /* protects cnt & member list */ + uint_t lx_clgrp_cnt; + list_t lx_clgrp_members; +} lx_clone_grp_t; + +/* Entries in the l_clone_grps clone-group array */ +#define LX_CLGRP_FS 0 +#define LX_CLGRP_MAX 1 + +/* See explanation in lx_mem.c about lx_mremap */ +#define LX_REMAP_ANONCACHE_NENTRIES 4 +typedef struct lx_segmap { + uintptr_t lxsm_vaddr; /* virtual address of mapping */ + size_t lxsm_size; /* size of mapping in bytes */ + uint64_t lxsm_lru; /* LRU field for cache */ + uint_t lxsm_flags; /* protection and attribute flags */ +} lx_segmap_t; + +typedef struct lx_proc_data { + uintptr_t l_handler; /* address of user-space handler */ + pid_t l_ppid; /* pid of originating parent proc */ + uid_t l_loginuid; /* /proc/{pid}/loginuid */ + int64_t l_ptrace; /* count of process lwps observed by ptrace */ + lx_elf_data_t l_elf_data; /* ELF data for linux executable */ + /* signal to deliver to parent when this thread group dies */ + int l_signal; + /* native signal to deliver to process when parent dies */ + int l_parent_deathsig; + lx_proc_flags_t l_flags; + + kmutex_t l_clone_grp_lock; /* protects the following member */ + lx_clone_grp_t *l_clone_grps[LX_CLGRP_MAX]; + + lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS]; + + kmutex_t l_io_ctx_lock; /* protects the following members */ + uintptr_t l_io_ctxpage; + kcondvar_t l_io_destroy_cv; + uint_t l_io_ctx_cnt; + struct lx_io_ctx **l_io_ctxs; + + /* original start/end bounds of arg/env string data */ + uintptr_t l_args_start; + uintptr_t l_envs_start; + uintptr_t l_envs_end; + + /* Override zone-wide settings for uname release and version */ + char l_uname_release[LX_KERN_RELEASE_MAX]; + char l_uname_version[LX_KERN_VERSION_MAX]; + + /* Linux process personality */ + unsigned int l_personality; + + /* VDSO location */ + uintptr_t l_vdso; + + /* mremap anon cache */ + kmutex_t l_remap_anoncache_lock; + uint64_t l_remap_anoncache_generation; + lx_segmap_t l_remap_anoncache[LX_REMAP_ANONCACHE_NENTRIES]; + + /* Block all signals to all threads; used during vfork */ + uint_t l_block_all_signals; +} lx_proc_data_t; + +#endif /* _KERNEL */ + +/* + * Linux process personality(2) flags stored in l_personality + */ +#define LX_PER_UNAME26 0x0020000 +#define LX_PER_ADDR_NO_RANDOMIZE 0x0040000 +#define LX_PER_FDPIC_FUNCPTRS 0x0080000 +#define LX_PER_MMAP_PAGE_ZERO 0x0100000 +#define LX_PER_ADDR_COMPAT_LAYOUT 0x0200000 +#define LX_PER_READ_IMPLIES_EXEC 0x0400000 +#define LX_PER_ADDR_LIMIT_32BIT 0x0800000 +#define LX_PER_SHORT_INODE 0x1000000 +#define LX_PER_WHOLE_SECONDS 0x2000000 +#define LX_PER_STICKY_TIMEOUTS 0x4000000 +#define LX_PER_ADDR_LIMIT_3GB 0x8000000 + +#define LX_PER_LINUX 0x00 +#define LX_PER_SUNOS (0x06 | LX_PER_STICKY_TIMEOUTS) +#define LX_PER_MASK 0xff + +/* max. number of aio control blocks (see lx_io_setup) allowed across zone */ +#define LX_AIO_MAX_NR 65536 + +/* + * A data type big enough to bitmap all Linux possible cpus. + * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages + * for sched_getaffinity() and sched_getaffinity(). + */ +#define LX_NCPU (1024) +#define LX_AFF_ULONGS (LX_NCPU / (8 * sizeof (ulong_t))) +typedef ulong_t lx_affmask_t[LX_AFF_ULONGS]; + +/* Length of proc boot_id string */ +#define LX_BOOTID_LEN 37 + +/* + * Flag values for uc_brand_data[0] in the ucontext_t: + */ +#define LX_UC_STACK_NATIVE 0x00001 +#define LX_UC_STACK_BRAND 0x00002 +#define LX_UC_RESTORE_NATIVE_SP 0x00010 +#define LX_UC_FRAME_IS_SYSCALL 0x00100 +#define LX_UC_RESTART_SYSCALL 0x01000 +#define LX_UC_IGNORE_LINK 0x10000 + +#ifdef _KERNEL + +typedef struct lx_lwp_data lx_lwp_data_t; + +/* + * Flag values for "lxpa_flags" on a ptrace(2) accord. + */ +typedef enum lx_accord_flags { + LX_ACC_TOMBSTONE = 0x01 +} lx_accord_flags_t; + +/* + * Flags values for "br_ptrace_flags" in the LWP-specific data. + */ +typedef enum lx_ptrace_flags { + LX_PTF_SYSCALL = 0x01, /* handling syscall or a trap */ + LX_PTF_EXITING = 0x02, + LX_PTF_STOPPING = 0x04, + LX_PTF_INHERIT = 0x08, + LX_PTF_STOPPED = 0x10, + LX_PTF_PARENT_WAIT = 0x20, + LX_PTF_CLDPEND = 0x40, + LX_PTF_CLONING = 0x80, + LX_PTF_WAITPEND = 0x100, + LX_PTF_NOSTOP = 0x200, /* disable syscall stop event */ + LX_PTF_INSYSCALL = 0x400 /* between syscall enter & exit */ +} lx_ptrace_flags_t; + +/* + * A ptrace(2) accord represents the relationship between a tracer LWP and the + * set of LWPs that it is tracing: the tracees. This data structure belongs + * primarily to the tracer, but is reference counted so that it may be freed by + * whoever references it last. + */ +typedef struct lx_ptrace_accord { + kmutex_t lxpa_lock; + uint_t lxpa_refcnt; + lx_accord_flags_t lxpa_flags; + + /* + * The tracer must hold "pidlock" while clearing these fields for + * exclusion of waitid(), etc. + */ + lx_lwp_data_t *lxpa_tracer; + kcondvar_t *lxpa_cvp; + + /* + * The "lxpa_tracees_lock" mutex protects the tracee list. + */ + kmutex_t lxpa_tracees_lock; + list_t lxpa_tracees; +} lx_ptrace_accord_t; + +/* + * These values are stored in the per-LWP data for a tracee when it is attached + * to a tracer. They record the method that was used to attach. + */ +typedef enum lx_ptrace_attach { + LX_PTA_NONE = 0x00, /* not attached */ + LX_PTA_ATTACH = 0x01, /* due to tracer using PTRACE_ATTACH */ + LX_PTA_TRACEME = 0x02, /* due to child using PTRACE_TRACEME */ + LX_PTA_INHERIT_CLONE = 0x04, /* due to PTRACE_CLONE clone(2) flag */ + LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */ +} lx_ptrace_attach_t; + +typedef enum lx_stack_mode { + LX_STACK_MODE_PREINIT = 0, + LX_STACK_MODE_INIT, + LX_STACK_MODE_NATIVE, + LX_STACK_MODE_BRAND +} lx_stack_mode_t; + +struct lx_pid { + pid_t lxp_spid; /* the SunOS pid and ... */ + id_t lxp_stid; /* ... tid pair */ + pid_t lxp_lpid; /* the corresponding linux pid */ + time_t lxp_start; /* birthday of this pid */ + struct pid *lxp_pidp; /* allocated pid struct */ + proc_t *lxp_procp; /* proc_t corresponding to lxp_spid */ + struct lx_pid *lxp_stol_next; /* link in stol hash table */ + struct lx_pid *lxp_ltos_next; /* link in ltos hash table */ +}; + +/* + * lx-specific data in the klwp_t + */ +struct lx_lwp_data { + uint_t br_lwp_flags; /* misc. flags */ + klwp_t *br_lwp; /* back pointer to container lwp */ + int br_signal; /* signal to send to parent when */ + /* clone()'ed child terminates */ + int br_exitwhy; /* reason for thread (process) exit */ + int br_exitwhat; /* exit code / killing signal */ + cpuset_t *br_affinitymask; /* bitmask of CPU sched affinities */ + struct user_desc br_tls[LX_TLSNUM]; + /* descriptors used by libc for TLS */ + ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */ + ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */ + ulong_t br_lx_gsbase; /* lx user-land gsbase */ + ulong_t br_ntv_gsbase; /* native user-land gsbase */ + pid_t br_pid; /* converted pid for this thread */ + pid_t br_tgid; /* thread group ID for this thread */ + pid_t br_ppid; /* parent pid for this thread */ + id_t br_ptid; /* parent tid for this thread */ + void *br_clear_ctidp; /* clone thread id ptr */ + void *br_set_ctidp; /* clone thread id ptr */ + void *br_robust_list; /* robust lock list, if any */ + + /* first 4 syscall args - used for auditing */ + uintptr_t br_syscall_args[4]; + + /* + * The following struct is used by some system calls to pass extra + * flags into the kernel without impinging on the namespace for + * illumos. + */ + void *br_scall_args; + int br_args_size; /* size in bytes of br_scall_args */ + + boolean_t br_waitid_emulate; + int br_waitid_flags; + + lx_ptrace_flags_t br_ptrace_flags; /* ptrace flags for this LWP */ + lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */ + lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */ + + lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */ + lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */ + lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */ + list_node_t br_ptrace_linkage; /* linkage for lxpa_tracees list */ + + ushort_t br_ptrace_whystop; /* stop reason, 0 for no stop */ + ushort_t br_ptrace_whatstop; /* stop sub-reason */ + + int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */ + /* + * Track the last (native) signal number processed by a ptrace. + * This allows the tracee to properly handle ignored signals after + * the tracer has been notified and the tracee restarted. + */ + int32_t br_ptrace_donesig; + uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */ + + uint_t br_ptrace_event; + ulong_t br_ptrace_eventmsg; + + int br_syscall_num; /* current system call number */ + boolean_t br_syscall_restart; /* should restart on EINTR */ + + /* + * Store the LX_STACK_MODE for this LWP, and the current extent of the + * native (emulation) stack. This is similar, in principle, to the + * sigaltstack mechanism for signal handling. We also use this mode + * flag to determine how to process system calls from this LWP. + */ + lx_stack_mode_t br_stack_mode; + uintptr_t br_ntv_stack; + uintptr_t br_ntv_stack_current; + + /* + * If strict mode is enabled (via LX_STRICT in the environment), any + * call to lx_unsupported() will set this boolean to B_TRUE. This will + * cause us to drop SIGSYS on the LWP as it attempts to return to + * usermode. + */ + boolean_t br_strict_failure; + + /* + * Some syscalls emulated in-kernel still call back out to the + * userspace emulation for certain functions. When that is the case, + * the syscall_return logic must be bypassed at the end of the + * in-kernel syscall code. The NORMALRETURN and JUSTRETURN constants + * are used to choose the behavior. + */ + char br_eosys; + + /* + * Hold a pre-allocated lx_pid structure to be used during lx_initlwp. + */ + struct lx_pid *br_lpid; + + /* + * ID of the cgroup this thread belongs to. + */ + uint_t br_cgroupid; + + /* + * When the zone is running under FSS (which is the common case) then + * we cannot change scheduling class, so we emulate that. By default + * Linux uses LX_SCHED_OTHER (which is 0) and that only supports a + * priority of 0, so no special initialization is needed. + */ + int br_schd_class; /* emulated scheduling class */ + int br_schd_pri; /* emulated scheduling priority */ + uint64_t br_schd_flags; /* emulated [sg]et_attr flags */ + uint64_t br_schd_runtime; /* emulated DEADLINE */ + uint64_t br_schd_deadline; /* emulated DEADLINE */ + uint64_t br_schd_period; /* emulated DEADLINE */ + + fwaiter_t br_fwaiter; /* futex upon which we're waiting */ + uint_t br_clone_grp_flags; /* pending clone group */ +}; + +/* + * Upper limit on br_args_size, low because this value can persist until + * overridden with another value, and the size is given from userland. + */ +#define LX_BR_ARGS_SIZE_MAX (1024) + +typedef enum lx_audit_enbl { + LXAE_DISABLED, + LXAE_ENABLED, + LXAE_LOCKED +} lx_audit_enbl_t; + +/* + * brand specific data + * + * We currently only support a single cgroup mount in an lx zone so we only have + * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever + * enhanced to support different mounts with different subsystem controllers. + */ +typedef struct lx_zone_data { + kmutex_t lxzd_lock; /* protects all members */ + char lxzd_kernel_release[LX_KERN_RELEASE_MAX]; + char lxzd_kernel_version[LX_KERN_VERSION_MAX]; + ksocket_t lxzd_ioctl_sock; + char lxzd_bootid[LX_BOOTID_LEN]; /* procfs boot_id */ + gid_t lxzd_ttygrp; /* tty gid for pty chown */ + vfs_t *lxzd_cgroup; /* cgroup for this zone */ + pid_t lxzd_lockd_pid; /* pid of NFS lockd */ + list_t *lxzd_vdisks; /* virtual disks (zvols) */ + dev_t lxzd_zfs_dev; /* major num for zfs */ + uint_t lxzd_aio_nr; /* see lx_aio.c */ + uint_t lxzd_pipe_max_sz; /* pipe-max-size sysctl val */ + boolean_t lxzd_swap_disabled; /* no fake swap in zone? */ + lx_audit_enbl_t lxzd_audit_enabled; /* auditing? */ + struct lx_audit_state *lxzd_audit_state; /* zone's audit state */ +} lx_zone_data_t; + +/* LWP br_lwp_flags values */ +#define BR_CPU_BOUND 0x0001 +#define BR_AIO_LWP 0x0002 /* aio kernel worker thread */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) +#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l)) +#define ttolxproc(t) \ + (((t)->t_procp->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL) +#define ptolxproc(p) \ + (((p)->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(p)->p_brand_data : NULL) +#define ztolxzd(z) \ + (((z)->zone_brand == &lx_brand) ? \ + (lx_zone_data_t *)(z)->zone_brand_data : NULL) + +/* Macro for converting to system call arguments. */ +#define LX_ARGS(scall) ((struct lx_##scall##_args *)\ + (ttolxlwp(curthread)->br_scall_args)) + +typedef enum lx_virt_disk_type { + LXVD_NONE, + LXVD_ZFS_DS, + LXVD_ZVOL +} lx_virt_disk_type_t; + +typedef struct lx_virt_disk { + list_node_t lxvd_link; + char lxvd_name[MAXNAMELEN]; + lx_virt_disk_type_t lxvd_type; + dev_t lxvd_emul_dev; + dev_t lxvd_real_dev; + uint64_t lxvd_volsize; + uint64_t lxvd_blksize; + char lxvd_real_name[MAXPATHLEN]; +} lx_virt_disk_t; + +/* + * Determine the upper bound on the system call number: + */ +#if defined(_LP64) +#define LX_MAX_SYSCALL(lwp) \ + ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \ + lx_nsysent64 : lx_nsysent32) +#else +#define LX_MAX_SYSCALL(lwp) lx_nsysent32 +#endif + +extern int lx_kern_release_cmp(zone_t *, const char *); + +extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t); +extern void lx_divert(klwp_t *, uintptr_t); +extern int lx_runexe(klwp_t *, void *); +extern void lx_switch_to_native(klwp_t *); + +extern int lx_syscall_enter(void); +extern void lx_syscall_return(klwp_t *, int, long); + +extern void lx_trace_sysenter(int, uintptr_t *); +extern void lx_trace_sysreturn(int, long); + +extern void lx_emulate_user(klwp_t *, int, uintptr_t *); + +extern void lx_audit_ld(); +extern void lx_audit_unld(); +extern void lx_audit_fini(zone_t *); +extern void lx_audit_syscall_exit(int, long); + +#if defined(_SYSCALL32_IMPL) +extern void lx_emulate_user32(klwp_t *, int, uintptr_t *); +#endif + +extern int lx_debug; +#define lx_print if (lx_debug) printf + +/* + * Flags for lx_lpid_lock() + */ +typedef enum { + LXP_PRLOCK = 0x1, /* acquire PR_LOCK as part of locking */ + LXP_ZOMBOK = 0x2 /* allow locking of zombies */ +} lx_pid_flag_t; + +extern void lx_pid_assign(kthread_t *, struct lx_pid *); +extern void lx_pid_reassign(kthread_t *); +extern void lx_pid_rele(pid_t, id_t); +extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *); +extern int lx_lpid_lock(pid_t, zone_t *, lx_pid_flag_t, proc_t **, + kthread_t **); +extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *); +extern void lx_pid_init(void); +extern void lx_pid_fini(void); +extern void lx_acct_out(vnode_t *, int); + +extern uint_t lx_pipe_max_limit; +extern uint_t lx_pipe_max_default; + +/* + * In-Kernel Linux System Call Description. + */ +typedef struct lx_sysent { + char *sy_name; + long (*sy_callc)(); + char sy_flags; + char sy_narg; +} lx_sysent_t; + +#if defined(_LP64) +extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +extern int lx_nsysent64; +#endif +extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +extern int lx_nsysent32; + +#endif /* _KERNEL */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_BRAND_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h new file mode 100644 index 0000000000..f82c6b867d --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h @@ -0,0 +1,161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_LX_FCNTL_H +#define _SYS_LX_FCNTL_H + +#include <sys/vnode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Lx open/fcntl flags + */ +#define LX_O_RDONLY 00 +#define LX_O_WRONLY 01 +#define LX_O_RDWR 02 +#define LX_O_ACCMODE (LX_O_RDONLY | LX_O_WRONLY | LX_O_RDWR) +#define LX_O_CREAT 0100 +#define LX_O_EXCL 0200 +#define LX_O_NOCTTY 0400 +#define LX_O_TRUNC 01000 +#define LX_O_APPEND 02000 +#define LX_O_NONBLOCK 04000 +#define LX_O_NDELAY LX_O_NONBLOCK +#define LX_O_SYNC 010000 +#define LX_O_FSYNC LX_O_SYNC +#define LX_O_ASYNC 020000 +#define LX_O_DIRECT 040000 +#define LX_O_LARGEFILE 0100000 +#define LX_O_DIRECTORY 0200000 +#define LX_O_NOFOLLOW 0400000 +#define LX_O_CLOEXEC 02000000 +#define LX_O_PATH 010000000 + +#define LX_F_DUPFD 0 +#define LX_F_GETFD 1 +#define LX_F_SETFD 2 +#define LX_F_GETFL 3 +#define LX_F_SETFL 4 +#define LX_F_GETLK 5 +#define LX_F_SETLK 6 +#define LX_F_SETLKW 7 +#define LX_F_SETOWN 8 +#define LX_F_GETOWN 9 +#define LX_F_SETSIG 10 +#define LX_F_GETSIG 11 + +#define LX_F_GETLK64 12 +#define LX_F_SETLK64 13 +#define LX_F_SETLKW64 14 + +#define LX_F_SETLEASE 1024 +#define LX_F_GETLEASE 1025 +#define LX_F_NOTIFY 1026 +#define LX_F_CANCELLK 1029 +#define LX_F_DUPFD_CLOEXEC 1030 +#define LX_F_SETPIPE_SZ 1031 +#define LX_F_GETPIPE_SZ 1032 + +#define LX_F_RDLCK 0 +#define LX_F_WRLCK 1 +#define LX_F_UNLCK 2 + +/* Test for emulated O_PATH setting in file_t flags */ +#define LX_IS_O_PATH(f) (((f)->f_flag & (FREAD|FWRITE)) == 0) + +extern int lx_vp_at(int, char *, vnode_t **, int); + +/* + * Lx flock codes. + */ +#define LX_NAME_MAX 255 +#define LX_LOCK_SH 1 /* shared */ +#define LX_LOCK_EX 2 /* exclusive */ +#define LX_LOCK_NB 4 /* non-blocking */ +#define LX_LOCK_UN 8 /* unlock */ + +/* + * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value. + * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by + * faccessat. + */ +#define LX_AT_FDCWD (-100) +#define LX_AT_SYMLINK_NOFOLLOW 0x100 +#define LX_AT_REMOVEDIR 0x200 +#define LX_AT_EACCESS 0x200 +#define LX_AT_SYMLINK_FOLLOW 0x400 +#define LX_AT_NO_AUTOMOUNT 0x800 +#define LX_AT_EMPTY_PATH 0x1000 + +typedef struct lx_flock { + short l_type; + short l_whence; + long l_start; + long l_len; + int l_pid; +} lx_flock_t; + +typedef struct lx_flock64 { + short l_type; + short l_whence; + long long l_start; + long long l_len; + int l_pid; +} lx_flock64_t; + +#if defined(_KERNEL) + +/* + * 64-bit kernel view of 32-bit usermode structs. + */ +#pragma pack(4) +typedef struct lx_flock32 { + int16_t l_type; + int16_t l_whence; + int32_t l_start; + int32_t l_len; + int32_t l_pid; +} lx_flock32_t; + +typedef struct lx_flock64_32 { + int16_t l_type; + int16_t l_whence; + int64_t l_start; + int64_t l_len; + int32_t l_pid; +} lx_flock64_32_t; +#pragma pack() + +#endif /* _KERNEL && _SYSCALL32_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FCNTL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h new file mode 100644 index 0000000000..7eba389218 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#ifndef _SYS_LX_FUTEX_H +#define _SYS_LX_FUTEX_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI + +/* + * Flags that can be OR'd into a futex operation. + */ +#define FUTEX_CMD_MASK 0x007f +#define FUTEX_PRIVATE_FLAG 0x0080 +#define FUTEX_CLOCK_REALTIME 0x0100 + +#define FUTEX_BITSET_MATCH_ANY 0xffffffff +/* + * FUTEX_WAKE_OP operations + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +/* + * FUTEX_WAKE_OP comparison operations + */ +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ + +/* + * The encoding of the FUTEX_WAKE_OP operation in 32 bits: + * + * +--+-- - --+-- - --+-- - --+-- - --+ + * |S |OP |CMP |OPARG |CMPARG | + * +--+-- - --+-- - --+-- - --+-- - --+ + * |31|30 - 28|27 - 24|23 - 12|11 - 0| + * + * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG. + * (Yes, this whole thing is entirely absurd -- see the block comment in + * lx_futex.c for an explanation of this nonsense.) Macros to extract the + * various components from the operation, given the above encoding: + */ +#define FUTEX_OP_OP(x) (((x) >> 28) & 7) +#define FUTEX_OP_CMP(x) (((x) >> 24) & 15) +#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \ + ((((x) << 8) >> 20))) +#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20) + +#ifdef _KERNEL + +/* + * This structure is used to track all the threads currently waiting on a + * futex. There is one fwaiter_t for each blocked thread. We store all + * fwaiter_t's in a hash structure, indexed by the memid_t of the integer + * containing the futex's value. + * + * At the moment, all fwaiter_t's for a single futex are simply dumped into + * the hash bucket. If futex contention ever becomes a hot path, we can + * chain a single futex's waiters together. + */ +typedef struct fwaiter { + memid_t fw_memid; /* memid of the user-space futex */ + kcondvar_t fw_cv; /* cond var */ + struct fwaiter *fw_next; /* hash queue */ + struct fwaiter *fw_prev; /* hash queue */ + uint32_t fw_bits; /* bits waiting on */ + pid_t fw_tid; /* for PI futexes; the waiter's tid */ + int fw_opri; /* for PI futexes; original pri. */ + boolean_t fw_pri_up; /* for PI futexes; pri. increased */ + volatile int fw_woken; +} fwaiter_t; + +#define FUTEX_WAITERS 0x80000000 +#define FUTEX_OWNER_DIED 0x40000000 +#define FUTEX_TID_MASK 0x3fffffff + +#define FUTEX_ROBUST_LOCK_PI 1 +#define FUTEX_ROBUST_LIST_LIMIT 2048 + +extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2); +extern void lx_futex_init(void); +extern int lx_futex_fini(void); +extern long lx_set_robust_list(void *listp, size_t len); +extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp); +extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FUTEX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h new file mode 100644 index 0000000000..03b9d43038 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_IMPL_H +#define _LX_IMPL_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t, + ulong_t, ulong_t); + + +extern lx_systrace_f *lx_systrace_entry_ptr; +extern lx_systrace_f *lx_systrace_return_ptr; + +extern void lx_brand_systrace_enable(void); +extern void lx_brand_systrace_disable(void); + +extern void lx_unsupported(char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h new file mode 100644 index 0000000000..08d4d78efb --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2018 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LINUX_LDT_H +#define _SYS_LINUX_LDT_H + +#include <sys/segments.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ldt_info { + uint_t entry_number; + uint_t base_addr; + uint_t limit; + uint_t seg_32bit:1, + contents:2, + read_exec_only:1, + limit_in_pages:1, + seg_not_present:1, + useable:1; +}; + +#define LDT_INFO_EMPTY(info) \ + ((info)->base_addr == 0 && (info)->limit == 0 && \ + (info)->contents == 0 && (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && (info)->useable == 0) + +#if defined(__amd64) +#define SETMODE(desc) (desc)->usd_long = SDP_SHORT; +#else +#define SETMODE(desc) +#endif + +#define LDT_INFO_TO_DESC(info, desc) { \ + USEGD_SETBASE(desc, (info)->base_addr); \ + USEGD_SETLIMIT(desc, (info)->limit); \ + (desc)->usd_type = ((info)->contents << 2) | \ + ((info)->read_exec_only ^ 1) << 1 | SDT_S | SDT_A; \ + (desc)->usd_dpl = SEL_UPL; \ + (desc)->usd_p = (info)->seg_not_present ^ 1; \ + (desc)->usd_def32 = (info)->seg_32bit; \ + (desc)->usd_gran = (info)->limit_in_pages; \ + (desc)->usd_avl = (info)->useable; \ + SETMODE(desc); \ +} + +#define DESC_TO_LDT_INFO(desc, info) { \ + bzero((info), sizeof (*(info))); \ + (info)->base_addr = USEGD_GETBASE(desc); \ + (info)->limit = USEGD_GETLIMIT(desc); \ + (info)->seg_not_present = (desc)->usd_p ^ 1; \ + (info)->contents = ((desc)->usd_type >> 2) & 3; \ + (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \ + (info)->seg_32bit = (desc)->usd_def32; \ + (info)->limit_in_pages = (desc)->usd_gran; \ + (info)->useable = (desc)->usd_avl; \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_LDT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h new file mode 100644 index 0000000000..0418d3e9f9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -0,0 +1,136 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS__LX_MISC_H +#define _SYS__LX_MISC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <inet/ip.h> +#include <inet/ip6.h> +#include <sys/siginfo.h> +#include <sys/lx_brand.h> + +#ifdef _KERNEL + +extern void lx_setrval(klwp_t *, int, int); +extern void lx_exec(); +extern void lx_exitlwp(klwp_t *); +extern void lx_freelwp(klwp_t *); +extern void *lx_lwpdata_alloc(proc_t *); +extern void lx_lwpdata_free(void *); +extern void lx_initlwp(klwp_t *, void *); +extern void lx_initlwp_post(klwp_t *); +extern void lx_forklwp(klwp_t *, klwp_t *); + +extern void lx_affinity_forklwp(klwp_t *, klwp_t *); + +extern void lx_set_gdt(int, user_desc_t *); +extern void lx_clear_gdt(int); + +extern longlong_t lx_nosys(); + +extern void lx_clone_grp_create(uint_t); +extern void lx_clone_grp_enter(uint_t, proc_t *, proc_t *); +extern void lx_clone_grp_exit(proc_t *, boolean_t); +extern boolean_t lx_clone_grp_member(lx_proc_data_t *, uint_t); +extern int lx_clone_grp_walk(lx_proc_data_t *, uint_t, + int (*)(proc_t *, void *), void *); + +extern greg_t lx_fixsegreg(greg_t, model_t); +extern uintptr_t lx_fsbase(klwp_t *, uintptr_t); +extern void lx_exit_with_sig(proc_t *, sigqueue_t *); +extern boolean_t lx_wait_filter(proc_t *, proc_t *); +extern void lx_sigfd_translate(k_siginfo_t *); +extern int stol_ksiginfo_copyout(k_siginfo_t *, void *); + +extern int ltos_at_flag(int, int, boolean_t); +#if defined(_SYSCALL32_IMPL) +extern int stol_ksiginfo32_copyout(k_siginfo_t *, void *); +#endif +extern void lx_read_argv_bounds(proc_t *p); + +typedef enum lx_regs_location { + LX_REG_LOC_UNAVAIL, + LX_REG_LOC_LWP, + LX_REG_LOC_UCP +} lx_regs_location_t; + +extern lx_regs_location_t lx_regs_location(lx_lwp_data_t *, void **, boolean_t); + + +typedef enum lx_if_action { + LX_IF_FROMNATIVE, + LX_IF_TONATIVE +} lx_if_action_t; + +/* Linux ARP protocol hardware identifiers */ +#define LX_ARPHRD_ETHER 1 /* Ethernet */ +#define LX_ARPHRD_LOOPBACK 772 /* Loopback */ +#define LX_ARPHRD_VOID 0xffff /* Unknown */ + +/* IPv6 address scope values used in /proc/net/if_inet6 */ +#define LX_IPV6_ADDR_LOOPBACK 0x0010U +#define LX_IPV6_ADDR_LINKLOCAL 0x0020U +#define LX_IPV6_ADDR_SITELOCAL 0x0040U +#define LX_IPV6_ADDR_COMPATv4 0x0080U + +/* Maximum length of a thread name, including the NUL terminator */ +#define LX_PR_SET_NAME_NAMELEN 16 + +extern void lx_ifname_convert(char *, lx_if_action_t); +extern void lx_ifflags_convert(uint64_t *, lx_if_action_t); +extern unsigned int lx_ipv6_scope_convert(const in6_addr_t *); +extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *, + int *); + +extern boolean_t lx_ptrace_stop(ushort_t); +extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t); +extern void lx_ptrace_init(void); +extern void lx_ptrace_fini(void); +extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *, + int *); +extern void lx_ptrace_exit(proc_t *, klwp_t *); +extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *); +extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t); +extern int lx_ptrace_set_clone_inherit(int, boolean_t); +extern int lx_sigcld_repost(proc_t *, sigqueue_t *); +extern int lx_ptrace_issig_stop(proc_t *, klwp_t *); +extern boolean_t lx_ptrace_sig_ignorable(proc_t *, klwp_t *, int); + +extern int lx_helper_clone(int64_t *, int, void *, void *, void *); +extern int lx_helper_setgroups(int, gid_t *); +extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *); +extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *); + +extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *); +extern void lx_vsyscall_enter(proc_t *, klwp_t *, int); + +extern void lx_check_strict_failure(lx_lwp_data_t *); + +extern boolean_t lx_is_eventfd(file_t *); + +extern int lx_read_common(file_t *, uio_t *, size_t *, boolean_t); +extern int lx_write_common(file_t *, uio_t *, size_t *, boolean_t); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS__LX_MISC_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h new file mode 100644 index 0000000000..74bbc939a3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PTM_LINUX_H +#define _SYS_PTM_LINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_PTM_DRV "lx_ptm" +#define LX_PTM_MINOR_NODE "lx_ptmajor" + +#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PTM_LINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h new file mode 100644 index 0000000000..9f606b614f --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGINFO_H +#define _LX_SIGINFO_H + +#include <sys/lx_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_siginfo_t lsi_code values + * + * LX_SI_ASYNCNL: Sent by asynch name lookup completion + * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads + * LX_SI_SIGIO: Sent by queued SIGIO + * LX_SI_ASYNCIO: Sent by asynchronous I/O completion + * LX_SI_MESGQ: Sent by real time message queue state change + * LX_SI_TIMER: Sent by timer expiration + * LX_SI_QUEUE: Sent by sigqueue + * LX_SI_USER: Sent by kill, sigsend, raise, etc. + * LX_SI_KERNEL: Sent by kernel + * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to + * illumos errors, if there is no translation available, this value + * should be used. This value should have no meaning as an si_code in + * illumos or Linux. + * + * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by + * BrandZ. + */ +#define LX_SI_CODE_NOT_EXIST (-61) +#define LX_SI_ASYNCNL (-60) +#define LX_SI_DETHREAD (-7) +#define LX_SI_TKILL (-6) +#define LX_SI_SIGIO (-5) +#define LX_SI_ASYNCIO (-4) +#define LX_SI_MESGQ (-3) +#define LX_SI_TIMER (-2) +#define LX_SI_QUEUE (-1) +#define LX_SI_USER (0) +#define LX_SI_KERNEL (0x80) + +#define LX_SI_MAX_SIZE 128 +#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3) +#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4) + +#if defined(_LP64) +/* + * Because of the odd number (3) of ints before the union, we need to account + * for the smaller padding needed on x64 due to the union being offset to an 8 + * byte boundary. + */ +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64 +#else +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32 +#endif + +typedef struct lx_siginfo { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE]; + + struct { + pid_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid_t _pid; + lx_uid16_t _uid; + union sigval _sigval; + } _rt; + + struct { + pid_t _pid; + lx_uid16_t _uid; + int _status; + clock_t _utime; + clock_t _stime; + } _sigchld; + + struct { + void *_addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "lx_siginfo_t" object. + */ +#pragma pack(4) +typedef struct lx_siginfo32 { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE_32]; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + union sigval32 _sigval; + } _rt; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + int _status; + clock32_t _utime; + clock32_t _stime; + } _sigchld; + + struct { + caddr32_t _addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo32_t; +#pragma pack() +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#define lsi_pid _sifields._kill._pid +#define lsi_uid _sifields._kill._uid +#define lsi_status _sifields._sigchld._status +#define lsi_utime _sifields._sigchld._utime +#define lsi_stime _sifields._sigchld._stime +#define lsi_value _sifields._rt._sigval +#define lsi_int _sifields._rt._sigval.sivalx_int +#define lsi_ptr _sifields._rt._sigval.sivalx_ptr +#define lsi_addr _sifields._sigfault._addr +#define lsi_band _sifields._sigpoll._band +#define lsi_fd _sifields._sigpoll._fd + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGINFO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_signal.h b/usr/src/uts/common/brand/lx/sys/lx_signal.h new file mode 100644 index 0000000000..552c36238b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_signal.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGNAL_H +#define _LX_SIGNAL_H + +#include <lx_signum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void lx_ltos_sigset(lx_sigset_t *, k_sigset_t *); +extern void lx_stol_sigset(k_sigset_t *, lx_sigset_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGNAL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_socket.h b/usr/src/uts/common/brand/lx/sys/lx_socket.h new file mode 100644 index 0000000000..99489e4d13 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_socket.h @@ -0,0 +1,444 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _SYS_LX_SOCKET_H +#define _SYS_LX_SOCKET_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux address family definitions + * Some of these are not supported + */ +#define LX_AF_UNSPEC 0 /* Unspecified */ +#define LX_AF_UNIX 1 /* local file/pipe name */ +#define LX_AF_INET 2 /* IP protocol family */ +#define LX_AF_AX25 3 /* Amateur Radio AX.25 */ +#define LX_AF_IPX 4 /* Novell Internet Protocol */ +#define LX_AF_APPLETALK 5 /* Appletalk */ +#define LX_AF_NETROM 6 /* Amateur radio */ +#define LX_AF_BRIDGE 7 /* Multiprotocol bridge */ +#define LX_AF_ATMPVC 8 /* ATM PVCs */ +#define LX_AF_X25 9 /* X.25 */ +#define LX_AF_INET6 10 /* IPV 6 */ +#define LX_AF_ROSE 11 /* Amateur Radio X.25 */ +#define LX_AF_DECNET 12 /* DECnet */ +#define LX_AF_NETBEUI 13 /* 802.2LLC */ +#define LX_AF_SECURITY 14 /* Security callback */ +#define LX_AF_KEY 15 /* key management */ +#define LX_AF_ROUTE 16 /* Alias to emulate 4.4BSD */ +#define LX_AF_NETLINK LX_AF_ROUTE +#define LX_AF_PACKET 17 /* Packet family */ +#define LX_AF_ASH 18 /* Ash ? */ +#define LX_AF_ECONET 19 /* Acorn Econet */ +#define LX_AF_ATMSVC 20 /* ATM SVCs */ +#define LX_AF_SNA 22 /* Linux SNA */ +#define LX_AF_IRDA 23 /* IRDA sockets */ +#define LX_AF_PPPOX 24 /* PPPoX sockets */ +#define LX_AF_WANPIPE 25 /* Wanpipe API sockets */ +#define LX_AF_LLC 26 +/* gap in Linux defines for 27 and 28 */ +#define LX_AF_CAN 29 +#define LX_AF_TIPC 30 +#define LX_AF_BLUETOOTH 31 /* Bluetooth sockets */ +#define LX_AF_IUCV 32 +#define LX_AF_RXRPC 33 + +/* limit of AF mappings */ +#define LX_AF_MAX LX_AF_RXRPC + +#define AF_NOTSUPPORTED -1 +#define AF_INVAL -2 + +/* + * Options for use with [gs]etsockopt at the SOL_SOCKET level. + */ +#define LX_SOL_SOCKET 1 + +#define LX_SCM_RIGHTS 1 +#define LX_SCM_CRED 2 + +#define LX_SO_DEBUG 1 +#define LX_SO_REUSEADDR 2 +#define LX_SO_TYPE 3 +#define LX_SO_ERROR 4 +#define LX_SO_DONTROUTE 5 +#define LX_SO_BROADCAST 6 +#define LX_SO_SNDBUF 7 +#define LX_SO_RCVBUF 8 +#define LX_SO_KEEPALIVE 9 +#define LX_SO_OOBINLINE 10 +#define LX_SO_NO_CHECK 11 +#define LX_SO_PRIORITY 12 +#define LX_SO_LINGER 13 +#define LX_SO_BSDCOMPAT 14 +#define LX_SO_REUSEPORT 15 +/* + * For Linux see unix(7) man page SO_PASSCRED description. For Illumos see + * socket.h(3HEAD) man page SO_RECVUCRED description. + */ +#define LX_SO_PASSCRED 16 +#define LX_SO_PEERCRED 17 +#define LX_SO_RCVLOWAT 18 +#define LX_SO_SNDLOWAT 19 +#define LX_SO_RCVTIMEO 20 +#define LX_SO_SNDTIMEO 21 +/* Security levels - as per NRL IPv6 - don't actually do anything */ +#define LX_SO_SECURITY_AUTHENTICATION 22 +#define LX_SO_SECURITY_ENCRYPTION_TRANSPORT 23 +#define LX_SO_SECURITY_ENCRYPTION_NETWORK 24 +#define LX_SO_BINDTODEVICE 25 +/* Socket filtering */ +#define LX_SO_ATTACH_FILTER 26 +#define LX_SO_DETACH_FILTER 27 +#define LX_SO_PEERNAME 28 +#define LX_SO_TIMESTAMP 29 +#define LX_SCM_TIMESTAMP LX_SO_TIMESTAMP +#define LX_SO_ACCEPTCONN 30 + +#define LX_SO_PEERSEC 31 +#define LX_SO_SNDBUFFORCE 32 +#define LX_SO_RCVBUFFORCE 33 +#define LX_SO_PASSSEC 34 +#define LX_SO_TIMESTAMPNS 35 +#define LX_SCM_TIMESTAMPNS LX_SO_TIMESTAMPNS +#define LX_SO_MARK 36 +#define LX_SO_TIMESTAMPING 37 +#define LX_SCM_TIMESTAMPING LX_SO_TIMESTAMPING +#define LX_SO_PROTOCOL 38 +#define LX_SO_DOMAIN 39 +#define LX_SO_RXQ_OVFL 40 +#define LX_SO_WIFI_STATUS 41 +#define LX_SCM_WIFI_STATUS LX_SO_WIFI_STATUS +#define LX_SO_PEEK_OFF 42 +#define LX_SO_NOFCS 43 +#define LX_SO_LOCK_FILTER 44 +#define LX_SO_SELECT_ERR_QUEUE 45 +#define LX_SO_BUSY_POLL 46 +#define LX_SO_MAX_PACING_RATE 47 +#define LX_SO_BPF_EXTENSIONS 48 + +/* + * Options for use with [gs]etsockopt at the RAW level. + * IPPROTO_RAW + */ +#define LX_ICMP_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the PACKET level. + * SOL_PACKET + */ +#define LX_SOL_PACKET 263 + +#define LX_PACKET_ADD_MEMBERSHIP 1 +#define LX_PACKET_DROP_MEMBERSHIP 2 +#define LX_PACKET_RECV_OUTPUT 3 +#define LX_PACKET_RX_RING 5 +#define LX_PACKET_STATISTICS 6 + +/* + * Options for use with [gs]etsockopt at the NETLINK level. + * SOL_NETLINK + */ +#define LX_SOL_NETLINK 270 + +/* + * Linux socket type definitions + */ +#define LX_SOCK_STREAM 1 /* Connection-based byte streams */ +#define LX_SOCK_DGRAM 2 /* Connectionless, datagram */ +#define LX_SOCK_RAW 3 /* Raw protocol interface */ +#define LX_SOCK_RDM 4 /* Reliably-delivered message */ +#define LX_SOCK_SEQPACKET 5 /* Sequenced packet stream */ +#define LX_SOCK_PACKET 10 /* Linux specific */ +#define LX_SOCK_MAX 11 + +/* + * The Linux socket type can be or-ed with other flags (e.g. SOCK_CLOEXEC). + */ +#define LX_SOCK_TYPE_MASK 0xf + +/* + * Linux flags for socket, socketpair and accept4. These are or-ed into the + * socket type value. In the Linux net.h header these come from fcntl.h (note + * that they are in octal in the Linux header). + */ +#define LX_SOCK_CLOEXEC 0x80000 +#define LX_SOCK_NONBLOCK 0x800 + +#define SOCK_NOTSUPPORTED -1 +#define SOCK_INVAL -2 + +/* + * PF_PACKET protocol definitions. + */ +#define LX_ETH_P_802_3 0x0001 +#define LX_ETH_P_ALL 0x0003 +#define LX_ETH_P_802_2 0x0004 +#define LX_ETH_P_IP 0x0800 +#define LX_ETH_P_ARP 0x0806 +#define LX_ETH_P_IPV6 0x86DD + +/* + * IP Protocol levels. Some of these match the Illumos IPPROTO_* values. + */ +#define LX_IPPROTO_IP 0 +#define LX_IPPROTO_ICMP 1 +#define LX_IPPROTO_IGMP 2 +#define LX_IPPROTO_TCP 6 +#define LX_IPPROTO_UDP 17 +#define LX_IPPROTO_IPV6 41 +#define LX_IPPROTO_ICMPV6 58 +#define LX_IPPROTO_RAW 255 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IP + */ +#define LX_IP_TOS 1 +#define LX_IP_TTL 2 +#define LX_IP_HDRINCL 3 +#define LX_IP_OPTIONS 4 +#define LX_IP_ROUTER_ALERT 5 +#define LX_IP_RECVOPTS 6 +#define LX_IP_RETOPTS 7 +#define LX_IP_PKTINFO 8 +#define LX_IP_PKTOPTIONS 9 +#define LX_IP_MTU_DISCOVER 10 +#define LX_IP_RECVERR 11 +#define LX_IP_RECVTTL 12 +#define LX_IP_RECVTOS 13 +#define LX_IP_MTU 14 +#define LX_IP_FREEBIND 15 +#define LX_IP_IPSEC_POLICY 16 +#define LX_IP_XFRM_POLICY 17 +#define LX_IP_PASSSEC 18 +#define LX_IP_TRANSPARENT 19 +#define LX_IP_ORIGDSTADDR 20 +#define LX_IP_MINTTL 21 +#define LX_IP_NODEFRAG 22 +/* Linux apparently leaves a gap here */ +#define LX_IP_MULTICAST_IF 32 +#define LX_IP_MULTICAST_TTL 33 +#define LX_IP_MULTICAST_LOOP 34 +#define LX_IP_ADD_MEMBERSHIP 35 +#define LX_IP_DROP_MEMBERSHIP 36 +#define LX_IP_UNBLOCK_SOURC 37 +#define LX_IP_BLOCK_SOURCE 38 +#define LX_IP_ADD_SOURCE_MEMBERSHIP 39 +#define LX_IP_DROP_SOURCE_MEMBERSHIP 40 +#define LX_IP_MSFILTER 41 +#define LX_MCAST_JOIN_GROUP 42 +#define LX_MCAST_BLOCK_SOURCE 43 +#define LX_MCAST_UNBLOCK_SOURCE 44 +#define LX_MCAST_LEAVE_GROUP 45 +#define LX_MCAST_JOIN_SOURCE_GROUP 46 +#define LX_MCAST_LEAVE_SOURCE_GROUP 47 +#define LX_MCAST_MSFILTER 48 +#define LX_IP_MULTICAST_ALL 49 +#define LX_IP_UNICAST_IF 50 + +/* + * LX_IP_MTU_DISCOVER values + */ +#define LX_IP_PMTUDISC_DONT 0 +#define LX_IP_PMTUDISC_WANT 1 +#define LX_IP_PMTUDISC_DO 2 +#define LX_IP_PMTUDISC_PROBE 3 +#define LX_IP_PMTUDISC_INTERFACE 4 +#define LX_IP_PMTUDISC_OMIT 5 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IPV6 + */ + +#define LX_IPV6_ADDRFORM 1 +#define LX_IPV6_2292PKTINFO 2 +#define LX_IPV6_2292HOPOPTS 3 +#define LX_IPV6_2292DSTOPTS 4 +#define LX_IPV6_2292RTHDR 5 +#define LX_IPV6_2292PKTOPTIONS 6 +#define LX_IPV6_CHECKSUM 7 +#define LX_IPV6_2292HOPLIMIT 8 +#define LX_IPV6_NEXTHOP 9 +#define LX_IPV6_AUTHHDR 10 +#define LX_IPV6_UNICAST_HOPS 16 +#define LX_IPV6_MULTICAST_IF 17 +#define LX_IPV6_MULTICAST_HOPS 18 +#define LX_IPV6_MULTICAST_LOOP 19 +#define LX_IPV6_JOIN_GROUP 20 +#define LX_IPV6_LEAVE_GROUP 21 +#define LX_IPV6_ROUTER_ALERT 22 +#define LX_IPV6_MTU_DISCOVER 23 +#define LX_IPV6_MTU 24 +#define LX_IPV6_RECVERR 25 +#define LX_IPV6_V6ONLY 26 +#define LX_IPV6_JOIN_ANYCAST 27 +#define LX_IPV6_LEAVE_ANYCAST 28 +#define LX_IPV6_IPSEC_POLICY 34 +#define LX_IPV6_XFRM_POLICY 35 + +#define LX_IPV6_RECVPKTINFO 49 +#define LX_IPV6_PKTINFO 50 +#define LX_IPV6_RECVHOPLIMIT 51 +#define LX_IPV6_HOPLIMIT 52 +#define LX_IPV6_RECVHOPOPTS 53 +#define LX_IPV6_HOPOPTS 54 +#define LX_IPV6_RTHDRDSTOPTS 55 +#define LX_IPV6_RECVRTHDR 56 +#define LX_IPV6_RTHDR 57 +#define LX_IPV6_RECVDSTOPTS 58 +#define LX_IPV6_DSTOPTS 59 +#define LX_IPV6_RECVTCLASS 66 +#define LX_IPV6_TCLASS 67 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_ICMPV6 + */ + +#define LX_ICMP6_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the TCP level. + * IPPROTO_TCP + */ +#define LX_TCP_NODELAY 1 /* Don't delay send to coalesce packets */ +#define LX_TCP_MAXSEG 2 /* Set maximum segment size */ +#define LX_TCP_CORK 3 /* Control sending of partial frames */ +#define LX_TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define LX_TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define LX_TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define LX_TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define LX_TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define LX_TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define LX_TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define LX_TCP_INFO 11 /* Information about this connection. */ +#define LX_TCP_QUICKACK 12 /* Bock/reenable quick ACKs. */ +#define LX_TCP_CONGESTION 13 /* Congestion control algorithm */ +#define LX_TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define LX_TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts on thin streams */ +#define LX_TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define LX_TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define LX_TCP_REPAIR 19 /* TCP socket under repair */ +#define LX_TCP_REPAIR_QUEUE 20 +#define LX_TCP_QUEUE_SEQ 21 +#define LX_TCP_REPAIR_OPTIONS 22 +#define LX_TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define LX_TCP_TIMESTAMP 24 +#define LX_TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes */ + +/* + * Options for use with [gs]etsockopt at the IGMP level. + * IPPROTO_IGMP + */ +#define LX_IGMP_MINLEN 8 +#define LX_IGMP_MAX_HOST_REPORT_DELAY 10 +#define LX_IGMP_HOST_MEMBERSHIP_QUERY 0x11 +#define LX_IGMP_HOST_MEMBERSHIP_REPORT 0x12 +#define LX_IGMP_DVMRP 0x13 +#define LX_IGMP_PIM 0x14 +#define LX_IGMP_TRACE 0x15 +#define LX_IGMP_HOST_NEW_MEMBERSHIP_REPORT 0x16 +#define LX_IGMP_HOST_LEAVE_MESSAGE 0x17 +#define LX_IGMP_MTRACE_RESP 0x1e +#define LX_IGMP_MTRACE 0x1f + +/* + * Linux socket flags for use with recv(2)/send(2)/recvmsg(2)/sendmsg(2) + */ +#define LX_MSG_OOB 0x1 +#define LX_MSG_PEEK 0x2 +#define LX_MSG_DONTROUTE 0x4 +#define LX_MSG_CTRUNC 0x8 +#define LX_MSG_PROXY 0x10 +#define LX_MSG_TRUNC 0x20 +#define LX_MSG_DONTWAIT 0x40 +#define LX_MSG_EOR 0x80 +#define LX_MSG_WAITALL 0x100 +#define LX_MSG_FIN 0x200 +#define LX_MSG_SYN 0x400 +#define LX_MSG_CONFIRM 0x800 +#define LX_MSG_RST 0x1000 +#define LX_MSG_ERRQUEUE 0x2000 +#define LX_MSG_NOSIGNAL 0x4000 +#define LX_MSG_MORE 0x8000 +#define LX_MSG_WAITFORONE 0x10000 +#define LX_MSG_FASTOPEN 0x20000000 +#define LX_MSG_CMSG_CLOEXEC 0x40000000 + +typedef struct lx_msghdr { + void *msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + size_t msg_iovlen; /* # elements in msg_iov */ + void *msg_control; /* ancillary data */ + size_t msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ +} lx_msghdr_t; + +typedef struct lx_mmsghdr { + lx_msghdr_t msg_hdr; /* message header */ + unsigned int msg_len; /* no. of bytes transmitted */ +} lx_mmsghdr_t; + +#if defined(_LP64) + +typedef struct lx_msghdr32 { + caddr32_t msg_name; /* optional address */ + uint32_t msg_namelen; /* size of address */ + caddr32_t msg_iov; /* scatter/gather array */ + int32_t msg_iovlen; /* # elements in msg_iov */ + caddr32_t msg_control; /* ancillary data */ + uint32_t msg_controllen; /* ancillary data buffer len */ + int32_t msg_flags; /* flags on received message */ +} lx_msghdr32_t; + +typedef struct lx_mmsghdr32 { + lx_msghdr32_t msg_hdr; /* message header */ + unsigned int msg_len; /* no. of bytes transmitted */ +} lx_mmsghdr32_t; + +#endif + +typedef struct lx_sockaddr_in6 { + sa_family_t sin6_family; + in_port_t sin6_port; + uint32_t sin6_flowinfo; + struct in6_addr sin6_addr; + uint32_t sin6_scope_id; /* Depends on scope of sin6_addr */ + /* one 32-bit field shorter than illumos */ +} lx_sockaddr_in6_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_SOCKET_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h new file mode 100644 index 0000000000..78fbf6e0a8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -0,0 +1,341 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _SYS_LINUX_SYSCALLS_H +#define _SYS_LINUX_SYSCALLS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern long lx_accept(); +extern long lx_accept4(); +extern long lx_access(); +extern long lx_acct(); +extern long lx_alarm(); +extern long lx_arch_prctl(); +extern long lx_bind(); +extern long lx_brk(); +extern long lx_chdir(); +extern long lx_chmod(); +extern long lx_chown(); +extern long lx_chown16(); +extern long lx_chroot(); +extern long lx_clock_getres(); +extern long lx_clock_gettime(); +extern long lx_clock_settime(); +extern long lx_close(); +extern long lx_connect(); +extern long lx_creat(); +extern long lx_dup(); +extern long lx_dup2(); +extern long lx_dup3(); +extern long lx_epoll_create(); +extern long lx_epoll_create1(); +extern long lx_epoll_ctl(); +extern long lx_epoll_pwait(); +extern long lx_epoll_wait(); +extern long lx_eventfd(); +extern long lx_eventfd2(); +extern long lx_faccessat(); +extern long lx_fadvise64(); +extern long lx_fadvise64_32(); +extern long lx_fadvise64_64(); +extern long lx_fallocate(); +extern long lx_fallocate32(); +extern long lx_fchdir(); +extern long lx_fchmod(); +extern long lx_fchmodat(); +extern long lx_fchown(); +extern long lx_fchown16(); +extern long lx_fchownat(); +extern long lx_fcntl(); +extern long lx_fcntl64(); +extern long lx_fgetxattr(); +extern long lx_flistxattr(); +extern long lx_flock(); +extern long lx_fremovexattr(); +extern long lx_fsetxattr(); +extern long lx_fstat32(); +extern long lx_fstat64(); +extern long lx_fstatat64(); +extern long lx_futex(); +extern long lx_get_robust_list(); +extern long lx_get_thread_area(); +extern long lx_getcpu(); +extern long lx_getcwd(); +extern long lx_getdents_32(); +extern long lx_getdents_64(); +extern long lx_getdents64(); +extern long lx_getegid(); +extern long lx_getegid16(); +extern long lx_geteuid(); +extern long lx_geteuid16(); +extern long lx_getgid(); +extern long lx_getgid16(); +extern long lx_getitimer(); +extern long lx_getpeername(); +extern long lx_getpgid(); +extern long lx_getpgrp(); +extern long lx_getsockname(); +extern long lx_getpid(); +extern long lx_getppid(); +extern long lx_getpriority(); +extern long lx_getrandom(); +extern long lx_getresgid(); +extern long lx_getresgid16(); +extern long lx_getresuid(); +extern long lx_getresuid16(); +extern long lx_getrlimit(); +extern long lx_getrusage(); +extern long lx_getsid(); +extern long lx_getsockopt(); +extern long lx_gettid(); +extern long lx_gettimeofday(); +extern long lx_getuid(); +extern long lx_getuid16(); +extern long lx_getxattr(); +extern long lx_io_cancel(); +extern long lx_io_destroy(); +extern long lx_io_getevents(); +extern long lx_io_setup(); +extern long lx_io_submit(); +extern long lx_ioctl(); +extern long lx_ioprio_get(); +extern long lx_ioprio_set(); +extern long lx_kill(); +extern long lx_lchown(); +extern long lx_lchown16(); +extern long lx_lgetxattr(); +extern long lx_link(); +extern long lx_linkat(); +extern long lx_listen(); +extern long lx_llistxattr(); +extern long lx_llseek(); +extern long lx_lremovexattr(); +extern long lx_lseek32(); +extern long lx_lseek64(); +extern long lx_lsetxattr(); +extern long lx_lstat32(); +extern long lx_lstat64(); +extern long lx_listxattr(); +extern long lx_madvise(); +extern long lx_mincore(); +extern long lx_mkdir(); +extern long lx_mkdirat(); +extern long lx_mlock(); +extern long lx_mlockall(); +extern long lx_mmap(); +extern long lx_mmap2(); +extern long lx_mremap(); +extern long lx_mprotect(); +extern long lx_modify_ldt(); +extern long lx_mount(); +extern long lx_msync(); +extern long lx_munlock(); +extern long lx_munlockall(); +extern long lx_munmap(); +extern long lx_nanosleep(); +extern long lx_nice(); +extern long lx_oldgetrlimit(); +extern long lx_open(); +extern long lx_openat(); +extern long lx_pause(); +extern long lx_personality(); +extern long lx_pipe(); +extern long lx_pipe2(); +extern long lx_poll(); +extern long lx_ppoll(); +extern long lx_pread(); +extern long lx_pread32(); +extern long lx_preadv(); +extern long lx_preadv32(); +extern long lx_prctl(); +extern long lx_prlimit64(); +extern long lx_pselect(); +extern long lx_ptrace(); +extern long lx_pwrite(); +extern long lx_pwrite32(); +extern long lx_pwritev(); +extern long lx_pwritev32(); +extern long lx_read(); +extern long lx_readlink(); +extern long lx_readlinkat(); +extern long lx_readv(); +extern long lx_reboot(); +extern long lx_recv(); +extern long lx_recvmsg(); +extern long lx_recvmmsg(); +extern long lx_recvfrom(); +extern long lx_rename(); +extern long lx_renameat(); +extern long lx_sched_getaffinity(); +extern long lx_sched_getparam(); +extern long lx_sched_getscheduler(); +extern long lx_sched_getattr(); +extern long lx_sched_get_priority_max(); +extern long lx_sched_get_priority_min(); +extern long lx_sched_rr_get_interval(); +extern long lx_sched_setaffinity(); +extern long lx_sched_setattr(); +extern long lx_sched_setparam(); +extern long lx_sched_setscheduler(); +extern long lx_sched_yield(); +extern long lx_select(); +extern long lx_send(); +extern long lx_sendmsg(); +extern long lx_sendmmsg(); +extern long lx_sendto(); +extern long lx_set_robust_list(); +extern long lx_set_thread_area(); +extern long lx_set_tid_address(); +extern long lx_setdomainname(); +extern long lx_setfsuid(); +extern long lx_setfsuid16(); +extern long lx_setfsgid(); +extern long lx_setfsgid16(); +extern long lx_setgid(); +extern long lx_setgid16(); +extern long lx_sethostname(); +extern long lx_setpgid(); +extern long lx_setpriority(); +extern long lx_setregid(); +extern long lx_setregid16(); +extern long lx_setresgid(); +extern long lx_setresgid16(); +extern long lx_setresuid(); +extern long lx_setresuid16(); +extern long lx_setreuid(); +extern long lx_setreuid16(); +extern long lx_setrlimit(); +extern long lx_setsid(); +extern long lx_setuid(); +extern long lx_setuid16(); +extern long lx_setxattr(); +extern long lx_setsockopt(); +extern long lx_symlink(); +extern long lx_symlinkat(); +extern long lx_shutdown(); +extern long lx_socket(); +extern long lx_socketcall(); +extern long lx_socketpair(); +extern long lx_splice(); +extern long lx_stat32(); +extern long lx_stat64(); +extern long lx_stime(); +extern long lx_swapoff(); +extern long lx_swapon(); +extern long lx_sync(); +extern long lx_sync_file_range(); +extern long lx_syncfs(); +extern long lx_sysinfo32(); +extern long lx_sysinfo64(); +extern long lx_syslog(); +extern long lx_removexattr(); +extern long lx_tgkill(); +extern long lx_time(); +extern long lx_times(); +extern long lx_timer_create(); +extern long lx_tkill(); +extern long lx_umask(); +extern long lx_umount(); +extern long lx_umount2(); +extern long lx_uname(); +extern long lx_unlink(); +extern long lx_unlinkat(); +extern long lx_unshare(); +extern long lx_vhangup(); +extern long lx_wait4(); +extern long lx_waitid(); +extern long lx_waitpid(); +extern long lx_write(); +extern long lx_writev(); + +#if defined(_LP64) +/* + * Linux vsyscall addresses: + */ +#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000 +#define LX_VSYS_time (uintptr_t)0xffffffffff600400 +#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800 + +#define LX_VSYSCALL_ADDR (uintptr_t)0xffffffffff600000 +#define LX_VSYSCALL_SIZE (uintptr_t)0x1000 +#endif + +#endif /* _KERNEL */ + +/* + * System call numbers for revectoring: + */ + +#if defined(__amd64) +#define LX_SYS_close 3 +#define LX_SYS_gettimeofday 96 +#define LX_SYS_mount 165 +#define LX_SYS_time 201 +#define LX_SYS_io_setup 206 +#define LX_SYS_clock_gettime 228 +#define LX_SYS_getcpu 309 + +#define LX_SYS32_close 6 +#define LX_SYS32_gettimeofday 78 +#define LX_SYS32_time 13 +#define LX_SYS32_mount 21 +#define LX_SYS32_clock_gettime 265 +#define LX_SYS32_io_setup 245 +#define LX_SYS32_getcpu 318 +#elif defined(__i386) +#define LX_SYS_close 6 +#define LX_SYS_mount 21 +#define LX_SYS_gettimeofday 78 +#define LX_SYS_time 13 +#define LX_SYS_clock_gettime 265 +#define LX_SYS_io_setup 245 +#define LX_SYS_getcpu 318 +#else +#error "Architecture not supported" +#endif /* defined(__amd64) */ + +/* + * The current code in the VDSO operates under the expectation that it will be + * mapped at a fixed offset from the comm page. This simplifies the act of + * locating said page without any other reference. The VDSO must fit within + * this offset, matching the same value as COMM_PAGE_ALIGN. + * See: uts/i86pc/sys/comm_page.h + */ +#define LX_VDSO_SIZE 0x4000 +#define LX_VDSO_ADDR_MASK ~(LX_VDSO_SIZE - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SYSCALLS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h new file mode 100644 index 0000000000..90363c8939 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_types.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_TYPES_H +#define _SYS_LX_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#define SHRT_MIN (-32768) /* min value of a "short int" */ +#define SHRT_MAX 32767 /* max value of a "short int" */ +#define USHRT_MAX 65535 /* max of "unsigned short int" */ +#define INT_MIN (-2147483647-1) /* min value of an "int" */ +#define INT_MAX 2147483647 /* max value of an "int" */ +#define UINT_MAX 4294967295U /* max value of an "unsigned int" */ + +#ifndef LLONG_MAX +#define LLONG_MAX 9223372036854775807LL +#endif + +#if defined(_LP64) +#define LONG_MAX 9223372036854775807L +#define ULONG_MAX 18446744073709551615UL +#else +#define LONG_MAX 2147483647L /* max value of a 32-bit "long int" */ +#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */ +#endif + +#endif /* !_KERNEL */ + + +typedef uint64_t lx_dev_t; +typedef uint16_t lx_dev16_t; +typedef uint32_t lx_ino_t; +typedef uint64_t lx_ino64_t; +typedef uint32_t lx_uid_t; +typedef uint16_t lx_uid16_t; +typedef uint32_t lx_gid_t; +typedef uint16_t lx_gid16_t; +typedef uint32_t lx_off_t; +typedef uint64_t lx_off64_t; +typedef uint32_t lx_blksize_t; +typedef uint32_t lx_blkcnt_t; +typedef uint64_t lx_blkcnt64_t; +typedef uint32_t lx_mode_t; +typedef uint16_t lx_mode16_t; + +/* + * Linux mangles major/minor numbers into dev_t differently than SunOS. + */ +#ifdef _LP64 +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | \ + ((uint64_t)((min) & ~0xff) << 12) | ((uint64_t)((maj) & ~0xfff) << 32)) + +#define LX_GETMAJOR(lx_dev) ((((lx_dev) >> 8) & 0xfff) | \ + ((((uint64_t)(lx_dev)) >> 32) & ~0xfff)) + +#else +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | (((min) & ~0xff) << 12)) + +#define LX_GETMAJOR(lx_dev) (((lx_dev) >> 8) & 0xfff) +#endif + +#define LX_GETMINOR(lx_dev) (((lx_dev) & 0xff) | (((lx_dev) >> 12) & ~0xff)) +/* Linux supports 20 bits for the minor, and 12 bits for the major number */ +#define LX_MAXMIN 0xfffff +#define LX_MAXMAJ 0xfff + +/* + * Certain Linux tools care deeply about major/minor number mapping. + * Map virtual disks (zfs datasets, zvols, etc) into a safe reserved range. + */ +#define LX_MAJOR_DISK 203 + +/* LX ptm driver major/minor number */ +#define LX_PTM_MAJOR 5 +#define LX_PTM_MINOR 2 + +/* LX pts driver major number range */ +#define LX_PTS_MAJOR_MIN 136 +#define LX_PTS_MAJOR_MAX 143 + +/* LX tty/cons driver major number */ +#define LX_TTY_MAJOR 5 + +#define LX_UID16_TO_UID32(uid16) \ + (((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16)) + +#define LX_GID16_TO_GID32(gid16) \ + (((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16)) + +/* Overflow values default to NFS nobody. */ + +#define UID16_OVERFLOW ((lx_uid16_t)65534) +#define GID16_OVERFLOW ((lx_gid16_t)65534) + +/* + * All IDs with high word non-zero are converted to default overflow values to + * avoid inadvertent truncation to zero (root) (!). + */ +#define LX_UID32_TO_UID16(uid32) \ + ((((uid32) & 0xffff0000) == 0) ? ((lx_uid16_t)(uid32)) : \ + (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW)) + +#define LX_GID32_TO_GID16(gid32) \ + ((((gid32) & 0xffff0000) == 0) ? ((lx_gid16_t)(gid32)) : \ + (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW)) + +#define LX_32TO64(lo, hi) \ + ((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_TYPES_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_userhz.h b/usr/src/uts/common/brand/lx/sys/lx_userhz.h new file mode 100644 index 0000000000..ebbda28698 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_userhz.h @@ -0,0 +1,64 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _LX_USERHZ_H +#define _LX_USERHZ_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Within the kernel, Linux implements an internal hz that they refer to as a + * "jiffy". Linux can be built with different hz, but on modern kernels + * it is frequently 250. However, Linux has a separate concept for the hz + * that is visible outside the kernel. This is called "USER_HZ" and is the + * value returned by 'sysconf(_SC_CLK_TCK)'. This is almost universally set to + * 100hz. Some (lazy) applications just hardcode 100hz instead of checking. + * To accommodate these broken applications, we always work with a USER_HZ of + * 100 and scale accordingly. See the Linux time(7) man page for a more + * detailed discussion of their behavior. See the comment in our + * uts/common/conf/param.c for a discussion of valid native hz values. + * + * There are a few interfaces which expose a clock_t to user-land and which + * need to be considered for USER_HZ adjustment. + * 1) The times(2) syscall. This is handled correctly. + * 2) The waitid(2) syscall passes a siginfo_t which contains si_stime and + * si_utime. Testing waitid(2) on various Linux distributions shows that the + * these fields are garbage. This aligns with the Linux waitid(2) man page, + * which describes the subset of the siginfo_t structure that is populated. + * Neither si_stime or si_utime are listed. + * 3) A sigaction(2) handler can pass a siginfo_t. This is only documented to + * occur when the sa_flags is SA_SIGINFO. The si_stime and si_utime are + * documented to only be populated when the signal is SIGCHLD. However, + * testing on Linux seems to show that these fields are not consistent + * with the corresponding times(2) data for the process, even for the + * SIGCHLD sigaction handler case. + * 4) Some fields in /proc/stat and /proc/pid/stat. See the Linux proc man + * page for references to sysconf(_SC_CLK_TCK). + * + * Although the siginfo_t si_stime and si_utime data for cases #2 and #3 is not + * consistent on Linux, we populate these fields correctly to be on the safe + * side. + */ +extern uint_t lx_hz_scale; +#define LX_USERHZ 100 +#define HZ_TO_LX_USERHZ(x) ((x) / lx_hz_scale) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_USERHZ_H */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_access.c b/usr/src/uts/common/brand/lx/syscall/lx_access.c new file mode 100644 index 0000000000..8cf836cd7a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_access.c @@ -0,0 +1,223 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + * + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred_impl.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/file.h> +#include <fs/fs_subr.h> +#include <c2/audit.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +/* + * Determine accessibility of file. + */ + +#define E_OK 010 /* use effective ids */ +#define R_OK 004 +#define W_OK 002 +#define X_OK 001 + +/* + * Convert Linux LX_AT_* flags to SunOS AT_* flags but skip verifying allowed + * flags have been passed. This also allows EACCESS/REMOVEDIR to be translated + * correctly since on linux they have the same value. + * + * Some code can actually pass in other bits in the flag. We may have to simply + * ignore these, as indicated by the enforce parameter. + */ +int +ltos_at_flag(int lflag, int allow, boolean_t enforce) +{ + int sflag = 0; + + if ((lflag & LX_AT_EACCESS) && (allow & AT_EACCESS)) { + lflag &= ~LX_AT_EACCESS; + sflag |= AT_EACCESS; + } + + if ((lflag & LX_AT_REMOVEDIR) && (allow & AT_REMOVEDIR)) { + lflag &= ~LX_AT_REMOVEDIR; + sflag |= AT_REMOVEDIR; + } + + if ((lflag & LX_AT_SYMLINK_NOFOLLOW) && (allow & AT_SYMLINK_NOFOLLOW)) { + lflag &= ~LX_AT_SYMLINK_NOFOLLOW; + sflag |= AT_SYMLINK_NOFOLLOW; + } + + /* right now SunOS doesn't have a _FOLLOW flag, so use a fake one */ + if ((lflag & LX_AT_SYMLINK_FOLLOW) && (allow & LX_AT_SYMLINK_FOLLOW)) { + lflag &= ~LX_AT_SYMLINK_FOLLOW; + sflag |= LX_AT_SYMLINK_FOLLOW; + } + + /* If lflag is not zero than some flags did not hit the above code. */ + if (enforce && lflag) + return (-1); + + return (sflag); +} + +/* + * For illumos, access() does this: + * If the process has appropriate privileges, an implementation may indicate + * success for X_OK even if none of the execute file permission bits are set. + * + * But for Linux, access() does this: + * If the calling process is privileged (i.e., its real UID is zero), then + * an X_OK check is successful for a regular file if execute permission is + * enabled for any of the file owner, group, or other. + * + * Linux used to behave more like illumos on older kernels: + * In kernel 2.4 (and earlier) there is some strangeness in the handling + * of X_OK tests for superuser. If all categories of execute permission + * are disabled for a nondirectory file, then the only access() test that + * returns -1 is when mode is specified as just X_OK; if R_OK or W_OK is + * also specified in mode, then access() returns 0 for such files. + * + * So we need to handle the case where a privileged process is checking for + * X_OK but none of the execute bits are set on the file. We'll keep the old + * 2.4 behavior for 2.4 emulation but use the new behavior for any other + * kernel rev. + */ +static int +lx_common_access(char *fname, int fmode, vnode_t *startvp) +{ + vnode_t *vp; + cred_t *tmpcr; + int error; + int mode; + cred_t *cr; + int estale_retry = 0; + + if (fmode & ~(E_OK|R_OK|W_OK|X_OK)) + return (EINVAL); + + mode = ((fmode & (R_OK|W_OK|X_OK)) << 6); + + cr = CRED(); + + /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */ + if ((fmode & E_OK) != 0 || + (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid)) { + tmpcr = cr; + crhold(tmpcr); + } else { + tmpcr = crdup(cr); + tmpcr->cr_uid = cr->cr_ruid; + tmpcr->cr_gid = cr->cr_rgid; + tmpcr->cr_ruid = cr->cr_uid; + tmpcr->cr_rgid = cr->cr_gid; + } + +lookup: + if ((error = lookupnameatcred(fname, UIO_USERSPACE, FOLLOW, NULLVPP, + &vp, startvp, tmpcr)) != 0) { + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto lookup; + crfree(tmpcr); + return (error); + } + + if (mode != 0) { + error = VOP_ACCESS(vp, mode, 0, tmpcr, NULL); + if (error != 0) { + if ((error == ESTALE) && + fs_need_estale_retry(estale_retry++)) { + VN_RELE(vp); + goto lookup; + } + + } else if ((fmode & X_OK) != 0 && cr->cr_ruid == 0 && + lx_kern_release_cmp(curproc->p_zone, "2.4.0") > 0) { + /* check for incorrect execute success */ + vattr_t va; + + va.va_mask = AT_MODE; + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) == 0) { + mode_t m = VTTOIF(va.va_type) | va.va_mode; + + if ((m & S_IFMT) == S_IFREG && + !(m & (S_IXUSR | S_IXGRP | S_IXOTH))) { + /* no execute bits set in the mode */ + error = EACCES; + } + } + } + } + + crfree(tmpcr); + VN_RELE(vp); + return (error); +} + +int +lx_faccessat(int atfd, char *fname, int fmode, int flag) +{ + vnode_t *startvp; + int error; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + if ((flag = ltos_at_flag(flag, AT_EACCESS, B_FALSE)) < 0) + return (set_errno(EINVAL)); + + if (fname == NULL) + return (set_errno(EFAULT)); + if ((error = fgetstartvp(atfd, fname, &startvp)) != 0) + return (set_errno(error)); + if (AU_AUDITING() && startvp != NULL) + audit_setfsat_path(1); + + /* Do not allow E_OK unless AT_EACCESS flag is set */ + if ((flag & AT_EACCESS) == 0) + fmode &= ~E_OK; + + error = lx_common_access(fname, fmode, startvp); + if (startvp != NULL) + VN_RELE(startvp); + if (error) + return (set_errno(error)); + return (0); +} + +int +lx_access(char *fname, int fmode) +{ + return (lx_faccessat(LX_AT_FDCWD, fname, fmode, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_aio.c b/usr/src/uts/common/brand/lx/syscall/lx_aio.c new file mode 100644 index 0000000000..c821e72538 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_aio.c @@ -0,0 +1,1345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Linux aio syscall support. + * + * The Linux story around the io_* syscalls is very confusing. The io_* syscalls + * are not exposed via glibc and in fact, glibc seems to implement its own aio + * without using the io_* syscalls at all. However, there is the libaio library + * which uses the io_* syscalls, although its implementation of the io_* + * functions (with the same names!) is different from the syscalls themselves, + * and it uses different definitions for some of the structures involved. + * + * These syscalls are documented to use an aio_context_t for the context + * parameter. On Linux this is a ulong_t. The contexts live in the kernel + * address space and are looked up using the aio_context_t parameter. However, + * the Linux libaio library, which is a consumer of the io_* syscalls, abuses + * the context by assuming it can be used as a pointer into memory that is + * mapped into the process. To accomodate this abomination we map a page of + * anonymous memory and expose the context to user-land as a pointer offset + * into that page. The page itself is never used by our code and our internal + * context ID is simply an integer we calculate based on the page pointer + * offset. + * + * Most applications never use aio, so we don't want an implementation that + * adds overhead to every process, but on the other hand, when an application is + * using aio, it is for performance reasons and we want to be as efficient as + * possible. In particular, we don't want to dynamically allocate resources + * in the paths that enqueue I/O. Instead, we pre-allocate the resources + * we may need when the application performs the io_setup call and keep the + * io_submit and io_getevents calls streamlined. + * + * The general approach here is inspired by the native aio support provided by + * libc in user-land. We have worker threads that pick up pending work from + * the context "lxioctx_pending" list and synchronously issue the operation in + * the control block. When the operation completes, the thread places the + * control block into the context "lxioctx_done" list for later consumption by + * io_getevents. The thread will then attempt to service another pending + * operation or wait for more work to arrive. + * + * The control blocks on the pending or done lists are referenced by an + * lx_io_elem_t struct. This simply holds a pointer to the user-land control + * block and the result of the operation. These elements are pre-allocated at + * io_setup time and stored on the context "lxioctx_free" list. + * + * io_submit pulls elements off of the free list, places them on the pending + * list and kicks a worker thread to run. io_getevents pulls elements off of + * the done list, sets up an event to return, and places the elements back + * onto the free list. + * + * The worker threads are pre-allocated at io_setup time. These are LWP's + * that are part of the process, but never leave the kernel. The number of + * LWP's is allocated based on the nr_events argument to io_setup. Because + * this argument can theoretically be large (up to LX_AIO_MAX_NR), we want to + * pre-allocate enough threads to get good I/O concurrency, but not overdo it. + * For a small nr_events (<= lx_aio_base_workers) we pre-allocate as many + * threads as nr_events so that all of the the I/O can run in parallel. Once + * we exceed lx_aio_base_workers, we scale up the number of threads by 2, until + * we hit the maximum at lx_aio_max_workers. See the code in io_setup for more + * information. + * + * Because the worker threads never leave the kernel, they are marked with the + * TP_KTHREAD bit so that /proc operations essentially ignore them. We also tag + * the brand lwp flags with the BR_AIO_LWP bit so that these threads never + * appear in the lx /proc. Aside from servicing aio submissions, the worker + * threads don't participate in most application-initiated operations. Forking + * is a special case for the workers. The Linux fork(2) and vfork(2) behavior + * always forks only a single thread; the caller. However, during cfork() the + * system attempts to quiesce all threads by calling holdlwps(). The workers + * check for SHOLDFORK and SHOLDFORK1 in their loops and suspend themselves ala + * holdlwp() if the process forks. + * + * It is hard to make any generalized statements about how the aio syscalls + * are used in production. MySQL is one of the more popular consumers of aio + * and in the default configuration it will create 10 contexts with a capacity + * of 256 I/Os (io_setup nr_events) and 1 context with a capacity of 100 I/Os. + * Another application we've seen will create 8 contexts, each with a capacity + * of 128 I/Os. In practice 1-7 was the typical number of in-flight I/Os. + * + * The default configuration for MySQL uses 4 read and 4 write threads. Each + * thread has an associated context. MySQL also allocates 3 additional contexts, + * so in the default configuration it will only use 11, but the number of + * read and write threads can be tuned up to a maximum of 64. We can expand + * a process's number of contexts up to a maximum of LX_IOCTX_CNT_MAX, which + * is significantly more than we've ever seen in use. + * + * According to www.kernel.org/doc/Documentation/sysctl/fs.txt, the + * /proc/sys/fs entries for aio are: + * - aio-nr: The total of all nr_events values specified on the io_setup + * call for every active context. + * - aio-max-nr: The upper limit for aio-nr + * aio-nr is tracked as a zone-wide value. We keep aio-max-nr limited to + * LX_AIO_MAX_NR, which matches Linux and provides plenty of headroom for the + * zone. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/time.h> +#include <sys/brand.h> +#include <sys/sysmacros.h> +#include <sys/sdt.h> +#include <sys/procfs.h> +#include <sys/eventfd.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + +/* These constants match Linux */ +#define LX_IOCB_FLAG_RESFD 0x0001 +#define LX_IOCB_CMD_PREAD 0 +#define LX_IOCB_CMD_PWRITE 1 +#define LX_IOCB_CMD_FSYNC 2 +#define LX_IOCB_CMD_FDSYNC 3 +#define LX_IOCB_CMD_PREADX 4 +#define LX_IOCB_CMD_POLL 5 +#define LX_IOCB_CMD_NOOP 6 +#define LX_IOCB_CMD_PREADV 7 +#define LX_IOCB_CMD_PWRITEV 8 + +#define LX_KIOCB_KEY 0 + +/* + * Base and max. number of contexts/process. Note that we currently map one + * page to manage the user-level context ID, so that code must be adjusted if + * LX_IOCTX_CNT_MAX is ever enlarged. Currently, this is the limit for the + * number of 64-bit pointers in one 4k page. + */ +#define LX_IOCTX_CNT_BASE 16 +#define LX_IOCTX_CNT_MAX 512 + +/* + * Max number of control block pointers, or lx_io_event_t's, to allocate on the + * stack in io_submit or io_getevents. + */ +#define MAX_ALLOC_ON_STACK 128 +#define alloca(x) __builtin_alloca(x) +extern void *__builtin_alloca(size_t); + +/* The context is an offset within the ctxpage we mapped */ +#define CTXID_TO_PTR(L, I) ((L)->l_io_ctxpage + ((I) * sizeof (uintptr_t))) +#define PTR_TO_CTXID(L, P) ((int)((uintptr_t)(P) - (L)->l_io_ctxpage) / \ + sizeof (uintptr_t)) + +typedef ulong_t lx_aio_context_t; + +uint_t lx_aio_base_workers = 16; /* num threads/context before scaling */ +uint_t lx_aio_max_workers = 32; /* upper limit on threads/context */ + +/* + * Internal representation of an aio context. + */ +typedef struct lx_io_ctx { + boolean_t lxioctx_shutdown; /* context is being destroyed */ + uint_t lxioctx_maxn; /* nr_events from io_setup */ + uint_t lxioctx_in_use; /* reference counter */ + kmutex_t lxioctx_f_lock; /* free list lock */ + uint_t lxioctx_free_cnt; /* num. elements in free list */ + list_t lxioctx_free; /* free list */ + kmutex_t lxioctx_p_lock; /* pending list lock */ + kcondvar_t lxioctx_pending_cv; /* pending list cv */ + list_t lxioctx_pending; /* pending list */ + kmutex_t lxioctx_d_lock; /* done list lock */ + kcondvar_t lxioctx_done_cv; /* done list cv */ + uint_t lxioctx_done_cnt; /* num. elements in done list */ + list_t lxioctx_done; /* done list */ +} lx_io_ctx_t; + +/* + * Linux binary definition of an I/O event. + */ +typedef struct lx_io_event { + uint64_t lxioe_data; /* data payload */ + uint64_t lxioe_object; /* object of origin */ + int64_t lxioe_res; /* result code */ + int64_t lxioe_res2; /* "secondary" result (WTF?) */ +} lx_io_event_t; + +/* + * Linux binary definition of an I/O control block. + */ +typedef struct lx_iocb { + uint64_t lxiocb_data; /* data payload */ + uint32_t lxiocb_key; /* must be LX_KIOCB_KEY (!) */ + uint32_t lxiocb_reserved1; + uint16_t lxiocb_op; /* operation */ + int16_t lxiocb_reqprio; /* request priority */ + uint32_t lxiocb_fd; /* file descriptor */ + uint64_t lxiocb_buf; /* data buffer */ + uint64_t lxiocb_nbytes; /* number of bytes */ + int64_t lxiocb_offset; /* offset in file */ + uint64_t lxiocb_reserved2; + uint32_t lxiocb_flags; /* LX_IOCB_FLAG_* flags */ + uint32_t lxiocb_resfd; /* eventfd fd, if any */ +} lx_iocb_t; + +typedef struct lx_io_elem { + list_node_t lxioelem_link; + uint16_t lxioelem_op; /* operation */ + uint16_t lxioelem_flags; /* bits from lxiocb_flags */ + int lxioelem_fd; /* file descriptor */ + file_t *lxioelem_fp; /* getf() file pointer */ + int lxioelem_resfd; /* RESFD file descriptor */ + file_t *lxioelem_resfp; /* RESFD getf() file pointer */ + void *lxioelem_buf; /* data buffer */ + uint64_t lxioelem_nbytes; /* number of bytes */ + int64_t lxioelem_offset; /* offset in file */ + uint64_t lxioelem_data; + ssize_t lxioelem_res; + void *lxioelem_cbp; /* ptr to iocb in userspace */ +} lx_io_elem_t; + +/* From lx_rw.c */ +extern ssize_t lx_pread_fp(file_t *, void *, size_t, off64_t); +extern ssize_t lx_pwrite_fp(file_t *, void *, size_t, off64_t); + +/* From common/syscall/rw.c */ +extern int fdsync(int, int); +/* From common/os/grow.c */ +extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t); + +/* + * Given an aio_context ID, return our internal context pointer with an + * additional ref. count, or NULL if cp not found. + */ +static lx_io_ctx_t * +lx_io_cp_hold(lx_aio_context_t cid) +{ + int id; + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_io_ctx_t *cp; + + mutex_enter(&lxpd->l_io_ctx_lock); + + if (lxpd->l_io_ctxs == NULL) { + ASSERT(lxpd->l_io_ctx_cnt == 0); + ASSERT(lxpd->l_io_ctxpage == NULL); + goto bad; + } + + id = PTR_TO_CTXID(lxpd, cid); + if (id < 0 || id >= lxpd->l_io_ctx_cnt) + goto bad; + + if ((cp = lxpd->l_io_ctxs[id]) == NULL) + goto bad; + + if (cp->lxioctx_shutdown) + goto bad; + + atomic_inc_32(&cp->lxioctx_in_use); + mutex_exit(&lxpd->l_io_ctx_lock); + return (cp); + +bad: + mutex_exit(&lxpd->l_io_ctx_lock); + return (NULL); +} + +/* + * Release a hold on the context and clean up the context if it was the last + * hold. + */ +static void +lx_io_cp_rele(lx_io_ctx_t *cp) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_zone_data_t *lxzd; + int i; + lx_io_elem_t *ep; + + mutex_enter(&lxpd->l_io_ctx_lock); + ASSERT(cp->lxioctx_in_use >= 1); + if (cp->lxioctx_in_use > 1) { + atomic_dec_32(&cp->lxioctx_in_use); + /* wake all threads waiting on context rele */ + cv_broadcast(&lxpd->l_io_destroy_cv); + mutex_exit(&lxpd->l_io_ctx_lock); + return; + } + + /* + * We hold the last ref. + */ + for (i = 0; i < lxpd->l_io_ctx_cnt; i++) { + if (lxpd->l_io_ctxs[i] == cp) { + lxpd->l_io_ctxs[i] = NULL; + break; + } + } + ASSERT(i < lxpd->l_io_ctx_cnt); + /* wake all threads waiting on context destruction */ + cv_broadcast(&lxpd->l_io_destroy_cv); + ASSERT(cp->lxioctx_shutdown == B_TRUE); + + mutex_exit(&lxpd->l_io_ctx_lock); + + /* can now decrement the zone's overall aio counter */ + lxzd = ztolxzd(curproc->p_zone); + mutex_enter(&lxzd->lxzd_lock); + VERIFY(cp->lxioctx_maxn <= lxzd->lxzd_aio_nr); + lxzd->lxzd_aio_nr -= cp->lxioctx_maxn; + mutex_exit(&lxzd->lxzd_lock); + + /* + * We have the only pointer to the context now. Free all + * elements from all three queues and the context itself. + */ + while ((ep = list_remove_head(&cp->lxioctx_free)) != NULL) { + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + /* + * During io_submit() we use getf() to get/validate the file pointer + * for the file descriptor in each control block. We do not releasef() + * the fd, but instead pass along the fd and file pointer to the worker + * threads. In order to manage this hand-off we use clear_active_fd() + * in the syscall path and then in our thread which takes over the file + * descriptor, we use a combination of set_active_fd() and releasef(). + * Because our thread that is taking ownership of the fd has not called + * getf(), we first call set_active_fd(-1) to reserve a slot in the + * active fd array for ourselves. + */ + set_active_fd(-1); + while ((ep = list_remove_head(&cp->lxioctx_pending)) != NULL) { + set_active_fd(ep->lxioelem_fd); + releasef(ep->lxioelem_fd); + + if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) { + set_active_fd(ep->lxioelem_resfd); + releasef(ep->lxioelem_resfd); + } + + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + while ((ep = list_remove_head(&cp->lxioctx_done)) != NULL) { + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + ASSERT(list_is_empty(&cp->lxioctx_free)); + list_destroy(&cp->lxioctx_free); + ASSERT(list_is_empty(&cp->lxioctx_pending)); + list_destroy(&cp->lxioctx_pending); + ASSERT(list_is_empty(&cp->lxioctx_done)); + list_destroy(&cp->lxioctx_done); + + kmem_free(cp, sizeof (lx_io_ctx_t)); +} + +/* + * Called by a worker thread to perform the operation specified in the control + * block. + * + * Linux returns a negative errno in the event "lxioelem_res" field as the + * result of a failed operation. We do the same. + */ +static void +lx_io_do_op(lx_io_elem_t *ep) +{ + int err; + int64_t res = 0; + + set_active_fd(ep->lxioelem_fd); + + ttolwp(curthread)->lwp_errno = 0; + switch (ep->lxioelem_op) { + case LX_IOCB_CMD_FSYNC: + case LX_IOCB_CMD_FDSYNC: + /* + * Note that Linux always returns EINVAL for these two + * operations. This is apparently because nothing in Linux + * defines the 'aio_fsync' function. Thus, it is unlikely any + * application will actually submit these. + * + * This is basically fdsync(), but we already have the fp. + */ + err = VOP_FSYNC(ep->lxioelem_fp->f_vnode, + (ep->lxioelem_op == LX_IOCB_CMD_FSYNC) ? FSYNC : FDSYNC, + ep->lxioelem_fp->f_cred, NULL); + if (err != 0) { + (void) set_errno(err); + } + + break; + + case LX_IOCB_CMD_PREAD: + res = lx_pread_fp(ep->lxioelem_fp, ep->lxioelem_buf, + ep->lxioelem_nbytes, ep->lxioelem_offset); + break; + + case LX_IOCB_CMD_PWRITE: + res = lx_pwrite_fp(ep->lxioelem_fp, ep->lxioelem_buf, + ep->lxioelem_nbytes, ep->lxioelem_offset); + break; + + default: + /* We validated the op at io_submit syscall time */ + VERIFY(0); + break; + } + if (ttolwp(curthread)->lwp_errno != 0) + res = -lx_errno(ttolwp(curthread)->lwp_errno, EINVAL); + + ep->lxioelem_res = res; + + releasef(ep->lxioelem_fd); + ep->lxioelem_fd = 0; + ep->lxioelem_fp = NULL; +} + +/* + * The operation has either completed or been cancelled. Finalize the handling + * and move the operation onto the "done" queue. + */ +static void +lx_io_finish_op(lx_io_ctx_t *cp, lx_io_elem_t *ep, boolean_t do_event) +{ + boolean_t do_resfd; + int resfd = 0; + file_t *resfp = NULL; + + if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) { + do_resfd = B_TRUE; + resfd = ep->lxioelem_resfd; + resfp = ep->lxioelem_resfp; + } else { + do_resfd = B_FALSE; + } + + ep->lxioelem_flags = 0; + ep->lxioelem_resfd = 0; + ep->lxioelem_resfp = NULL; + + mutex_enter(&cp->lxioctx_d_lock); + list_insert_tail(&cp->lxioctx_done, ep); + cp->lxioctx_done_cnt++; + cv_signal(&cp->lxioctx_done_cv); + mutex_exit(&cp->lxioctx_d_lock); + + /* Update the eventfd if necessary */ + if (do_resfd) { + vnode_t *vp = resfp->f_vnode; + uint64_t val = 1; + + set_active_fd(resfd); + + if (do_event) { + /* + * Eventfd notifications from AIO are special in that + * they are not expected to block. This interface allows + * the eventfd value to reach (but not cross) the + * overflow value. + */ + (void) VOP_IOCTL(vp, EVENTFDIOC_POST, (intptr_t)&val, + FKIOCTL, resfp->f_cred, NULL, NULL); + } + + releasef(resfd); + } +} + +/* + * First check if this worker needs to quit due to shutdown or exit. Return + * true in this case. + * + * Then check if our process is forking. In this case it expects all LWPs to be + * stopped first. For the worker threads, a stop equivalent to holdlwp() is + * necessary before the fork can proceed. + * + * It is common to check p_flag outside of p_lock (see issig) and we want to + * avoid making p_lock any hotter since this is called in the worker main loops. + */ +static boolean_t +lx_io_worker_chk_status(lx_io_ctx_t *cp, boolean_t locked) +{ + if (cp->lxioctx_shutdown) + return (B_TRUE); + + if (curproc->p_flag & (SEXITLWPS | SKILLED)) { + cp->lxioctx_shutdown = B_TRUE; + return (B_TRUE); + } + + if (curproc->p_flag & (SHOLDFORK | SHOLDFORK1)) { + if (locked) + mutex_exit(&cp->lxioctx_p_lock); + + mutex_enter(&curproc->p_lock); + stop(PR_SUSPENDED, SUSPEND_NORMAL); + mutex_exit(&curproc->p_lock); + + if (locked) + mutex_enter(&cp->lxioctx_p_lock); + + if (cp->lxioctx_shutdown) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Worker thread - pull work off the pending queue, perform the operation and + * place the result on the done queue. Do this as long as work is pending, then + * wait for more. + */ +static void +lx_io_worker(void *a) +{ + lx_io_ctx_t *cp = (lx_io_ctx_t *)a; + lx_io_elem_t *ep; + + set_active_fd(-1); /* See comment in lx_io_cp_rele */ + + while (!cp->lxioctx_shutdown) { + mutex_enter(&cp->lxioctx_p_lock); + if (list_is_empty(&cp->lxioctx_pending)) { + /* + * This must be cv_wait_sig, as opposed to cv_wait, so + * that pokelwps works correctly on these threads. + * + * The worker threads have all of their signals held, + * so a cv_wait_sig return of 0 here only occurs while + * we're shutting down. + */ + if (cv_wait_sig(&cp->lxioctx_pending_cv, + &cp->lxioctx_p_lock) == 0) + cp->lxioctx_shutdown = B_TRUE; + } + + if (lx_io_worker_chk_status(cp, B_TRUE)) { + mutex_exit(&cp->lxioctx_p_lock); + break; + } + + ep = list_remove_head(&cp->lxioctx_pending); + mutex_exit(&cp->lxioctx_p_lock); + + while (ep != NULL) { + lx_io_do_op(ep); + + lx_io_finish_op(cp, ep, B_TRUE); + + if (lx_io_worker_chk_status(cp, B_FALSE)) + break; + + mutex_enter(&cp->lxioctx_p_lock); + ep = list_remove_head(&cp->lxioctx_pending); + mutex_exit(&cp->lxioctx_p_lock); + } + } + + lx_io_cp_rele(cp); + + ASSERT(curthread->t_lwp != NULL); + mutex_enter(&curproc->p_lock); + lwp_exit(); +} + +/* + * LTP passes -1 for nr_events but we're limited by LX_AIO_MAX_NR anyway. + */ +long +lx_io_setup(uint_t nr_events, void *ctxp) +{ + int i, slot; + proc_t *p = curproc; + lx_proc_data_t *lxpd = ptolxproc(p); + lx_zone_data_t *lxzd = ztolxzd(p->p_zone); + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + uintptr_t cid; + uint_t nworkers; + k_sigset_t hold_set; + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t cid32; + + if (copyin(ctxp, &cid32, sizeof (cid32)) != 0) + return (set_errno(EFAULT)); + cid = (uintptr_t)cid32; + } else +#endif + if (copyin(ctxp, &cid, sizeof (cid)) != 0) + return (set_errno(EFAULT)); + + /* The cid in user-land must be NULL to start */ + if (cid != NULL || nr_events > LX_AIO_MAX_NR) + return (set_errno(EINVAL)); + + mutex_enter(&lxzd->lxzd_lock); + if ((nr_events + lxzd->lxzd_aio_nr) > LX_AIO_MAX_NR) { + mutex_exit(&lxzd->lxzd_lock); + return (set_errno(EAGAIN)); + } + lxzd->lxzd_aio_nr += nr_events; + mutex_exit(&lxzd->lxzd_lock); + + /* Find a free slot */ + mutex_enter(&lxpd->l_io_ctx_lock); + if (lxpd->l_io_ctxs == NULL) { + /* + * First use of aio, allocate a context array and a page + * in our address space to use for context ID handling. + */ + uintptr_t ctxpage; + + ASSERT(lxpd->l_io_ctx_cnt == 0); + ASSERT(lxpd->l_io_ctxpage == NULL); + + ttolwp(curthread)->lwp_errno = 0; + ctxpage = (uintptr_t)smmap64(0, PAGESIZE, PROT_READ, + MAP_SHARED | MAP_ANON, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + mutex_exit(&lxpd->l_io_ctx_lock); + return (set_errno(ENOMEM)); + } + + lxpd->l_io_ctxpage = ctxpage; + lxpd->l_io_ctx_cnt = LX_IOCTX_CNT_BASE; + lxpd->l_io_ctxs = kmem_zalloc(lxpd->l_io_ctx_cnt * + sizeof (lx_io_ctx_t *), KM_SLEEP); + slot = 0; + } else { + ASSERT(lxpd->l_io_ctx_cnt > 0); + for (slot = 0; slot < lxpd->l_io_ctx_cnt; slot++) { + if (lxpd->l_io_ctxs[slot] == NULL) + break; + } + + if (slot == lxpd->l_io_ctx_cnt) { + /* Double our context array up to the max. */ + const uint_t new_cnt = lxpd->l_io_ctx_cnt * 2; + const uint_t old_size = lxpd->l_io_ctx_cnt * + sizeof (lx_io_ctx_t *); + const uint_t new_size = new_cnt * + sizeof (lx_io_ctx_t *); + struct lx_io_ctx **old_array = lxpd->l_io_ctxs; + + if (new_cnt > LX_IOCTX_CNT_MAX) { + mutex_exit(&lxpd->l_io_ctx_lock); + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_aio_nr -= nr_events; + mutex_exit(&lxzd->lxzd_lock); + return (set_errno(ENOMEM)); + } + + /* See big theory comment explaining context ID. */ + VERIFY(PAGESIZE >= new_size); + lxpd->l_io_ctxs = kmem_zalloc(new_size, KM_SLEEP); + + bcopy(old_array, lxpd->l_io_ctxs, old_size); + kmem_free(old_array, old_size); + lxpd->l_io_ctx_cnt = new_cnt; + + /* note: 'slot' is now valid in the new array */ + } + } + + cp = kmem_zalloc(sizeof (lx_io_ctx_t), KM_SLEEP); + list_create(&cp->lxioctx_free, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + list_create(&cp->lxioctx_pending, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + list_create(&cp->lxioctx_done, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + mutex_init(&cp->lxioctx_f_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&cp->lxioctx_p_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&cp->lxioctx_d_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&cp->lxioctx_pending_cv, NULL, CV_DEFAULT, NULL); + cv_init(&cp->lxioctx_done_cv, NULL, CV_DEFAULT, NULL); + + /* Add a hold on this context until we're done setting up */ + cp->lxioctx_in_use = 1; + lxpd->l_io_ctxs[slot] = cp; + + cid = CTXID_TO_PTR(lxpd, slot); + + mutex_exit(&lxpd->l_io_ctx_lock); + + /* + * Finish setting up the context. + * + * The context is in the l_io_ctxs array now, so it is potentially + * visible to other threads. However, we have a hold so it cannot be + * destroyed, and both lxioctx_free_cnt and lxioctx_maxn are still 0, + * so nothing can be submitted to this context yet either. + */ + + /* Setup the free list of internal control block elements */ + for (i = 0; i < nr_events; i++) { + ep = kmem_zalloc(sizeof (lx_io_elem_t), KM_SLEEP); + list_insert_head(&cp->lxioctx_free, ep); + } + + /* + * Pre-allocate the worker threads at setup time. + * + * Based on how much concurrent input we may be given, we want enough + * worker threads to get good parallelism but we also want to taper off + * and cap at our upper limit. Our zone's ZFS I/O limit may also come + * into play when we're pumping lots of I/O in parallel. + * + * Note: a possible enhancement here would be to also limit the number + * of worker threads based on the zone's cpu-cap. That is, if the + * cap is low, we might not want too many worker threads. + */ + if (nr_events <= lx_aio_base_workers) { + nworkers = nr_events; + } else { + /* scale up until hit max */ + nworkers = (nr_events / 2) + (lx_aio_base_workers / 2); + if (nworkers > lx_aio_max_workers) + nworkers = lx_aio_max_workers; + } + + sigfillset(&hold_set); + for (i = 0; i < nworkers; i++) { + klwp_t *l; + kthread_t *t; + + /* + * Note that this lwp will not "stop at sys_rtt" as described + * on lwp_create. This lwp will run entirely in the kernel as + * a worker thread serving aio requests. + */ + l = lwp_create(lx_io_worker, (void *)cp, 0, p, TS_STOPPED, + minclsyspri - 1, &hold_set, curthread->t_cid, 0); + if (l == NULL) { + if (i == 0) { + /* + * Uh-oh - we can't create a single worker. + * Release our hold which will cleanup. + */ + cp->lxioctx_shutdown = B_TRUE; + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_maxn = nr_events; + mutex_exit(&lxpd->l_io_ctx_lock); + lx_io_cp_rele(cp); + return (set_errno(ENOMEM)); + } else { + /* + * No new lwp but we already have at least 1 + * worker so don't fail entire syscall. + */ + break; + } + } + + atomic_inc_32(&cp->lxioctx_in_use); + + /* + * Mark it as an in-kernel thread, an lx AIO worker LWP, and + * set it running. + */ + t = lwptot(l); + mutex_enter(&curproc->p_lock); + t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; + lwptolxlwp(l)->br_lwp_flags |= BR_AIO_LWP; + lwp_create_done(t); + mutex_exit(&curproc->p_lock); + } + + /* + * io_submit can occur once lxioctx_free_cnt and lxioctx_maxn are + * non-zero. + */ + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_maxn = cp->lxioctx_free_cnt = nr_events; + mutex_exit(&lxpd->l_io_ctx_lock); + /* Release our hold, worker thread refs keep ctx alive. */ + lx_io_cp_rele(cp); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t cid32 = (uintptr32_t)cid; + + if (copyout(&cid32, ctxp, sizeof (cid32)) != 0) { + /* Since we did a copyin above, this shouldn't fail */ + (void) lx_io_destroy(cid); + return (set_errno(EFAULT)); + } + } else +#endif + if (copyout(&cid, ctxp, sizeof (cid)) != 0) { + /* Since we did a copyin above, this shouldn't fail */ + (void) lx_io_destroy(cid); + return (set_errno(EFAULT)); + } + + return (0); +} + +long +lx_io_submit(lx_aio_context_t cid, const long nr, uintptr_t **bpp) +{ + uint_t i = 0; + int err = 0; + const size_t sz = nr * sizeof (uintptr_t); + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + lx_iocb_t **iocbpp; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + if (nr == 0) { + lx_io_cp_rele(cp); + return (0); + } + + if (nr < 0 || nr > cp->lxioctx_maxn) { + lx_io_cp_rele(cp); + return (set_errno(EINVAL)); + } + + if (nr > MAX_ALLOC_ON_STACK) { + iocbpp = (lx_iocb_t **)kmem_alloc(sz, KM_NOSLEEP); + if (iocbpp == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EAGAIN)); + } + } else { + iocbpp = (lx_iocb_t **)alloca(sz); + } + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t *iocbpp32; + + if (copyin(bpp, iocbpp, nr * sizeof (uintptr32_t)) != 0) { + lx_io_cp_rele(cp); + err = EFAULT; + goto out; + } + + /* + * Zero-extend the 32-bit pointers to proper size. This is + * performed "in reverse" so it can be done in-place, rather + * than with an additional translation copy. + */ + iocbpp32 = (uintptr32_t *)iocbpp; + i = nr; + do { + i--; + iocbpp[i] = (lx_iocb_t *)(uintptr_t)iocbpp32[i]; + } while (i != 0); + } else +#endif + if (copyin(bpp, iocbpp, nr * sizeof (uintptr_t)) != 0) { + lx_io_cp_rele(cp); + err = EFAULT; + goto out; + } + + /* We need to return an error if not able to process any of them */ + mutex_enter(&cp->lxioctx_f_lock); + if (cp->lxioctx_free_cnt == 0) { + mutex_exit(&cp->lxioctx_f_lock); + lx_io_cp_rele(cp); + err = EAGAIN; + goto out; + } + mutex_exit(&cp->lxioctx_f_lock); + + for (i = 0; i < nr; i++) { + lx_iocb_t cb; + file_t *fp, *resfp = NULL; + + if (cp->lxioctx_shutdown) + break; + + if (copyin(iocbpp[i], &cb, sizeof (lx_iocb_t)) != 0) { + err = EFAULT; + break; + } + + /* There is only one valid flag */ + if (cb.lxiocb_flags & ~LX_IOCB_FLAG_RESFD) { + err = EINVAL; + break; + } + + switch (cb.lxiocb_op) { + case LX_IOCB_CMD_FSYNC: + case LX_IOCB_CMD_FDSYNC: + case LX_IOCB_CMD_PREAD: + case LX_IOCB_CMD_PWRITE: + break; + + /* + * We don't support asynchronous preadv and pwritev (an + * asynchronous scatter/gather being a somewhat odd + * notion to begin with); we return EINVAL for that + * case, which the caller should be able to deal with. + * We also return EINVAL for LX_IOCB_CMD_NOOP or any + * unrecognized opcode. + */ + default: + err = EINVAL; + break; + } + if (err != 0) + break; + + /* Validate fd */ + if ((fp = getf(cb.lxiocb_fd)) == NULL) { + err = EBADF; + break; + } + + if (cb.lxiocb_op == LX_IOCB_CMD_PREAD && + (fp->f_flag & FREAD) == 0) { + err = EBADF; + releasef(cb.lxiocb_fd); + break; + } else if (cb.lxiocb_op == LX_IOCB_CMD_PWRITE && + (fp->f_flag & FWRITE) == 0) { + err = EBADF; + releasef(cb.lxiocb_fd); + break; + } + + /* + * A character device is a bit complicated. Linux seems to + * accept these on some devices (e.g. /dev/zero) but not + * others (e.g. /proc/self/fd/0). This might be related to + * the device being seek-able, but a simple seek-set to the + * current offset will succeed for us on a pty. For now we + * handle this by rejecting the device if it is a stream. + * + * If it is a pipe (VFIFO) or directory (VDIR), we error here + * as does Linux. If it is a socket (VSOCK), it's ok here but + * we will post ESPIPE when processing the I/O CB, as does + * Linux. We also error on our other types: VDOOR, VPROC, + * VPORT, VBAD. + */ + if (fp->f_vnode->v_type == VCHR) { + if (fp->f_vnode->v_stream != NULL) { + err = EINVAL; + releasef(cb.lxiocb_fd); + break; + } + } else if (fp->f_vnode->v_type != VREG && + fp->f_vnode->v_type != VBLK && + fp->f_vnode->v_type != VSOCK) { + err = EINVAL; + releasef(cb.lxiocb_fd); + break; + } + + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + if ((resfp = getf(cb.lxiocb_resfd)) == NULL || + !lx_is_eventfd(resfp)) { + err = EINVAL; + releasef(cb.lxiocb_fd); + if (resfp != NULL) + releasef(cb.lxiocb_resfd); + break; + } + } + + mutex_enter(&cp->lxioctx_f_lock); + if (cp->lxioctx_free_cnt == 0) { + mutex_exit(&cp->lxioctx_f_lock); + releasef(cb.lxiocb_fd); + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + releasef(cb.lxiocb_resfd); + } + if (i == 0) { + /* + * Another thread used all of the free entries + * after the check preceding this loop. Since + * we did nothing, we must return an error. + */ + err = EAGAIN; + } + break; + } + ep = list_remove_head(&cp->lxioctx_free); + cp->lxioctx_free_cnt--; + ASSERT(ep != NULL); + mutex_exit(&cp->lxioctx_f_lock); + + ep->lxioelem_op = cb.lxiocb_op; + ep->lxioelem_fd = cb.lxiocb_fd; + ep->lxioelem_fp = fp; + ep->lxioelem_buf = (void *)(uintptr_t)cb.lxiocb_buf; + ep->lxioelem_nbytes = cb.lxiocb_nbytes; + ep->lxioelem_offset = cb.lxiocb_offset; + ep->lxioelem_data = cb.lxiocb_data; + ep->lxioelem_cbp = iocbpp[i]; + + /* Hang on to the fp but setup to hand it off to a worker */ + clear_active_fd(cb.lxiocb_fd); + + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + ep->lxioelem_flags = LX_IOCB_FLAG_RESFD; + ep->lxioelem_resfd = cb.lxiocb_resfd; + ep->lxioelem_resfp = resfp; + clear_active_fd(cb.lxiocb_resfd); + } + + mutex_enter(&cp->lxioctx_p_lock); + list_insert_tail(&cp->lxioctx_pending, ep); + cv_signal(&cp->lxioctx_pending_cv); + mutex_exit(&cp->lxioctx_p_lock); + } + + lx_io_cp_rele(cp); + +out: + if (nr > MAX_ALLOC_ON_STACK) { + kmem_free(iocbpp, sz); + } + if (i == 0 && err != 0) + return (set_errno(err)); + + return (i); +} + +long +lx_io_getevents(lx_aio_context_t cid, long min_nr, const long nr, + lx_io_event_t *events, timespec_t *timeoutp) +{ + int i; + lx_io_ctx_t *cp; + const size_t sz = nr * sizeof (lx_io_event_t); + timespec_t timeout, *tp; + lx_io_event_t *out; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + if (min_nr < 0 || min_nr > cp->lxioctx_maxn || + nr < 0 || nr > cp->lxioctx_maxn) { + lx_io_cp_rele(cp); + return (set_errno(EINVAL)); + } + + if (nr == 0) { + lx_io_cp_rele(cp); + return (0); + } + + if (events == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EFAULT)); + } + + if (timeoutp == NULL) { + tp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &timeout, sizeof (timestruc_t))) { + lx_io_cp_rele(cp); + return (EFAULT); + } + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(timeoutp, &timeout32, + sizeof (timestruc32_t))) { + lx_io_cp_rele(cp); + return (EFAULT); + } + timeout.tv_sec = (time_t)timeout32.tv_sec; + timeout.tv_nsec = timeout32.tv_nsec; + } +#endif + + if (itimerspecfix(&timeout)) { + lx_io_cp_rele(cp); + return (EINVAL); + } + + tp = &timeout; + if (timeout.tv_sec == 0 && timeout.tv_nsec == 0) { + /* + * A timeout of 0:0 is like a poll; we return however + * many events are ready, irrespective of the passed + * min_nr. + */ + min_nr = 0; + } else { + timestruc_t now; + + /* + * We're given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(tp, &now); + } + } + + out = kmem_zalloc(sz, KM_SLEEP); + + /* + * A min_nr of 0 is like a poll even if given a NULL timeout; we return + * however many events are ready. + */ + if (min_nr > 0) { + mutex_enter(&cp->lxioctx_d_lock); + while (!cp->lxioctx_shutdown && cp->lxioctx_done_cnt < min_nr) { + int r; + + r = cv_waituntil_sig(&cp->lxioctx_done_cv, + &cp->lxioctx_d_lock, tp, timechanged); + if (r < 0) { + /* timeout */ + mutex_exit(&cp->lxioctx_d_lock); + lx_io_cp_rele(cp); + kmem_free(out, sz); + return (0); + } else if (r == 0) { + /* interrupted */ + mutex_exit(&cp->lxioctx_d_lock); + lx_io_cp_rele(cp); + kmem_free(out, sz); + return (set_errno(EINTR)); + } + + /* + * Signalled that something was queued up. Check if + * there are now enough or if we have to wait for more. + */ + } + ASSERT(cp->lxioctx_done_cnt >= min_nr || cp->lxioctx_shutdown); + mutex_exit(&cp->lxioctx_d_lock); + } + + /* + * For each done control block, move it into the Linux event we return. + * As we're doing this, we also moving it from the done list to the + * free list. + */ + for (i = 0; i < nr && !cp->lxioctx_shutdown; i++) { + lx_io_event_t *lxe; + lx_io_elem_t *ep; + + lxe = &out[i]; + + mutex_enter(&cp->lxioctx_d_lock); + if (cp->lxioctx_done_cnt == 0) { + mutex_exit(&cp->lxioctx_d_lock); + break; + } + + ep = list_remove_head(&cp->lxioctx_done); + cp->lxioctx_done_cnt--; + mutex_exit(&cp->lxioctx_d_lock); + + lxe->lxioe_data = ep->lxioelem_data; + lxe->lxioe_object = (uint64_t)(uintptr_t)ep->lxioelem_cbp; + lxe->lxioe_res = ep->lxioelem_res; + lxe->lxioe_res2 = 0; + + /* Put it back on the free list */ + ep->lxioelem_cbp = NULL; + ep->lxioelem_data = 0; + ep->lxioelem_res = 0; + mutex_enter(&cp->lxioctx_f_lock); + list_insert_head(&cp->lxioctx_free, ep); + cp->lxioctx_free_cnt++; + mutex_exit(&cp->lxioctx_f_lock); + } + + lx_io_cp_rele(cp); + + /* + * Note: Linux seems to push the events back into the queue if the + * copyout fails. Since this error is due to an application bug, it + * seems unlikely we need to worry about it, but we can revisit this + * if it is ever seen to be an issue. + */ + if (i > 0 && copyout(out, events, i * sizeof (lx_io_event_t)) != 0) { + kmem_free(out, sz); + return (set_errno(EFAULT)); + } + + kmem_free(out, sz); + return (i); +} + +/* + * Linux never returns 0 from io_cancel. A successful cancellation will return + * EINPROGRESS and the result for the cancelled operation will be available via + * a normal io_getevents call. The third parameter (the "result") to this + * syscall is unused. Note that currently the Linux man pages are incorrect + * about this behavior. Also note that in Linux, only the USB driver currently + * support aio cancellation, so callers will almost always get EINVAL when they + * attempt to cancel an IO on Linux. + */ +/*ARGSUSED*/ +long +lx_io_cancel(lx_aio_context_t cid, lx_iocb_t *iocbp, lx_io_event_t *result) +{ + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + uint32_t buf; + + /* + * The Linux io_cancel copies in a field from the iocb in order to + * locate the matching kernel-internal structure. To appease the LTP + * test case which exercises this, a similar copy is performed here. + */ + if (copyin(iocbp, &buf, sizeof (buf)) != 0) { + return (set_errno(EFAULT)); + } + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + /* Try to pull the CB off the pending list */ + mutex_enter(&cp->lxioctx_p_lock); + ep = list_head(&cp->lxioctx_pending); + while (ep != NULL) { + if (ep->lxioelem_cbp == iocbp) { + list_remove(&cp->lxioctx_pending, ep); + break; + } + ep = list_next(&cp->lxioctx_pending, ep); + } + mutex_exit(&cp->lxioctx_p_lock); + + if (ep == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EAGAIN)); + } + + set_active_fd(-1); /* See comment in lx_io_cp_rele */ + set_active_fd(ep->lxioelem_fd); + releasef(ep->lxioelem_fd); + ep->lxioelem_fd = 0; + ep->lxioelem_fp = NULL; + ep->lxioelem_res = -lx_errno(EINTR, EINTR); + + lx_io_finish_op(cp, ep, B_FALSE); + lx_io_cp_rele(cp); + + return (set_errno(EINPROGRESS)); +} + +long +lx_io_destroy(lx_aio_context_t cid) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_io_ctx_t *cp; + int cnt = 0; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_shutdown = B_TRUE; + + /* + * Wait for the worker threads and any blocked io_getevents threads to + * exit. We have a hold and our rele will cleanup after all other holds + * are released. + */ + ASSERT(cp->lxioctx_in_use >= 1); + while (cp->lxioctx_in_use > 1) { + DTRACE_PROBE2(lx__io__destroy, lx_io_ctx_t *, cp, int, cnt); + cv_broadcast(&cp->lxioctx_pending_cv); + cv_broadcast(&cp->lxioctx_done_cv); + + /* + * Each worker has a hold. We want to let those threads finish + * up and exit. + */ + cv_wait(&lxpd->l_io_destroy_cv, &lxpd->l_io_ctx_lock); + cnt++; + } + + mutex_exit(&lxpd->l_io_ctx_lock); + lx_io_cp_rele(cp); + return (0); +} + +/* + * Called at proc fork to clear contexts from child. We don't bother to unmap + * l_io_ctxpage since the vast majority of processes will immediately exec and + * cause an unmapping. If the child does not exec, there will simply be a + * single shared page in its address space, so no additional anonymous memory + * is consumed. + */ +void +lx_io_clear(lx_proc_data_t *cpd) +{ + cpd->l_io_ctxs = NULL; + cpd->l_io_ctx_cnt = 0; + cpd->l_io_ctxpage = NULL; +} + +/* + * Called via lx_proc_exit to cleanup any existing io context array. All + * worker threads should have already exited by this point, so all contexts + * should already be deleted. + */ +void +lx_io_cleanup(proc_t *p) +{ + lx_proc_data_t *lxpd; + int i; + + mutex_enter(&p->p_lock); + VERIFY((lxpd = ptolxproc(p)) != NULL); + mutex_exit(&p->p_lock); + + mutex_enter(&lxpd->l_io_ctx_lock); + if (lxpd->l_io_ctxs == NULL) { + ASSERT(lxpd->l_io_ctx_cnt == 0); + mutex_exit(&lxpd->l_io_ctx_lock); + return; + } + + ASSERT(lxpd->l_io_ctx_cnt > 0); + for (i = 0; i < lxpd->l_io_ctx_cnt; i++) { + ASSERT(lxpd->l_io_ctxs[i] == NULL); + } + + kmem_free(lxpd->l_io_ctxs, lxpd->l_io_ctx_cnt * sizeof (lx_io_ctx_t *)); + lxpd->l_io_ctxs = NULL; + lxpd->l_io_ctx_cnt = 0; + mutex_exit(&lxpd->l_io_ctx_lock); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c new file mode 100644 index 0000000000..d46e442759 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/errno.h> + +/* From usr/src/uts/common/os/grow.c */ +extern intptr_t brk(caddr_t); + +long +lx_brk(caddr_t nva) +{ + if (nva != 0) { + (void) brk(nva); + + /* + * Despite claims to the contrary in the man page, when Linux + * brk(2) fails, errno is left unchanged. + */ + ttolwp(curthread)->lwp_errno = 0; + } + + /* + * When ASLR was integrated, our internal brk(2) was updated to emit + * the current brk when arg0 == 0. Using the function yields an + * equivalent result to manually calculating the brk, but also + * serializes with changes to the process AS. + */ + return ((long)brk((caddr_t)0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c new file mode 100644 index 0000000000..7783b97cb0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/thread.h> +#include <sys/klwp.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +long +lx_vn_chmod(vnode_t *vp, int mode) +{ + vattr_t vattr; + + vattr.va_mode = mode & MODEMASK; + vattr.va_mask = AT_MODE; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +static long +lx_fchmodat_wrapper(int fd, char *path, int mode) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, 0)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchmodat(int fd, char *path, int mode) +{ + return (lx_fchmodat_wrapper(fd, path, mode)); +} + +long +lx_fchmod(int fd, int mode) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchmod cannot leverage + * lx_fchmodat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_chmod(char *path, int mode) +{ + return (lx_fchmodat_wrapper(LX_AT_FDCWD, path, mode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c new file mode 100644 index 0000000000..830fba0a73 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c @@ -0,0 +1,180 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> + +long +lx_vn_chown(vnode_t *vp, uid_t uid, gid_t gid) +{ + vattr_t vattr; + zone_t *zone = crgetzone(CRED()); + + if ((uid != (uid_t)-1 && !VALID_UID(uid, zone)) || + (gid != (gid_t)-1 && !VALID_GID(gid, zone))) { + return (EINVAL); + } + vattr.va_uid = uid; + vattr.va_gid = gid; + vattr.va_mask = 0; + if (vattr.va_uid != -1) + vattr.va_mask |= AT_UID; + if (vattr.va_gid != -1) + vattr.va_mask |= AT_GID; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +long +lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, native_flag)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchown_wrapper(int fd, uid_t uid, gid_t gid) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchown cannot leverage + * lx_fchownat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag) +{ + int native_flag = 0; + + if (flag & LX_AT_EMPTY_PATH) { + char c; + + /* + * According to fchownat(2), when AT_EMPTY_PATH is set: "if + * path is an empty string, operate on the file referred to by + * fd". We pass NULL in place of the empty string, which + * causes fchownat() to operate on the fd we passed without an + * additional lookup. + */ + if (copyin(path, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + path = NULL; + } + + flag &= ~LX_AT_EMPTY_PATH; + } + if (flag & LX_AT_SYMLINK_NOFOLLOW) { + flag &= ~LX_AT_SYMLINK_NOFOLLOW; + native_flag |= AT_SYMLINK_NOFOLLOW; + } + if (flag != 0) { + return (set_errno(EINVAL)); + } + + return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag)); +} + +long +lx_fchown(int fd, uid_t uid, gid_t gid) +{ + return (lx_fchown_wrapper(fd, uid, gid)); +} + +long +lx_lchown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, + AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0)); +} + +long +lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchown_wrapper(fd, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid))); +} + +long +lx_lchown16(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c new file mode 100644 index 0000000000..4e00e90b1a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -0,0 +1,513 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * [This comment omits the 'LX_' prefix on the clone flag names.] + * + * The vast majority of clone calls result in the creation of a new process or + * a new thread. Both of these map easily from Linux to our native code. For + * these calls, the user-level brand library uses a brand call to hook into the + * lx_helper_clone function for the required in-kernel support. + * + * A fork will typically provide these clone flags: + * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID + * + * A new thread will use our SHARED_AS macro which has the flags: + * CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM + * + * In rare cases an application will attempt to use a subset of the SHARED_AS + * flags in order to implement some sharing between two processes without using + * a true thread. Because we do not have native support for this concept, the + * lx brand implements the notion of a 'clone-group'. This is a set of + * processes which share a subset of the allowed SHARED_AS flags. The lx brand + * syscalls implement the appropriate sharing for each flag. A clone-group is + * only instantiated in the rare case that a subset of the SHARED_AS flags are + * used with clone. + * + * The following set of flags could theoretically be supported, although most + * are not implemented at this time. The user-level brand library will validate + * that a supported subset of the flags are being used, or error if not. We + * also re-validate in the kernel. + * + * CLONE_FILES: share the file descriptor table + * CLONE_FS: share the filesystem information (root of the filesystem, the + * CWD, and the umask) + * CLONE_SIGHAND: share the table of signal handlers + * CLONE_THREAD: share the thread group + * CLONE_VM: share the address space + * + * At this time, only those flags defined in CLONE_GRP_SUBSET (CLONE_FS) are + * implemented. + * + * When a clone-group is in use, the lx_proc_data_t`l_clone_grps array will + * hold groups of processes sharing the attributes relevant to the clone flag. + * Each supported flag can have an associated group list in the array. + * + * On the first clone, a new lx_clone_grp_t struct will be created. This struct + * holds a pointer to each process in the group. A reference to that group is + * held in the appropriate slot in l_clone_grps. The struct is created for + * the parent process by lx_clone_grp_create() and then the child process will + * associate itself with the group(s) using lx_clone_grp_enter(). + * + * Each syscall acting upon attributes relevant to a clone-group must include + * logic to do so properly. The syscalls will use lx_clone_grp_member() to + * determine if clone-group handling is required, and use lx_clone_grp_walk() + * to walk the list of processes in the group and apply the provided callback + * to each process. + * + * The following example illustrates how a common clone group would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_FS + * When A clones B, a new clone group is created and saved in the LX_CLGRP_FS + * slot in the l_clone_grps array on both A and B. When B clones, since a group + * already exists, C is added to the group and the group is saved in the + * LX_CLGRP_FS slot on C. + * + * The following example illustrates how two common clone groups would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS|CLONE_THREAD + * A new clone group is created and saved in the LX_CLGRP_FS slot in the + * l_clone_grps array on both A and B. A second clone group is created and + * saved in the LX_CLGRP_THREAD slot on both A and B (note that LX_CLGRP_THREAD + * is not implemented at this time). + * + * The following example illustrates how different clone groups would be used, + * as processes clone with different sets of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_THREAD + * C clones D with CLONE_FS + * In this example, only A&B and C&D should share their FS information. B&C + * have to be in two clone groups. When A clones, a new clone group is created + * and saved in the LX_CLGRP_FS slot in the l_clone_grps array on both A and B. + * When B clones, a new clone group is created and saved in the LX_CLGRP_THREAD + * slot on both B and C (note that LX_CLGRP_THREAD is not implemented at this + * time). When C clones, a new clone group is created and saved in the + * LX_CLGRP_FS slot on both C and D. + * + * When a process exits, it removes itself from any groups to which it belongs. + * When the last process exits a group, it is cleaned up. + * + * If clone-groups were commonly used, this implementation would be inefficient + * and unwieldy, but since they are so rare a straightforward list-based + * approach is adequate. + * + * During group creation, the l_clone_grp_lock is first taken to ensure only + * one group is created, otherwise, only the group's lx_clgrp_lock protects the + * list. + * + * Note: Despite the locking, there is still a subtle race that can occur in + * this code. This occurs if a process has two threads and one of them is about + * to execute a clone-group aware syscall (e.g. chdir), while the other thread + * is forking to create a new clone-group. In theory the child process could be + * created, but not yet in the group. The syscall in the first thread could + * thus miss the new process. For example, the first thread might chdir the + * parent, but since the child process was alrady created, but not yet in the + * clone-group, it would not be chdir-ed. + */ + + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> + +/* + * We currently only support a single clone-group (CLONE_FS) but the design + * allows for future expansion by expanding the lx_proc_data+t`l_clone_grps + * array. + */ +static int +lx_clone_flag2grp(uint_t flag) +{ + if (flag & LX_CLONE_FS) + return (LX_CLGRP_FS); + + return (-1); +} + +/* + * Note: this function has the side effect of clearing the flags. + */ +static int +lx_clone_flags_iter(uint_t *fp) +{ + if (*fp & LX_CLONE_FS) { + *fp &= ~LX_CLONE_FS; + return (LX_CLGRP_FS); + } + + return (-1); +} + +/* + * Setup the current process in the proper clone-group(s) and record the + * clone-group flags on the lwp so that we can join the child process to the + * group during lx_forklwp(). + */ +void +lx_clone_grp_create(uint_t flags) +{ + int offset; + lx_proc_data_t *plproc = ttolxproc(curthread); + lx_lwp_data_t *ldp = (lx_lwp_data_t *)ttolwp(curthread)->lwp_brand; + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + if (!LX_IS_CLONE_GRP(flags)) + return; + + ldp->br_clone_grp_flags = flags & LX_CLONE_GRP_SUBSET; + + cgps = plproc->l_clone_grps; + /* + * We take the top-level mutex during create to ensure we only create + * one group per flag. + */ + mutex_enter(&plproc->l_clone_grp_lock); + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * If we already havae a clone-group list for this flag then + * nothing to do. + */ + if (cgp != NULL) + continue; + + /* + * Create a new clone-group. If it ever becomes an issue, we + * could preallocate this memory before taking + * l_clone_grp_lock. + */ + cgp = kmem_alloc(sizeof (lx_clone_grp_t), KM_SLEEP); + mutex_init(&cgp->lx_clgrp_lock, NULL, MUTEX_DEFAULT, NULL); + cgp->lx_clgrp_cnt = 1; + list_create(&cgp->lx_clgrp_members, + sizeof (lx_clone_grp_member_t), + offsetof(lx_clone_grp_member_t, lx_clgrpm_link)); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = curproc; + list_insert_tail(&cgp->lx_clgrp_members, mp); + + /* Attach group to our proc */ + plproc->l_clone_grps[offset] = cgp; + } + mutex_exit(&plproc->l_clone_grp_lock); +} + +/* + * Add the child process to the proper parent clone-group(s). + * + * Called from lx_forklwp, thus there is no need to have any locking for the + * destination proc. This is always run in the thread context of the source + * thread, and the destination thread is always newly created and not referred + * to from anywhere else. The source process should have already created the + * clone group(s) that we need to place the child into via lx_clone_grp_create. + */ +void +lx_clone_grp_enter(uint_t flags, proc_t *srcp, proc_t *dstp) +{ + int offset; + lx_proc_data_t *plproc = ptolxproc(srcp); + lx_proc_data_t *clproc = ptolxproc(dstp); + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + cgps = plproc->l_clone_grps; + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * Parent should already have a clone-group list for this flag. + * The child joins that group. + */ + VERIFY(cgp != NULL); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = dstp; + + mutex_enter(&cgp->lx_clgrp_lock); + list_insert_tail(&cgp->lx_clgrp_members, mp); + cgp->lx_clgrp_cnt++; + clproc->l_clone_grps[offset] = cgp; + mutex_exit(&cgp->lx_clgrp_lock); + } +} + +/* + * The process is exiting or we're exec-ing a native app. In the unlikely event + * it is in a clone-group, remove it from the group and perform any necessary + * cleanup. Normally we're called from lx_proc_exit(), so we know we're the + * last lwp in the process, but we can also be called from lx_clearbrand() when + * exec-ing a native application. In this case we know the lwp(s) are stopped + * (It is possible to have multiple lwps if we branded the process but the + * exec failed. Those lwps were just branded as part of the exec, and will + * be de-branded). + */ +void +lx_clone_grp_exit(proc_t *p, boolean_t lwps_ok) +{ + int i; + lx_proc_data_t *plproc = ptolxproc(p); + lx_clone_grp_t **cgps; + + ASSERT(!MUTEX_HELD(&p->p_lock)); + ASSERT(plproc != NULL); + + if (!lwps_ok) + VERIFY(p->p_lwpcnt <= 1); + + cgps = plproc->l_clone_grps; + for (i = 0; i < LX_CLGRP_MAX; i++) { + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + boolean_t found; + + cgp = cgps[i]; + if (cgp == NULL) + continue; + + /* + * The rare case when this process belongs to a clone-group. + */ + + mutex_enter(&cgp->lx_clgrp_lock); + + /* First remove ourselves from the group. */ + found = B_FALSE; + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + if (mp->lx_clgrpm_pp == p) { + found = B_TRUE; + list_remove(&cgp->lx_clgrp_members, mp); + kmem_free(mp, sizeof (lx_clone_grp_member_t)); + ASSERT(cgp->lx_clgrp_cnt > 0); + cgp->lx_clgrp_cnt--; + plproc->l_clone_grps[i] = NULL; + break; + } + mp = list_next(&cgp->lx_clgrp_members, mp); + } + VERIFY(found); + + if (cgp->lx_clgrp_cnt > 0) { + mutex_exit(&cgp->lx_clgrp_lock); + continue; + } + + /* + * cgp->lx_clgrp_cnt == 0 + * + * We're the sole remaining member; finish cleanup now. + */ + ASSERT(plproc->l_clone_grps[i] == NULL); + mutex_exit(&cgp->lx_clgrp_lock); + + /* Delete the group since there are no more references to it. */ + VERIFY(list_is_empty(&cgp->lx_clgrp_members)); + + list_destroy(&cgp->lx_clgrp_members); + mutex_destroy(&cgp->lx_clgrp_lock); + kmem_free(cgp, sizeof (lx_clone_grp_t)); + } +} + +/* + * Return true in the rare case that the process is a member of a clone group + * with the specific flag set. Clone groups are only added to the array + * atomically until this process exits, so we don't need to take + * l_clone_grp_lock. + */ +boolean_t +lx_clone_grp_member(lx_proc_data_t *dp, uint_t flag) +{ + int offset; + + if ((offset = lx_clone_flag2grp(flag)) == -1) + return (B_FALSE); + + if (dp->l_clone_grps[offset] != NULL) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Walk all of the processes in the clone-group list and apply the callback + * to each. Because we're holding the group list lock (lx_clgrp_lock) none of + * the processes can exit, but that is the only locking guarantee made by this + * function itself. + */ +int +lx_clone_grp_walk(lx_proc_data_t *dp, uint_t flag, int (*cb)(proc_t *, void *), + void *arg) +{ + int offset; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + int res, rv = 0; + + + ASSERT(dp != NULL); + /* We should not be called unless we belong to a group */ + VERIFY((offset = lx_clone_flag2grp(flag)) != -1); + VERIFY(dp->l_clone_grps[offset] != NULL); + + cgp = dp->l_clone_grps[offset]; + mutex_enter(&cgp->lx_clgrp_lock); + + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + res = cb(mp->lx_clgrpm_pp, arg); + /* return the first error we see, but try all procs */ + if (res != 0 && rv == 0) + rv = res; + mp = list_next(&cgp->lx_clgrp_members, mp); + } + + mutex_exit(&cgp->lx_clgrp_lock); + + return (rv); +} + + +/* + * Our lwp has already been created at this point, so this routine is + * responsible for setting up all the state needed to track this as a + * linux cloned thread. + */ +/* ARGSUSED */ +int +lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + struct lx_proc_data *lproc = ttolxproc(curthread); + struct ldt_info info; + struct user_desc descr; + int tls_index; + int entry = -1; + int signo; + + signo = flags & LX_CSIGNAL; + if (signo < 0 || signo > LX_NSIG) + return (set_errno(EINVAL)); + + if (!(flags & LX_CLONE_THREAD)) { + lproc->l_signal = signo; + } else { + if (flags & LX_CLONE_SETTLS) { + if (get_udatamodel() == DATAMODEL_ILP32) { + if (copyin((caddr_t)tls, &info, sizeof (info))) + return (set_errno(EFAULT)); + + if (LDT_INFO_EMPTY(&info)) + return (set_errno(EINVAL)); + + entry = info.entry_number; + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + tls_index = entry - GDT_TLSMIN; + + /* + * Convert the user-space structure into a real + * x86 descriptor and copy it into this LWP's + * TLS array. We also load it into the GDT. + */ + LDT_INFO_TO_DESC(&info, &descr); + bcopy(&descr, &lwpd->br_tls[tls_index], + sizeof (descr)); + lx_set_gdt(entry, &lwpd->br_tls[tls_index]); + } else { + /* + * Set the Linux %fsbase for this LWP. We will + * restore it the next time we return to Linux + * via setcontext()/lx_restorecontext(). + */ + lwpd->br_lx_fsbase = (uintptr_t)tls; + } + } + + lwpd->br_clear_ctidp = + (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL; + + if (signo && ! (flags & LX_CLONE_DETACH)) + lwpd->br_signal = signo; + else + lwpd->br_signal = 0; + + if (flags & LX_CLONE_THREAD) + lwpd->br_tgid = curthread->t_procp->p_pid; + + if (flags & LX_CLONE_PARENT) + lwpd->br_ppid = 0; + + if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) && + (suword32(ctidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) && + (suword32(ptidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + } + + *rval = lwpd->br_pid; + return (0); +} + +long +lx_set_tid_address(int *tidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + long rv; + + lwpd->br_clear_ctidp = tidp; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + rv = lwpd->br_pid; + } + + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_close.c b/usr/src/uts/common/brand/lx/syscall/lx_close.c new file mode 100644 index 0000000000..5d1a1605c1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_close.c @@ -0,0 +1,30 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> + + +extern int close(int); + +long +lx_close(int fdes) +{ + return (close(fdes)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c new file mode 100644 index 0000000000..b0a92394dc --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * We support neither the second argument (NUMA node), nor the third (obsolete + * pre-2.6.24 caching functionality which was ultimately broken). + */ +/* ARGSUSED1 */ +long +lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3) +{ + unsigned int curcpu = curthread->t_cpu->cpu_id; + + if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_dup.c b/usr/src/uts/common/brand/lx/syscall/lx_dup.c new file mode 100644 index 0000000000..d0f513753c --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_dup.c @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +/* From usr/src/uts/common/syscall/fcntl.c */ +extern int fcntl(int, int, intptr_t); + +long +lx_dup(int fd) +{ + return (fcntl(fd, F_DUPFD, 0)); +} + +long +lx_dup2(int oldfd, int newfd) +{ + return (fcntl(oldfd, F_DUP2FD, newfd)); +} + +long +lx_dup3(int oldfd, int newfd, int flags) +{ + int rc; + + /* The only valid flag is O_CLOEXEC. */ + if (flags & ~LX_O_CLOEXEC) + return (set_errno(EINVAL)); + + /* Only DUP2FD_CLOEXEC returns EINVAL on the same fd's */ + if (oldfd == newfd) + return (set_errno(EINVAL)); + + rc = fcntl(oldfd, (flags == 0) ? F_DUP2FD : F_DUP2FD_CLOEXEC, newfd); + return (rc); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_epoll.c b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c new file mode 100644 index 0000000000..47688dad6a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c @@ -0,0 +1,303 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/epoll.h> +#include <sys/devpoll.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/vnode.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_signal.h> + +static major_t devpoll_major = 0; + +static boolean_t +lx_epoll_isvalid(file_t *fp) +{ + vnode_t *vp = fp->f_vnode; + + if (vp->v_type == VCHR && getmajor(vp->v_rdev) == devpoll_major) + return (B_TRUE); + return (B_FALSE); +} + +long +lx_epoll_create1(int flags) +{ + int err, fd, rv; + int fmode = FREAD | FWRITE; + boolean_t cloexec = B_FALSE; + vnode_t *vp = NULL; + file_t *fp = NULL; + + if (flags & EPOLL_CLOEXEC) { + cloexec = B_TRUE; + flags &= ~EPOLL_CLOEXEC; + } + if (flags != 0) { + /* No other flags accepted at this time */ + return (set_errno(EINVAL)); + } + + if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) { + err = EMFILE; + goto error; + } + if (ldi_vp_from_name("/devices/pseudo/poll@0:poll", &vp) != 0) { + err = ENOENT; + goto error; + } + if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) { + goto error; + } + err = VOP_IOCTL(vp, DP_EPOLLCOMPAT, 0, fmode, CRED(), &rv, NULL); + if (err != 0) { + (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL); + goto error; + } + + devpoll_major = getmajor(vp->v_rdev); + + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if (cloexec) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); + +error: + if (fp != NULL) { + setf(fd, NULL); + unfalloc(fp); + } + if (vp != NULL) { + VN_RELE(vp); + } + return (set_errno(err)); +} + +long +lx_epoll_create(int size) +{ + if (size <= 0) { + return (set_errno(EINVAL)); + } + + return (lx_epoll_create1(0)); +} + + +/* Match values from libc implementation */ +#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP) +#define EPOLLSWIZZLED \ + (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM) +#define EPOLL_TIMEOUT_CLAMP(t) (((t) < -1) ? -1 : (t)) + +long +lx_epoll_ctl(int fd, int op, int pfd, void *event) +{ + epoll_event_t epevent; + dvpoll_epollfd_t dpevent[2]; + file_t *fp; + iovec_t aiov; + uio_t auio; + uint32_t events, ev = 0; + int error = 0, i = 0; + + dpevent[i].dpep_pollfd.fd = pfd; + switch (op) { + case EPOLL_CTL_DEL: + dpevent[i].dpep_pollfd.events = POLLREMOVE; + break; + + case EPOLL_CTL_MOD: + /* + * In the modify case, we pass down two events: one to + * remove the event and another to add it back. + */ + dpevent[i++].dpep_pollfd.events = POLLREMOVE; + dpevent[i].dpep_pollfd.fd = pfd; + /* FALLTHROUGH */ + + case EPOLL_CTL_ADD: + if (copyin(event, &epevent, sizeof (epevent)) != 0) + return (set_errno(EFAULT)); + + /* + * Mask off the events that we ignore, and then swizzle the + * events for which our values differ from their epoll(7) + * equivalents. + */ + events = epevent.events; + ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED); + + if (events & EPOLLRDHUP) + ev |= POLLRDHUP; + if (events & EPOLLET) + ev |= POLLET; + if (events & EPOLLONESHOT) + ev |= POLLONESHOT; + if (events & EPOLLWRNORM) + ev |= POLLWRNORM; + if (events & EPOLLWRBAND) + ev |= POLLWRBAND; + + dpevent[i].dpep_data = epevent.data.u64; + dpevent[i].dpep_pollfd.events = ev; + break; + + default: + return (set_errno(EINVAL)); + } + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + aiov.iov_base = (void *)dpevent; + aiov.iov_len = sizeof (dvpoll_epollfd_t) * (i + 1); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_loffset = 0; + auio.uio_fmode = fp->f_flag; + + error = VOP_WRITE(fp->f_vnode, &auio, 1, fp->f_cred, NULL); + + releasef(fd); + + switch (error) { + case 0: + return (0); + + case EBADF: + case EEXIST: + case EINVAL: + case ENOENT: + case ENOMEM: + case ENOSPC: + case EPERM: + /* + * Legal errors should pass straight through. + */ + return (set_errno(error)); + + case ELOOP: + /* + * In the case of descriptor loops, /dev/poll emits a more + * descriptive error than Linux epoll consumers would expect. + */ + return (set_errno(EINVAL)); + + default: + /* + * While devpoll itself should not emit unexpected errors, it + * is possible that a VOP_POLL handler might. There is little + * choice but to map these unexpected errors to something which + * is valid for epoll_ctl. + */ + return (set_errno(ENOMEM)); + } +} + +long +lx_epoll_wait(int fd, void *events, int maxevents, int timeout) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout); + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_POLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} + +long +lx_epoll_pwait(int fd, void *events, int maxevents, int timeout, void *sigmask) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + k_sigset_t ksig; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + if (sigmask != NULL) { + lx_sigset_t lsig; + + if (copyin(sigmask, &lsig, sizeof (lsig)) != 0) { + releasef(fd); + return (set_errno(EFAULT)); + } + lx_ltos_sigset(&lsig, &ksig); + arg.dp_setp = (sigset_t *)&ksig; + } else { + arg.dp_setp = NULL; + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout); + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_PPOLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c new file mode 100644 index 0000000000..21205aa18a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c @@ -0,0 +1,126 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/vnode.h> +#include <sys/eventfd.h> + +static major_t eventfd_major = 0; + +/* io_submit uses this to validate control block eventfd descriptors */ +boolean_t +lx_is_eventfd(file_t *fp) +{ + vnode_t *vp = fp->f_vnode; + + if (vp->v_type == VCHR && getmajor(vp->v_rdev) == eventfd_major) + return (B_TRUE); + return (B_FALSE); +} + +long +lx_eventfd2(uint_t initval, int flags) +{ + int err, fd; + int fmode = FREAD | FWRITE; + vnode_t *vp = NULL; + file_t *fp = NULL; + + if (flags & ~(EFD_NONBLOCK | EFD_CLOEXEC | EFD_SEMAPHORE)) + return (set_errno(EINVAL)); + + if (flags & EFD_NONBLOCK) + fmode |= FNONBLOCK; + + if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) + return (set_errno(EMFILE)); + + if (ldi_vp_from_name("/dev/eventfd", &vp) != 0) { + /* + * If /dev/eventfd is not available then it is less jarring to + * Linux programs to tell them that the system call is not + * supported instead of reporting an error (ENOENT) they are + * not expecting. + */ + err = ENOTSUP; + goto error; + } + if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) { + VN_RELE(vp); + vp = NULL; + goto error; + } + + if (flags & EFD_SEMAPHORE) { + int rv; + + if ((err = VOP_IOCTL(vp, EVENTFDIOC_SEMAPHORE, 0, fmode, CRED(), + &rv, NULL)) != 0) + goto error; + } + + if (initval != 0) { + uint64_t val = initval; + struct uio auio; + struct iovec aiov; + + /* write initial value */ + aiov.iov_base = (caddr_t)&val; + aiov.iov_len = sizeof (val); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = 0; + auio.uio_offset = 0; + auio.uio_resid = sizeof (val); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_fmode = FWRITE; + + if ((err = VOP_WRITE(vp, &auio, FWRITE, CRED(), NULL)) != 0) + goto error; + } + + eventfd_major = getmajor(vp->v_rdev); + + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if (flags & EFD_CLOEXEC) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); + +error: + if (fp != NULL) { + setf(fd, NULL); + unfalloc(fp); + } + if (vp != NULL) { + (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL); + VN_RELE(vp); + } + return (set_errno(err)); +} + +long +lx_eventfd(uint_t val) +{ + return (lx_eventfd2(val, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c new file mode 100644 index 0000000000..61f9b936f2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c @@ -0,0 +1,103 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_misc.h> + +/* + * Based on illumos posix_fadvise which does nothing. The only difference is + * that on Linux an fd refering to a pipe or FIFO returns EINVAL. The Linux + * POSIX_FADV_* values are the same as the illumos values. See how the 32-bit + * glibc calls fadvise64; the offeset is a 64-bit value, but the length is not. + * fadvise64_64 passes both the offset and length as 64-bit values. The 64-bit + * fadvise64 caller always passes 64-bit values for the offset and length. + */ + +/* + * This is the fadvise64 function used by 64-bit callers, and by 32-bit callers + * after they have adjusted their arguments. + */ +/* ARGSUSED */ +int +lx_fadvise64(int fd, off64_t offset, off64_t len, int advice) +{ + file_t *fp; + boolean_t is_fifo; + + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + break; + default: + return (set_errno(EINVAL)); + } + + if (len < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + is_fifo = (fp->f_vnode->v_type == VFIFO); + releasef(fd); + + if (is_fifo) + return (set_errno(ESPIPE)); + + return (0); +} + +/* + * This is the fadvise64 function used by 32-bit callers. Linux passes the + * 64-bit offset by concatenating consecutive arguments. We must perform the + * same conversion here. + */ +long +lx_fadvise64_32(int fd, uint32_t off_lo, uint32_t off_hi, int32_t len, + int advice) +{ + off64_t offset; + + offset = off_hi; + offset = offset << 32; + offset |= off_lo; + + return (lx_fadvise64(fd, offset, (off64_t)len, advice)); +} + +/* + * This function is only used by 32-bit callers. Linux passes the 64-bit offset + * and length by concatenating consecutive arguments. We must perform the same + * conversion here. + */ +long +lx_fadvise64_64(int fd, uint32_t off_lo, uint32_t off_hi, uint32_t len_lo, + uint32_t len_hi, int advice) +{ + off64_t offset; + off64_t len; + + offset = off_hi; + offset = offset << 32; + offset |= off_lo; + len = len_hi; + len = len << 32; + len |= len_lo; + + return (lx_fadvise64(fd, offset, len, advice)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c new file mode 100644 index 0000000000..338e4399fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c @@ -0,0 +1,251 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/nbmlock.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/sdt.h> + +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); + +#define LX_FALLOC_FL_KEEP_SIZE 0x01 +#define LX_FALLOC_FL_PUNCH_HOLE 0x02 +#define LX_FALLOC_FL_NO_HIDE_STALE 0x04 +#define LX_FALLOC_FL_COLLAPSE_RANGE 0x08 +#define LX_FALLOC_FL_ZERO_RANGE 0x10 + +#define LX_FALLOC_VALID (LX_FALLOC_FL_KEEP_SIZE | LX_FALLOC_FL_PUNCH_HOLE | \ + LX_FALLOC_FL_NO_HIDE_STALE | LX_FALLOC_FL_COLLAPSE_RANGE | \ + LX_FALLOC_FL_ZERO_RANGE) + +#define LX_FALLOC_UNSUPP (LX_FALLOC_FL_NO_HIDE_STALE | \ + LX_FALLOC_FL_COLLAPSE_RANGE) + +long +lx_fallocate(int fd, int mode, off_t offset, off_t len) +{ + int error = 0; + file_t *fp; + vnode_t *vp; + int64_t tot; + struct flock64 bf; + vattr_t vattr; + u_offset_t f_offset; + boolean_t in_crit = B_FALSE; + + /* + * Error checking is in a specific order to make LTP happy. + */ + + tot = offset + len; + if (tot > (LLONG_MAX / (int64_t)1024)) + return (set_errno(EFBIG)); + + if (mode & LX_FALLOC_UNSUPP) + return (set_errno(EOPNOTSUPP)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto done; + } + + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = EINVAL; + goto done; + } + + if (offset < 0 || len <= 0) { + error = EINVAL; + goto done; + } + + if (tot < 0LL) { + error = EFBIG; + goto done; + } + + if ((mode & ~LX_FALLOC_VALID) != 0) { + error = EINVAL; + goto done; + } + + /* + * If this is the only flag then we don't actually do any work. + */ + if (mode == LX_FALLOC_FL_KEEP_SIZE) + goto done; + + bzero(&bf, sizeof (bf)); + + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + if (mode == 0) { + /* Nothing to do if not extending the file */ + if (vattr.va_size >= tot) + goto done; + + /* Extend the file. */ + bf.l_start = (off64_t)tot; + bf.l_len = (off64_t)0; + + } else if (mode & LX_FALLOC_FL_PUNCH_HOLE) { + /* + * Deallocate space in the file. + */ + if ((mode & LX_FALLOC_FL_KEEP_SIZE) == 0) { + /* this flag is required with punch hole */ + error = EINVAL; + goto done; + } + + if (mode & + ~(LX_FALLOC_FL_PUNCH_HOLE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend since keep_size is set. */ + if (vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = (off_t)vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + + } else if (mode & LX_FALLOC_FL_ZERO_RANGE) { + /* + * Zero out the space in the file. + */ + if (mode & + ~(LX_FALLOC_FL_ZERO_RANGE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend when keep_size is set. */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + } else { + /* We should have already handled all flags */ + VERIFY(0); + } + + /* + * Check for locks in the range. + */ + f_offset = fp->f_offset; + error = flock_check(vp, &bf, f_offset, MAXOFF_T); + if (error != 0) + goto done; + + /* + * Check for conflicting non-blocking mandatory locks. + * We need to get the size again under nbl_start_crit. + */ + if (nbl_need_check(vp)) { + u_offset_t begin; + ssize_t length; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + /* + * Make sure we don't extend when keep_size is set. + */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + ASSERT(mode & (LX_FALLOC_FL_PUNCH_HOLE | + LX_FALLOC_FL_ZERO_RANGE)); + + /* + * If the size grew we can short-circuit the rest of + * the work, otherwise adjust bf for the vop_space + * call. + */ + if (offset >= vattr.va_size) + goto done; + len = vattr.va_size - offset; + bf.l_len = (off64_t)len; + } + + if (offset > vattr.va_size) { + begin = vattr.va_size; + length = offset - vattr.va_size; + } else { + begin = offset; + length = vattr.va_size - offset; + } + + if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, NULL)) { + error = EACCES; + goto done; + } + } + + error = VOP_SPACE(vp, F_FREESP, &bf, 0, f_offset, fp->f_cred, NULL); + +done: + if (in_crit) + nbl_end_crit(vp); + + releasef(fd); + if (error != 0) + return (set_errno(error)); + + return (0); +} + +long +lx_fallocate32(int fd, int mode, uint32_t offl, uint32_t offh, uint32_t lenl, + uint32_t lenh) +{ + int64_t offset = 0, len = 0; + + /* + * From 32-bit callers, Linux passes the 64-bit offset and len by + * concatenating consecutive arguments. We must perform the same + * conversion here. + */ + offset = offh; + offset = offset << 32; + offset |= offl; + len = lenh; + len = len << 32; + len |= lenl; + + return (lx_fallocate(fd, mode, (off_t)offset, (off_t)len)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c new file mode 100644 index 0000000000..a5406c0a4f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c @@ -0,0 +1,701 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/cmn_err.h> +#include <sys/pathname.h> +#include <sys/policy.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/brand.h> +#include <sys/fs/fifonode.h> +#include <sys/strsubr.h> +#include <sys/stream.h> +#include <sys/flock.h> + +extern int fcntl(int, int, intptr_t); +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +extern int lx_pipe_setsz(stdata_t *, uint_t, boolean_t); + + +int +lx_vp_at(int fd, char *upath, vnode_t **vpp, int flag) +{ + vnode_t *startvp; + int error; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + if ((error = fgetstartvp(fd, upath, &startvp)) != 0) { + return (error); + } + + if (upath != NULL) { + uio_seg_t seg = UIO_USERSPACE; + + error = lookupnameat(upath, seg, + (flag == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, vpp, startvp); + if (startvp != NULL) { + VN_RELE(startvp); + } + return (error); + } else { + /* VN_HOLD was established in fgetstartvp */ + *vpp = startvp; + VERIFY(*vpp); + return (0); + } +} + +#define LTOS_FLOCK(l, s) \ +{ \ + s->l_type = ltos_type(l->l_type); \ + s->l_whence = l->l_whence; \ + s->l_start = l->l_start; \ + s->l_len = l->l_len; \ + s->l_sysid = 0; /* not defined in linux */ \ + s->l_pid = (pid_t)l->l_pid; \ +} + +#define STOL_FLOCK(s, l) \ +{ \ + l->l_type = stol_type(s->l_type); \ + l->l_whence = s->l_whence; \ + l->l_start = s->l_start; \ + l->l_len = s->l_len; \ + l->l_pid = (int)s->l_pid; \ +} + +static short +ltos_type(short l_type) +{ + switch (l_type) { + case LX_F_RDLCK: + return (F_RDLCK); + case LX_F_WRLCK: + return (F_WRLCK); + case LX_F_UNLCK: + return (F_UNLCK); + default: + return (-1); + } +} + +static short +stol_type(short l_type) +{ + switch (l_type) { + case F_RDLCK: + return (LX_F_RDLCK); + case F_WRLCK: + return (LX_F_WRLCK); + case F_UNLCK: + return (LX_F_UNLCK); + default: + /* can't ever happen */ + return (0); + } +} + +static void +ltos_flock(struct lx_flock *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock(struct flock64 *s, struct lx_flock *l) +{ + STOL_FLOCK(s, l) +} + +static void +ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock64(struct flock64 *s, struct lx_flock64_32 *l) +{ + STOL_FLOCK(s, l) +} + +static int +lx_fcntl_getfl(int fd) +{ + int retval; + int rc; + + retval = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + if ((retval & O_ACCMODE) == O_RDONLY) + rc = LX_O_RDONLY; + else if ((retval & O_ACCMODE) == O_WRONLY) + rc = LX_O_WRONLY; + else + rc = LX_O_RDWR; + /* O_NDELAY != O_NONBLOCK, so we need to check for both */ + if (retval & O_NDELAY) + rc |= LX_O_NDELAY; + if (retval & O_NONBLOCK) + rc |= LX_O_NONBLOCK; + if (retval & O_APPEND) + rc |= LX_O_APPEND; + if (retval & O_SYNC) + rc |= LX_O_SYNC; + if (retval & O_LARGEFILE) + rc |= LX_O_LARGEFILE; + if (retval & FASYNC) + rc |= LX_O_ASYNC; + + return (rc); +} + +#define LX_SETFL_MASK (O_NONBLOCK | O_APPEND | O_SYNC | FASYNC); + +static int +lx_fcntl_setfl(int fd, ulong_t arg) +{ + int flags; + + /* + * When performing fcntl(F_SETFL), only certain flags are + * allowed to be manipulated. A mask is used to preserve + * other flags, such as those which are specified during + * open(2). The mask on Linux excludes O_LARGEFILE from + * being manipulated, whereas illumos expects the flag to + * be set. In order to properly preserve the O_LARGEFILE + * (FOFFMAX) state, we must first query for it via + * fcntl(F_GETFL) so that the value can be carried + * through. + */ + flags = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + flags &= ~LX_SETFL_MASK; + + /* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */ + if (arg & LX_O_NDELAY) + flags |= O_NONBLOCK; + if (arg & LX_O_APPEND) + flags |= O_APPEND; + if (arg & LX_O_SYNC) + flags |= O_SYNC; + if (arg & LX_O_ASYNC) + flags |= FASYNC; + + return (fcntl(fd, F_SETFL, flags)); +} + + +static int +lx_fcntl_pipesz(int fd, int cmd, ulong_t arg) +{ + file_t *fp; + vnode_t *vp; + stdata_t *str; + int err = 0, res = 0; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VFIFO || vp->v_op != fifo_vnodeops) { + err = EBADF; + goto out; + } + VERIFY((str = vp->v_stream) != NULL); + + if (cmd == LX_F_SETPIPE_SZ) { + err = lx_pipe_setsz(str, (uint_t)arg, B_FALSE); + } else if (cmd == LX_F_GETPIPE_SZ) { + size_t val; + + err = strqget(RD(str->sd_wrq), QHIWAT, 0, &val); + res = val; + } else { + /* NOTREACHED */ + ASSERT(0); + } + +out: + releasef(fd); + if (err != 0) { + return (set_errno(err)); + } + return (res); +} + +static int +lx_fcntl_common(int fd, int cmd, ulong_t arg) +{ + int rc = 0; + pid_t pid; + int error; + int rv; + int32_t flag; + file_t *fp; + + /* + * We depend on the call to fcntl to set the errno if necessary. + */ + ttolwp(curthread)->lwp_errno = 0; + + switch (cmd) { + case LX_F_SETSIG: + case LX_F_GETSIG: + case LX_F_SETLEASE: + case LX_F_GETLEASE: + case LX_F_NOTIFY: + case LX_F_CANCELLK: + { + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "unsupported fcntl command: %d", cmd); + lx_unsupported(buf); + } + return (set_errno(ENOTSUP)); + + case LX_F_DUPFD: + rc = fcntl(fd, F_DUPFD, arg); + break; + + case LX_F_DUPFD_CLOEXEC: + rc = fcntl(fd, F_DUPFD_CLOEXEC, arg); + break; + + case LX_F_GETFD: + rc = fcntl(fd, F_GETFD, 0); + break; + + case LX_F_SETFD: + rc = fcntl(fd, F_SETFD, arg); + break; + + case LX_F_GETFL: + rc = lx_fcntl_getfl(fd); + break; + + case LX_F_SETFL: + rc = lx_fcntl_setfl(fd, arg); + break; + + case LX_F_SETOWN: + pid = (pid_t)arg; + if (pid == 1) { + /* Setown for the init process uses the real pid. */ + pid = curzone->zone_proc_initpid; + } + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) { + /* + * On illumos F_SETOWN is only defined for sockets, but + * some apps hardcode to do this fcntl on other devices + * (e.g. /dev/tty) to setup signal handling. If the + * app is only setting itself to be the signal + * handler, we pretend to succeed. + */ + if (error != EINVAL || + curthread->t_procp->p_pid != pid) { + return (set_errno(error)); + } + } + + rc = 0; + break; + + case LX_F_GETOWN: + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) + return (set_errno(error)); + + if (pid == curzone->zone_proc_initpid) { + /* Getown for the init process returns 1. */ + pid = 1; + } + + rc = pid; + break; + + case LX_F_SETPIPE_SZ: + case LX_F_GETPIPE_SZ: + rc = lx_fcntl_pipesz(fd, cmd, arg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rc); +} + +static int +lx_fcntl_lock_cmd_to_s(int lx_cmd) +{ + switch (lx_cmd) { + case LX_F_GETLK: + return (F_GETLK); + case LX_F_SETLK: + return (F_SETLK); + case LX_F_SETLKW: + return (F_SETLKW); + case LX_F_GETLK64: + return (F_GETLK64); + case LX_F_SETLK64: + return (F_SETLK64); + case LX_F_SETLKW64: + return (F_SETLKW64); + default: + VERIFY(0); + /*NOTREACHED*/ + return (0); + } +} + +/* + * This is a pain but we can't re-use the fcntl code for locking since it does + * its own copyin/copyout for the flock struct. Since we have to convert the + * struct we have to do our own copyin/out. Thus we replicate the fcntl code for + * these 3 cmds. Luckily it's not much. + */ +static int +lx_fcntl_lock(int fd, int lx_cmd, void *arg) +{ + int cmd; + int error = 0; + file_t *fp; + vnode_t *vp; + int flag; + offset_t maxoffset; + u_offset_t offset; + model_t datamodel; + lx_flock_t lxflk; + lx_flock64_32_t lxflk64; + struct flock64 bf; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + maxoffset = MAXOFF_T; + datamodel = DATAMODEL_NATIVE; +#if defined(_SYSCALL32_IMPL) + if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32) + maxoffset = MAXOFF32_T; +#endif + vp = fp->f_vnode; + flag = fp->f_flag; + offset = fp->f_offset; + + cmd = lx_fcntl_lock_cmd_to_s(lx_cmd); + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + + lxflk.l_type = lxflk32.l_type; + lxflk.l_whence = lxflk32.l_whence; + lxflk.l_start = (off64_t)lxflk32.l_start; + lxflk.l_len = (off64_t)lxflk32.l_len; + lxflk.l_pid = lxflk32.l_pid; + } +#endif /* _SYSCALL32_IMPL */ + + ltos_flock(&lxflk, &bf); + + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) { + if (cmd == F_SETLKW && error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = + B_TRUE; + } + break; + } + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock(&bf, &lxflk); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type, + sizeof (lxflk.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) { + error = EOVERFLOW; + break; + } + + lxflk32.l_type = lxflk.l_type; + lxflk32.l_whence = lxflk.l_whence; + lxflk32.l_start = lxflk.l_start; + lxflk32.l_len = lxflk.l_len; + lxflk32.l_pid = lxflk.l_pid; + + if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + } +#endif /* _SYSCALL32_IMPL */ + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + /* + * Large File support is only used for ILP32 apps. + */ + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (cmd == F_GETLK64) + cmd = F_GETLK; + else if (cmd == F_SETLK64) + cmd = F_SETLK; + else if (cmd == F_SETLKW64) + cmd = F_SETLKW; + + if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + + ltos_flock64(&lxflk64, &bf); + + if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock64(&bf, &lxflk64); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk64.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk64.l_type, + &((lx_flock64_t *)arg)->l_type, + sizeof (lxflk64.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + break; + } + + releasef(fd); + if (error) + return (set_errno(error)); + + return (0); +} + +long +lx_fcntl(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK64: + case LX_F_SETLK64: + case LX_F_SETLKW64: + /* The 64-bit fcntl commands must go through fcntl64(). */ + return (set_errno(EINVAL)); + + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, arg)); + } +} + +long +lx_fcntl64(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + case LX_F_GETLK64: + case LX_F_SETLKW64: + case LX_F_SETLK64: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, (ulong_t)arg)); + } +} + +/* + * Apply or remove an advisory lock on the entire file. F_FLOCK and F_FLOCKW + * are OFD-style locks. For more information, see the comment on ofdlock(). + */ +long +lx_flock(int fd, int op) +{ + int cmd; + int error; + flock64_t bf; + file_t *fp; + + if (op & LX_LOCK_NB) { + cmd = F_FLOCK; + op &= ~LX_LOCK_NB; + } else { + cmd = F_FLOCKW; + } + + switch (op) { + case LX_LOCK_UN: + bf.l_type = F_UNLCK; + break; + case LX_LOCK_SH: + bf.l_type = F_RDLCK; + break; + case LX_LOCK_EX: + bf.l_type = F_WRLCK; + break; + default: + return (set_errno(EINVAL)); + } + + bf.l_whence = 0; + bf.l_start = 0; + bf.l_len = 0; + bf.l_sysid = 0; + bf.l_pid = 0; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + /* + * See the locking comment in fcntl.c. In summary, the *_frlock + * functions in the various file systems basically do some validation, + * then funnel everything through the fs_frlock function. For OFD-style + * locks, fs_frlock will do nothing. Once control returns here, we call + * the ofdlock function to do the actual locking. + */ + error = VOP_FRLOCK(fp->f_vnode, cmd, &bf, fp->f_flag, fp->f_offset, + NULL, fp->f_cred, NULL); + if (error != 0) { + releasef(fd); + return (set_errno(error)); + } + error = ofdlock(fp, cmd, &bf, fp->f_flag, fp->f_offset); + if (error != 0) { + if (cmd == F_FLOCKW && error == EINTR) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + (void) set_errno(error); + } + releasef(fd); + return (error); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c new file mode 100644 index 0000000000..2bf65748c0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -0,0 +1,1665 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/priv.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/condvar.h> +#include <sys/inttypes.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_futex.h> +#include <sys/lx_impl.h> +#include <sys/sdt.h> + +/* + * Futexes are a Linux-specific implementation of inter-process mutexes. + * They are designed to use shared memory for simple, uncontested + * operations, and rely on the kernel to resolve any contention issues. + * + * Most of the information in this section comes from the paper "Futexes + * Are Tricky", by Ulrich Drepper. This paper is currently available at: + * http://people.redhat.com/~drepper/futex.pdf. + * + * A futex itself a 4-byte integer, which must be 4-byte aligned. The + * value of this integer is expected to be modified using user-level atomic + * operations. For the original, simple futexes, the futex(4) design itself did + * not impose any semantic constraints on the value stored in the futex; it is + * up to the application to define its own protocol. For the newer, + * priority-inheritance (PI) futexes, the value is 0 or the TID of the holder, + * as defined in futex(2). + * + * When the application decides that kernel intervention is required, it + * will use the futex(2) system call. Originally there were 5 different + * operations that could be performed on a futex, using this system call, but + * that has subsequently been extended. Since this interface has evolved over + * time, there are several different prototypes available to the user. + * Fortunately, there is only a single kernel-level interface: + * + * long sys_futex(void *futex1, int cmd, int val1, + * struct timespec *timeout, void *futex2, int val2) + * + * The kernel-level operations that may be performed on a simple futex are: + * + * FUTEX_WAIT + * + * Atomically verify that futex1 contains the value val1. If it + * doesn't, return EWOULDBLOCK. If it does contain the expected + * value, the thread will sleep until somebody performs a FUTEX_WAKE + * on the futex. The caller may also specify a timeout, indicating + * the maximum time the thread should sleep. If the timer expires, + * the call returns ETIMEDOUT. If the thread is awoken with a signal, + * the call returns EINTR. Otherwise, the call returns 0. + * + * FUTEX_WAKE + * + * Wake up val1 processes that are waiting on futex1. The call + * returns the number of blocked threads that were woken up. + * + * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET + * + * Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument + * denoting a bit vector, with wakers will only waking waiters that match + * in one or more bits. These semantics are dubious enough, but the + * interface has an inconsistency that is glaring even by the + * embarrassingly low standards that Linux sets for itself: the timeout + * argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for + * FUTEX_WAIT. And as if that weren't enough unnecessary complexity, + * the caller may specify this absolute timeout to be against either + * CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET, + * of course! + * + * FUTEX_WAKE_OP + * + * The implementation of a conditional variable in terms of futexes + * actually uses two futexes: one to assure sequential access and one to + * represent the condition variable. This implementation gives rise to a + * particular performance problem whereby a thread is awoken on the futex + * that represents the condition variable only to have to (potentially) + * immediately wait on the futex that protects the condition variable. + * (Do not confuse the futex that serves to protect the condition variable + * with the pthread_mutex_t associated with pthread_cond_t -- which + * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP + * was invented, which performs an atomic compare-and-exchange on a + * second address in a specified fashion (that is, with a specified + * operation). Here are the possible operations (OPARG is defined + * to be 12 bit value embedded in the operation): + * + * - FUTEX_OP_SET: Sets the value at the second address to OPARG + * - FUTEX_OP_ADD: Adds the value to OPARG + * - FUTEX_OP_OR: OR's the value with OPARG + * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG + * - FUTEX_OP_XOR: XOR's the value with OPARG + * + * After this compare-and-exchange on the second address, a FUTEX_WAKE is + * performed on the first address and -- if the compare-and-exchange + * matches a specified result based on a specified comparison operation -- + * a FUTEX_WAKE is performed on the second address. Here are the possible + * comparison operations: + * + * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake + * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake + * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake + * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake + * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake + * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake + * + * As a practical matter, the only way that this is used (or, some might + * argue, is usable) is by the implementation of pthread_cond_signal(), + * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the + * futex that protects the condition variable and wake the futex that + * represents the condition variable. The second wake-up is conditional + * because the futex that protects the condition variable (rather than the + * one that represents it) may or may not have waiters. Given that this + * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for + * five different kinds of operations and six different kinds of + * comparision operations, in practice only one is used. (Namely, setting + * to 0 and waking if the old value is greater than 1 -- which denotes + * that waiters are present and the wakeup should be performed.) Moreover, + * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the + * case that the pthread_mutex_t associated with the pthread_cond_t is + * held at the time of a pthread_cond_signal(), this entire mechanism is + * essentially for naught in this case. As one can imagine (and can + * verify on just about any source base that uses pthread_cond_signal()), + * it is overwhelmingly the common case that the lock associated with the + * pthread_cond_t is held at the time of pthread_cond_signal(), assuring + * that the problem that all of this complexity was designed to solve + * isn't, in fact, solved because the signalled thread simply wakes up + * only to block again on the held mutex. Cue a slow clap! + * + * FUTEX_CMP_REQUEUE + * + * If the value stored in futex1 matches that passed in in val2, wake + * up val1 processes that are waiting on futex1. Otherwise, return + * EAGAIN. + * + * If there are more than val1 threads waiting on the futex, remove + * the remaining threads from this futex, and requeue them on futex2. + * The caller can limit the number of threads being requeued by + * encoding an integral numerical value in the position usually used + * for the timeout pointer. + * + * The call returns the number of blocked threads that were woken up + * or requeued. + * + * FUTEX_REQUEUE + * + * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. + * This command has been declared broken and obsolete, but we still + * need to support it. + * + * FUTEX_FD + * + * Return a file descriptor, which can be used to refer to the futex. + * This operation was broken by design, and was blessedly removed in + * Linux 2.6.26 ("because it was inherently racy"); it should go without + * saying that we don't support this operation. + * + * The kernel-level operations that may be performed on a PI futex are: + * + * FUTEX_LOCK_PI + * + * Called after a user-land attempt to acquire the lock using an atomic + * instruction failed because the futex had a nonzero value (the current + * holder's TID). Once enqueued, the thread sleeps until FUTEX_UNLOCK_PI + * is called on the futex, or the timeout expires. The timeout argument to + * FUTEX_LOCK_PI is absolute, unlike FUTEX_WAIT, and cannot be modified + * as with FUTEX_WAIT_BITSET! + * + * FUTEX_TRYLOCK_PI + * + * Similar to FUTEX_LOCK_PI but can be used for error recovery as + * described in futex(2). + * + * FUTEX_UNLOCK_PI + * + * Called when user-land cannot atomically release the lock because + * there are waiting threads. This will wake the highest priority waiting + * thread. + * + * FUTEX_CMP_REQUEUE_PI + * + * Not implemented at this time. + * + * FUTEX_WAIT_REQUEUE_PI + * + * Not implemented at this time. + * + * Priority Inheritance + * + * Our general approach to priority inheritance recognizes the fact that the + * application is almost certainly not a real-time process running on dedicated + * hardware. The zone is most likely running in a multi-tenant environment under + * FSS, in spite of whatever scheduling class the Linux application thinks it is + * using. Thus, we make our best effort to handle priority inheritance. When a + * thread must block on a PI futex, it may increase the scheduling priority of + * the futex holder to match the blocking thread. The futex holder's original + * priority will be restored when it unlocks the futex. + * + * This approach does not always handle transitive priority inheritance. For + * example, three threads at Low, Medium and High priority: + * L holds futex X + * M holds futex Y and became enqueued on X (M bumped L's priority to M) + * H enqueues on Y and bumps priority of M to H, but never bumps L's priority + * (which is currently M) up to H + * In reality this scenario is both uncommon and likely still executes + * reasonably well under a multi-tenant, FSS scenario. Also note that if H + * enqueued on Y before M enqueues on X, then L will have its priority raised + * to H when M enqueues on X. + * + * PI Futex Cleanup + * + * Futex cleanup can occur when a thread exits unexpectedly while holding one + * or more futexes. Normally this done via a "robust" futex and cleanup of a + * robust PI futex works in the same way as a non-PI robust futex (see + * lx_futex_robust_exit). On Linux, in the case of a non-robust PI futex, + * cleanup can still occur because the futex is associated with a real-time + * mutex inside the kernel (see the futex(2) man page for more details). For lx + * we are not using anything similar. When a thread exits, lx_futex_robust_exit + * will be called, but we would have to iterate every hash bucket, and every + * futex in the chain, to look for futexes held by the exiting thread. This + * would be very expensive and would occur whether or not the thread held any + * futexes. Thus, at this time we don't set the FUTEX_OWNER_DIED bit on + * non-robust PI futexes held by a thread when it exits while holding futexes. + * In practice this does not seem to be a serious limitation since user-level + * code generally appears to use robust futexes, but this may need to be + * revisited if it is observed to be an issue. + */ + +/* + * The structure of the robust_list, as set with the set_robust_list() system + * call. See lx_futex_robust_exit(), below, for details. + */ +typedef struct futex_robust_list { + uintptr_t frl_head; /* list of robust locks held */ + uint64_t frl_offset; /* offset of lock word within a lock */ + uintptr_t frl_pending; /* pending operation */ +} futex_robust_list_t; + +#if defined(_SYSCALL32_IMPL) + +#pragma pack(4) +typedef struct futex_robust_list32 { + uint32_t frl_head; /* list of robust locks held */ + uint32_t frl_offset; /* offset of lock word within a lock */ + uint32_t frl_pending; /* pending operation */ +} futex_robust_list32_t; +#pragma pack() + +#endif + +#define MEMID_COPY(s, d) \ + { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } +#define MEMID_EQUAL(s, d) \ + ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) + +/* + * Because collisions on this hash table can be a source of negative + * scalability, we make it pretty large: 4,096 entries -- 64K. If this + * size is found to be insufficient, the size should be made dynamic. + * (Making it dynamic will be delicate because the per-chain locking will + * necessitate memory retiring or similar; see the 2008 ACM Queue article + * "Real-world concurrency" for details on this technique.) + */ +#define HASH_SHIFT_SZ 12 +#define HASH_SIZE (1 << HASH_SHIFT_SZ) +#define HASH_FUNC(id) \ + ((((uintptr_t)((id)->val[1]) >> 3) + \ + ((uintptr_t)((id)->val[1]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[1]) >> (3 + 2 * HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> 3) + \ + ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ + (HASH_SIZE - 1)) + +/* + * A small, invalid value we can compare against to find the highest scheduling + * priority. + */ +#define BELOW_MINPRI INT_MIN + +/* + * We place the per-chain lock next to the pointer to the chain itself. + * When compared to an array of orthogonal locks, this reduces false sharing + * (though adjacent entries can still be falsely shared -- just not as many), + * while having the additional bonus of increasing locality. + */ +typedef struct futex_hash { + kmutex_t fh_lock; + fwaiter_t *fh_waiters; +} futex_hash_t; + +static futex_hash_t futex_hash[HASH_SIZE]; + +static void +futex_hashin(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + fwp->fw_prev = NULL; + fwp->fw_next = futex_hash[index].fh_waiters; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp; + futex_hash[index].fh_waiters = fwp; +} + +static void +futex_hashout(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + if (fwp->fw_prev) + fwp->fw_prev->fw_next = fwp->fw_next; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp->fw_prev; + if (futex_hash[index].fh_waiters == fwp) + futex_hash[index].fh_waiters = fwp->fw_next; + + fwp->fw_prev = NULL; + fwp->fw_next = NULL; +} + +/* + * Go to sleep until somebody does a WAKE operation on this futex, we get a + * signal, or the timeout expires. + */ +static int +futex_wait(memid_t *memid, caddr_t addr, + int val, timespec_t *timeout, uint32_t bits, boolean_t hrtime) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp = &lwpd->br_fwaiter; + int err, ret; + int32_t curval; + int index; + + /* + * The LMS_USER_LOCK micro state becomes valid if we sleep; otherwise + * our time will accrue against LMS_SYSTEM. Use of this micro state + * is modelled on lwp_mutex_timedlock(), a native analogue of + * futex_wait(). + */ + (void) new_mstate(t, LMS_USER_LOCK); + + fwp->fw_woken = 0; + fwp->fw_bits = bits; + fwp->fw_tid = 0; + + MEMID_COPY(memid, &fwp->fw_memid); + cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL); + + index = HASH_FUNC(&fwp->fw_memid); + mutex_enter(&futex_hash[index].fh_lock); + + if (fuword32(addr, (uint32_t *)&curval)) { + err = set_errno(EFAULT); + goto out; + } + if (curval != val) { + err = set_errno(EWOULDBLOCK); + goto out; + } + + futex_hashin(fwp); + + err = 0; + while ((fwp->fw_woken == 0) && (err == 0)) { + /* + * If hrtime is set, we interpret timeout to be absolute and + * CLOCK_MONOTONIC-based; otherwise we treat it as absolute + * and CLOCK_REALTIME-based. (Strictly speaking -- or at least + * in as much as the term "strictly" means anything in the + * semantic shambles that is Linux -- FUTEX_WAIT defines its + * timeout to be CLOCK_MONOTONIC-based but limited by system + * clock interval; we treat these semantics as effectively + * CLOCK_REALTIME.) + */ + if (hrtime) { + ret = cv_timedwait_sig_hrtime(&fwp->fw_cv, + &futex_hash[index].fh_lock, ts2hrt(timeout)); + } else { + ret = cv_waituntil_sig(&fwp->fw_cv, + &futex_hash[index].fh_lock, timeout, timechanged); + } + + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* + * According to signal(7), a futex(2) call with the + * FUTEX_WAIT operation is restartable. + */ + ttolxlwp(t)->br_syscall_restart = B_TRUE; + err = set_errno(EINTR); + } + } + + /* + * The futex is normally hashed out in wakeup. If we timed out or + * got a signal, we need to hash it out here instead. + */ + if (fwp->fw_woken == 0) + futex_hashout(fwp); + +out: + mutex_exit(&futex_hash[index].fh_lock); + + return (err); +} + +/* + * Wake up to wake_threads threads that are blocked on the futex at memid. + */ +static int +futex_wake(memid_t *memid, int wake_threads, uint32_t mask) +{ + fwaiter_t *fwp, *next; + int index; + int ret = 0; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash[index].fh_lock); + + for (fwp = futex_hash[index].fh_waiters; + fwp != NULL && ret < wake_threads; fwp = next) { + next = fwp->fw_next; + if (MEMID_EQUAL(&fwp->fw_memid, memid)) { + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and + * non-PI usage on the same futex. + */ + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EINVAL)); + } + + if ((fwp->fw_bits & mask)) { + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + ret++; + } + } + } + + mutex_exit(&futex_hash[index].fh_lock); + + return (ret); +} + +static int +futex_wake_op_execute(int32_t *addr, int32_t val3) +{ + int32_t op = FUTEX_OP_OP(val3); + int32_t cmp = FUTEX_OP_CMP(val3); + int32_t cmparg = FUTEX_OP_CMPARG(val3); + int32_t oparg, oldval, newval; + label_t ljb; + int rval; + + if ((uintptr_t)addr >= KERNELBASE) + return (-EFAULT); + + if (on_fault(&ljb)) + return (-EFAULT); + + oparg = FUTEX_OP_OPARG(val3); + + do { + oldval = *addr; + newval = oparg; + + switch (op) { + case FUTEX_OP_SET: + break; + + case FUTEX_OP_ADD: + newval += oparg; + break; + + case FUTEX_OP_OR: + newval |= oparg; + break; + + case FUTEX_OP_ANDN: + newval &= ~oparg; + break; + + case FUTEX_OP_XOR: + newval ^= oparg; + break; + + default: + no_fault(); + return (-EINVAL); + } + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + no_fault(); + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + rval = (oldval == cmparg); + break; + + case FUTEX_OP_CMP_NE: + rval = (oldval != cmparg); + break; + + case FUTEX_OP_CMP_LT: + rval = (oldval < cmparg); + break; + + case FUTEX_OP_CMP_LE: + rval = (oldval <= cmparg); + break; + + case FUTEX_OP_CMP_GT: + rval = (oldval > cmparg); + break; + + case FUTEX_OP_CMP_GE: + rval = (oldval >= cmparg); + break; + + default: + return (-EINVAL); + } + + return (rval); +} + +static int +futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2, + int wake_threads, int wake_threads2, int val3) +{ + kmutex_t *l1, *l2; + int ret = 0, ret2 = 0, wake; + fwaiter_t *fwp, *next; + int index1, index2; + + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(memid2); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + /* LINTED: alignment */ + if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) { + (void) set_errno(-wake); /* convert back to positive errno */ + ret = -1; + goto out; + } + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and non-PI + * usage on the same futex. + */ + (void) set_errno(EINVAL); + ret = -1; + goto out; + } + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret >= wake_threads) { + break; + } + } + + if (!wake) + goto out; + + for (fwp = futex_hash[index2].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid2)) + continue; + + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and non-PI + * usage on the same futex. + */ + (void) set_errno(EINVAL); + ret = -1; + goto out; + } + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret2 >= wake_threads2) { + break; + } + } + + ret += ret2; +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + return (ret); +} + +/* + * Wake up to wake_threads waiting on the futex at memid. If there are + * more than that many threads waiting, requeue the remaining threads on + * the futex at requeue_memid. + */ +static int +futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, + ulong_t requeue_threads, caddr_t addr, int *cmpval) +{ + fwaiter_t *fwp, *next; + int index1, index2; + int ret = 0; + int32_t curval; + kmutex_t *l1, *l2; + + /* + * To ensure that we don't miss a wakeup if the value of cmpval + * changes, we need to grab locks on both the original and new hash + * buckets. To avoid deadlock, we always grab the lower-indexed + * lock first. + */ + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(requeue_memid); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + if (cmpval != NULL) { + if (fuword32(addr, (uint32_t *)&curval)) { + ret = -EFAULT; + goto out; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out; + } + } + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + MEMID_COPY(requeue_memid, &fwp->fw_memid); + futex_hashin(fwp); + + if ((ret - wake_threads) >= requeue_threads) + break; + } + } + +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + if (ret < 0) + return (set_errno(-ret)); + return (ret); +} + +/* + * Copy in the timeout provided by the application and convert it to an + * absolute timeout. Sadly, this is complicated by the different timeout + * semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET vs. FUTEX_LOCK_PI. (Yes, you + * read that correctly; all three of these have different timeout semantics; + * see the block comment at the top of the file for commentary on this + * inanity.) This function doesn't attempt to clean up all of these + * differences, however; we will only copy the timer value in, perform some + * basic sanity checking, and (if it's an operation operating on a relative + * time, which is to say FUTEX_WAIT) adjust it to be absolute. All other + * nuances (namely, the resolution and clock of the timeout) are left up to + * the caller. + */ +static int +get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd) +{ + timestruc_t now; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) + return (EFAULT); + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) + return (EFAULT); + timeout->tv_sec = (time_t)timeout32.tv_sec; + timeout->tv_nsec = timeout32.tv_nsec; + } +#endif + if (itimerspecfix(timeout)) + return (EINVAL); + + if (cmd == FUTEX_WAIT) { + /* + * We've been given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } + + return (0); +} + +/* + * Attempt to take the futex. If currently held, enqueue (sleep) on the futex + * until a thread performs futex_unlock_pi, we get a signal, or the timeout + * expires. If 'is_trylock' is true and the futex is currently held, return + * EAGAIN immediately. + */ +static int +futex_lock_pi(memid_t *memid, uint32_t *addr, timespec_t *timeout, + boolean_t is_trylock) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp = &lwpd->br_fwaiter; + fwaiter_t *f_fwp; + int fpri, mypri; + int err; + int index; + /* volatile to silence gcc clobber warning for longjmp */ + volatile pid_t mytid; + pid_t ftid; /* current futex holder tid */ + proc_t *fproc = NULL; /* current futex holder proc */ + kthread_t *fthrd; /* current futex holder thread */ + volatile uint32_t oldval; + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); + + /* + * Have to take mutex first to prevent the following race with unlock: + * a) T1 sees a tid in the futex and atomically sets FUTEX_WAITERS. + * b) T2 calls unlock, sees there are waiters, but since nothing is in + * the queue yet, it simply returns with the futex now containing 0. + * c) T1 proceeds to enqueue itself. + * At this point nothing will ever wake T1. + */ + index = HASH_FUNC(memid); + mutex_enter(&futex_hash[index].fh_lock); + + /* It would be very unusual to actually loop here. */ + oldval = 0; + /* CONSTCOND */ + while (1) { + uint32_t curval; + label_t ljb; + + if (on_fault(&ljb)) { + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EFAULT)); + } + + /* + * We optimistically try to set our tid on the off chance that + * the futex was released after we initiated the syscall. That + * may work but it is the unlikely path and is usually just our + * way of getting the current value. This also handles the + * retry in the case when the futex only has the high bits set. + */ + curval = atomic_cas_32(addr, oldval, mytid); + if (oldval == curval) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + + oldval = curval; + ftid = oldval & FUTEX_TID_MASK; + /* high bits were only ones set, so we retry to set our tid */ + if (ftid == 0) { + no_fault(); + continue; + } + + if (ftid == mytid) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EDEADLK)); + } + + /* The futex is currently held by another thread. */ + if (is_trylock) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EAGAIN)); + } + + curval = atomic_cas_32(addr, oldval, oldval | FUTEX_WAITERS); + no_fault(); + if (curval == oldval) { + /* + * We set the WAITERS bit so now we can enqueue our + * thread on the mutex. This is the typical path. + */ + oldval |= FUTEX_WAITERS; + break; + } + + /* + * The rare case when a change snuck into the window between + * first getting the futex value and updating it; retry. + */ + oldval = 0; + } + + /* + * Determine if the current futex holder's priority needs to inherit + * our priority (only if it should be increased). + * + * If a non-branded proc is sharing this futex(!?) then we don't + * interact with it. This seems like it would only occur maliciously. + * That proc will never be able to call futex(2) to unlock the futex. + * We just return ESRCH for this invalid case. + * + * Otherwise, get the holder's priority and if necessary, bump it up to + * our level. + */ + mutex_enter(&curproc->p_lock); + (void) CL_DOPRIO(curthread, kcred, 0, &mypri); + mutex_exit(&curproc->p_lock); + + if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) != 0) { + label_t ljb; + + if (on_fault(&ljb) == 0) { + (void) atomic_cas_32(addr, oldval, + oldval | FUTEX_OWNER_DIED); + } + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(ESRCH)); + } + if (!PROC_IS_BRANDED(fproc)) { + mutex_exit(&fproc->p_lock); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(ESRCH)); + } + + ASSERT(MUTEX_HELD(&fproc->p_lock)); + (void) CL_DOPRIO(fthrd, kcred, 0, &fpri); + + f_fwp = &lwptolxlwp(ttolwp(fthrd))->br_fwaiter; + if (mypri > fpri) { + /* Save holder's current pri if not already bumped up */ + if (!f_fwp->fw_pri_up) + f_fwp->fw_opri = fpri; + f_fwp->fw_pri_up = B_TRUE; + DTRACE_PROBE2(futex__lck__pri, int, mypri, int, fpri); + CL_DOPRIO(fthrd, kcred, mypri - fpri, &fpri); + } + + /* + * If we haven't already been bumped by some other thread then + * record our pri at time of enqueue. + */ + if (!fwp->fw_pri_up) { + fwp->fw_opri = mypri; + } + mutex_exit(&fproc->p_lock); + + /* + * Enqueue our thread on the mutex. This is similar to futex_wait(). + * See futex_wait() for LMS_USER_LOCK state description. + */ + (void) new_mstate(t, LMS_USER_LOCK); + + fwp->fw_woken = 0; + fwp->fw_bits = 0; + fwp->fw_tid = mytid; + MEMID_COPY(memid, &fwp->fw_memid); + cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL); + + futex_hashin(fwp); + + err = 0; + while (fwp->fw_woken == 0 && err == 0) { + int ret; + + ret = cv_waituntil_sig(&fwp->fw_cv, &futex_hash[index].fh_lock, + timeout, timechanged); + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* EINTR is not valid for futex_lock_pi */ + err = set_errno(EAGAIN); + } + } + + /* + * The futex is normally hashed out in futex_unlock_pi. If we timed out + * or got a signal, we need to hash it out here instead. + */ + if (fwp->fw_woken == 0) + futex_hashout(fwp); + + mutex_exit(&futex_hash[index].fh_lock); + return (err); +} + +/* + * This must be a separate function to prevent compiler complaints about + * clobbering variables via longjmp (on_fault). When setting the new owner we + * must preserve the current WAITERS and OWNER_DIED bits. + */ +static int +futex_unlock_pi_waiter(fwaiter_t *fnd_fwp, uint32_t *addr, uint32_t curval) +{ + label_t ljb; + pid_t tid; + + if (on_fault(&ljb)) { + return (EFAULT); + } + + /* No waiter on this futex; again, not normal, but not an error. */ + if (fnd_fwp == NULL) { + int res = 0; + if (atomic_cas_32(addr, curval, + 0 | (curval & FUTEX_OWNER_DIED)) != curval) + res = EINVAL; + no_fault(); + return (res); + } + + tid = fnd_fwp->fw_tid | (curval & (FUTEX_WAITERS | FUTEX_OWNER_DIED)); + if (atomic_cas_32(addr, curval, tid) != curval) { + /* + * The value was changed behind our back, return an error and + * don't dequeue the waiter. + */ + no_fault(); + return (EINVAL); + } + + no_fault(); + + futex_hashout(fnd_fwp); + fnd_fwp->fw_woken = 1; + cv_signal(&fnd_fwp->fw_cv); + + return (0); +} + +/* + * Paired with futex_lock_pi; wake up highest priority thread that is blocked + * on the futex at memid. A non-zero 'clean_tid' argument is used for a PI + * futex during robust or trylock cleanup when the calling thread may not own + * the futex. During cleanup we check that the futex contains the expected + * tid to avoid cleanup races. + */ +static int +futex_unlock_pi(memid_t *memid, uint32_t *addr, pid_t clean_tid) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp, *fnd_fwp; + uint32_t curval; + pid_t mytid; + pid_t holder_tid; + int index; + int hipri; + int res; + + if ((uintptr_t)addr >= KERNELBASE) + return (EFAULT); + + mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); + + /* See comment in futex_lock_pi for why we take the mutex first. */ + index = HASH_FUNC(memid); + mutex_enter(&futex_hash[index].fh_lock); + + if (fuword32(addr, &curval)) { + mutex_exit(&futex_hash[index].fh_lock); + return (EFAULT); + } + + holder_tid = curval & FUTEX_TID_MASK; + if (clean_tid == 0) { + /* Not cleaning up so we must hold the futex */ + if (holder_tid != mytid) { + mutex_exit(&futex_hash[index].fh_lock); + return (EPERM); + } + } else { + /* + * We're doing cleanup but we want to check if another thread + * already did the cleanup due to a race before we took the + * futex_hash.fh_lock. + * + * There are two posible cases here: + * 1) During robust cleanup we already cleared the dead tid + * from the futex and set the FUTEX_OWNER_DIED bit. + * 2) During trylock cleanup we want to be sure the tid we + * saw in the futex before we took the futex_hash lock + * is still there and that we did not race with another + * trylock also doing cleanup. + */ + DTRACE_PROBE2(futex__unl__clean, int, curval, int, clean_tid); + if ((curval & FUTEX_OWNER_DIED) != 0) { + if (holder_tid != 0) { + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + } else if (holder_tid != clean_tid) { + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + } + + /* + * If necessary, restore our old priority. Since we only ever bump up + * the priority, our incr should be negative, but we allow for the + * case where the priority was lowered in some other way while we held + * the futex. Also, we only reset our priority on a true unlock, not + * when cleaning up, as indicated by clean_tid. + */ + if (clean_tid == 0) { + fwp = &lwpd->br_fwaiter; + if (fwp->fw_pri_up) { + int curpri; + int incr; + + mutex_enter(&curproc->p_lock); + CL_DOPRIO(curthread, kcred, 0, &curpri); + DTRACE_PROBE2(futex__unl__pri, int, fwp->fw_opri, + int, curpri); + incr = fwp->fw_opri - curpri; + if (incr < 0) { + CL_DOPRIO(curthread, kcred, incr, &curpri); + } + mutex_exit(&curproc->p_lock); + fwp->fw_pri_up = B_FALSE; + } + } + + /* + * Normally an application wouldn't make the syscall if the WAITERS + * bit is not set, but we also come through here on robust and trylock + * cleanup. Preserve the OWNER_DIED bit even though there are no + * waiters and we're just clearing the tid. + */ + if ((curval & FUTEX_WAITERS) == 0) { + res = 0; + label_t fjb; + + if (on_fault(&fjb)) { + mutex_exit(&futex_hash[index].fh_lock); + return (EFAULT); + } + if (atomic_cas_32(addr, curval, + 0 | (curval & FUTEX_OWNER_DIED)) != curval) { + res = EINVAL; + } + + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (res); + } + + /* Find the highest priority waiter. */ + hipri = BELOW_MINPRI; + fnd_fwp = NULL; + for (fwp = futex_hash[index].fh_waiters; fwp != NULL; + fwp = fwp->fw_next) { + if (MEMID_EQUAL(&fwp->fw_memid, memid)) { + if (fwp->fw_tid == 0) { + /* + * A non-PI waiter. It is invalid to mix PI and + * non-PI usage on the same futex. + */ + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (EINVAL); + } + /* + * Because futex_hashin inserts at the head of the list + * we want to find the oldest entry with the highest + * priority (hence >=). + */ + if (fwp->fw_opri >= hipri) { + fnd_fwp = fwp; + hipri = fwp->fw_opri; + } + } + } + + res = futex_unlock_pi_waiter(fnd_fwp, addr, curval); + mutex_exit(&futex_hash[index].fh_lock); + return (res); +} + +/* + * Handle the case where the futex holder is gone and try to recover. Trylock + * will never enqueue on the futex and must return EAGAIN if it is held by + * a live process. + */ +static int +futex_trylock_pi(memid_t *memid, uint32_t *addr) +{ + uint32_t curval; + pid_t ftid; /* current futex holder tid */ + proc_t *fproc = NULL; /* current futex holder proc */ + kthread_t *fthrd; /* current futex holder thread */ + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + if (fuword32(addr, &curval)) + return (set_errno(EFAULT)); + + /* The futex is free, use the normal flow. */ + if (curval == 0) + return (futex_lock_pi(memid, addr, NULL, B_TRUE)); + + /* Determine if the current futex holder is still alive. */ + ftid = curval & FUTEX_TID_MASK; + if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) == 0) { + mutex_exit(&fproc->p_lock); + } else { + /* + * The current holder is gone. Unlock then take the lock. + * Ignore any error that may result from two threads racing to + * cleanup. + */ + (void) futex_unlock_pi(memid, addr, ftid); + } + return (futex_lock_pi(memid, addr, NULL, B_TRUE)); +} + +long +lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val3) +{ + struct as *as = curproc->p_as; + memid_t memid, memid2; + timestruc_t timeout; + timestruc_t *tptr = NULL; + int val2 = NULL; + int rval = 0; + int cmd = op & FUTEX_CMD_MASK; + int private = op & FUTEX_PRIVATE_FLAG; + char dmsg[32]; + + /* must be aligned on int boundary */ + if (addr & 0x3) + return (set_errno(EINVAL)); + + /* Sanity check the futex command */ + if (cmd < 0 || cmd > FUTEX_MAX_CMD) + return (set_errno(EINVAL)); + + if (cmd == FUTEX_FD) { + /* + * FUTEX_FD was sentenced to death for grievous crimes of + * semantics against humanity; it has been ripped out of Linux + * and will never be supported by us. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + switch (cmd) { + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + /* + * These are operations that we don't currently support, but + * may well need to in the future. For now, callers need to + * deal with these being missing -- but if and as that changes, + * they may well need to be implemented. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) { + /* + * Linux only allows FUTEX_CLOCK_REALTIME to be set on the + * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands. + */ + return (set_errno(ENOSYS)); + } + + /* Copy in the timeout structure from userspace. */ + if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_LOCK_PI) && lx_timeout != NULL) { + rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd); + + if (rval != 0) + return (set_errno(rval)); + tptr = &timeout; + } + + switch (cmd) { + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: + /* + * lx_timeout is nominally a pointer to a userspace address. + * For several commands, however, it actually contains + * an additional integer parameter. This is horrible, and + * the people who did this to us should be sorry. + */ + val2 = (int)lx_timeout; + } + + /* + * Translate the process-specific, user-space futex virtual + * address(es) to a universal memid. If the private bit is set, we + * can just use our as plus the virtual address, saving quite a bit + * of effort. + */ + if (private) { + memid.val[0] = (uintptr_t)as; + memid.val[1] = (uintptr_t)addr; + } else { + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) + return (set_errno(rval)); + } + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) { + if (addr2 & 0x3) + return (set_errno(EINVAL)); + + if (private) { + memid2.val[0] = (uintptr_t)as; + memid2.val[1] = (uintptr_t)addr2; + } else { + rval = as_getmemid(as, (void *)addr2, &memid2); + if (rval) + return (set_errno(rval)); + } + } + + switch (cmd) { + case FUTEX_WAIT: + rval = futex_wait(&memid, (void *)addr, val, + tptr, FUTEX_BITSET_MATCH_ANY, B_FALSE); + break; + + case FUTEX_WAIT_BITSET: + rval = futex_wait(&memid, (void *)addr, val, tptr, val3, + (op & FUTEX_CLOCK_REALTIME) ? B_FALSE : B_TRUE); + break; + + case FUTEX_WAKE: + rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAKE_BITSET: + rval = futex_wake(&memid, val, val3); + break; + + case FUTEX_WAKE_OP: + rval = futex_wake_op(&memid, (void *)addr2, &memid2, + val, val2, val3); + break; + + case FUTEX_CMP_REQUEUE: + case FUTEX_REQUEUE: + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, &val3); + + break; + + case FUTEX_LOCK_PI: + rval = futex_lock_pi(&memid, (uint32_t *)addr, tptr, B_FALSE); + break; + + case FUTEX_TRYLOCK_PI: + rval = futex_trylock_pi(&memid, (uint32_t *)addr); + break; + + case FUTEX_UNLOCK_PI: + rval = futex_unlock_pi(&memid, (uint32_t *)addr, 0); + if (rval != 0) + (void) set_errno(rval); + break; + } + + return (rval); +} + +/* + * Wake the next waiter if the thread holding the futex has exited without + * releasing the futex. + */ +static void +futex_robust_wake(memid_t *memid, uint32_t tid) +{ + fwaiter_t *fwp; + int index; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash[index].fh_lock); + + for (fwp = futex_hash[index].fh_waiters; fwp != NULL; + fwp = fwp->fw_next) { + if (MEMID_EQUAL(&fwp->fw_memid, memid)) + break; + } + + if (fwp != NULL) { + if (fwp->fw_tid != 0) { + /* + * This is a PI futex and there is a waiter; unlock the + * futex in cleanup mode. Ignore errors, which are very + * unlikely, but could happen if the futex was in an + * unexpected state due to some other cleanup, such as + * might happen with a concurrent trylock call. + */ + mutex_exit(&futex_hash[index].fh_lock); + (void) futex_unlock_pi(memid, + (uint32_t *)(uintptr_t)memid->val[1], tid); + return; + } + + /* non-PI futex, just wake it */ + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } + + mutex_exit(&futex_hash[index].fh_lock); +} + +/* + * Does the dirty work of actually dropping a held robust lock in the event + * of the untimely death of the owner; see lx_futex_robust_exit(), below. + */ +static void +lx_futex_robust_drop(uintptr_t addr, uint32_t tid) +{ + memid_t memid; + uint32_t oldval, newval; + + VERIFY(addr + sizeof (uint32_t) < KERNELBASE); + + do { + fuword32_noerr((void *)addr, &oldval); + + if ((oldval & FUTEX_TID_MASK) != tid) + return; + + newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + /* + * We have now denoted that this lock's owner is dead; we need to + * wake any waiters. + */ + if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0) + return; + + futex_robust_wake(&memid, tid); +} + +/* + * Called when a thread is exiting. The role of the kernel is very clearly + * spelled out in the Linux design document entitled robust-futex-ABI.txt: + * we must (carefully!) iterate over the list of held locks pointed to by + * the robust list head; for each lock, we'll check to see if the calling + * (exiting) thread is the owner, and if so, denote that the lock is dead + * and wake any waiters. (The "pending" field of the head points to a lock + * that is in transition; it should be dropped if held.) If there are any + * errors through here at all (including memory operations), we abort the + * entire operation. + */ +void +lx_futex_robust_exit(uintptr_t addr, uint32_t tid) +{ + futex_robust_list_t list; + uintptr_t entry, next; + model_t model = get_udatamodel(); + int length = 0; + label_t ljb; + + if (on_fault(&ljb)) + return; + + if (addr + sizeof (futex_robust_list_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + copyin_noerr((void *)addr, &list, sizeof (list)); + } +#if defined(_SYSCALL32_IMPL) + else { + futex_robust_list32_t list32; + + copyin_noerr((void *)addr, &list32, sizeof (list32)); + list.frl_head = list32.frl_head; + list.frl_offset = list32.frl_offset; + list.frl_pending = list32.frl_pending; + } +#endif + + /* + * Strip off the PI bit, if any. + */ + entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI; + + while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) { + if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + fulword_noerr((void *)entry, &next); + } +#if defined(_SYSCALL32_IMPL) + else { + uint32_t next32; + fuword32_noerr((void *)entry, &next32); + next = next32; + } +#endif + + /* + * Drop the robust mutex -- but only if our pending lock didn't + * somehow sneak on there. + */ + if (entry != list.frl_pending) + lx_futex_robust_drop(entry + list.frl_offset, tid); + + entry = next & ~FUTEX_LOCK_PI; + } + + /* + * Finally, drop the pending lock if there is one. + */ + if (list.frl_pending != NULL && list.frl_pending + + list.frl_offset + sizeof (uint32_t) < KERNELBASE) + lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid); + +out: + no_fault(); +} + +long +lx_set_robust_list(void *listp, size_t len) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (len != sizeof (futex_robust_list_t)) + return (set_errno(EINVAL)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (len != sizeof (futex_robust_list32_t)) + return (set_errno(EINVAL)); + } +#endif + + /* + * To assure that we are serialized with respect to any racing call + * to lx_get_robust_list(), we lock ourselves to set the value. (Note + * that sprunlock() drops p_lock.) + */ + mutex_enter(&p->p_lock); + sprlock_proc(p); + lwpd->br_robust_list = listp; + sprunlock(p); + + return (0); +} + +long +lx_get_robust_list(pid_t pid, void **listp, size_t *lenp) +{ + model_t model = get_udatamodel(); + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + void *list; + int err = 0; + + if (pid == 0) { + /* + * A pid of 0 denotes the current thread; we lock the current + * process even though it isn't strictly necessary (we can't + * race with set_robust_list() because a thread may only set + * its robust list on itself). + */ + rproc = curproc; + rlwpd = lwptolxlwp(ttolwp(curthread)); + mutex_enter(&curproc->p_lock); + sprlock_proc(rproc); + } else { + if (lx_lpid_lock(pid, curzone, LXP_PRLOCK, &rproc, + &rthr) != 0) { + return (set_errno(ESRCH)); + } + + if (rproc->p_model != model || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL) { + /* + * The target process does not match our data model, or + * we couldn't find the LWP, or the target process is + * not branded. + */ + err = ESRCH; + goto out; + } + } + + if (curproc != rproc && + priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) { + /* + * We don't have the permission to examine the target. + */ + err = EPERM; + goto out; + } + + list = rlwpd->br_robust_list; + +out: + sprunlock(rproc); + + if (err != 0) + return (set_errno(err)); + + if (model == DATAMODEL_NATIVE) { + if (sulword(listp, (uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (sulword(lenp, sizeof (futex_robust_list_t)) != 0) + return (set_errno(EFAULT)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (suword32(listp, (uint32_t)(uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +void +lx_futex_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + mutex_init(&futex_hash[i].fh_lock, NULL, MUTEX_DEFAULT, NULL); +} + +int +lx_futex_fini(void) +{ + int i, err; + + err = 0; + for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { + mutex_enter(&futex_hash[i].fh_lock); + if (futex_hash[i].fh_waiters != NULL) + err = EBUSY; + mutex_exit(&futex_hash[i].fh_lock); + } + return (err); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c new file mode 100644 index 0000000000..275a781fa0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/pathname.h> + +/* + * getcwd() - Linux syscall semantics are slightly different; we need to return + * the length of the pathname copied (+ 1 for the terminating NULL byte.) + */ +long +lx_getcwd(char *buf, int size) +{ + int len; + int error; + vnode_t *vp; + char path[MAXPATHLEN + 1]; + + mutex_enter(&curproc->p_lock); + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + mutex_exit(&curproc->p_lock); + if ((error = vnodetopath(NULL, vp, path, sizeof (path), CRED())) != 0) { + VN_RELE(vp); + return (set_errno(error)); + } + VN_RELE(vp); + + len = strlen(path) + 1; + if (len > size) + return (set_errno(ERANGE)); + + if (copyout(path, buf, len) != 0) + return (set_errno(EFAULT)); + + return (len); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c new file mode 100644 index 0000000000..5bde892aea --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/vnode.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> + +#include <sys/lx_types.h> +#include <sys/lx_misc.h> + +#define LX_NAMEMAX 256 + +#define LX_GETDENTS_MAX_BUFSZ 65536 + +/* + * See the comment in our lx_sysfs VFS code for a detailed explanation around + * the handling of 'd_type' here. + */ +#define LX_DT_UNKNOWN 0 +#define LX_DT_FIFO 1 +#define LX_DT_CHR 2 +#define LX_DT_DIR 4 +#define LX_DT_BLK 6 +#define LX_DT_REG 8 +#define LX_DT_LNK 10 +#define LX_DT_SOCK 12 + +/* + * Set by lx_sysfs when it loads. lx_sysfs depends on the lx_brand module, + * so our module has to load first and define the variables that lx_sysfs will + * set when it loads. + */ +int lx_sysfs_vfs_type; +int (*lx_sysfs_vtype)(ino_t); + +/* + * Because the Linux dirent has an extra field (d_type), it's possible that + * each entry will be 8 bytes larger (and aligned to 8 bytes) due to padding. + * To prevent overrun during translation, the illumos-native buffer is sized + * pessimistically. + */ +#define LTOS_GETDENTS_BUFSZ(bufsz, datasz) \ + (((bufsz) / (((datasz) + 15) & ~7)) * sizeof (struct dirent)) + +/* + * Linux d_type offset is at (d_reclen - 1). See the Linux getdents(2) man page. + * This macro assumes d_reclen is already set correctly. + */ +#define LX_DTYPE(l) *(((char *)l) + (l->d_reclen - 1)) + +/* + * Record must be long enough to house d_name string, null terminator and + * d_type field. It's then padded to nearest 8-byte boundary + */ +#define LX_RECLEN(l, t) \ + ((offsetof(t, d_name) + 2 + (l) + 7) & ~7) + +/* + * Bytes after d_name string until d_reclen should be zeroed. + * Includes zero-terminating d_name + */ +#define LX_ZEROLEN(l, t) \ + (LX_RECLEN(l, t) - \ + ((offsetof(t, d_name) + (l)))) + +/* The output format of getdents differs if the caller is 32 or 64 bit. */ +struct lx_dirent_32 { + uint32_t d_ino; + int32_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +struct lx_dirent_64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +static long +lx_getdents_common(int fd, caddr_t uptr, size_t count, + unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int, boolean_t)) +{ + vnode_t *vp; + boolean_t is_sysfs = B_FALSE; + file_t *fp; + struct uio auio; + struct iovec aiov; + int error, at_eof; + int sbufsz, lbufsz, bufsz; + void *lbuf, *sbuf; + size_t outb = 0; + + if (count < lx_size) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + if (!(fp->f_flag & FREAD)) { + releasef(fd); + return (set_errno(EBADF)); + } + + if (vp->v_vfsp->vfs_fstype == lx_sysfs_vfs_type) { + is_sysfs = B_TRUE; + } + + if (count > LX_GETDENTS_MAX_BUFSZ) { + /* + * If the target buffer passed to us is huge, keep the + * translation buffers moderate in size. Iteration will be + * used to fill the request. + */ + lbufsz = LX_GETDENTS_MAX_BUFSZ; + sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size); + } else if (count < (lx_size + MAXPATHLEN)) { + /* + * If the target buffer is tiny, allocate a Linux-format buffer + * big enough to hold at least one max-length row while keeping + * the illumos-format buffer pesimistic in size. + * + * Assuming the buffer is truely tiny, it's likely that the + * result will not fit and an EINVAL will be tossed. + */ + lbufsz = (lx_size + MAXPATHLEN); + sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)), + sizeof (struct dirent)); + } else { + lbufsz = count; + sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size); + } + bufsz = sbufsz; + lbuf = kmem_alloc(lbufsz, KM_SLEEP); + sbuf = kmem_alloc(sbufsz, KM_SLEEP); + + aiov.iov_base = sbuf; + aiov.iov_len = sbufsz; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = sbufsz; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Since we use a conservative buffer allocation for the differing + * struct sizing and Linux places fewer limits on getdents buffers in + * general, there's a chance we'll undershoot on the record count. + * When this happens, we can simply repeat the READDIR operation until + * the available records are exhausted or we've filled the user buffer. + */ + do { + int res; + + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error != 0 || auio.uio_resid == sbufsz) { + break; + } + res = outcb(sbuf, lbuf, bufsz - auio.uio_resid, is_sysfs); + VERIFY(res <= lbufsz); + if (res == 0) { + /* no records to copyout from this batch */ + break; + } else if (res > count) { + /* + * For very small buffer sizes, it's possible that a + * single record is too large due to a long filename. + */ + error = EINVAL; + break; + } + + VERIFY(outb + res <= count); + if (copyout(lbuf, (void *)(uptr + outb), res) != 0) { + error = EFAULT; + break; + } + outb += res; + + /* + * We undershot the request buffer. + * Reset for another READDIR, taking care not to overshoot. + */ + bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size)); + auio.uio_resid = bufsz; + aiov.iov_len = bufsz; + aiov.iov_base = sbuf; + + /* + * Continued progress is allowed only if EOF has not been + * reached and there is enough remaining buffer space to hold + * an entry with a max-length filename. + */ + } while (at_eof == 0 && (count - outb) >= (lx_size + MAXPATHLEN)); + + kmem_free(lbuf, lbufsz); + kmem_free(sbuf, sbufsz); + + if (error) { + releasef(fd); + return (set_errno(error)); + } + + fp->f_offset = auio.uio_loffset; + releasef(fd); + return (outb); +} + +static int +lx_get_sysfs_dtype(ino_t ino) +{ + vtype_t vt; + + vt = lx_sysfs_vtype(ino); + + switch (vt) { + case VREG: return (LX_DT_REG); + case VDIR: return (LX_DT_DIR); + case VBLK: return (LX_DT_BLK); + case VCHR: return (LX_DT_CHR); + case VLNK: return (LX_DT_LNK); + case VFIFO: return (LX_DT_FIFO); + case VSOCK: return (LX_DT_SOCK); + default: return (LX_DT_UNKNOWN); + } +} + +static int +lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent_32 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent_32 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_32); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_32)); + + if (is_sysfs) { + LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +static int +lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent_64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent_64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_64); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_64)); + + if (is_sysfs) { + LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +long +lx_getdents_32(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_32), lx_getdents_format32)); +} + +long +lx_getdents_64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_64), lx_getdents_format64)); +} + +struct lx_dirent64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + uchar_t d_type; + char d_name[1]; +}; + +#define LX_RECLEN64(namelen) \ + ((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7) + +#define LX_ZEROLEN64(namelen) \ + (LX_RECLEN64(namelen) - \ + ((offsetof(struct lx_dirent64, d_name) + (namelen)))) + +static int +lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + ld->d_type = LX_DT_UNKNOWN; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN64(namelen); + /* Zero out any alignment padding */ + bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen)); + + if (is_sysfs) { + ld->d_type = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + + +long +lx_getdents64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent64), lx_getdents64_format)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c new file mode 100644 index 0000000000..0ebd93304e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * return the pid + */ +long +lx_getpid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + long rv; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + VERIFY(lwpd != NULL); + + rv = lwpd->br_tgid; + } + + return (rv); +} + +/* + * return the parent pid + */ +long +lx_getppid(void) +{ + return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL)); +} + +/* + * return the thread id + */ +long +lx_gettid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + return (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c new file mode 100644 index 0000000000..acc4073483 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * From "uts/common/syscall/getrandom.c": + */ +extern int getrandom(void *, size_t, int); + +long +lx_getrandom(void *bufp, size_t buflen, int flags) +{ + /* + * According to signal(7), calls to getrandom(2) are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (getrandom(bufp, buflen, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c new file mode 100644 index 0000000000..67f0fc9e5e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -0,0 +1,509 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/zone.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/policy.h> +#include <sys/lx_types.h> + +#define LX_NGROUPS_MAX 32 + +/* From usr/src/uts/common/syscall/gid.c & uid.c */ +extern int setgid(gid_t); +extern int setregid(gid_t, gid_t); +extern int setreuid(uid_t, uid_t); +extern int setuid(uid_t); + +/* From usr/src/uts/common/syscall/groups.c */ +extern int setgroups(int, gid_t *); + +long +lx_getegid(void) +{ + return (crgetgid(CRED())); +} + +long +lx_getegid16(void) +{ + return ((int)LX_GID32_TO_GID16(crgetgid(CRED()))); +} + +long +lx_geteuid(void) +{ + return (crgetuid(CRED())); +} + +long +lx_geteuid16(void) +{ + return ((int)LX_UID32_TO_UID16(crgetuid(CRED()))); +} + +long +lx_getgid(void) +{ + return (crgetrgid(CRED())); +} + +long +lx_getgid16(void) +{ + return ((int)LX_GID32_TO_GID16(crgetrgid(CRED()))); +} + +long +lx_getuid(void) +{ + return (crgetruid(CRED())); +} + +long +lx_getuid16(void) +{ + return ((int)LX_UID32_TO_UID16(crgetruid(CRED()))); +} + +long +lx_setgid(gid_t gid) +{ + return (setgid(gid)); +} + +long +lx_setgid16(lx_gid16_t gid) +{ + return (setgid(LX_GID16_TO_GID32(gid))); +} + +long +lx_setregid(gid_t rgid, gid_t egid) +{ + return (setregid(rgid, egid)); +} + +long +lx_setregid16(lx_gid16_t rgid, lx_gid16_t egid) +{ + return (setregid(LX_UID16_TO_UID32(rgid), LX_UID16_TO_UID32(egid))); +} + +long +lx_setreuid(uid_t ruid, uid_t euid) +{ + return (setreuid(ruid, euid)); +} + +long +lx_setreuid16(lx_uid16_t ruid, lx_uid16_t euid) +{ + return (setreuid(LX_UID16_TO_UID32(ruid), LX_UID16_TO_UID32(euid))); +} + +long +lx_setuid(uid_t uid) +{ + return (setuid(uid)); +} + +long +lx_setuid16(lx_uid16_t uid) +{ + return (setuid(LX_UID16_TO_UID32(uid))); +} + +/* + * This function is based on setreuid in common/syscall/uid.c and exists + * because illumos does not have a way to explicitly set the saved uid (suid) + * from any other system call. + */ +long +lx_setresuid(lx_uid_t ruid, lx_uid_t euid, lx_uid_t suid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid > MAXUID)) || + (euid != -1 && (euid > MAXUID)) || + (suid != -1 && (suid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && + ruid != cr->cr_ruid && ruid != cr->cr_uid && + ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else if (suid != -1 && + suid != cr->cr_ruid && suid != cr->cr_uid && + suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (suid != -1) + newcr->cr_suid = suid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresuid16(lx_uid16_t ruid16, lx_uid16_t euid16, lx_uid16_t suid16) +{ + long rval; + + rval = lx_setresuid( + LX_UID16_TO_UID32(ruid16), + LX_UID16_TO_UID32(euid16), + LX_UID16_TO_UID32(suid16)); + + return (rval); +} + +/* + * This function is based on setregid in common/syscall/gid.c + */ +long +lx_setresgid(lx_gid_t rgid, lx_gid_t egid, lx_gid_t sgid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid > MAXUID)) || + (egid != -1 && (egid > MAXUID)) || + (sgid != -1 && (sgid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (rgid != -1 && + rgid != cr->cr_rgid && rgid != cr->cr_gid && + rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (egid != -1 && + egid != cr->cr_rgid && egid != cr->cr_gid && + egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (sgid != -1 && + sgid != cr->cr_rgid && sgid != cr->cr_gid && + sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (sgid != -1) + newcr->cr_sgid = sgid; + if (rgid != -1) + newcr->cr_rgid = rgid; + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresgid16(lx_gid16_t rgid16, lx_gid16_t egid16, lx_gid16_t sgid16) +{ + long rval; + + rval = lx_setresgid( + LX_GID16_TO_GID32(rgid16), + LX_GID16_TO_GID32(egid16), + LX_GID16_TO_GID32(sgid16)); + + return (rval); +} + +/* + * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ + * the terrible hack below so that tests may proceed, if only on DEBUG kernels. + */ +int +lx_helper_setgroups(int ngroups, gid_t *grouplist) +{ +#ifdef DEBUG + if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) + ngroups = ngroups_max; +#endif /* DEBUG */ + + return (setgroups(ngroups, grouplist)); +} + +long +lx_getresuid(lx_uid_t *ruid, lx_uid_t *euid, lx_uid_t *suid) +{ + lx_uid_t lx_ruid, lx_euid, lx_suid; + cred_t *cr = CRED(); + + lx_ruid = (lx_uid_t)crgetruid(cr); + lx_euid = (lx_uid_t)crgetuid(cr); + lx_suid = (lx_uid_t)crgetsuid(cr); + + if (copyout(&lx_ruid, (void *)ruid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_euid, (void *)euid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_suid, (void *)suid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresuid16(lx_uid16_t *ruid16, lx_uid16_t *euid16, lx_uid16_t *suid16) +{ + lx_uid16_t lx_ruid16, lx_euid16, lx_suid16; + cred_t *cr = CRED(); + + lx_ruid16 = LX_UID32_TO_UID16((lx_uid_t)crgetruid(cr)); + lx_euid16 = LX_UID32_TO_UID16((lx_uid_t)crgetuid(cr)); + lx_suid16 = LX_UID32_TO_UID16((lx_uid_t)crgetsuid(cr)); + + if (copyout(&lx_ruid16, (void *)ruid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_euid16, (void *)euid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_suid16, (void *)suid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresgid(lx_gid_t *rgid, lx_gid_t *egid, lx_gid_t *sgid) +{ + lx_gid_t lx_rgid, lx_egid, lx_sgid; + cred_t *cr = CRED(); + + lx_rgid = (lx_gid_t)crgetrgid(cr); + lx_egid = (lx_gid_t)crgetgid(cr); + lx_sgid = (lx_gid_t)crgetsgid(cr); + + if (copyout(&lx_rgid, (void *)rgid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_egid, (void *)egid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_sgid, (void *)sgid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresgid16(lx_gid16_t *rgid16, lx_gid16_t *egid16, lx_gid16_t *sgid16) +{ + lx_gid16_t lx_rgid16, lx_egid16, lx_sgid16; + cred_t *cr = CRED(); + + lx_rgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetrgid(cr)); + lx_egid16 = LX_GID32_TO_GID16((lx_gid_t)crgetgid(cr)); + lx_sgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetsgid(cr)); + + if (copyout(&lx_rgid16, (void *)rgid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_egid16, (void *)egid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_sgid16, (void *)sgid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * The lx brand cannot support the setfs[ug]id16/setfs[ug]id calls as that + * would require significant rework of the illumos privilege mechanisms, so + * instead return the current effective [ug]id. + * + * In Linux, fsids track effective IDs, so returning the effective IDs works + * as a substitute; returning the current value also denotes failure of the + * call if the caller had specified something different. We don't need to + * worry about setting error codes because the Linux calls don't set any. + */ +/*ARGSUSED*/ +long +lx_setfsuid16(uid_t fsuid16) +{ + return ((int)LX_UID32_TO_UID16(crgetuid(CRED()))); +} + +/*ARGSUSED*/ +long +lx_setfsgid16(gid_t fsgid16) +{ + return ((int)LX_GID32_TO_GID16(crgetgid(CRED()))); +} + +/*ARGSUSED*/ +long +lx_setfsuid(uid_t fsuid) +{ + return (crgetuid(CRED())); +} + +/*ARGSUSED*/ +long +lx_setfsgid(gid_t fsgid) +{ + return (crgetgid(CRED())); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c new file mode 100644 index 0000000000..9d8d88d6f6 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c @@ -0,0 +1,1865 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/termio.h> +#include <sys/termios.h> +#include <sys/ptyvar.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <sys/sockio.h> +#include <sys/stropts.h> +#include <sys/ptms.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/sysmacros.h> +#include <sys/lx_misc.h> +#include <sys/lx_ptm.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/kmem.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/if_arp.h> +#include <sys/ioccom.h> +#include <sys/dtrace.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/lx_autofs.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> +#include <sys/dkio.h> +#include <sys/sdt.h> + +/* + * Linux ioctl types + */ +#define LX_IOC_TYPE_HD 0x03 +#define LX_IOC_TYPE_BLK 0x12 +#define LX_IOC_TYPE_FD 0x54 +#define LX_IOC_TYPE_DTRACE 0x68 +#define LX_IOC_TYPE_SOCK 0x89 +#define LX_IOC_TYPE_AUTOFS 0x93 + +/* + * Supported ioctls + */ +#define LX_HDIO_GETGEO 0x0301 +#define LX_BLKGETSIZE 0x1260 +#define LX_BLKSSZGET 0x1268 +#define LX_BLKGETSIZE64 0x80081272 +#define LX_TCGETS 0x5401 +#define LX_TCSETS 0x5402 +#define LX_TCSETSW 0x5403 +#define LX_TCSETSF 0x5404 +#define LX_TCGETA 0x5405 +#define LX_TCSETA 0x5406 +#define LX_TCSETAW 0x5407 +#define LX_TCSETAF 0x5408 +#define LX_TCSBRK 0x5409 +#define LX_TCXONC 0x540a +#define LX_TCFLSH 0x540b +#define LX_TIOCEXCL 0x540c +#define LX_TIOCNXCL 0x540d +#define LX_TIOCSCTTY 0x540e +#define LX_TIOCGPGRP 0x540f +#define LX_TIOCSPGRP 0x5410 +#define LX_TIOCOUTQ 0x5411 +#define LX_TIOCSTI 0x5412 +#define LX_TIOCGWINSZ 0x5413 +#define LX_TIOCSWINSZ 0x5414 +#define LX_TIOCMGET 0x5415 +#define LX_TIOCMBIS 0x5416 +#define LX_TIOCMBIC 0x5417 +#define LX_TIOCMSET 0x5418 +#define LX_TIOCGSOFTCAR 0x5419 +#define LX_TIOCSSOFTCAR 0x541a +#define LX_FIONREAD 0x541b +#define LX_TIOCPKT 0x5420 +#define LX_FIONBIO 0x5421 +#define LX_TIOCNOTTY 0x5422 +#define LX_TIOCSETD 0x5423 +#define LX_TIOCGETD 0x5424 +#define LX_TCSBRKP 0x5425 +#define LX_TIOCGSID 0x5429 +#define LX_TIOCGPTN 0x80045430 +#define LX_TIOCSPTLCK 0x40045431 +#define LX_FIONCLEX 0x5450 +#define LX_FIOCLEX 0x5451 +#define LX_FIOASYNC 0x5452 +#define LX_FIOSETOWN 0x8901 +#define LX_SIOCSPGRP 0x8902 +#define LX_FIOGETOWN 0x8903 +#define LX_SIOCGPGRP 0x8904 +#define LX_SIOCATMARK 0x8905 +#define LX_SIOCGSTAMP 0x8906 +#define LX_SIOCADDRT 0x890b +#define LX_SIOCDELRT 0x890c +#define LX_SIOCRTMSG 0x890d +#define LX_SIOCGIFNAME 0x8910 +#define LX_SIOCSIFLINK 0x8911 +#define LX_SIOCGIFCONF 0x8912 +#define LX_SIOCGIFFLAGS 0x8913 +#define LX_SIOCSIFFLAGS 0x8914 +#define LX_SIOCGIFADDR 0x8915 +#define LX_SIOCSIFADDR 0x8916 +#define LX_SIOCGIFDSTADDR 0x8917 +#define LX_SIOCSIFDSTADDR 0x8918 +#define LX_SIOCGIFBRDADDR 0x8919 +#define LX_SIOCSIFBRDADDR 0x891a +#define LX_SIOCGIFNETMASK 0x891b +#define LX_SIOCSIFNETMASK 0x891c +#define LX_SIOCGIFMETRIC 0x891d +#define LX_SIOCSIFMETRIC 0x891e +#define LX_SIOCGIFMEM 0x891f +#define LX_SIOCSIFMEM 0x8920 +#define LX_SIOCGIFMTU 0x8921 +#define LX_SIOCSIFMTU 0x8922 +#define LX_SIOCSIFNAME 0x8923 +#define LX_SIOCSIFHWADDR 0x8924 +#define LX_SIOCGIFENCAP 0x8925 +#define LX_SIOCSIFENCAP 0x8926 +#define LX_SIOCGIFHWADDR 0x8927 +#define LX_SIOCGIFSLAVE 0x8929 +#define LX_SIOCSIFSLAVE 0x8930 +#define LX_SIOCADDMULTI 0x8931 +#define LX_SIOCDELMULTI 0x8932 +#define LX_SIOCGIFINDEX 0x8933 +#define LX_SIOCSIFPFLAGS 0x8934 +#define LX_SIOCGIFPFLAGS 0x8935 +#define LX_SIOCDIFADDR 0x8936 +#define LX_SIOCSIFHWBROADCAST 0x8937 +#define LX_SIOCGIFCOUNT 0x8938 +#define LX_SIOCGIFBR 0x8940 +#define LX_SIOCSIFBR 0x8941 +#define LX_SIOCGIFTXQLEN 0x8942 +#define LX_SIOCSIFTXQLEN 0x8943 +#define LX_SIOCETHTOOL 0x8946 +#define LX_SIOCGMIIPHY 0x8947 +#define LX_SIOCGMIIREG 0x8948 +#define LX_SIOCSMIIREG 0x8949 +#define LX_SIOCWANDEV 0x894a +#define LX_SIOCOUTQNSD 0x894b +#define LX_SIOCDARP 0x8953 +#define LX_SIOCGARP 0x8954 +#define LX_SIOCSARP 0x8955 +#define LX_SIOCDRARP 0x8960 +#define LX_SIOCGRARP 0x8961 +#define LX_SIOCSRARP 0x8962 +#define LX_SIOCGIFMAP 0x8970 +#define LX_SIOCSIFMAP 0x8971 +#define LX_SIOCADDDLCI 0x8980 +#define LX_SIOCDELDLCI 0x8981 +#define LX_SIOCGIFVLAN 0x8982 +#define LX_SIOCSIFVLAN 0x8983 +#define LX_SIOCBONDENSLAVE 0x8990 +#define LX_SIOCBONDRELEASE 0x8991 +#define LX_SIOCBONDSETHWADDR 0x8992 +#define LX_SIOCBONDSLAVEINFOQUERY 0x8993 +#define LX_SIOCBONDINFOQUERY 0x8994 +#define LX_SIOCBONDCHANGEACTIVE 0x8995 +#define LX_SIOCBRADDBR 0x89a0 +#define LX_SIOCBRDELBR 0x89a1 +#define LX_SIOCBRADDIF 0x89a2 +#define LX_SIOCBRDELIF 0x89a3 +#define LX_SIOCSHWTSTAMP 0x89b0 +#define LX_SIOCGHWTSTAMP 0x89b1 +#define LX_SIOCDEVPRIVATE 0x89f0 +#define LX_SIOCPROTOPRIVATE 0x89e0 + +#define FLUSER(fp) fp->f_flag | get_udatamodel() +#define FLFAKE(fp) fp->f_flag | FKIOCTL + +/* + * LX_NCC must be different from LX_NCCS since while the termio and termios + * structures may look similar they are fundamentally different sizes and + * have different members. + */ +#define LX_NCC 8 +#define LX_NCCS 19 + +struct lx_termio { + unsigned short c_iflag; /* input mode flags */ + unsigned short c_oflag; /* output mode flags */ + unsigned short c_cflag; /* control mode flags */ + unsigned short c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCC]; /* control characters */ +}; + +struct lx_termios { + uint32_t c_iflag; /* input mode flags */ + uint32_t c_oflag; /* output mode flags */ + uint32_t c_cflag; /* control mode flags */ + uint32_t c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCCS]; /* control characters */ +}; + +/* + * c_cc characters which are valid for lx_termio and lx_termios + */ +#define LX_VINTR 0 +#define LX_VQUIT 1 +#define LX_VERASE 2 +#define LX_VKILL 3 +#define LX_VEOF 4 +#define LX_VTIME 5 +#define LX_VMIN 6 +#define LX_VSWTC 7 + +/* + * c_cc characters which are valid for lx_termios + */ +#define LX_VSTART 8 +#define LX_VSTOP 9 +#define LX_VSUSP 10 +#define LX_VEOL 11 +#define LX_VREPRINT 12 +#define LX_VDISCARD 13 +#define LX_VWERASE 14 +#define LX_VLNEXT 15 +#define LX_VEOL2 16 + +/* + * Defaults needed for SunOS to Linux format conversion. + * See INIT_C_CC in linux-stable/include/asm-generic/termios.h + */ +#define LX_DEF_VTIME 0 +#define LX_DEF_VMIN 1 +#define LX_DEF_VEOF '\004' +#define LX_DEF_VEOL 0 + +/* VSD key for lx_cc information */ +static uint_t lx_ioctl_vsd = 0; + + +/* Terminal helpers */ + +static void +l2s_termios(struct lx_termios *l_tios, struct termios *s_tios) +{ + ASSERT((l_tios != NULL) && (s_tios != NULL)); + + bzero(s_tios, sizeof (*s_tios)); + + s_tios->c_iflag = l_tios->c_iflag; + s_tios->c_oflag = l_tios->c_oflag; + s_tios->c_cflag = l_tios->c_cflag; + s_tios->c_lflag = l_tios->c_lflag; + + if (s_tios->c_lflag & ICANON) { + s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF]; + s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL]; + } else { + s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN]; + s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME]; + } + + s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2]; + s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE]; + s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL]; + s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT]; + s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT]; + s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE]; + s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR]; + s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT]; + s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC]; + s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART]; + s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP]; + s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP]; + s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD]; +} + +static void +l2s_termio(struct lx_termio *l_tio, struct termio *s_tio) +{ + ASSERT((l_tio != NULL) && (s_tio != NULL)); + + bzero(s_tio, sizeof (*s_tio)); + + s_tio->c_iflag = l_tio->c_iflag; + s_tio->c_oflag = l_tio->c_oflag; + s_tio->c_cflag = l_tio->c_cflag; + s_tio->c_lflag = l_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF]; + } else { + s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN]; + s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME]; + } + + s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR]; + s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT]; + s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE]; + s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL]; + s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC]; +} + +static void +termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio) +{ + ASSERT((l_tios != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tios->c_cc[LX_VEOF]; + lio->veol = l_tios->c_cc[LX_VEOL]; + lio->vmin = l_tios->c_cc[LX_VMIN]; + lio->vtime = l_tios->c_cc[LX_VTIME]; +} + +static void +termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio) +{ + ASSERT((l_tio != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tio->c_cc[LX_VEOF]; + lio->veol = 0; + lio->vmin = l_tio->c_cc[LX_VMIN]; + lio->vtime = l_tio->c_cc[LX_VTIME]; +} + +static void +s2l_termios(struct termios *s_tios, struct lx_termios *l_tios) +{ + ASSERT((s_tios != NULL) && (l_tios != NULL)); + + bzero(l_tios, sizeof (*l_tios)); + + l_tios->c_iflag = s_tios->c_iflag; + l_tios->c_oflag = s_tios->c_oflag; + l_tios->c_cflag = s_tios->c_cflag; + l_tios->c_lflag = s_tios->c_lflag; + + /* + * Since use of the VMIN/VTIME and VEOF/VEOL control characters is + * mutually exclusive (determined by ICANON), SunOS aliases them in the + * c_cc field in termio/termios. Linux does not perform this aliasing, + * so it expects that the default values are present regardless of + * ICANON status. + * + * These defaults can be overridden later by any values stored via the + * lx_cc mechanism. + */ + if (s_tios->c_lflag & ICANON) { + l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF]; + l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL]; + l_tios->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tios->c_cc[LX_VMIN] = LX_DEF_VMIN; + + } else { + l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN]; + l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME]; + l_tios->c_cc[LX_VEOF] = LX_DEF_VEOF; + l_tios->c_cc[LX_VEOL] = LX_DEF_VEOL; + } + + l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2]; + l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE]; + l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL]; + l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT]; + l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT]; + l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE]; + l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR]; + l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT]; + l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH]; + l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART]; + l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP]; + l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP]; + l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD]; +} + +static void +s2l_termio(struct termio *s_tio, struct lx_termio *l_tio) +{ + ASSERT((s_tio != NULL) && (l_tio != NULL)); + + bzero(l_tio, sizeof (*l_tio)); + + l_tio->c_iflag = s_tio->c_iflag; + l_tio->c_oflag = s_tio->c_oflag; + l_tio->c_cflag = s_tio->c_cflag; + l_tio->c_lflag = s_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF]; + l_tio->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tio->c_cc[LX_VMIN] = LX_DEF_VMIN; + } else { + l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN]; + l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME]; + l_tio->c_cc[LX_VEOF] = LX_DEF_VEOF; + } + + l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR]; + l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT]; + l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE]; + l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL]; + l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH]; +} + +static void +set_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + /* + * Linux expects that the termio/termios control characters are + * preserved more strictly than illumos supports. In order to preserve + * the illusion that the characters are maintained, they are stored as + * vnode-specific data. + */ + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur == NULL) { + cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP); + bcopy(lio, cur, sizeof (struct lx_cc)); + (void) vsd_set(vp, lx_ioctl_vsd, cur); + } else { + bcopy(lio, cur, sizeof (struct lx_cc)); + } + mutex_exit(&vp->v_vsd_lock); +} + +static int +get_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + int rv = 1; + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur != NULL) { + bcopy(cur, lio, sizeof (*lio)); + rv = 0; + } + mutex_exit(&vp->v_vsd_lock); + return (rv); +} + +/* Socket helpers */ + +typedef struct lx_ifreq32 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + } ifr_ifrn; +} lx_ifreq32_t; + +typedef struct lx_ifreq64 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + /* pad this out to the Linux size */ + uint64_t ifmap[3]; + } ifr_ifrn; +} lx_ifreq64_t; + +typedef struct lx_ifconf32 { + int32_t if_len; + caddr32_t if_buf; +} lx_ifconf32_t; + +typedef struct lx_ifconf64 { + int32_t if_len; + caddr_t if_buf; +} lx_ifconf64_t; + + +/* Generic translators */ + +/* ARGSUSED */ +static int +ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error = 0; + int rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + int32_t iflag, flags; + int error; + + if (copyin((caddr_t)arg, &iflag, sizeof (iflag))) + return (set_errno(EFAULT)); + + mutex_enter(&fp->f_tlock); + vp = fp->f_vnode; + flags = fp->f_flag; + /* Linux sets NONBLOCK instead of FIONBIO */ + if (iflag) + flags |= FNONBLOCK; + else + flags &= ~FNONBLOCK; + /* push the flag down */ + error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL); + fp->f_flag = flags; + mutex_exit(&fp->f_tlock); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + struct vattr vattr; + int error = 0; + int rv; + /* + * offset is int32_t because that is what FIONREAD is defined in terms + * of. We cap at INT_MAX as in other cases for this ioctl. + */ + int32_t offset; + + vp = fp->f_vnode; + + if (vp->v_type == VREG || vp->v_type == VDIR) { + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL); + if (error != 0) + return (set_errno(error)); + offset = MIN(vattr.va_size - fp->f_offset, INT_MAX); + if (copyout(&offset, (caddr_t)arg, sizeof (offset))) + return (set_errno(EFAULT)); + } else { + error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred, + &rv, NULL); + if (error) + return (set_errno(error)); + } + return (0); +} + +/* + * hard disk-related translators + * + * Note that the normal disk ioctls only work for VCHR devices. See spec_ioctl + * which will return ENOTTY for a VBLK device. However, fdisk, etc. expect to + * work with block devices. + * + * We expect a zvol to be the primary block device we're interacting with and + * we use the zone's lxzd_vdisks list to handle zvols specifically. + */ + +typedef struct lx_hd_geom { + unsigned char heads; + unsigned char sectors; + unsigned short cylinders; + unsigned long start; +} lx_hd_geom_t; + +/* + * Return the volsize and blksize for the correct virtual "disk" for the zone. + * Only these two values are returned in 'vdp' within this code. + * + * A virtual "disk" can be a zvol visible within the zone, but most zones are + * not configured with a delegated dataset necessary to make zvols visible. + * + * To make various applications happy, lx also pretends that our root filesystem + * (normally within the zone's dataset) lives on a virtual disk. We have a + * /dev/zfsds0 symlink which points at /dev/zfs. This appears in various places + * to give the illusion of root's disk. For example, see: + * /proc/partitions + * /sys/block/zfsds0 + * /sys/devices/zfs/zfsds0 + * If an application issues the various LX_HDIO_GETGEO, LX_BLKGETSIZE*, or + * LX_BLKSSZGET ioctls on /dev/zfs (that is, minor number 0), we want to return + * something sane. In this case, we return the total size (which is normally + * limited by a quota) of the dataset that the zone root lives on. + */ +static boolean_t +lx_lookup_zdsk_info(lx_zone_data_t *lxzd, dev_t dev, lx_virt_disk_t *vdp) +{ + lx_virt_disk_t *vd; + + /* Handle /dev/zfs */ + if (getminor(dev) == 0) { + struct statvfs64 sv; + + if (VFS_STATVFS(curzone->zone_rootvp->v_vfsp, &sv) == 0) { + vdp->lxvd_volsize = sv.f_blocks * sv.f_frsize; + vdp->lxvd_blksize = sv.f_frsize; + } else { + vdp->lxvd_volsize = 0; + /* always set to prevent potential divide-by-zero */ + vdp->lxvd_blksize = 512; + } + + return (B_TRUE); + } + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL && vd->lxvd_real_dev == dev) { + bzero(vdp, sizeof (*vdp)); + vdp->lxvd_volsize = vd->lxvd_volsize; + vdp->lxvd_blksize = vd->lxvd_blksize; + return (B_TRUE); + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + return (B_FALSE); +} + +/* + * See zvol_ioctl() which always fails for DKIOCGGEOM. The geometry for a + * zvol (or really any modern disk) is made up, so we do that here as well. + */ +/* ARGSUSED */ +static int +ict_hdgetgeo(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_hd_geom_t lx_geom; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd) || + vd.lxvd_volsize == 0 || vd.lxvd_blksize == 0) { + /* should only happen if new zvol */ + bzero(&lx_geom, sizeof (lx_geom)); + } else { + const diskaddr_t blks = + MAX(1, vd.lxvd_volsize / vd.lxvd_blksize); + + /* + * Attempt to conjure up a Cylinder-Head-Sector + * geometry for the given virtual disk size. + */ + if (blks <= (63*16*65535)) { + /* + * Use traditional BIOS-style geometry for + * adequately small disks. + */ + lx_geom.sectors = 63; + lx_geom.heads = 16; + lx_geom.cylinders = MAX(1, (blks / (63 * 16))); + } else if (blks <= (64*32*65535)) { + /* 1MB per cylinder for 512-byte sectors */ + lx_geom.sectors = 64; + lx_geom.heads = 32; + lx_geom.cylinders = (blks / (64 * 32)); + } else { + /* + * Max out the geometry sizing for large disks. + * This may not be adequate for truely huge + * volumes (maxing out at a little under 2TB + * for those with a 512-byte blocksize), but it + * is the best we can do with the given struct. + */ + lx_geom.sectors = 255; + lx_geom.heads = 255; + lx_geom.cylinders = MIN(65535, + (blks / (255*255))); + } + lx_geom.start = 0; + } + } else { + int res, rv; + struct dk_geom geom; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGGEOM, (intptr_t)&geom, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + lx_geom.heads = geom.dkg_nhead; + lx_geom.sectors = geom.dkg_nsect; + lx_geom.cylinders = geom.dkg_ncyl; + lx_geom.start = 0; + } + + if (copyout(&lx_geom, (caddr_t)arg, sizeof (lx_geom))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Per the Linux sd(4) man page, get the number of sectors. The linux/fs.h + * header says its 512 byte blocks. + */ +/* ARGSUSED */ +static int +ict_blkgetsize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + diskaddr_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd.lxvd_volsize / 512; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity; + if (minfo.dki_lbsize > 512) { + uint_t bsize = minfo.dki_lbsize / 512; + + tot *= bsize; + } + } + + if (copyout(&tot, (caddr_t)arg, sizeof (long))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the sector size (i.e. the logical block size). + */ +/* ARGSUSED */ +static int +ict_blkgetssize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint_t bsize; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + bsize = 0; + } else { + bsize = (uint_t)vd.lxvd_blksize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + bsize = (uint_t)minfo.dki_lbsize; + } + + if (copyout(&bsize, (caddr_t)arg, sizeof (bsize))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the size. The linux/fs.h header says its in bytes. + */ +/* ARGSUSED */ +static int +ict_blkgetsize64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint64_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd.lxvd_volsize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity * minfo.dki_lbsize; + } + + if (copyout(&tot, (caddr_t)arg, sizeof (uint64_t))) + return (set_errno(EFAULT)); + return (0); +} + +/* ARGSUSED */ +/* Terminal-related translators */ + +/* ARGSUSED */ +static int +ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF); + + if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + termios2lx_cc(&l_tios, &lio); + l2s_termios(&l_tios, &s_tios); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF); + + if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + l2s_termio(&l_tio, &s_tio); + termio2lx_cc(&l_tio, &lio); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios, *s_tiosd; + uint_t s_tiosl; + + /* get termios defaults */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd, + &s_tiosl) != DDI_SUCCESS) + return (EIO); + ASSERT(s_tiosl == sizeof (*s_tiosd)); + bcopy(s_tiosd, &s_tios, sizeof (s_tios)); + ddi_prop_free(s_tiosd); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tios.c_cc[LX_VEOF] = lio.veof; + l_tios.c_cc[LX_VEOL] = lio.veol; + l_tios.c_cc[LX_VMIN] = lio.vmin; + l_tios.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV)) + return (ict_tcgets_ptm(fp, cmd, arg, lxcmd)); + else + return (ict_tcgets_native(fp, cmd, arg, lxcmd)); +} + +/* ARGSUSED */ +static int +ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + s2l_termio(&s_tio, &l_tio); + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tio.c_cc[LX_VEOF] = lio.veof; + l_tio.c_cc[LX_VMIN] = lio.vmin; + l_tio.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t lpid, spid, tid; + int error, rv; + + /* Converting to the illumos pid is necessary */ + if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(lpid, &spid, &tid) < 0) + return (set_errno(EPERM)); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int rv, error; + /* use null duration to emulate TCSBRKP */ + int dur = 0; + error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur, + FLFAKE(fp), fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t spgrp; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp), + fp->f_cred, &rv, NULL); + if (error == 0) { + if (spgrp == curproc->p_zone->zone_proc_initpid) { + spgrp = 1; + } + if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) { + return (set_errno(EFAULT)); + } + } + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + int error, rv; + + istr.ic_cmd = UNLKPT; + istr.ic_len = 0; + istr.ic_timout = 0; + istr.ic_dp = NULL; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + /* + * The success/fail return values are different between Linux + * and illumos. Linux expects 0 or -1. Illumos can return + * positive number on success. + */ + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + cred_t *cr; + pt_own_t pto; + int error, rv; + int ptyno; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + + /* This operation is only valid for the lx_ptm device. */ + if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV)) + return (set_errno(ENOTTY)); + + cr = CRED(); + pto.pto_ruid = cr->cr_uid; + /* + * Both Linux and our native code (see grantpt() in native libc) + * prefer assigning the "tty" gid to the new pty. On Linux this is + * done by udev. Since we're in the kernel we cannot lookup the gid, so + * we rely on the lx_support program to initialize the value in the + * zone data at boot time. + */ + if (lxzd->lxzd_ttygrp == 0) { + pto.pto_rgid = cr->cr_gid; + } else { + pto.pto_rgid = lxzd->lxzd_ttygrp; + } + + istr.ic_cmd = OWNERPT; + istr.ic_len = sizeof (pto); + istr.ic_timout = 0; + istr.ic_dp = (char *)&pto; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (error) + return (set_errno((error == ENOTTY) ? error: EACCES)); + + ptyno = getminor(fp->f_vnode->v_rdev) - 1; + if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno))) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + + /* + * A few Linux libc's (e.g. musl) have chosen to implement isatty() + * using the TIOCGWINSZ ioctl. Some apps also do the same thing + * directly. On Linux that ioctl will return a size of 0x0 for dumb + * terminals but on illumos see the handling for TIOCGWINSZ in ptem's + * ptioc(). We fail if the winsize is all zeros. To emulate the Linux + * behavior use the native ioctl check that we do for isatty and return + * a size of 0x0 if that succeeds. + */ + if (error == EINVAL) { + int err; + struct termio s_tio; + + err = VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (err == 0) { + struct winsize w; + + bzero(&w, sizeof (w)); + if (copyout(&w, (struct winsize *)arg, sizeof (w)) != 0) + return (set_errno(EFAULT)); + return (0); + } + } + + if (error != 0) + return (set_errno(error)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t ttysid, mysid; + int error, rv; + proc_t *p = curproc; + + /* getsid */ + mutex_enter(&p->p_splock); + mysid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + /* + * Report success if we already control the tty. + * If no one controls it, TIOCSCTTY will change that later. + */ + error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error == 0 && ttysid == mysid) + return (0); + + /* + * Need to make sure we're a session leader, otherwise the + * TIOCSCTTY ioctl will fail. + */ + mutex_enter(&pidlock); + if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + sess_create(); + } else { + mutex_exit(&pidlock); + } + + error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp), + fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* Socket-related translators */ + +/* ARGSUSED */ +static int +ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp = fp->f_vnode; + int error, rv; + /* + * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while + * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this. + */ + if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM) + return (set_errno(ENOTTY)); + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (error) + return (set_errno(error)); + + return (0); +} + +static int +ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred) +{ + int error, rv; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + ksocket_t ks; + + ASSERT(lxzd != NULL); + + /* + * For ioctls of this type, we are strict about address family + * whereas Linux is lenient. This strictness can be avoided by using + * an internal AF_INET ksocket, which we use if the family is anything + * but AF_PACKET. + */ + if (vn->v_type == VSOCK && VTOSO(vn)->so_family == AF_PACKET) + return (VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL)); + + mutex_enter(&lxzd->lxzd_lock); + ks = lxzd->lxzd_ioctl_sock; + if (ks == NULL) { + /* + * Linux is not at all picky about address family when it comes + * to supporting interface-related ioctls. To mimic this + * behavior, we'll attempt those ioctls against a ksocket + * configured for that purpose. + */ + (void) ksocket_socket(&lxzd->lxzd_ioctl_sock, AF_INET, + SOCK_DGRAM, 0, 0, curproc->p_zone->zone_kcred); + ks = lxzd->lxzd_ioctl_sock; + } + mutex_exit(&lxzd->lxzd_lock); + + if (ks != NULL) { + error = ksocket_ioctl(ks, cmd, arg, &rv, cred); + } else { + error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL); + } + + return (error); +} + +static int +ict_sioghwaddr(file_t *fp, struct lifreq *lreq) +{ + struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr; + struct sockaddr hwaddr; + int error, size; + + error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq, + FLFAKE(fp), fp->f_cred); + + if (error == EADDRNOTAVAIL && + strncmp(lreq->lifr_name, "lo", 2) == 0) { + /* Emulate success on suspected loopbacks */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + bzero(LLADDR(sdl), sdl->sdl_alen); + error = 0; + } + + if (error == 0) { + bzero(&hwaddr, sizeof (hwaddr)); + lx_stol_hwaddr(sdl, &hwaddr, &size); + bcopy(&hwaddr, &lreq->lifr_addr, + size + sizeof (sdl->sdl_family)); + } + + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifname(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + int len; + char name[LIFNAMSIZ]; + netstack_t *ns; + ip_stack_t *ipst; + phyint_t *phyi; + + if (fp->f_vnode->v_type != VSOCK) { + return (set_errno(EINVAL)); + } + + len = (curproc->p_model == DATAMODEL_LP64) ? sizeof (lx_ifreq64_t) : + sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) { + return (set_errno(EFAULT)); + } + + /* + * Since Linux calls this ioctl on all sorts of sockets, perform the + * interface name lookup manually. + */ + if ((ns = netstack_get_current()) == NULL) { + return (set_errno(EINVAL)); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &req.ifr_index, NULL); + if (phyi != NULL) { + (void) strncpy(name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + if (strlen(name) != 0) { + /* Truncate for ifreq and copyout */ + (void) strncpy(req.ifr_name, name, IFNAMSIZ); + if (copyout(&req, (struct ifreq *)arg, len) != 0) { + return (set_errno(EFAULT)); + } + return (0); + } + + return (set_errno(EINVAL)); +} + +/* ARGSUSED */ +static int +ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + struct lifreq lreq; + int error, len; + + /* Convert from Linux ifreq to illumos lifreq */ + if (curproc->p_model == DATAMODEL_LP64) + len = sizeof (lx_ifreq64_t); + else + len = sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) + return (set_errno(EFAULT)); + bzero(&lreq, sizeof (lreq)); + (void) strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ); + bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ); + lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE); + + switch (cmd) { + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMTU: + case SIOCSIFMTU: + /* + * Convert cmd from SIO*IF* to SIO*LIF*. + * This is needed since Linux allows ifreq operations on ipv6 + * sockets where illumos does not. + */ + cmd = ((cmd & IOC_INOUT) | + _IOW('i', ((cmd & 0xff) + 100), struct lifreq)); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFINDEX: + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFFLAGS: + cmd = SIOCGLIFFLAGS; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE); + break; + case SIOCSIFFLAGS: + cmd = SIOCSLIFFLAGS; + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFHWADDR: + error = ict_sioghwaddr(fp, &lreq); + break; + case LX_SIOCGIFTXQLEN: + /* + * Illumos lacks the notion of txqlen. Confirm the provided + * interface is valid with SIOCGLIFINDEX and return a fake + * txqlen of 1. Loopback devices will report txqlen of 0. + */ + if (strncmp(lreq.lifr_name, "lo", 2) == 0) { + lreq.lifr_index = 0; + error = 0; + break; + } + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) { + /* lifr_index aliases to the qlen field */ + lreq.lifr_index = 1; + } + break; + case LX_SIOCSIFHWADDR: + /* + * We're not going to support SIOCSIFHWADDR, but we need to be + * able to check the result of the copyin first to see if the + * command should have returned EFAULT. + */ + default: + error = EINVAL; + } + + if (error != 0) + return (set_errno(error)); + + /* Convert back to a Linux ifreq */ + lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE); + bzero(&req, sizeof (req)); + (void) strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ); + bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ); + + if (copyout(&req, (struct lifreq *)arg, len) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf32_t conf; + lx_ifreq32_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq32_t); + + if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq32_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq32_t); + oreq = (lx_ifreq32_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf64_t conf; + lx_ifreq64_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq64_t); + + if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq64_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq64_t); + oreq = (lx_ifreq64_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq64_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (curproc->p_model == DATAMODEL_LP64) + return (ict_siocgifconf64(fp, cmd, arg, lxcmd)); + else + return (ict_siocgifconf32(fp, cmd, arg, lxcmd)); +} + +/* + * Unfortunately some of the autofs ioctls want to return a positive integer + * result which does not indicate an error. To minimize disruption in the + * rest of the code, we'll treat a positive return as an errno and a negative + * return as the non-error return (which we then negate). + */ +/* ARGSUSED */ +static int +ict_autofs(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int res = 0; + int rv; + + res = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (res > 0) + return (set_errno(res)); + if (res == 0) + return (0); + return (-res); +} + +/* Structure used to define an ioctl translator. */ +typedef struct lx_ioc_cmd_translator { + int lict_lxcmd; + int lict_cmd; + int (*lict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd); +} lx_ioc_cmd_translator_t; + +#define LX_IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_END \ + {0, 0, NULL} + +static lx_ioc_cmd_translator_t lx_ioc_xlate_fd[] = { + LX_IOC_CMD_TRANSLATOR_FILTER(FIONBIO, ict_fionbio) + LX_IOC_CMD_TRANSLATOR_FILTER(FIONREAD, ict_fionread) + LX_IOC_CMD_TRANSLATOR_PASS(FIOASYNC) + + /* streams related */ + LX_IOC_CMD_TRANSLATOR_PASS(TCXONC) + LX_IOC_CMD_TRANSLATOR_PASS(TCFLSH) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCEXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSTI) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIS) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIC) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMSET) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSETD) + LX_IOC_CMD_TRANSLATOR_PASS(TCSBRK) + + /* terminal related */ + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGETD) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGSID) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCPKT) + + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETS, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSW, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSF, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETA, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAW, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAF, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETS, ict_tcgets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETA, ict_tcgeta) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ, ict_tiocgwinsz) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP, ict_tcsbrkp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP, ict_tiocspgrp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP, ict_tiocgpgrp) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK, ict_sptlock) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN, ict_gptn) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY, ict_tiocsctty) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_socket[] = { + LX_IOC_CMD_TRANSLATOR_PASS(FIOGETOWN) + + LX_IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK, ict_siocatmark) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF, ict_siocgifconf) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFNAME, ict_siocgifname) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_dtrace[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_autofs[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_READY) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_FAIL) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_CATATONIC) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_SETTIMEOUT) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE_MULTI) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOSUBVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_ASKUMOUNT) + + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_VERSION_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_READY_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_FAIL_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CATATONIC_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_TIMEOUT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_REQUESTER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_EXPIRE_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD, + ict_autofs) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_hd[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_HDIO_GETGEO, ict_hdgetgeo) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_blk[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE, ict_blkgetsize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKSSZGET, ict_blkgetssize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE64, ict_blkgetsize64) + + LX_IOC_CMD_TRANSLATOR_END +}; + +/* + * Linux only restarts ioctls for "slow" devices. This includes terminals, + * pipes, and sockets. If additional "slow" devices are discovered in the + * future, they can be added here as well. + */ +static boolean_t +lx_ioctl_is_slow_dev(file_t *fp) +{ + int rv; + struct termio s_tio; + vtype_t vt = fp->f_vnode->v_type; + + if (vt == VFIFO || vt == VSOCK) + return (B_TRUE); + + /* Check if it's a terminal using the isatty() approach. */ + if (vt == VCHR && VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +static void +lx_ioctl_vsd_free(void *data) +{ + kmem_free(data, sizeof (struct lx_cc)); +} + +void +lx_ioctl_init() +{ + vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free); +} + +void +lx_ioctl_fini() +{ + vsd_destroy(&lx_ioctl_vsd); +} + +long +lx_ioctl(int fdes, int cmd, intptr_t arg) +{ + file_t *fp; + int res = 0, error = ENOTTY; + lx_ioc_cmd_translator_t *ict = NULL; + + if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) { + res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0); + return ((res != 0) ? set_errno(res) : 0); + } + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + switch ((cmd & 0xff00) >> 8) { + case LX_IOC_TYPE_FD: + ict = lx_ioc_xlate_fd; + break; + + case LX_IOC_TYPE_DTRACE: + ict = lx_ioc_xlate_dtrace; + break; + + case LX_IOC_TYPE_SOCK: + ict = lx_ioc_xlate_socket; + error = EOPNOTSUPP; + break; + + case LX_IOC_TYPE_AUTOFS: + ict = lx_ioc_xlate_autofs; + break; + + case LX_IOC_TYPE_BLK: + ict = lx_ioc_xlate_blk; + break; + + case LX_IOC_TYPE_HD: + ict = lx_ioc_xlate_hd; + break; + + default: + releasef(fdes); + return (set_errno(ENOTTY)); + } + + /* + * Today, none of the ioctls supported by the emulation possess + * overlapping cmd values. Because of that, no type interrogation of + * the fd is done before executing specific ioctl emulation. It's + * assumed that the vnode-specific logic called by the emulation + * function will reject ioctl commands not supported by the fd. + */ + VERIFY(ict != NULL); + while (ict->lict_func != NULL) { + if (ict->lict_lxcmd == cmd) + break; + ict++; + } + if (ict->lict_func == NULL) { + releasef(fdes); + return (set_errno(error)); + } + + res = ict->lict_func(fp, ict->lict_cmd, arg, ict->lict_lxcmd); + + if (ttolwp(curthread)->lwp_errno == EINTR && lx_ioctl_is_slow_dev(fp)) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + releasef(fdes); + return (res); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c new file mode 100644 index 0000000000..13397e199e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c @@ -0,0 +1,66 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/lx_brand.h> + +/* 'which' values. */ +#define LX_IOPRIO_WHO_PROCESS 1 +#define LX_IOPRIO_WHO_PGRP 2 +#define LX_IOPRIO_WHO_USER 3 + +/* + * The possible values for the class. We report best effort (BE) as the class + * in use. + */ +#define LX_IOPRIO_CLASS_RT 1 +#define LX_IOPRIO_CLASS_BE 2 +#define LX_IOPRIO_CLASS_IDLE 3 + +/* Macro to determine the class from the input mask */ +#define LX_IOPRIO_PRIO_CLASS(m) ((m) >> 13) + +/* ARGSUSED */ +long +lx_ioprio_get(int which, int who) +{ + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + return (LX_IOPRIO_CLASS_BE); +} + +/* + * We allow setting any valid class, even though it's ignored. + * We ignore the 'who' parameter which means that we're not searching for + * the specified target in order to return a specific errno in the case that + * the target does not exist. + */ +/* ARGSUSED */ +long +lx_ioprio_set(int which, int who, int mask) +{ + int class; + + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + class = LX_IOPRIO_PRIO_CLASS(mask); + if (class < LX_IOPRIO_CLASS_RT || class > LX_IOPRIO_CLASS_IDLE) + return (set_errno(EINVAL)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c new file mode 100644 index 0000000000..6fefbde705 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -0,0 +1,408 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/thread.h> +#include <sys/signal.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <lx_signum.h> +#include <sys/contract/process_impl.h> + +extern int kill(pid_t, int); + +/* + * Check if it is legal to send this signal to the init process. Linux + * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid + * 1. + */ +static int +lx_init_sig_check(int sig, pid_t pid) +{ + proc_t *p; + int rv = 0; + + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) { + rv = ESRCH; + } else if (sig != 0) { + if (sigismember(&cantmask, sig)) { + rv = EPERM; + } else { + mutex_enter(&p->p_lock); + if (PTOU(p)->u_signal[sig-1] == SIG_DFL || + PTOU(p)->u_signal[sig-1] == SIG_IGN) { + rv = EPERM; + } + mutex_exit(&p->p_lock); + } + } + mutex_exit(&pidlock); + + return (rv); +} + +static long +lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill) +{ + kthread_t *t; + proc_t *pp, *cp = curproc; + sigqueue_t *sqp; + int sig, rv; + + /* + * Unlike kill(2), Linux tkill(2) doesn't allow signals to + * be sent to process IDs <= 0 as it doesn't overlay any special + * semantics on the pid. + */ + if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * If the Linux pid is 1, translate the pid to the actual init + * pid for the zone. Note that Linux dictates that no unhandled + * signals may be sent to init, so check for that, too. + * + * Otherwise, extract the tid and real pid from the Linux pid. + */ + if (pid == 1) { + pid_t initpid; + + initpid = cp->p_zone->zone_proc_initpid; + if ((rv = lx_init_sig_check(sig, initpid)) != 0) { + return (set_errno(rv)); + } + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * Find the process for the passed pid... + */ + if (lx_lpid_lock(pid, curzone, 0, &pp, &t) != 0) { + rv = set_errno(ESRCH); + goto free_and_exit; + } + + /* + * Make sure the thread group matches the thread. + */ + if (tgkill) { + if ((pid == 1 && tgid != 1) || + (pid != 1 && tgid != pp->p_pid)) { + mutex_exit(&pp->p_lock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + } + + /* + * Deny permission to send the signal if either of the following + * is true: + * + * + The signal is SIGCONT and the target pid is not in the same + * session as the sender + * + * + prochasprocperm() shows the user lacks sufficient permission + * to send the signal to the target pid + */ + if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) || + (!prochasprocperm(pp, cp, CRED()))) { + mutex_exit(&pp->p_lock); + rv = set_errno(EPERM); + goto free_and_exit; + } + + /* a signal of 0 means just check for the existence of the thread */ + if (lx_sig == 0) { + mutex_exit(&pp->p_lock); + rv = 0; + goto free_and_exit; + } + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = cp->p_pid; + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(pp, t, sqp); + + mutex_exit(&pp->p_lock); + + return (0); + +free_and_exit: + kmem_free(sqp, sizeof (sigqueue_t)); + return (rv); +} + +long +lx_tgkill(pid_t tgid, pid_t pid, int lx_sig) +{ + return (lx_thrkill(tgid, pid, lx_sig, B_TRUE)); +} + +long +lx_tkill(pid_t pid, int lx_sig) +{ + return (lx_thrkill(0, pid, lx_sig, B_FALSE)); +} + +long +lx_kill(pid_t lx_pid, int lx_sig) +{ + pid_t s_pid, initpid; + sigsend_t v; + zone_t *zone = curzone; + struct proc *p; + int err, sig, nfound; + + if ((lx_sig < 0) || (lx_sig > LX_NSIG) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + initpid = zone->zone_proc_initpid; + if (lx_pid == 0 || lx_pid == -1) { + s_pid = 0; + } else if (lx_pid > 0) { + /* + * Translations for individual processes (including pid 1) is + * all handled by lx_lpid_to_spair. + */ + if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } else { + ASSERT(lx_pid < 0); + if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid it means that the + * process group leader doesn't exist in this zone. + * In this case assuming that the Linux pid is + * the same as the Solaris pid will get us the + * correct behavior. + */ + s_pid = -lx_pid; + } + } + + /* + * Check that it is legal for this signal to be sent to init + */ + if (s_pid == initpid && (err = lx_init_sig_check(sig, s_pid)) != 0) + return (set_errno(err)); + + /* + * For individual processes, kill() semantics are the same between + * Solaris and Linux. + */ + if (lx_pid >= 0) + return (kill(s_pid, sig)); + + /* + * In Solaris, sending a signal to -pid means "send a signal to + * everyone in process group pid." In Linux it means "send a + * signal to everyone in the group other than init." Sending a + * signal to -1 means "send a signal to every process except init + * and myself." + */ + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + err = 0; + + mutex_enter(&pidlock); + + p = (lx_pid == -1) ? practive : pgfind(s_pid); + nfound = 0; + while (err == 0 && p != NULL) { + if ((p->p_zone == zone) && (p->p_stat != SIDL) && + (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) { + nfound++; + err = sigsendproc(p, &v); + } + + p = (lx_pid == -1) ? p->p_next : p->p_pglink; + } + mutex_exit(&pidlock); + + /* + * If we found no processes, we'll return ESRCH -- but unlike our + * native kill(2), we do not return EPERM if processes are found but + * we did not have permission to send any of them a signal. + */ + if (nfound == 0) + err = ESRCH; + + return (err ? set_errno(err) : 0); +} + +/* + * This handles the unusual case where the user sends a non-queueable signal + * through rt_sigqueueinfo. Signals sent with codes that indicate they are + * queuable are sent through the sigqueue syscall via the user level function + * lx_rt_sigqueueinfo(). + */ +int +lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + proc_t *target_proc; + pid_t s_pid; + zone_t *zone = curproc->p_zone; + sigsend_t send; + int err; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + /* Unlike in lx_kill, this process id must be exact, no negatives. */ + if (tgid == 0) + return (set_errno(ESRCH)); + if (tgid < 0) + return (set_errno(EINVAL)); + /* + * Translate init directly, otherwise use the convenient utility + * function to translate. Since we're sending to the whole group, we + * only need the solaris pid, and not the lwp id. + */ + if (tgid == 1) { + s_pid = zone->zone_proc_initpid; + } else { + if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } + /* + * We shouldn't have queuable signals here, those are sent elsewhere by + * the usermode handler for this emulated call. + */ + if (!SI_CANQUEUE(kinfo.si_code)) { + return (set_errno(EINVAL)); + } + /* Since our signal shouldn't queue, we just call sigsendproc(). */ + bzero(&send, sizeof (send)); + send.sig = sig; + send.checkperm = 1; + send.sicode = kinfo.si_code; + send.value = kinfo.si_value; + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + err = 0; + if (target_proc != NULL) { + err = sigsendproc(target_proc, &send); + if (err == 0 && send.perm == 0) + err = EPERM; + } else { + err = ESRCH; + } + mutex_exit(&pidlock); + + return (err ? set_errno(err) : 0); +} + +/* + * Unlike the above function, this handles all system calls to rt_tgsigqueue + * regardless of si_code. + */ +int +lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) +{ + int err; + proc_t *p = NULL; + kthread_t *t; + sigqueue_t *sqp; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) { + return (set_errno(EFAULT)); + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + + if (lx_lpid_lock(tid, curzone, 0, &p, &t) != 0) { + err = ESRCH; + goto errout; + } + + /* + * For group leaders, the SunOS pid == Linux pid, so the SunOS leader + * pid should be the same as the tgid. Because the tgid comes in via + * the syscall, we need to check for an invalid value. + */ + if (p->p_pid != tgid) { + err = EINVAL; + goto errout; + } + + /* + * In order to match the Linux behavior of emitting ESRCH errors before + * confirming that the signal is valid, this check _must_ be performed + * after the target process/thread is located. + */ + if (sig < 0 || sig >= NSIG) { + err = EINVAL; + goto errout; + } + + /* + * To merely check for the existence of a thread, the caller will pass + * a signal value of 0. + */ + if (sig != 0) { + ASSERT(sqp != NULL); + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = kinfo.si_code; + sqp->sq_info.si_pid = p->p_pid; + sqp->sq_info.si_ctid = PRCTID(p); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + } + mutex_exit(&p->p_lock); + return (0); + +errout: + if (p != NULL) { + mutex_exit(&p->p_lock); + } + kmem_free(sqp, sizeof (sigqueue_t)); + return (set_errno(err)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_link.c b/usr/src/uts/common/brand/lx/syscall/lx_link.c new file mode 100644 index 0000000000..4ebf491d23 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_link.c @@ -0,0 +1,194 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/systm.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +#define LX_LINK_ALLOWED (LX_AT_SYMLINK_FOLLOW | LX_AT_EMPTY_PATH) + +/* From "uts/common/syscall/stat.c" */ +extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **); +/* From uts/common/syscall/unlink.c */ +extern int unlinkat(int, char *, int); +/* From uts/common/syscall/symlink.c */ +extern int symlinkat(char *, int, char *); +/* From uts/common/syscall/readlink.c */ +extern ssize_t readlinkat(int, char *, char *, size_t); + +static long +lx_link_common(int ffd, char *from, int tfd, char *to, int flags) +{ + int error; + vnode_t *fsvp = NULL, *tsvp = NULL; + enum symfollow follow = NO_FOLLOW; + + if ((flags & ~LX_LINK_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flags & LX_AT_EMPTY_PATH) == 0) { + char c; + + /* + * Check that both 'from' and 'to' names are non-empty if + * AT_EMPTY_PATH is not set. + */ + if (copyin(from, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + if (copyin(to, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + + /* + * XXX: When our support for LX capabilities improves, ENOENT + * should be thrown when a process lacking CAP_DAC_READ_SEARCH + * attempts to use the AT_EMPTY_PATH flag. + */ + } + if ((flags & LX_AT_SYMLINK_FOLLOW) != 0) { + follow = FOLLOW; + } + + if ((error = fgetstartvp(ffd, from, &fsvp)) != 0) { + goto out; + } + if ((error = fgetstartvp(tfd, to, &tsvp)) != 0) { + goto out; + } + error = vn_linkat(fsvp, from, follow, tsvp, to, UIO_USERSPACE); + +out: + if (fsvp != NULL) { + VN_RELE(fsvp); + } + if (tsvp != NULL) { + VN_RELE(tsvp); + } + if (error) { + return (set_errno(error)); + } + return (0); +} + +long +lx_link(char *from, char *to) +{ + return (lx_link_common(AT_FDCWD, from, AT_FDCWD, to, 0)); +} + +long +lx_linkat(int ffd, char *from, int tfd, char *to, int flags) +{ + ffd = (ffd == LX_AT_FDCWD) ? AT_FDCWD : ffd; + tfd = (tfd == LX_AT_FDCWD) ? AT_FDCWD : tfd; + + return (lx_link_common(ffd, from, tfd, to, flags)); +} + +static boolean_t +lx_isdir(int atfd, char *path) +{ + cred_t *cr = NULL; + vnode_t *vp = NULL; + boolean_t is_dir; + + if (cstatat_getvp(atfd, path, NO_FOLLOW, &vp, &cr) != 0) + return (B_FALSE); + + is_dir = (vp->v_type == VDIR); + VN_RELE(vp); + + return (is_dir); +} + +long +lx_unlink(char *path) +{ + int err; + + if ((err = unlinkat(AT_FDCWD, path, 0)) == EPERM) { + /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */ + if (lx_isdir(AT_FDCWD, path)) + return (set_errno(EISDIR)); + } + + return (err); +} + +long +lx_unlinkat(int atfd, char *path, int flag) +{ + int err; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + if ((flag = ltos_at_flag(flag, AT_REMOVEDIR, B_TRUE)) < 0) + return (set_errno(EINVAL)); + + err = unlinkat(atfd, path, flag); + if (err == EPERM && !(flag & AT_REMOVEDIR)) { + /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */ + if (lx_isdir(atfd, path)) + return (set_errno(EISDIR)); + } + + return (err); +} + +long +lx_symlink(char *name1, char *name2) +{ + return (symlinkat(name1, AT_FDCWD, name2)); +} + +long +lx_symlinkat(char *name1, int atfd, char *name2) +{ + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + return (symlinkat(name1, atfd, name2)); +} + +long +lx_readlink(char *path, char *buf, size_t bufsize) +{ + if (bufsize <= 0) + return (set_errno(EINVAL)); + + return (readlinkat(AT_FDCWD, path, buf, bufsize)); +} + +long +lx_readlinkat(int atfd, char *path, char *buf, size_t bufsize) +{ + if (bufsize <= 0) + return (set_errno(EINVAL)); + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + return (readlinkat(atfd, path, buf, bufsize)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_lseek.c b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c new file mode 100644 index 0000000000..3ac32a2faf --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/errno.h> +#include <sys/debug.h> + + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +/* from uts/common/syscalls/lseek.c */ +extern offset_t llseek32(int32_t, uint32_t, uint32_t, int); +extern off32_t lseek32(int32_t, off32_t, int32_t); + +long +lx_llseek(int fd, uint32_t off_high, uint32_t off_low, void *out, int whence) +{ + offset_t res; + + ASSERT(get_udatamodel() == DATAMODEL_ILP32); + res = llseek32(fd, off_low, off_high, whence); + if (ttolwp(curthread)->lwp_errno == 0) { + if (copyout(&res, out, sizeof (offset_t)) != 0) { + return (set_errno(EFAULT)); + } + } + return (ttolwp(curthread)->lwp_errno); +} + + +long +lx_lseek32(int fd, off32_t offset, int whence) +{ + offset_t res; + const uint32_t hival = (offset < 0) ? (uint32_t)-1 : 0; + + /* + * When returning EOVERFLOW for an offset which is outside the bounds + * of an off32_t, Linux will still perform the actual seek before + * yielding EOVERFLOW. + * + * In order to emulate that behavior, an llseek bound to the 64-bit + * boundary is used. The overflow can then be reported after the + * successful seek. + */ + ASSERT(get_udatamodel() == DATAMODEL_ILP32); + res = llseek32(fd, (uint32_t)offset, hival, whence); + if (ttolwp(curthread)->lwp_errno == 0 && res > MAXOFF32_T) { + return (set_errno(EOVERFLOW)); + } + return (res); + +} +#endif /* defined(_SYSCALL32_IMPL) || defined(_ILP32) */ + +#if defined(_LP64) + +/* from uts/common/syscalls/lseek.c */ +extern off_t lseek64(int, off_t, int); + +long +lx_lseek64(int fd, off_t offset, int whence) +{ + ASSERT(get_udatamodel() == DATAMODEL_LP64); + return (lseek64(fd, offset, whence)); +} + +#endif /* defined(_LP64) */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mem.c b/usr/src/uts/common/brand/lx/syscall/lx_mem.c new file mode 100644 index 0000000000..cc756717f1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mem.c @@ -0,0 +1,1118 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/mman.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/lx_brand.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> +#include <sys/shm_impl.h> +#include <vm/as.h> + +/* From uts/common/os/grow.c */ +extern int mprotect(caddr_t, size_t, int); +extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t); +extern int munmap(caddr_t, size_t); +/* From uts/common/syscall/close.c */ +extern int close(int); +/* From uts/common/fs/proc/prsubr.c */ +extern uint_t pr_getprot(struct seg *, int, void **, caddr_t *, caddr_t *, + caddr_t); +/* From uts/common/vm/seg_spt.c */ +extern struct seg_ops segspt_shmops; +/* From uts/common/syscall/memcntl.c */ +extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); +/* From uts/common/os/grow.c */ +extern int smmap_common(caddr_t *, size_t, int, int, struct file *, offset_t); + +/* + * After Linux 2.6.8, an unprivileged process can lock memory up to its + * RLIMIT_MEMLOCK resource limit. + * + * Within memcntl() it assumes we have PRIV_PROC_LOCK_MEMORY, or the check in + * secpolicy_lock_memory() will fail when we attempt to lock memory. Thus, + * to support the Linux semantics, we bypass memcntl() and perform the locking + * operations directly. + */ + +#define LX_MADV_NORMAL 0 +#define LX_MADV_RANDOM 1 +#define LX_MADV_SEQUENTIAL 2 +#define LX_MADV_WILLNEED 3 +#define LX_MADV_DONTNEED 4 +#define LX_MADV_FREE 8 +#define LX_MADV_REMOVE 9 +#define LX_MADV_DONTFORK 10 +#define LX_MADV_DOFORK 11 +#define LX_MADV_MERGEABLE 12 +#define LX_MADV_UNMERGEABLE 13 +#define LX_MADV_HUGEPAGE 14 +#define LX_MADV_NOHUGEPAGE 15 +#define LX_MADV_DONTDUMP 16 +#define LX_MADV_DODUMP 17 + +#define LX_VALID_MSYNC (MS_ASYNC|MS_INVALIDATE|MS_SYNC) + +#define LX_PROT_GROWSDOWN 0x01000000 +#define LX_PROT_GROWSUP 0x02000000 + +/* Internal segment map flags */ +#define LX_SM_READ 0x01 +#define LX_SM_WRITE 0x02 +#define LX_SM_EXEC 0x04 +#define LX_SM_SHM 0x08 +#define LX_SM_ANON 0x10 +#define LX_SM_SHARED 0x20 +#define LX_SM_NORESERVE 0x40 + +/* For convenience */ +#define LX_PROT_GROWMASK (LX_PROT_GROWSUP|LX_PROT_GROWSDOWN) + +/* From lx_rlimit.c */ +extern void lx_get_rctl(char *, struct rlimit64 *); + +static int +lx_mlock_common(int op, uintptr_t addr, size_t len) +{ + int err; + struct as *as = curproc->p_as; + const uintptr_t align_addr = addr & (uintptr_t)PAGEMASK; + const size_t align_len = P2ROUNDUP(len + (addr & PAGEOFFSET), PAGESIZE); + + if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((align_addr + align_len) <= align_addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(EINVAL)); + } + + err = as_ctl(as, (caddr_t)align_addr, align_len, op, 0, 0, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_mlock(uintptr_t addr, size_t len) +{ + int err; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlock2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + return (lx_mlock_common(MC_LOCK, addr, len)); +} + +int +lx_munlock(uintptr_t addr, size_t len) +{ + return (lx_mlock_common(MC_UNLOCK, addr, len)); +} + +int +lx_mlockall(int flags) +{ + int err; + struct as *as = curproc->p_as; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlockall2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + if ((flags & ~(MCL_FUTURE | MCL_CURRENT)) || flags == 0) + return (set_errno(EINVAL)); + + err = as_ctl(as, 0, 0, MC_LOCKAS, 0, (uintptr_t)flags, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_munlockall(void) +{ + int err; + struct as *as = curproc->p_as; + + if (lx_kern_release_cmp(curzone, "2.6.9") < 0) { + if ((err = secpolicy_lock_memory(CRED())) != 0) + return (set_errno(err)); + } + + err = as_ctl(as, 0, 0, MC_UNLOCKAS, 0, 0, NULL, 0); + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_msync(uintptr_t addr, size_t len, int flags) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + if ((addr & PAGEOFFSET) != 0 || + (flags & ~LX_VALID_MSYNC) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) < addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (memcntl((caddr_t)addr, align_len, MC_SYNC, + (caddr_t)(uintptr_t)flags, 0, 0)); +} + +int +lx_madvise(uintptr_t addr, size_t len, int advice) +{ + int err; + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + switch (advice) { + case LX_MADV_REMOVE: + /* approximately similar */ + advice = MADV_FREE; + break; + + case LX_MADV_DONTNEED: + /* + * On Linux, MADV_DONTNEED implies an immediate purge of the + * specified region. This is spuriously different from + * (nearly) every other Unix, having apparently been done to + * mimic the semantics on Digital Unix (!). This is bad enough + * (MADV_FREE both has better semantics and results in better + * performance), but it gets worse: Linux applications (and + * notably, jemalloc) have managed to depend on the busted + * semantics of MADV_DONTNEED on Linux. We implement these + * semantics via MADV_PURGE -- and we translate our advice + * accordingly. + */ + advice = MADV_PURGE; + break; + + case LX_MADV_FREE: + advice = MADV_FREE; + break; + + case LX_MADV_NORMAL: + case LX_MADV_RANDOM: + case LX_MADV_SEQUENTIAL: + case LX_MADV_WILLNEED: + /* These map directly to the illumos values */ + break; + + case LX_MADV_DONTFORK: + case LX_MADV_DOFORK: + case LX_MADV_HUGEPAGE: + case LX_MADV_NOHUGEPAGE: + case LX_MADV_DONTDUMP: + case LX_MADV_DODUMP: + /* harmless to pretend these work */ + return (0); + default: + return (set_errno(EINVAL)); + } + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* + * Catch overflow (including when aligning len). Unlike + * similar syscalls, this is an EINVAL failure for madvise(2). + */ + return (set_errno(EINVAL)); + } + + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err == EBUSY) { + if (advice != MADV_PURGE) { + return (set_errno(EINVAL)); + } + /* + * If we received an EBUSY from a MADV_PURGE, we will now try + * again with a MADV_DONTNEED: there are conditions (namely, + * with locked mappings that haven't yet been faulted in) where + * MADV_PURGE will fail but MADV_DONTNEED will succeed. If + * this succeeds, we'll call the operation a success; if not, + * we'll kick back EINVAL. + */ + advice = MADV_DONTNEED; + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err != 0) { + return (set_errno(EINVAL)); + } + /* Clear the old errno since success was eventually achieved. */ + ttolwp(curthread)->lwp_errno = 0; + } + return (err); +} + +int +lx_mprotect(uintptr_t addr, size_t len, int prot) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + /* + * The flags for native mprotect(2) are essentially the same as those + * on Linux, with the exception of PROT_GROWSUP/PROT_GROWSDOWN, for + * which there is no native analog. Those flags are presently ignored, + * unless they are both present, which represents an invalid argument. + */ + if ((prot & LX_PROT_GROWMASK) == LX_PROT_GROWMASK) { + return (set_errno(EINVAL)); + } + prot &= ~(LX_PROT_GROWMASK); + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (mprotect((void *)addr, align_len, prot)); +} + +/* + * There are two forms of mmap, mmap() and mmap2(). The only difference is that + * the final argument to mmap2() specifies the number of pages, not bytes. Also, + * mmap2 is 32-bit only. + * + * Linux has a number of additional flags, but they are all deprecated. We also + * ignore the MAP_GROWSDOWN flag, which has no equivalent on Solaris. + * + * The Linux mmap() returns ENOMEM in some cases where illumos returns + * EOVERFLOW, so we translate the errno as necessary. + */ + +#define LX_MAP_ANONYMOUS 0x00020 +#define LX_MAP_LOCKED 0x02000 +#define LX_MAP_NORESERVE 0x04000 +#define LX_MAP_32BIT 0x00040 + +#define ONE_GB 0x40000000 + +static void lx_remap_anoncache_invalidate(uintptr_t, size_t); + +static int +lx_ltos_mmap_flags(int flags) +{ + int new_flags; + + new_flags = flags & (MAP_TYPE | MAP_FIXED); + + if (flags & LX_MAP_ANONYMOUS) + new_flags |= MAP_ANONYMOUS; + if (flags & LX_MAP_NORESERVE) + new_flags |= MAP_NORESERVE; + +#if defined(_LP64) + if (flags & LX_MAP_32BIT) + new_flags |= MAP_32BIT; +#endif + + return (new_flags); +} + +static void * +lx_mmap_common(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + caddr_t ret; + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Under Linux, the file descriptor is ignored when mapping zfod + * anonymous memory, On illumos, we want the fd set to -1 for the + * same functionality. + */ + if (flags & LX_MAP_ANONYMOUS) + fd = -1; + + /* + * We refuse, as a matter of principle, to overcommit memory. + * Unfortunately, several bits of important and popular software expect + * to be able to pre-allocate large amounts of virtual memory but then + * probably never use it. One particularly bad example of this + * practice is golang. Another is the JVM. + * + * In the interest of running software, unsafe or not, we fudge + * something vaguely similar to overcommit by permanently enabling + * MAP_NORESERVE unless MAP_LOCKED was requested: + */ + if (!(flags & LX_MAP_LOCKED)) { + flags |= LX_MAP_NORESERVE; + } + + /* + * This is totally insane. The NOTES section in the linux mmap(2) man + * page claims that on some architectures, read protection may + * automatically include exec protection. It has been observed on a + * native linux system that the /proc/<pid>/maps file does indeed + * show that segments mmap'd from userland (such as libraries mapped in + * by the dynamic linker) all have exec the permission set, even for + * data segments. + * + * This insanity is tempered by the fact that the behavior is disabled + * for ELF binaries bearing a PT_GNU_STACK header which lacks PF_X + * (which most do). Such a header will clear the READ_IMPLIES_EXEC + * flag from the process personality. + */ + if (prot & PROT_READ) { + if ((lxpd->l_personality & LX_PER_READ_IMPLIES_EXEC) != 0) { + prot |= PROT_EXEC; + } + } + + ret = smmap64(addr, len, prot, lx_ltos_mmap_flags(flags), fd, off); + if (ttolwp(curthread)->lwp_errno != 0) { + if (ttolwp(curthread)->lwp_errno == EOVERFLOW) + (void) set_errno(ENOMEM); + return ((void *)-1); + } + + if (flags & LX_MAP_LOCKED) { + (void) lx_mlock_common(MC_LOCK, (uintptr_t)ret, len); + /* clear any errno from mlock */ + ttolwp(curthread)->lwp_errno = 0; + } + + /* + * We have a new mapping; invalidate any cached anonymous regions that + * overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)ret, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (ret); +} + +long +lx_mmap(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, off)); +} + +long +lx_mmap2(void *addr, size_t len, int prot, int flags, + int fd, off_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, + (off64_t)off * PAGESIZE)); +} + +long +lx_munmap(void *addr, size_t len) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Invalidate any cached anonymous regions that overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)addr, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (munmap(addr, len)); +} + +#define LX_MREMAP_MAYMOVE 1 /* mapping can be moved */ +#define LX_MREMAP_FIXED 2 /* address is fixed */ + +/* + * Unfortunately, the Linux mremap() manpage contains a statement that is, at + * best, grossly oversimplified: that mremap() "can be used to implement a + * very efficient realloc(3)." To the degree this is true at all, it is only + * true narrowly (namely, when large buffers are being expanded but can't be + * expanded in place due to virtual address space restrictions) -- but + * apparently, someone took this very literally, because variants of glibc + * appear to simply implement realloc() in terms of mremap(). This is + * unfortunate because absent intelligent usage, it forces realloc() to have + * an unncessary interaction with the VM system for small expansions -- and if + * realloc() is itself abused (e.g., if a consumer repeatedly expands and + * contracts the same memory buffer), the net result can be less efficient + * than a much more naive realloc() implementation. And if native Linux is + * suboptimal in this case, we are deeply pathological, having not + * historically supported mremap() for anonymous mappings at all. To make + * this at least palatable, we not only support remap for anonymous mappings + * (see lx_remap_anon(), below), we also cache the metadata associated with + * these anonymous remappings to reduce the need to search our address space. + * We implement the anonymous metadata cache with l_remap_anoncache, an LRU + * cache of lx_segmap_t's that correspond to anonymous segments that have been + * resized (only anonymous mappings that have been remapped are cached). The + * cache is part of the process's lx-brand-specifc data. + */ + +/* + * Search our address space (as) mappings to find the specified mapping. This + * is derived from the procfs prgetmap() code. We implement the "reserved" + * behavior on the segment so as to accommodate the case where an mmap()'d and + * then ftruncate()'d file is being mremap()'d: we use the size of the + * mapping (which we need to validate old_size). + * + * Return 0 if mapping is found, errno if there is a problem or if mapping + * not found. If the mapping is found, we populate the mp parameter, vpp and + * offp with the results. + */ +static int +lx_get_mapping(uintptr_t find_addr, size_t find_size, lx_segmap_t *mp, + vnode_t **vpp, offset_t *offp) +{ + struct as *as = curproc->p_as; + struct seg *seg; + uint_t prot; + caddr_t saddr, eaddr, naddr; + + /* pr_getprot asserts that the as is held as a writer */ + AS_LOCK_ENTER(as, RW_WRITER); + + seg = as_segat(as, (caddr_t)find_addr); + if (seg == NULL || (seg->s_flags & S_HOLE) != 0) { + AS_LOCK_EXIT(as); + return (EFAULT); + } + + /* + * We're interested in the reserved space, so we use the size of the + * segment itself. + */ + eaddr = seg->s_base + seg->s_size; + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { + uintptr_t vaddr; + size_t size; + struct vnode *vp; + void *tmp = NULL; + + prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr); + if (saddr == naddr) + continue; + + vaddr = (uintptr_t)saddr; + size = (uintptr_t)naddr - (uintptr_t)saddr; + + if (vaddr == find_addr && find_size < size && + (find_size & PAGEOFFSET) != 0) { + /* + * We found a mapping but the size being requested is + * less than the mapping and not a multiple of our page + * size. If it is an anonymous mapping, that likely + * means the application did the initial mmap with this + * odd size. We'll round up to the next page boundary + * in this case. + */ + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) { + /* + * It's anonymous, round up the size. + */ + find_size = ptob(btopr(find_size)); + } + } + + /* Check if mapping matches our arguments */ + if (vaddr == find_addr && size == find_size) { + struct vattr vattr; + + mp->lxsm_vaddr = vaddr; + mp->lxsm_size = size; + mp->lxsm_flags = 0; + + *offp = SEGOP_GETOFFSET(seg, saddr); + + if (prot & PROT_READ) + mp->lxsm_flags |= LX_SM_READ; + if (prot & PROT_WRITE) + mp->lxsm_flags |= LX_SM_WRITE; + if (prot & PROT_EXEC) + mp->lxsm_flags |= LX_SM_EXEC; + if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED) + mp->lxsm_flags |= LX_SM_SHARED; + if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE) + mp->lxsm_flags |= LX_SM_NORESERVE; + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) + mp->lxsm_flags |= LX_SM_ANON; + + if (seg->s_ops == &segspt_shmops) { + mp->lxsm_flags |= LX_SM_SHM; + } else if ((mp->lxsm_flags & LX_SM_SHARED) && + curproc->p_segacct && shmgetid(curproc, + seg->s_base) != SHMID_NONE) { + mp->lxsm_flags |= LX_SM_SHM; + } + + vattr.va_mask = AT_FSID | AT_NODEID; + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, saddr, &vp) == 0 && + vp != NULL && vp->v_type == VREG && + VOP_GETATTR(vp, &vattr, 0, CRED(), + NULL) == 0) { + VN_HOLD(vp); + *vpp = vp; + } else { + *vpp = NULL; + } + + AS_LOCK_EXIT(as); + return (0); + } + + if (vaddr <= find_addr && + find_addr + find_size < vaddr + size) { + /* + * We have a mismatch, but our specified range is a + * subset of the actual segment; this is EINVAL. + */ + AS_LOCK_EXIT(as); + DTRACE_PROBE2(lx__mremap__badsubset, caddr_t, + vaddr, size_t, size); + return (EINVAL); + } + } + + AS_LOCK_EXIT(as); + return (EFAULT); +} + +static void +lx_remap_anoncache_invalidate(uintptr_t addr, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + if (lxpd->l_remap_anoncache_generation == 0) + return; + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *map = &lxpd->l_remap_anoncache[i]; + + /* + * If the ranges overlap at all, we zap it. + */ + if (addr < map->lxsm_vaddr + map->lxsm_size && + map->lxsm_vaddr < addr + size) { + bzero(map, sizeof (lx_segmap_t)); + } + } +} + +static void +lx_remap_anoncache_load(lx_segmap_t *map, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint64_t oldest = UINT64_MAX; + lx_segmap_t *evict = NULL; + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *cp = &lxpd->l_remap_anoncache[i]; + + if (cp->lxsm_vaddr == map->lxsm_vaddr) { + /* + * We're already in the cache -- we just need to update + * our LRU field and size to reflect the hit. + */ + cp->lxsm_lru = lxpd->l_remap_anoncache_generation++; + cp->lxsm_size = size; + return; + } + + if (cp->lxsm_vaddr == 0) { + evict = cp; + break; + } + + if (cp->lxsm_lru < oldest) { + oldest = cp->lxsm_lru; + evict = cp; + } + } + + /* Update the entry we're evicting */ + ASSERT(evict != NULL); + evict->lxsm_vaddr = map->lxsm_vaddr; + evict->lxsm_size = size; + evict->lxsm_flags = map->lxsm_flags; + evict->lxsm_lru = lxpd->l_remap_anoncache_generation++; +} + +static int lx_u2u_copy(void *, void *, size_t); + +/* + * As part of lx_remap() (see below) and to accommodate heavy realloc() use + * cases (see the discussion of the l_remap_anoncache, above), we allow + * anonymous segments to be "remapped" in that we are willing to truncate them + * or append to them (as much as that's allowed by virtual address space + * usage). If we fall out of these cases, we take the more expensive option + * of actually copying the data to a new segment -- but we locate the address + * in a portion of the address space that should give us plenty of VA space to + * expand. + * + * We return the address of the mapping or set errno if there is a problem. + */ +static long +lx_remap_anon(lx_segmap_t *mapin, size_t new_size, uint_t flags, + uintptr_t new_addr) +{ + lx_segmap_t m; + int mflags = MAP_ANON; + int prot = 0; + void *addr, *hint = NULL; + + ASSERT(MUTEX_HELD(&ptolxproc(curproc)->l_remap_anoncache_lock)); + + /* + * Make a copy of the input lx_segmap_t argument since it might be + * a reference into the anon cache, and we're manipulating cache + * entries during this function. + */ + m = *mapin; + + /* + * If our new size is less than our old size and we're either not + * being ordered to move it or the address we're being ordered to + * move it to is our current address, we can just act as Procrustes + * and chop off anything larger than the new size. + */ + if (new_size < m.lxsm_size && (!(flags & LX_MREMAP_FIXED) || + new_addr == m.lxsm_vaddr)) { + if (munmap((void *)(m.lxsm_vaddr + new_size), + m.lxsm_size - new_size) != 0) { + return (set_errno(EINVAL)); + } + + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + if (m.lxsm_flags & LX_SM_SHM) + return (set_errno(EINVAL)); + + if (m.lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (m.lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (m.lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= (m.lxsm_flags & LX_SM_SHARED) ? MAP_SHARED : MAP_PRIVATE; + + if (m.lxsm_flags & LX_SM_NORESERVE) + mflags |= MAP_NORESERVE; + + /* + * If we're not being told where to move it, let's try to expand our + * mapping in place by adding a fixed mapping after it. + */ + if (!(flags & LX_MREMAP_FIXED)) { + void *tmp_addr = (void *)(m.lxsm_vaddr + m.lxsm_size); + + ASSERT(new_size > m.lxsm_size); + addr = smmap64(tmp_addr, new_size - m.lxsm_size, prot, + mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + /* There is no place to mmap some extra anon */ + return (set_errno(EINVAL)); + } + + if (addr == tmp_addr) { + /* The expansion worked */ + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + /* + * Our advisory address was not followed -- which, as a + * practical matter, means that the range conflicted with an + * extant mapping. Unmap wherever our attempted expansion + * landed, and drop into the relocation case. + */ + (void) munmap(addr, new_size - m.lxsm_size); + } + + lx_remap_anoncache_invalidate(m.lxsm_vaddr, m.lxsm_size); + + /* + * If we're here, we actually need to move this mapping -- so if we + * can't move it, we're done. + */ + if (!(flags & LX_MREMAP_MAYMOVE)) + return (set_errno(ENOMEM)); + + /* + * If this is a shared private mapping, we can't remap it. + */ + if (m.lxsm_flags & LX_SM_SHARED) + return (set_errno(EINVAL)); + + if (flags & LX_MREMAP_FIXED) { + mflags |= MAP_FIXED; + hint = (void *)new_addr; + } else { + /* + * Search our address space for a gap to remap into. To give + * ourselves plenty of room for further mremap() expansion, + * we'll multiply our new size by 16 and look for a gap at + * least that big. Historically we looked for an empty gap + * around the 2GB region, so we start our search for the lowest + * gap in that vicinity. + */ + caddr_t base; + size_t upper; + + base = (caddr_t)ONE_GB; + upper = (uintptr_t)USERLIMIT - (uintptr_t)base; + + if (as_gap(curproc->p_as, (new_size << 4UL), &base, &upper, + AH_LO, NULL) != -1) + hint = base; + } + + addr = smmap64(hint, new_size, prot, mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + return (ttolwp(curthread)->lwp_errno); + } + + if (lx_u2u_copy((void *)m.lxsm_vaddr, addr, m.lxsm_size) != 0) { + /* We couldn't complete the relocation, backout & fail */ + (void) munmap(addr, new_size); + return (set_errno(ENOMEM)); + } + + (void) munmap((void *)m.lxsm_vaddr, m.lxsm_size); + + /* + * Add the relocated mapping to the cache. + */ + m.lxsm_vaddr = (uintptr_t)addr; + lx_remap_anoncache_load(&m, new_size); + + return ((long)addr); +} + +/* + * We don't have a native mremap() (nor do we particularly want one), so + * we emulate it strictly in lx. The idea is simple: we just want to + * mmap() the underlying object with the new size and rip down the old mapping. + * However, this is slightly complicated because we don't actually have the + * file descriptor that corresponds to the resized mapping. So to get a file + * descriptor, we may have to search our address space for the mapping and use + * the associated vnode to create a file descriptor. Assuming that this + * succeeds, we then mmap() it and rip down the original mapping. There are + * clearly many reasons why this might fail; absent a more apt errno (e.g., + * ENOMEM in some cases), we return EINVAL to denote these cases. + */ +long +lx_mremap(uintptr_t old_addr, size_t old_size, size_t new_size, int flags, + uintptr_t new_addr) +{ + int prot = 0, oflags, mflags = 0, i, res; + lx_segmap_t map, *mp; + int rval = 0; + lx_proc_data_t *lxpd; + offset_t off; + struct vnode *vp = NULL; + file_t *fp; + caddr_t naddr; + + if (flags & LX_MREMAP_FIXED) { + /* MREMAP_FIXED requires MREMAP_MAYMOVE */ + if ((flags & LX_MREMAP_MAYMOVE) == 0) + return (set_errno(EINVAL)); + + if (new_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + mflags |= MAP_FIXED; + } else { + if (new_size == old_size) + return (old_addr); + + /* new_addr is optional and only valid when LX_MREMAP_FIXED. */ + new_addr = NULL; + } + + if (old_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + if (new_size == 0) + return (set_errno(EINVAL)); + + /* + * First consult the anoncache; if we find the segment there, we'll + * drop straight into lx_remap_anon() and save ourself the pain of + * searching our address space. + */ + lxpd = ptolxproc(curproc); + mutex_enter(&lxpd->l_remap_anoncache_lock); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + long rv; + + mp = &lxpd->l_remap_anoncache[i]; + + if (mp->lxsm_vaddr != old_addr) + continue; + + if (mp->lxsm_size != old_size) + continue; + + /* + * lx_remap_anon will either: + * a) expand/contract in place, returning old_addr + * b) relocate & expand the mapping, returning a new address + * c) there will be an error of some sort and errno will be set + */ + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + mutex_exit(&lxpd->l_remap_anoncache_lock); + + /* + * Search our address space to find the specified mapping. + */ + if ((res = lx_get_mapping(old_addr, old_size, &map, &vp, &off)) > 0) + return (set_errno(res)); + + /* + * We found the mapping. + */ + mp = ↦ + DTRACE_PROBE1(lx__mremap__seg, lx_segmap_t *, mp); + + if (mp->lxsm_flags & LX_SM_SHM) { + /* + * If this is either ISM or System V shared memory, we're not + * going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + if (mp->lxsm_flags & LX_SM_ANON) { + /* + * This is an anonymous mapping -- which is the one case in + * which we perform something that approaches a true remap. + */ + long rv; + + if (vp != NULL) + VN_RELE(vp); + mutex_enter(&lxpd->l_remap_anoncache_lock); + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + /* The rest of the code is for a 'named' mapping */ + + if (!(flags & LX_MREMAP_MAYMOVE)) { + /* + * If we're not allowed to move this mapping, we're going to + * act as if we can't expand it. + */ + rval = set_errno(ENOMEM); + goto out; + } + + if (!(mp->lxsm_flags & LX_SM_SHARED)) { + /* + * If this is a private mapping, we're not going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + oflags = (mp->lxsm_flags & LX_SM_WRITE) ? (FWRITE | FREAD) : FREAD; + if (vp == NULL) { + /* + * If vp is NULL, the path might not exist. We're going to kick + * it back with EINVAL. + */ + rval = set_errno(EINVAL); + goto out; + } + + /* falloc cannot fail with a NULL fdp. */ + VERIFY0(falloc(vp, oflags, &fp, NULL)); + mutex_exit(&fp->f_tlock); + + if (mp->lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (mp->lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (mp->lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= MAP_SHARED; + + /* + * We're using smmap_common to pass the fp directly, instead of + * initializing a temporary file descriptor for smmap64(), so as to + * prevent any inadvertent use of that temporary fd within the + * application. + */ + naddr = (caddr_t)new_addr; + rval = smmap_common(&naddr, new_size, prot, mflags, fp, off); + + mutex_enter(&fp->f_tlock); + unfalloc(fp); + + if (rval != 0) { + rval = set_errno(ENOMEM); + goto out; + } + + /* + * Our mapping succeeded; we're now going to rip down the old mapping. + */ + (void) munmap((void *)old_addr, old_size); + +out: + if (vp != NULL) + VN_RELE(vp); + + if (rval == 0) + return ((long)naddr); + return ((long)rval); +} + +#pragma GCC diagnostic ignored "-Wclobbered" +/* + * During mremap we had to relocate the initial anonymous mapping to a new + * location (a new anonymous mapping). Copy the user-level data from the first + * mapping to the second mapping. + * + * We have to lock both sides to ensure there is no fault. We do this in 16MB + * chunks at a time and we do not concern ourselves with the zone's + * max-locked-memory rctl. + * + * Keep this function at the end since we're disabling the compiler's "clobber" + * check due to the on_fault call. + */ +static int +lx_u2u_copy(void *src, void *dst, size_t len) +{ + size_t mlen; + caddr_t sp, dp; + int err; + page_t **ppa_src, **ppa_dst; + label_t ljb; + struct as *p_as = curproc->p_as; + + /* Both sides should be page aligned since they're from smmap64 */ + ASSERT(((uintptr_t)src & PAGEOFFSET) == 0); + ASSERT(((uintptr_t)dst & PAGEOFFSET) == 0); + /* Both came from mmap, so they should be valid user pointers */ + ASSERT((uintptr_t)src < USERLIMIT && (uintptr_t)dst < USERLIMIT); + + sp = src; + dp = dst; + + do { + mlen = MIN(len, 16 * 1024 * 1024); + + err = as_pagelock(p_as, &ppa_src, sp, mlen, S_READ); + if (err != 0) { + return (err); + } + err = as_pagelock(p_as, &ppa_dst, dp, mlen, S_WRITE); + if (err != 0) { + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (err); + } + + DTRACE_PROBE3(lx__mremap__copy, void *, sp, void *, dp, + size_t, mlen); + + /* on_fault calls smap_disable */ + if (on_fault(&ljb)) { + /* + * Given that the pages are locked and smap is disabled, + * we really should never get here. If we somehow do + * get here, the copy fails just as if we could not + * lock the pages to begin with. + */ + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (EFAULT); + } + ucopy(sp, dp, mlen); + no_fault(); /* calls smap_enable */ + + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + + len -= mlen; + sp += mlen; + dp += mlen; + } while (len > 0); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c new file mode 100644 index 0000000000..5245b32870 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c @@ -0,0 +1,495 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systeminfo.h> +#include <sys/fcntl.h> +#include <sys/resource.h> +#include <sys/uadmin.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> + +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 672274793 +#define LINUX_REBOOT_MAGIC2A 85072278 +#define LINUX_REBOOT_MAGIC2B 369367448 +#define LINUX_REBOOT_MAGIC2C 537993216 + +#define LINUX_REBOOT_CMD_RESTART 0x1234567 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_CAD_ON 0x89abcdef +#define LINUX_REBOOT_CMD_CAD_OFF 0 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART2 0xa1b2c3d4 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543 + +#define LX_RUSAGE_SELF 0 +#define LX_RUSAGE_CHILDREN (-1) +#define LX_RUSAGE_BOTH (-2) +#define LX_RUSAGE_THREAD 1 + +#define LX_SWAP_PRIOMASK 0x7fff +#define LX_SWAP_PREFER 0x8000 +#define LX_SWAP_DISCARD 0x10000 +#define LX_SWAP_DISCARD_ONCE 0x20000 +#define LX_SWAP_DISCARD_PAGES 0x40000 + +#define LX_SWAP_ALL (LX_SWAP_DISCARD_PAGES | \ + LX_SWAP_DISCARD_ONCE | \ + LX_SWAP_DISCARD | \ + LX_SWAP_PREFER | LX_SWAP_PRIOMASK) + +/* From uts/common/fs/vfs.c */ +extern void vfs_sync(int); +/* From uts/common/os/grow.c */ +extern int mincore(caddr_t, size_t, char *); +extern int munmap(caddr_t, size_t); +/* From uts/common/os/session.c */ +extern int vhangup(); +/* From uts/common/syscall/alarm.c */ +extern int alarm(int); +/* From uts/common/syscall/chdir.c */ +extern int chdir(char *); +extern int chroot(char *); +extern int fchdir(int); +/* From uts/common/syscall/nice.c */ +extern int nice(int); +/* From uts/common/syscall/open.c */ +extern int open(char *, int, int); +/* From uts/common/syscall/pause.c */ +extern int pause(); +/* From uts/common/syscall/rusagesys.c */ +extern int rusagesys(int, void *, void *, void *, void *); +/* From uts/common/syscall/systeminfo.c */ +extern long systeminfo(int, char *, long); +/* From uts/common/syscall/timers.c */ +extern int getitimer(uint_t, struct itimerval *); +/* From uts/common/syscall/time.c */ +extern int stime(time_t); +/* From uts/common/syscall/uadmin.c */ +extern int uadmin(int, int, uintptr_t); +/* From uts/common/syscall/chdir.c */ +extern int chdir_proc(proc_t *, vnode_t *, boolean_t, boolean_t); +/* From uts/common/fs/lookup.c */ +extern int lookupname(char *, enum uio_seg, int, vnode_t **, vnode_t **); +/* From uts/common/fs/fs_subr.c */ +extern int fs_need_estale_retry(int); +/* From uts/common/os/acct.c */ +extern int sysacct(char *); + +/* The callback arguments when handling a FS clone group. */ +typedef struct { + vnode_t *lcfa_vp; + boolean_t lcfa_type; + boolean_t lcfa_traverse; +} lx_clone_fs_arg_t; + +long +lx_alarm(int seconds) +{ + return (alarm(seconds)); +} + +static int +lx_clone_fs_cb(proc_t *pp, void *arg) +{ + lx_clone_fs_arg_t *ap = (lx_clone_fs_arg_t *)arg; + int err; + + /* + * Either: + * A) The initial lookupname() from lx_clone_fs_do_group() will have + * added a hold on the vnode to ensure its existence throughout the + * walk. + * B) We added a hold in fchdir. + * We need to add another hold for each process in the group. + */ + VN_HOLD(ap->lcfa_vp); + if ((err = chdir_proc(pp, ap->lcfa_vp, ap->lcfa_type, + ap->lcfa_traverse)) != 0) { + /* if we failed, chdir_proc already did a rele on vp */ + return (err); + } + + return (0); +} + +/* + * Check to see if the process is in a CLONE_FS clone group. Return false + * if not (the normal case), otherwise perform the setup, do the group walk + * and return true. + */ +static boolean_t +lx_clone_fs_do_group(char *path, boolean_t is_chroot, int *errp) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + vnode_t *vp; + lx_clone_fs_arg_t arg; + int err; + int estale_retry = 0; + + if (!lx_clone_grp_member(lproc, LX_CLONE_FS)) + return (B_FALSE); + + /* Handle the rare case of being in a CLONE_FS clone group */ + +retry: + err = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (err != 0) { + if (err == ESTALE && fs_need_estale_retry(estale_retry++)) + goto retry; + *errp = err; + return (B_TRUE); + } + + arg.lcfa_vp = vp; + arg.lcfa_type = is_chroot; + arg.lcfa_traverse = B_TRUE; + + /* + * We use the VN_HOLD from the lookup to guarantee vp exists for the + * entire walk. + */ + err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb, + (void *)&arg); + VN_RELE(vp); + *errp = err; + return (B_TRUE); +} + +long +lx_chdir(char *path) +{ + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_FALSE, &err)) + return ((err != 0) ? set_errno(err) : 0); + + return (chdir(path)); +} + +long +lx_chroot(char *path) +{ + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_TRUE, &err)) + return ((err != 0) ? set_errno(err) : 0); + + return (chroot(path)); +} + +long +lx_creat(char *path, mode_t mode) +{ + return (open(path, O_WRONLY | O_CREAT | O_TRUNC, mode)); +} + +long +lx_fchdir(int fd) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + + if (lx_clone_grp_member(lproc, LX_CLONE_FS)) { + /* Handle the rare case of being in a CLONE_FS clone group */ + file_t *fp; + vnode_t *vp; + lx_clone_fs_arg_t arg; + int err; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + arg.lcfa_vp = vp; + arg.lcfa_type = B_FALSE; + arg.lcfa_traverse = B_FALSE; + + /* + * We use the VN_HOLD above to guarantee vp exists for the + * entire walk. + */ + err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb, + (void *)&arg); + VN_RELE(vp); + if (err) + return (set_errno(err)); + return (0); + } + + return (fchdir(fd)); +} + +long +lx_getitimer(int which, struct itimerval *value) +{ + return (getitimer(which, value)); +} + +/* Linux and illumos have the same rusage structures. */ +long +lx_getrusage(int who, struct rusage *rup) +{ + int code; + + switch (who) { + case LX_RUSAGE_SELF: + code = _RUSAGESYS_GETRUSAGE; + break; + case LX_RUSAGE_CHILDREN: + code = _RUSAGESYS_GETRUSAGE_CHLD; + break; + case LX_RUSAGE_THREAD: + code = _RUSAGESYS_GETRUSAGE_LWP; + break; + default: + return (set_errno(EINVAL)); + } + + return (rusagesys(code, rup, NULL, NULL, NULL)); +} + +long +lx_mincore(caddr_t addr, size_t len, char *vec) +{ + int r; + + r = mincore(addr, len, vec); + if (r == EINVAL) { + /* + * LTP mincore01 expects mincore with a huge len to fail with + * ENOMEM on a modern kernel, although on Linux 2.6.11 and + * earlier, it will return EINVAL. + */ + if (lx_kern_release_cmp(curzone, "2.6.11") > 0 && (long)len < 0) + return (set_errno(ENOMEM)); + } + return (r); +} + +long +lx_nice(int incr) +{ + return (nice(incr)); +} + +long +lx_pause(void) +{ + return (pause()); +} + +/*ARGSUSED*/ +long +lx_reboot(int magic1, int magic2, uint_t flag, uintptr_t p4) +{ + if (magic1 != LINUX_REBOOT_MAGIC1) + return (set_errno(EINVAL)); + + switch (magic2) { + case LINUX_REBOOT_MAGIC2: + case LINUX_REBOOT_MAGIC2A: + case LINUX_REBOOT_MAGIC2B: + case LINUX_REBOOT_MAGIC2C: + break; + default: + return (set_errno(EINVAL)); + } + + /* + * Once we have better Linux capabilities(7) support we should check + * CAP_SYS_BOOT instead. + */ + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + switch (flag) { + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + /* ignored */ + return (0); + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + return (uadmin(A_SHUTDOWN, AD_HALT, NULL)); + + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_RESTART2: + /* RESTART2 may need more work */ + return (uadmin(A_SHUTDOWN, AD_BOOT, NULL)); + + default: + return (set_errno(EINVAL)); + } +} + +long +lx_setdomainname(char *name, long len) +{ + if (len < 0 || len >= LX_SYS_UTS_LN) + return (set_errno(EINVAL)); + + ttolwp(curthread)->lwp_errno = 0; + (void) systeminfo(SI_SET_SRPC_DOMAIN, name, len); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + return (0); +} + +long +lx_sethostname(char *name, size_t len) +{ + ttolwp(curthread)->lwp_errno = 0; + (void) systeminfo(SI_SET_HOSTNAME, name, len); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + return (0); +} + +long +lx_stime(time_t *tp) +{ + time_t time; + + if (copyin(tp, &time, sizeof (time)) != 0) + return (set_errno(EFAULT)); + + return (stime(time)); +} + +long +lx_sync(void) +{ + vfs_sync(0); + return (0); +} + +/* + * For syslog, since there is no Linux kernel and nothing to log, we simply + * emulate a kernel buffer (LOG_BUF_LEN) of 0 bytes and only handle errors for + * bad input. All actions except 3 and 10 require CAP_SYS_ADMIN or CAP_SYSLOG + * so without full capabilities support, for now we just perform an euid check. + */ +long +lx_syslog(int type, char *bufp, int len) +{ + if (type < 0 || type > 10) + return (set_errno(EINVAL)); + + if (type != 3 && type != 10 && crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + if (type >= 2 && type <= 4 && (bufp == NULL || len < 0)) + return (set_errno(EINVAL)); + + if (type == 8 && (len < 1 || len > 8)) + return (set_errno(EINVAL)); + + return (0); +} + +long +lx_vhangup(void) +{ + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * The native vhangup code does nothing except check for the sys_config + * privilege. Eventually we'll first want to check our emulation for the + * Linux CAP_SYS_TTY_CONFIG capability, but currently, since we've + * already checked that our process is root, just succeed. + */ + return (0); +} + +long +lx_acct(char *p) +{ + return (sysacct(p)); +} + +/* + * Support for Linux namespaces is not yet implemented. Normally we would + * simply return ENOSYS for this. However, "systemd" uses mount namespaces to + * provide the PrivateTmp feature for some services. Use of this feature is + * becoming common and these services will fail to run without namespace + * support. "systemd" has a fallback to allow these types of services to run if + * it sees either EACCES or EPERM when it tries to setup the namespace. Until + * we have namespace support, we return EPERM to workaround this issue. + */ +/*ARGSUSED*/ +long +lx_unshare(int flags) +{ + return (set_errno(EPERM)); +} + +/* + * The whole idea of "swap space" within a zone is a complete fabrication. + * However, some apps expect to be able to see swap space data in the /proc + * files, while other apps actually don't want there to be any swap space + * configured. We use the swapon/off syscalls to allow this visibility to be + * controlled from within the zone iself. Note that the "swapon" CLI tends to + * do a lot of additional validation which will fail within a zone. + * + * Once we have better Linux capabilities(7) support we should check + * CAP_SYS_ADMIN instead of uid == 0. + */ +long +lx_swapoff(char *path) +{ + char buf[MAXPATHLEN]; + size_t len; + lx_zone_data_t *lxzd; + + /* Simple validaton of the argument */ + if (copyinstr(path, buf, sizeof (buf), &len) != 0) + return (set_errno(EFAULT)); + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + lxzd = ztolxzd(curzone); + ASSERT(lxzd != NULL); + + lxzd->lxzd_swap_disabled = B_TRUE; + return (0); +} + +long +lx_swapon(char *path, int flags) +{ + char buf[MAXPATHLEN]; + size_t len; + lx_zone_data_t *lxzd; + + /* Simple validaton of the arguments */ + if (copyinstr(path, buf, sizeof (buf), &len) != 0) + return (set_errno(EFAULT)); + if (flags & ~LX_SWAP_ALL) + return (set_errno(EINVAL)); + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + lxzd = ztolxzd(curzone); + ASSERT(lxzd != NULL); + + lxzd->lxzd_swap_disabled = B_FALSE; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c new file mode 100644 index 0000000000..2f29f56d5f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* + * From "uts/common/syscall/mkdir.c": + */ +extern int mkdirat(int, char *, int); + +long +lx_mkdirat(int fd, char *dname, int dmode) +{ + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + return (mkdirat(fd, dname, dmode)); +} + +long +lx_mkdir(char *dname, int dmode) +{ + return (mkdirat(AT_FDCWD, dname, dmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c new file mode 100644 index 0000000000..aa6e12a7d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/segments.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/sysi86.h> +#include <sys/cmn_err.h> +#include <sys/lx_ldt.h> + +/* + * Read the ldt_info structure in from the Linux app, convert it to an ssd + * structure, and then call setdscr() to do all the heavy lifting. + */ +static int +write_ldt(void *data, ulong_t count) +{ + user_desc_t usd; + struct ssd ssd; + struct ldt_info ldt_inf; + proc_t *pp = curthread->t_procp; + int err; + + if (count != sizeof (ldt_inf)) + return (set_errno(EINVAL)); + + if (copyin(data, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + if (ldt_inf.entry_number >= MAXNLDT) + return (set_errno(EINVAL)); + + LDT_INFO_TO_DESC(&ldt_inf, &usd); + usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number)); + + /* + * Get everyone into a safe state before changing the LDT. + */ + if (!holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + + err = setdscr(&ssd); + + /* + * Release the hounds! + */ + mutex_enter(&pp->p_lock); + continuelwps(pp); + mutex_exit(&pp->p_lock); + + return (err ? set_errno(err) : 0); +} + +static int +read_ldt(void *uptr, ulong_t count) +{ + proc_t *pp = curproc; + int bytes; + + if (pp->p_ldt == NULL) + return (0); + + bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + if (bytes > count) + bytes = count; + + if (copyout(pp->p_ldt, uptr, bytes)) + return (set_errno(EFAULT)); + + return (bytes); +} + +long +lx_modify_ldt(int op, void *data, ulong_t count) +{ + int rval; + + switch (op) { + case 0: + rval = read_ldt(data, count); + break; + + case 1: + rval = write_ldt(data, count); + break; + + default: + rval = set_errno(ENOSYS); + break; + } + + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mount.c b/usr/src/uts/common/brand/lx/syscall/lx_mount.c new file mode 100644 index 0000000000..2524e9044a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mount.c @@ -0,0 +1,675 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/ctype.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/types.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_autofs.h> + +#define tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x)) + +/* + * mount(2) is significantly different between Linux and illumos. One of the + * main differences is between the set of flags. Some flags on Linux can be + * translated to an illumos equivalent, some are converted to a + * filesystem-specific option, while others have no equivalent whatsoever. + * + * Another big difference is that mounting NFS is fully handled in the kernel on + * Linux whereas on illumos a lot of preliminary work is done by the NFS mount + * command before calling mount(2). As a simplification, we forward NFS + * mount calls back out to the user-level library which does the same kind of + * preliminary processing that is done by the native user-level NFS mount code. + */ +#define LX_MS_MGC_VAL 0xC0ED0000 +#define LX_MS_RDONLY 0x00000001 +#define LX_MS_NOSUID 0x00000002 +#define LX_MS_NODEV 0x00000004 +#define LX_MS_NOEXEC 0x00000008 +#define LX_MS_SYNCHRONOUS 0x00000010 +#define LX_MS_REMOUNT 0x00000020 +#define LX_MS_MANDLOCK 0x00000040 +#define LX_MS_NOATIME 0x00000400 +#define LX_MS_NODIRATIME 0x00000800 +#define LX_MS_BIND 0x00001000 +#define LX_MS_MOVE 0x00002000 +#define LX_MS_REC 0x00004000 +#define LX_MS_SILENT 0x00008000 +#define LX_MS_POSIXACL 0x00010000 +#define LX_MS_UNBINDABLE 0x00020000 +#define LX_MS_PRIVATE 0x00040000 +#define LX_MS_SLAVE 0x00080000 +#define LX_MS_SHARED 0x00100000 +#define LX_MS_RELATIME 0x00200000 +#define LX_MS_KERNMOUNT 0x00400000 +#define LX_MS_I_VERSION 0x00800000 +#define LX_MS_STRICTATIME 0x01000000 +#define LX_MS_LAZYTIME 0x02000000 + +/* Linux kernel-internal flags - ignored if passed in */ +#define LX_MS_NOSEC 0x10000000 +#define LX_MS_BORN 0x20000000 +#define LX_MS_ACTIVE 0x40000000 +#define LX_MS_NOUSER 0x80000000 + +#define LX_MS_SUPPORTED (LX_MS_MGC_VAL | \ + LX_MS_RDONLY | LX_MS_NOSUID | \ + LX_MS_NODEV | LX_MS_NOEXEC | \ + LX_MS_REMOUNT | LX_MS_NOATIME | \ + LX_MS_BIND | LX_MS_SILENT | \ + LX_MS_STRICTATIME | LX_MS_NOSEC | \ + LX_MS_BORN | LX_MS_ACTIVE | LX_MS_NOUSER) + +/* + * support definitions + */ +typedef enum mount_opt_type { + MOUNT_OPT_INVALID = 0, + MOUNT_OPT_NORMAL = 1, /* option value: none */ + MOUNT_OPT_UINT = 2, /* option value: unsigned int */ + MOUNT_OPT_PASSTHRU = 3 /* option value: validated downstream */ +} mount_opt_type_t; + +typedef struct mount_opt { + char *mo_name; + mount_opt_type_t mo_type; +} mount_opt_t; + +/* From uts/common/syscall/umount.c */ +extern int umount2(char *, int); + +/* From lx_chown.c */ +extern long lx_vn_chown(vnode_t *, uid_t, gid_t); + +/* + * Globals + */ +static mount_opt_t lofs_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_proc_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_sysfs_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_tmpfs_options[] = { + { "size", MOUNT_OPT_PASSTHRU }, + { "mode", MOUNT_OPT_UINT }, + { "uid", MOUNT_OPT_UINT }, + { "gid", MOUNT_OPT_UINT }, + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_autofs_options[] = { + { LX_MNTOPT_FD, MOUNT_OPT_UINT }, + { LX_MNTOPT_PGRP, MOUNT_OPT_UINT }, + { LX_MNTOPT_MINPROTO, MOUNT_OPT_UINT }, + { LX_MNTOPT_MAXPROTO, MOUNT_OPT_UINT }, + { LX_MNTOPT_INDIRECT, MOUNT_OPT_NORMAL }, + { LX_MNTOPT_DIRECT, MOUNT_OPT_NORMAL }, + { LX_MNTOPT_OFFSET, MOUNT_OPT_NORMAL }, + { NULL, MOUNT_OPT_INVALID } +}; + +static const char *lx_common_mnt_opts[] = { + "exec", + "noexec", + "devices", + "nodevices", + "dev", + "nodev", + "suid", + "nosuid", + NULL +}; + +/* + * Check the mount options. + * + * On illumos all mount option verification is done by the user-level mount + * command. Invalid options are simply ignored by domount(). Thus, we check + * here for invalid/unsupported options. + */ +static int +lx_mnt_opt_verify(char *opts, mount_opt_t *mop) +{ + int opts_len = strlen(opts); + char *opt, *tp; + int opt_len, i; + boolean_t last = B_FALSE; + + ASSERT((opts != NULL) && (mop != NULL)); + + /* If no options were specified, nothing to do. */ + if (opts_len == 0) + return (0); + + /* If no options are allowed, fail. */ + if (mop[0].mo_name == NULL) + return (ENOTSUP); + + /* Don't accept leading or trailing ','. */ + if ((opts[0] == ',') || (opts[opts_len] == ',')) + return (EINVAL); + + /* Don't accept sequential ','. */ + for (i = 1; i < opts_len; i++) { + if ((opts[i - 1] == ',') && (opts[i] == ',')) + return (EINVAL); + } + + /* + * Verify each prop one at a time. There is no strtok in the kernel but + * it's easy to tokenize the entry ourselves. + */ + opt = opts; + for (tp = opt; *tp != ',' && *tp != '\0'; tp++) + ; + if (*tp == ',') { + *tp = '\0'; + } else { + last = B_TRUE; + } + for (;;) { + opt_len = strlen(opt); + + /* Check common options we support on all filesystems */ + for (i = 0; lx_common_mnt_opts[i] != NULL; i++) { + if (strcmp(opt, lx_common_mnt_opts[i]) == 0) + goto next_opt; + } + + /* Check for matching option/value pair. */ + for (i = 0; mop[i].mo_name != NULL; i++) { + char *ovalue; + int ovalue_len, mo_len; + + /* If the options is too short don't bother comparing */ + mo_len = strlen(mop[i].mo_name); + if (opt_len < mo_len) { + /* Keep trying to find a match. */ + continue; + } + + /* Compare the option to an allowed option. */ + if (strncmp(mop[i].mo_name, opt, mo_len) != 0) { + /* Keep trying to find a match. */ + continue; + } + + if (mop[i].mo_type == MOUNT_OPT_NORMAL) { + /* The option doesn't take a value. */ + if (opt_len == mo_len) { + /* This option is ok. */ + break; + } else { + /* Keep trying to find a match. */ + continue; + } + } + + /* This options takes a value. */ + if ((opt_len == mo_len) || (opt[mo_len] != '=')) { + /* Keep trying to find a match. */ + continue; + } + + /* We have an option match. Verify option value. */ + ovalue = &opt[mo_len] + 1; + ovalue_len = strlen(ovalue); + + /* Value can't be zero length string. */ + if (ovalue_len == 0) { + goto bad; + } + + if (mop[i].mo_type == MOUNT_OPT_UINT) { + int j; + /* Verify that value is an unsigned int. */ + for (j = 0; j < ovalue_len; j++) { + if (!ISDIGIT(ovalue[j])) { + goto bad; + } + } + } else if (mop[i].mo_type == MOUNT_OPT_PASSTHRU) { + /* Filesystem will do its own validation. */ + break; + } else { + /* Unknown option type specified. */ + goto bad; + } + + /* The option is ok. */ + break; + } + + /* If there were no matches this is an unsupported option. */ + if (mop[i].mo_name == NULL) { + goto bad; + } + +next_opt: + /* + * This option is ok, either we're done or move on to the next + * option. + */ + if (last) + break; + + *tp = ','; + opt = tp + 1; + for (tp = opt; *tp != ',' && *tp != '\0'; tp++) + ; + if (*tp == ',') { + *tp = '\0'; + } else { + last = B_TRUE; + } + }; + + /* We verified all the options. */ + return (0); + +bad: + if (!last) { + *tp = ','; + } + return (EINVAL); +} + +/* + * Remove an option from the string and save it in the provided buffer. + * The option string should have already been verified as valid. + * Return 0 if not present, -1 if error, and 1 if present and fine. + */ +static int +lx_mnt_opt_rm(char *opts, char *rmopt, char *retstr, int retlen) +{ + int opts_len = strlen(opts); + char *optstart, *optend; + int optlen; + + ASSERT((opts != NULL) && (rmopt != NULL)); + + retstr[0] = '\0'; + + /* If no options were specified, there's no problem. */ + if (opts_len == 0) + return (0); + + if ((optstart = strstr(opts, rmopt)) == NULL) + return (0); + + for (optend = optstart; *optend != ',' && *optend != '\0'; optend++) + ; + + /*LINTED*/ + optlen = optend - optstart; + if (optlen >= retlen) + return (-1); + (void) strncpy(retstr, optstart, optlen); + retstr[optlen] = '\0'; + + if (*optend == ',') + optend++; + + optlen = strlen(optend) + 1; + bcopy(optend, optstart, optlen); + + if (*optstart == '\0' && optstart != opts) { + /* removed last opt and it had a preceeding opt, remove comma */ + *(optstart - 1) = '\0'; + } + + return (1); +} + +static int +lx_mnt_opt_val(char *opt, int *valp) +{ + char *op, *ep; + long lval; + + if ((op = strchr(opt, '=')) == NULL) + return (-1); + + op++; + if (!ISDIGIT(*op)) + return (-1); + + if (ddi_strtoul(op, &ep, 10, (ulong_t *)&lval) != 0 || lval > INT_MAX) { + return (-1); + } + + if (*ep != '\0') + return (-1); + + *valp = (int)lval; + return (0); +} + +static int +lx_mnt_add_opt(char *option, char *buf, size_t buf_size) +{ + char *fmt_str = NULL; + size_t len; + + ASSERT((option != NULL) && (strlen(option) > 0)); + ASSERT((buf != NULL) && (buf_size > 0)); + + if (buf[0] == '\0') { + fmt_str = "%s"; + } else { + fmt_str = ",%s"; + } + + len = strlen(buf); + VERIFY(len <= buf_size); + buf_size -= len; + buf += len; + + if (snprintf(buf, buf_size, fmt_str, option) > (buf_size - 1)) + return (EOVERFLOW); + return (0); +} + +static int +lx_mnt_copyin_arg(const char *from, char *to, size_t len) +{ + size_t slen; + int rv; + + rv = copyinstr(from, to, len, &slen); + if (rv == ENAMETOOLONG || slen == len) + return (ENAMETOOLONG); + if (rv != 0) + return (EFAULT); + + return (0); +} + +long +lx_mount(const char *sourcep, const char *targetp, const char *fstypep, + uint_t flags, const void *datap) +{ + char fstype[16]; + char source[MAXPATHLEN]; + char target[MAXPATHLEN]; + char options[MAX_MNTOPT_STR]; + int sflags, rv; + struct mounta ma, *map = &ma; + vfs_t *vfsp; + vnode_t *vp = NULL; + int uid = -1; + int gid = -1; + + if ((rv = lx_mnt_copyin_arg(fstypep, fstype, sizeof (fstype))) != 0) { + if (rv == ENAMETOOLONG) + return (set_errno(ENODEV)); + return (set_errno(rv)); + } + + /* + * Vector back out to userland emulation for NFS. + */ + if (strcmp(fstype, "nfs") == 0 || strcmp(fstype, "nfs4") == 0) { + uintptr_t uargs[5] = {(uintptr_t)sourcep, (uintptr_t)targetp, + (uintptr_t)fstypep, (uintptr_t)flags, (uintptr_t)datap}; + + /* The userspace emulation will do the lx_syscall_return() */ + ttolxlwp(curthread)->br_eosys = JUSTRETURN; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(ttolwp(curthread), LX_SYS32_mount, + uargs); + } else +#endif + { + lx_emulate_user(ttolwp(curthread), LX_SYS_mount, uargs); + } + return (0); + } + + sflags = MS_SYSSPACE | MS_OPTIONSTR; + options[0] = '\0'; + + /* Copy in parameters that are always present. */ + if ((rv = lx_mnt_copyin_arg(sourcep, source, sizeof (source))) != 0) + return (set_errno(rv)); + + if ((rv = lx_mnt_copyin_arg(targetp, target, sizeof (target))) != 0) + return (set_errno(rv)); + + /* + * While SunOS is picky about mount(2) target paths being absolute, + * Linux is not so strict. In order to facilitate this looser + * requirement we must lookup the full path. + */ + if (target[0] != '/') { + vnode_t *vp; + + if ((rv = lookupnameatcred(target, UIO_SYSSPACE, FOLLOW, + NULLVPP, &vp, NULL, CRED())) != 0) + return (set_errno(rv)); + + rv = vnodetopath(NULL, vp, target, MAXPATHLEN, CRED()); + VN_RELE(vp); + if (rv != 0) + return (set_errno(rv)); + } + + /* Make sure we support the requested mount flags. */ + if ((flags & ~LX_MS_SUPPORTED) != 0) + return (set_errno(ENOTSUP)); + + /* Copy in Linux mount options. */ + if (datap != NULL && + (rv = lx_mnt_copyin_arg(datap, options, sizeof (options))) != 0) + return (set_errno(rv)); + + /* Do filesystem specific mount work. */ + if (flags & LX_MS_BIND) { + /* If MS_BIND is set, we turn this into a lofs mount. */ + (void) strcpy(fstype, "lofs"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lofs_options)) != 0) + return (set_errno(rv)); + } else if (strcmp(fstype, "tmpfs") == 0) { + char idstr[64]; + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_tmpfs_options)) != 0) + return (set_errno(rv)); + + /* + * Linux defaults to mode=1777 for tmpfs mounts. + */ + if (strstr(options, "mode=") == NULL) { + if (options[0] != '\0') + (void) strlcat(options, ",", sizeof (options)); + (void) strlcat(options, "mode=1777", sizeof (options)); + } + + switch (lx_mnt_opt_rm(options, "uid=", idstr, sizeof (idstr))) { + case 0: + uid = -1; + break; + case 1: + if (lx_mnt_opt_val(idstr, &uid) < 0) + return (set_errno(EINVAL)); + break; + default: + return (set_errno(E2BIG)); + } + switch (lx_mnt_opt_rm(options, "gid=", idstr, sizeof (idstr))) { + case 0: + gid = -1; + break; + case 1: + if (lx_mnt_opt_val(idstr, &gid) < 0) + return (set_errno(EINVAL)); + break; + default: + return (set_errno(E2BIG)); + } + + /* + * Linux seems to always allow overlay mounts. We allow this + * everywhere except under /dev where it interferes with device + * emulation. + */ + if (strcmp(target, "/dev") != 0 && + strncmp(target, "/dev/", 5) != 0) + sflags |= MS_OVERLAY; + } else if (strcmp(fstype, "proc") == 0) { + /* Translate proc mount requests to lx_proc requests. */ + (void) strcpy(fstype, "lx_proc"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_proc_options)) != 0) + return (set_errno(rv)); + } else if (strcmp(fstype, "sysfs") == 0) { + /* Translate sysfs mount requests to lx_sysfs requests. */ + (void) strcpy(fstype, "lx_sysfs"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_sysfs_options)) != 0) + return (set_errno(rv)); + } else if (strcmp(fstype, "cgroup") == 0) { + /* Translate cgroup mount requests to lx_cgroup requests. */ + (void) strcpy(fstype, "lx_cgroup"); + + /* + * Currently don't verify Linux mount options since we can + * have a subsystem string provided. + */ + } else if (strcmp(fstype, "autofs") == 0) { + /* Translate autofs mount requests to lxautofs requests. */ + (void) strcpy(fstype, LX_AUTOFS_NAME); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_autofs_options)) != 0) + return (set_errno(rv)); + + /* Linux seems to always allow overlay mounts */ + sflags |= MS_OVERLAY; + } else { + return (set_errno(ENODEV)); + } + + /* Convert some Linux flags to illumos flags. */ + if (flags & LX_MS_RDONLY) + sflags |= MS_RDONLY; + if (flags & LX_MS_NOSUID) + sflags |= MS_NOSUID; + if (flags & LX_MS_REMOUNT) + sflags |= MS_REMOUNT; + + /* + * Convert some Linux flags to illumos option strings. + */ + if (flags & LX_MS_STRICTATIME) { + /* + * The "strictatime" mount option ensures that none of the + * weaker atime-related mode options are in effect. + */ + flags &= ~(LX_MS_RELATIME | LX_MS_NOATIME); + } + if ((flags & LX_MS_NODEV) && + (rv = lx_mnt_add_opt("nodev", options, sizeof (options))) != 0) + return (set_errno(rv)); + if ((flags & LX_MS_NOEXEC) && + (rv = lx_mnt_add_opt("noexec", options, sizeof (options))) != 0) + return (set_errno(rv)); + if ((flags & LX_MS_NOATIME) && + (rv = lx_mnt_add_opt("noatime", options, sizeof (options))) != 0) + return (set_errno(rv)); + + if ((rv = lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) + return (set_errno(rv)); + + /* If mounting proc over itself, just return ok */ + if (strcmp(fstype, "lx_proc") == 0 && strcmp("lx_proc", + vfssw[vp->v_vfsp->vfs_fstype].vsw_name) == 0) { + VN_RELE(vp); + return (0); + } + + map->spec = source; + map->dir = target; + map->flags = sflags; + map->fstype = fstype; + map->dataptr = NULL; + map->datalen = 0; + map->optptr = options; + map->optlen = sizeof (options); + + rv = domount(NULL, map, vp, CRED(), &vfsp); + VN_RELE(vp); + if (rv != 0) + return (set_errno(rv)); + + VFS_RELE(vfsp); + if (strcmp(fstype, "tmpfs") == 0 && (uid != -1 || gid != -1)) { + /* Handle tmpfs uid/gid mount options. */ + if (lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP, + &vp) == 0) { + (void) lx_vn_chown(vp, (uid_t)uid, (gid_t)gid); + VN_RELE(vp); + } + } + + return (0); +} + +/* + * umount() is identical to illumos, though implemented on top of umount2(). + */ +long +lx_umount(char *path) +{ + return (umount2(path, 0)); +} + +/* + * The Linux umount2() system call is identical to illumos but has a different + * value for MNT_FORCE (the logical equivalent to MS_FORCE). + */ +#define LX_MNT_FORCE 0x1 + +long +lx_umount2(char *path, int flg) +{ + int flags = 0; + + if (flg & ~LX_MNT_FORCE) + return (set_errno(EINVAL)); + + if (flg & LX_MNT_FORCE) + flags |= MS_FORCE; + + return (umount2(path, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c new file mode 100644 index 0000000000..4ee355eb70 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/mutex.h> + +#include <sys/lx_types.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> + +extern int fcntl(int, int, intptr_t); +extern int openat(int, char *, int, int); +extern int open(char *, int, int); +extern int close(int); +extern int cioctl(file_t *, int, intptr_t, int *); +extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **, + vnode_t *); + + +static int +ltos_open_flags(int input) +{ + int flags; + + if (input & LX_O_PATH) { + input &= (LX_O_DIRECTORY | LX_O_NOFOLLOW | LX_O_CLOEXEC); + } + + /* This depends on the Linux ACCMODE flags being the same as SunOS. */ + flags = (input & LX_O_ACCMODE); + + if (input & LX_O_CREAT) { + flags |= O_CREAT; + } + + if (input & LX_O_EXCL) + flags |= O_EXCL; + if (input & LX_O_NOCTTY) + flags |= O_NOCTTY; + if (input & LX_O_TRUNC) + flags |= O_TRUNC; + if (input & LX_O_APPEND) + flags |= O_APPEND; + if (input & LX_O_NONBLOCK) + flags |= O_NONBLOCK; + if (input & LX_O_SYNC) + flags |= O_SYNC; + if (input & LX_O_LARGEFILE) + flags |= O_LARGEFILE; + if (input & LX_O_NOFOLLOW) + flags |= O_NOFOLLOW; + if (input & LX_O_CLOEXEC) + flags |= O_CLOEXEC; + + /* + * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the + * device backing the fd in question. Illumos doesn't have similar + * functionality, but we can attempt to simulate it using the flags + * (O_RSYNC|O_SYNC) and directio(3C). + * + * The LX_O_DIRECT flag also requires that the transfer size and + * alignment of I/O buffers be a multiple of the logical block size for + * the underlying file system, but frankly there isn't an easy way to + * support that functionality without doing something like adding an + * fcntl(2) flag to denote LX_O_DIRECT mode. + * + * Since LX_O_DIRECT is merely a performance advisory, we'll just + * emulate what we can and trust that the only applications expecting + * an error when performing I/O from a misaligned buffer or when + * passing a transfer size is not a multiple of the underlying file + * system block size will be test suites. + */ + if (input & LX_O_DIRECT) + flags |= (O_RSYNC|O_SYNC); + + return (flags); +} + +#define LX_POSTPROCESS_OPTS (LX_O_DIRECT | LX_O_ASYNC | LX_O_PATH) + +static int +lx_open_postprocess(int fd, int fmode) +{ + file_t *fp; + int rv, error = 0; + + if ((fmode & LX_POSTPROCESS_OPTS) == 0) { + /* Skip out early, if possible */ + return (0); + } + + if ((fp = getf(fd)) == NULL) { + /* + * It is possible that this fd was closed by the time we + * arrived here if some one is hammering away with close(). + */ + return (EIO); + } + + if (fmode & LX_O_DIRECT && error == 0) { + (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, + fp->f_flag, fp->f_cred, &rv, NULL); + } + + if (fmode & LX_O_ASYNC && error == 0) { + if ((error = VOP_SETFL(fp->f_vnode, fp->f_flag, FASYNC, + fp->f_cred, NULL)) == 0) { + mutex_enter(&fp->f_tlock); + fp->f_flag |= FASYNC; + mutex_exit(&fp->f_tlock); + } + } + + if (fmode & LX_O_PATH && error == 0) { + /* + * While the O_PATH flag has no direct analog in SunOS, it is + * emulated by removing both FREAD and FWRITE from f_flag. + * This causes read(2) and write(2) result in EBADF and can be + * checked for in other syscalls to trigger the correct behavior + * there. + */ + mutex_enter(&fp->f_tlock); + fp->f_flag &= ~(FREAD|FWRITE); + mutex_exit(&fp->f_tlock); + } + + releasef(fd); + if (error != 0) { + (void) closeandsetf(fd, NULL); + } + return (error); +} + +long +lx_openat(int atfd, char *path, int fmode, int cmode) +{ + int flags, fd, error; + mode_t mode = 0; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + flags = ltos_open_flags(fmode); + + /* + * We use the FSEARCH flag to make sure this is a directory. We have to + * explicitly add 1 to emulate the FREAD/FWRITE mapping of the OPENMODE + * macro since it won't get set via OPENMODE when FSEARCH is used. + */ + if (fmode & LX_O_DIRECTORY) { + flags |= FSEARCH; + flags++; + } + + if (flags & O_CREAT) + mode = (mode_t)cmode; + + ttolwp(curthread)->lwp_errno = 0; + fd = openat(atfd, path, flags, mode); + if (ttolwp(curthread)->lwp_errno != 0) { + if ((fmode & LX_O_DIRECTORY) && + ttolwp(curthread)->lwp_errno != ENOTDIR) { + /* + * We got an error trying to open a file as a directory. + * We need to determine if we should return the original + * error or ENOTDIR. + */ + vnode_t *startvp; + vnode_t *vp; + int oerror, error = 0; + + oerror = ttolwp(curthread)->lwp_errno; + + if (atfd == AT_FDCWD) { + /* regular open */ + startvp = NULL; + } else { + char startchar; + + if (copyin(path, &startchar, sizeof (char))) + return (set_errno(oerror)); + + /* if startchar is / then startfd is ignored */ + if (startchar == '/') { + startvp = NULL; + } else { + file_t *startfp; + + if ((startfp = getf(atfd)) == NULL) + return (set_errno(oerror)); + startvp = startfp->f_vnode; + VN_HOLD(startvp); + releasef(atfd); + } + } + + if (lookupnameat(path, UIO_USERSPACE, + (fmode & LX_O_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, &vp, startvp) != 0) { + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(oerror)); + } + + if (startvp != NULL) + VN_RELE(startvp); + + if (vp->v_type != VDIR) + error = ENOTDIR; + + VN_RELE(vp); + if (error != 0) + return (set_errno(ENOTDIR)); + + (void) set_errno(oerror); + } else if ((fmode & LX_O_NOFOLLOW) && (fmode & LX_O_PATH) && + ttolwp(curthread)->lwp_errno == ELOOP) { + /* + * On Linux, if O_NOFOLLOW and O_PATH are set together + * and the target is a symbolic link, then openat + * should return a file descriptor referring to the + * symbolic link. + * + * This file descriptor can be used with fchownat(2), + * fstatat(2), linkat(2), and readlinkat(2) alongside + * an empty pathname. + * + * We do not have a way to return such a file + * descriptor in illumos so open it without NO_FOLLOW + * and allow the postprocess to emulate O_PATH by + * removing the read and write flags. + * This is enough to keep recent systemd happy + * although any attempt to use the fd for the above + * listed calls without a pathname will fail or modify + * the symlink target. + */ + return (lx_openat(atfd, path, fmode & ~LX_O_NOFOLLOW, + cmode)); + } + + if (ttolwp(curthread)->lwp_errno == EINTR) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (ttolwp(curthread)->lwp_errno); + } + + if ((error = lx_open_postprocess(fd, fmode)) != 0) { + return (set_errno(error)); + } + return (fd); +} + +long +lx_open(char *path, int fmode, int cmode) +{ + return (lx_openat(LX_AT_FDCWD, path, fmode, cmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_personality.c b/usr/src/uts/common/brand/lx/syscall/lx_personality.c new file mode 100644 index 0000000000..e7aa945b50 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_personality.c @@ -0,0 +1,112 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + + +/* + * These flags are for what Linux calls "bug emulation". + * (Descriptions from the personality(2) Linux man page.) + * + * Flags which are currently actionable in LX: + * - READ_IMPLIES_EXEC (since Linux 2.6.8) + * With this flag set, PROT_READ implies PROT_EXEC for mmap(2). + * + * Flags which are current accepted but ignored: + * - UNAME26 (since Linux 3.1) + * Have uname(2) report a 2.6.40+ version number rather than a 3.x version + * number. Added as a stopgap measure to support broken applications that + * could not handle the kernel version- numbering switch from 2.6.x to 3.x. + * + * - ADDR_NO_RANDOMIZE (since Linux 2.6.12) + * With this flag set, disable address-space-layout randomization. + * + * - FDPIC_FUNCPTRS (since Linux 2.6.11) + * User-space function pointers to signal handlers point (on certain + * architectures) to descriptors. + * + * - MMAP_PAGE_ZERO (since Linux 2.4.0) + * Map page 0 as read-only (to support binaries that depend on this SVr4 + * behavior). + * + * - ADDR_COMPAT_LAYOUT (since Linux 2.6.9) + * With this flag set, provide legacy virtual address space layout. + * + * - ADDR_LIMIT_32BIT (since Linux 2.2) + * Limit the address space to 32 bits. + * + * - SHORT_INODE (since Linux 2.4.0) + * No effects(?). + * + * - WHOLE_SECONDS (since Linux 1.2.0) + * No effects(?). + * + * - STICKY_TIMEOUTS (since Linux 1.2.0) + * With this flag set, select(2), pselect(2), and ppoll(2) do not modify the + * returned timeout argument when interrupted by a signal handler. + * + * - ADDR_LIMIT_3GB (since Linux 2.4.0) + * With this flag set, use 0xc0000000 as the offset at which to search a + * virtual memory chunk on mmap(2); otherwise use 0xffffe000. + */ + +#define LX_PER_GET 0xffffffff + +long +lx_personality(unsigned int arg) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + unsigned int result = 0; + + mutex_enter(&curproc->p_lock); + result = lxpd->l_personality; + + if (arg == LX_PER_GET) { + mutex_exit(&curproc->p_lock); + return (result); + } + + /* + * Prevent changes to the personality if the process is undergoing an + * exec. This will allow elfexec and friends to manipulate the + * personality without hinderance. + */ + if ((curproc->p_flag & P_PR_EXEC) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EINVAL)); + } + + /* + * Keep tabs when a non-Linux personality is set. This is silently + * allowed to succeed, even though the emulation required is almost + * certainly missing. + */ + if ((arg & LX_PER_MASK) != LX_PER_LINUX) { + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "invalid personality: %02X", + arg & LX_PER_MASK); + lx_unsupported(buf); + } + + lxpd->l_personality = arg; + mutex_exit(&curproc->p_lock); + return (result); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c new file mode 100644 index 0000000000..2acd9d431e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c @@ -0,0 +1,189 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/lx_misc.h> + +#define LX_INIT_PGID 1 +#define LX_INIT_SID 1 + +/* From uts/common/syscall/pgrpsys.c */ +extern int setpgrp(int, int, int); + +long +lx_getpgrp(void) +{ + int pg; + + /* getpgrp() */ + pg = setpgrp(0, 0, 0); + + /* + * If the pgrp is that of the init process, return the value Linux + * expects. + */ + if (pg == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + return (pg); +} + +long +lx_getpgid(int pid) +{ + pid_t spid; + int tid; + int pg; + + if (pid < 0) + return (set_errno(ESRCH)); + + /* + * If the supplied pid matches that of the init process, return the pgid + * Linux expects. + */ + if (pid == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* getpgid() */ + ttolwp(curthread)->lwp_errno = 0; + pg = setpgrp(4, spid, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + /* + * If the pgid is that of the init process, return the value Linux + * expects. + */ + if (pg == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + return (pg); +} + +long +lx_setpgid(pid_t pid, pid_t pgid) +{ + pid_t spid, spgid; + int tid; + int pg; + int ret; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (pgid < 0) + return (set_errno(EINVAL)); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + if (pgid == 0) { + spgid = spid; + } else if (lx_lpid_to_spair(pgid, &spgid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* setpgid() */ + ret = setpgrp(5, spid, spgid); + + if (ret == EPERM) { + /* + * On Linux, when calling setpgid with a desired pgid that is + * equal to the current pgid of the process, no error is + * emitted. This differs slightly from illumos which would + * return EPERM. To emulate the Linux behavior, we check + * specifically for matching pgids. + */ + + /* getpgid() */ + ttolwp(curthread)->lwp_errno = 0; + pg = setpgrp(4, spid, 0); + if (ttolwp(curthread)->lwp_errno == 0 && spgid == pg) + return (0); + return (set_errno(EPERM)); + } + + return (ret); +} + +long +lx_getsid(int pid) +{ + pid_t spid; + int tid; + int sid; + + if (pid < 0) + return (set_errno(ESRCH)); + + /* + * If the supplied pid matches that of the init process, return the sid + * Linux expects. + */ + if (pid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* getsid() */ + ttolwp(curthread)->lwp_errno = 0; + sid = setpgrp(2, spid, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + + /* + * If the sid is that of the init process, return the value Linux + * expects. + */ + if (sid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + return (sid); +} + +long +lx_setsid(void) +{ + int sid; + + /* setsid() */ + ttolwp(curthread)->lwp_errno = 0; + sid = setpgrp(3, 0, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + /* + * If the sid is that of the init process, return the value Linux + * expects. + */ + if (sid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + return (sid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c new file mode 100644 index 0000000000..96959e40df --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c @@ -0,0 +1,309 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved. + * + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/fs/fifonode.h> +#include <sys/fcntl.h> +#include <sys/policy.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/sysmacros.h> + +#define LX_DEFAULT_PIPE_SIZE 65536 + +/* + * Our default value for fs.pipe-size-max mirrors Linux. The enforced maximum + * is meant to provide some sort of upper bound on pipe buffer sizing. Its + * value was chosen somewhat arbitrarily. + */ +uint_t lx_pipe_max_default = 1048576; +uint_t lx_pipe_max_limit = 8388608; + +int +lx_pipe_setsz(stdata_t *str, uint_t size, boolean_t is_init) +{ + int err; + stdata_t *mate; + lx_zone_data_t *lxzd = ztolxzd(curzone); + uint_t max_size = lxzd->lxzd_pipe_max_sz; + fifonode_t *fnp1, *fnp2; + + size = P2ROUNDUP(size, PAGESIZE); + if (size == 0) { + return (EINVAL); + } else if (size > max_size && secpolicy_resource(CRED()) != 0) { + if (!is_init) { + return (EPERM); + } + /* + * If the size limit is breached during initial pipe setup, + * simply clamp it to the maximum. On Linux kernels prior to + * 4.9, this clamping would not occur and it would be possible + * to open a pipe with the default buffer size even if it + * exceeded the sysctl limit. Rather than trigger behavior + * here based on the configured kernel version, it is applied + * to all callers. + */ + size = max_size; + ASSERT(max_size <= lx_pipe_max_limit); + } else if (size > lx_pipe_max_limit) { + /* + * Unlike Linux, we do maintain a global hard cap on pipe + * buffer limits. + */ + return (EPERM); + } + + if (!STRMATED(str)) { + err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size); + if (err == 0) { + fnp1 = VTOF(str->sd_vnode); + mutex_enter(&fnp1->fn_lock->flk_lock); + fnp1->fn_hiwat = size; + mutex_exit(&fnp1->fn_lock->flk_lock); + } + return (err); + } + + /* + * Ensure consistent order so the set operation is always attempted on + * the "higher" stream first. + */ + if (str > str->sd_mate) { + VERIFY((mate = str->sd_mate) != NULL); + } else { + mate = str; + VERIFY((str = mate->sd_mate) != NULL); + } + + /* + * While it is unfortunate that an error could occur for the latter + * half of the stream pair, there is little to be done about it aside + * from reporting the failure. + */ + if ((err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size)) == 0) { + err = strqset(RD(mate->sd_wrq), QHIWAT, 0, (intptr_t)size); + } + + if (err == 0) { + fnp1 = VTOF(str->sd_vnode); + fnp2 = VTOF(str->sd_mate->sd_vnode); + + /* + * See fnode_constructor. Both sides should have the same + * lock. We expect our callers to ensure that the vnodes + * are VFIFO and have v_op == fifovnops. + */ + ASSERT(str->sd_vnode->v_type == VFIFO); + ASSERT(str->sd_mate->sd_vnode->v_type == VFIFO); + ASSERT(fnp1->fn_lock == fnp2->fn_lock); + + mutex_enter(&fnp1->fn_lock->flk_lock); + + fnp1->fn_hiwat = size; + fnp2->fn_hiwat = size; + + mutex_exit(&fnp1->fn_lock->flk_lock); + } + + return (err); +} + +/* + * Based on native pipe(2) system call, except that the pipe is half-duplex. + */ +static int +lx_hd_pipe(intptr_t arg, int flags) +{ + vnode_t *vp1, *vp2; + struct file *fp1, *fp2; + int error = 0; + int flag1, flag2, iflags; + int fd1, fd2; + stdata_t *str; + + /* + * Validate allowed flags. + */ + if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) { + return (set_errno(EINVAL)); + } + /* + * Allocate and initialize two vnodes. + */ + makepipe(&vp1, &vp2); + + /* + * Allocate and initialize two file table entries and two + * file pointers. The first file pointer is open for read and the + * second is open for write. + */ + if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) { + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); + } + + if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0) + goto out2; + + /* + * Create two stream heads and attach to each vnode. + */ + if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0) + goto out; + + if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) { + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, + fp1->f_cred, NULL); + goto out; + } + + strmate(vp1, vp2); + + VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid(); + + /* + * Attempt to set pipe buffer sizes to expected value. + */ + VERIFY((str = vp1->v_stream) != NULL); + (void) lx_pipe_setsz(str, LX_DEFAULT_PIPE_SIZE, B_TRUE); + + /* + * Set the O_NONBLOCK flag if requested. + */ + if (flags & FNONBLOCK) { + flag1 = fp1->f_flag; + flag2 = fp2->f_flag; + iflags = flags & FNONBLOCK; + + if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp1->f_flag |= iflags; + + if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp2->f_flag |= iflags; + } + + /* + * Return the file descriptors to the user. They now + * point to two different vnodes which have different + * stream heads. + */ + if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) || + copyout(&fd2, &((int *)arg)[1], sizeof (int))) { + error = EFAULT; + goto out_vop_close; + } + + /* + * Now fill in the entries that falloc reserved + */ + mutex_exit(&fp1->f_tlock); + mutex_exit(&fp2->f_tlock); + setf(fd1, fp1); + setf(fd2, fp2); + + /* + * Optionally set the FCLOEXEC flag + */ + if ((flags & FCLOEXEC) != 0) { + f_setfd(fd1, FD_CLOEXEC); + f_setfd(fd2, FD_CLOEXEC); + } + + return (0); +out_vop_close: + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL); + (void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL); +out: + setf(fd2, NULL); + unfalloc(fp2); +out2: + setf(fd1, NULL); + unfalloc(fp1); + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); +} + +/* + * pipe(2) system call. + */ +long +lx_pipe(intptr_t arg) +{ + return (lx_hd_pipe(arg, 0)); +} + +/* + * pipe2(2) system call. + */ +long +lx_pipe2(intptr_t arg, int lxflags) +{ + int flags = 0; + + /* + * Validate allowed flags. + */ + if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) { + return (set_errno(EINVAL)); + } + + /* + * Convert from Linux flags to illumos flags. + */ + if (lxflags & LX_O_NONBLOCK) { + flags |= FNONBLOCK; + } + if (lxflags & LX_O_CLOEXEC) { + flags |= FCLOEXEC; + } + + return (lx_hd_pipe(arg, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_poll.c b/usr/src/uts/common/brand/lx/syscall/lx_poll.c new file mode 100644 index 0000000000..e54130aff1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_poll.c @@ -0,0 +1,786 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/poll_impl.h> +#include <sys/schedctl.h> +#include <sys/lx_signal.h> + +/* + * Max number of FDs that can be given to poll() or select() before we return + * EINVAL (the Linux man page documents this value as {OPEN_MAX}, and defaults + * it to this value). + */ +int lx_poll_max_fds = 1048576; + +/* From uts/common/syscall/poll.c */ +extern int poll_copyin(pollstate_t *, pollfd_t *, nfds_t); +extern int poll_common(pollstate_t *, pollfd_t *, nfds_t, timespec_t *, int *); + +/* + * These events are identical between Linux and SunOS + */ +#define LX_POLLIN 0x001 +#define LX_POLLPRI 0x002 +#define LX_POLLOUT 0x004 +#define LX_POLLERR 0x008 +#define LX_POLLHUP 0x010 +#define LX_POLLNVAL 0x020 +#define LX_POLLRDNORM 0x040 +#define LX_POLLRDBAND 0x080 + +#define LX_POLL_COMMON_EVENTS (LX_POLLIN | LX_POLLPRI | LX_POLLOUT | \ + LX_POLLERR | LX_POLLHUP | LX_POLLNVAL | LX_POLLRDNORM | LX_POLLRDBAND) + +/* + * These events differ between Linux and SunOS + */ +#define LX_POLLWRNORM 0x0100 +#define LX_POLLWRBAND 0x0200 +#define LX_POLLRDHUP 0x2000 + + +#define LX_POLL_SUPPORTED_EVENTS \ + (LX_POLL_COMMON_EVENTS | LX_POLLWRNORM | LX_POLLWRBAND | LX_POLLRDHUP) + + +static int +lx_poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i, error = 0; + pollfd_t *pollfdp; + + if ((error = poll_copyin(ps, fds, nfds)) != 0) { + return (error); + } + pollfdp = ps->ps_pollfd; + + /* Convert the Linux events bitmask into SunOS equivalent. */ + for (i = 0; i < nfds; i++) { + short lx_events = pollfdp[i].events; + short events; + + /* + * If the caller is polling for an unsupported event, we + * have to bail out. + */ + if (lx_events & ~LX_POLL_SUPPORTED_EVENTS) { + return (ENOTSUP); + } + + events = lx_events & LX_POLL_COMMON_EVENTS; + if (lx_events & LX_POLLWRNORM) + events |= POLLWRNORM; + if (lx_events & LX_POLLWRBAND) + events |= POLLWRBAND; + if (lx_events & LX_POLLRDHUP) + events |= POLLRDHUP; + pollfdp[i].events = events; + oldevt[i] = lx_events; + } + return (0); +} + +static int +lx_poll_copyout(pollfd_t *pollfdp, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i; + + /* + * Convert SunOS revents bitmask into Linux equivalent and restore + * cached events field which was swizzled by lx_poll_copyin. + */ + for (i = 0; i < nfds; i++) { + short revents = pollfdp[i].revents; + short lx_revents = revents & LX_POLL_COMMON_EVENTS; + short orig_events = oldevt[i]; + + if (revents & POLLWRBAND) + lx_revents |= LX_POLLWRBAND; + if (revents & POLLRDHUP) + lx_revents |= LX_POLLRDHUP; + /* + * Because POLLOUT and POLLWRNORM are native defined as the + * same value, care must be taken when translating them to + * Linux where they differ. + */ + if (revents & POLLOUT) { + if ((orig_events & LX_POLLOUT) == 0) + lx_revents &= ~LX_POLLOUT; + if (orig_events & LX_POLLWRNORM) + lx_revents |= LX_POLLWRNORM; + } + + pollfdp[i].revents = lx_revents; + pollfdp[i].events = orig_events; + } + + if (copyout(pollfdp, fds, sizeof (pollfd_t) * nfds) != 0) + return (EFAULT); + + return (0); +} + +static long +lx_poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + short *oldevt = NULL; + int error = 0, fdcnt = 0; + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + /* + * Cap the number of FDs they can give us so we don't go + * allocating a huge chunk of memory. Note that this is *not* + * the RLIMIT_NOFILE rctl. + */ + if (nfds > lx_poll_max_fds) { + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + /* + * Certain event types which are distinct on Linux are aliased + * against each other on illumos. In order properly translate + * back into the Linux format, the original events of interest + * are stored in 'oldevt' for use during lx_poll_copyout. + */ + oldevt = kmem_alloc(nfds * sizeof (short), KM_SLEEP); + if ((error = lx_poll_copyin(ps, fds, nfds, oldevt)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + + /* + * The Linux poll(2) implicitly polls for POLLERR and POLLHUP + * in addition to any other events specified for the file + * descriptors in question. It does not modify pollfd_t`events + * to reflect that fact when performing a later copyout. + */ + ps->ps_implicit_ev = POLLERR | POLLHUP; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + + /* + * Clear implicit event interest, if needed. + */ + if (ps != NULL) { + ps->ps_implicit_ev = 0; + } + + +pollout: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (nfds != 0 && error == 0) { + error = lx_poll_copyout(pollfdp, fds, nfds, oldevt); + } + if (oldevt != NULL) { + kmem_free(oldevt, nfds * sizeof (short)); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_poll(pollfd_t *fds, nfds_t nfds, int timeout) +{ + timespec_t ts, *tsp = NULL; + + if (timeout >= 0) { + ts.tv_sec = timeout / MILLISEC; + ts.tv_nsec = (timeout % MILLISEC) * MICROSEC; + tsp = &ts; + } + + return (lx_poll_common(fds, nfds, tsp, NULL)); +} + +long +lx_ppoll(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, lx_sigset_t *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset; + + if (copyin(setp, &lset, sizeof (lset))) + return (set_errno(EFAULT)); + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + + return (lx_poll_common(fds, nfds, tsp, ksetp)); +} + +typedef struct lx_select_buf_s { + long *lsb_rfds; + long *lsb_wfds; + long *lsb_efds; + unsigned int lsb_size; +} lx_select_buf_t; + +/* + * Size (in bytes) of buffer appropriate for fd_set copyin/copyout. + * Linux uses buffers of 'long' to accomplish this. + */ +#define LX_FD_SET_BYTES (sizeof (long)) +#define LX_FD_SET_BITS (8 * LX_FD_SET_BYTES) +#define LX_FD_SET_SIZE(nfds) \ + ((((nfds) + (LX_FD_SET_BITS - 1)) / LX_FD_SET_BITS) * LX_FD_SET_BYTES) + +static int +lx_select_copyin(pollstate_t *ps, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds) +{ + int n; + long *in, *out, *ex; + long absent = 0; + pollfd_t *pfd; + nfds_t old_nfds; + + /* + * Just like pollsys and lx_poll, attempt to reuse ps_pollfd if it is + * appropriately sized. See poll_copyin for more detail. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pfd = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pfd; + ps->ps_nfds = nfds; + } else { + pfd = ps->ps_pollfd; + } + + if (rfds != NULL) { + if (copyin(rfds, sbuf->lsb_rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyin(wfds, sbuf->lsb_wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyin(efds, sbuf->lsb_efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + in = (rfds != NULL) ? sbuf->lsb_rfds : &absent; + out = (wfds != NULL) ? sbuf->lsb_wfds : &absent; + ex = (efds != NULL) ? sbuf->lsb_efds : &absent; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long b, m, j; + + b = (unsigned long)(*in | *out | *ex); + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + int fd = n + j; + + if (fd >= nfds) + return (0); + pfd->events = 0; + if (b & 1) { + pfd->fd = fd; + if (*in & m) + pfd->events |= POLLRDNORM; + if (*out & m) + pfd->events |= POLLWRNORM; + if (*ex & m) + pfd->events |= POLLRDBAND; + } else { + pfd->fd = -1; + } + pfd++; + b >>= 1; + m <<= 1; + } + + if (rfds != NULL) + in++; + if (wfds != NULL) + out++; + if (efds != NULL) + ex++; + } + return (0); +} + +static int +lx_select_copyout(pollfd_t *pollfdp, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds, int *fdcnt) +{ + int n; + pollfd_t *pfd; + long rv = 0; + + /* + * If poll did not find any fds of interest, we can just zero out the + * fd_set fields for copyout. + */ + if (*fdcnt == 0) { + if (rfds != NULL) { + bzero(sbuf->lsb_rfds, sbuf->lsb_size); + } + if (wfds != NULL) { + bzero(sbuf->lsb_wfds, sbuf->lsb_size); + } + if (efds != NULL) { + bzero(sbuf->lsb_efds, sbuf->lsb_size); + } + goto copyout; + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + pfd = pollfdp; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long m, j; + long in = 0, out = 0, ex = 0; + + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + if ((n + j) >= nfds) + break; + if (pfd->revents != 0) { + if (pfd->revents & POLLNVAL) { + return (EBADF); + } + if (pfd->revents & POLLRDNORM) { + in |= m; + rv++; + } + if (pfd->revents & POLLWRNORM) { + out |= m; + rv++; + } + if (pfd->revents & POLLRDBAND) { + ex |= m; + rv++; + } + /* + * Only set this bit on return if we asked + * about input conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDNORM)) { + if ((in & m) == 0) { + /* wasn't already set */ + rv++; + } + in |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLWRNORM)) { + if ((out & m) == 0) { + /* wasn't already set */ + rv++; + } + out |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDBAND)) { + if ((ex & m) == 0) { + /* wasn't already set */ + rv++; + } + ex |= m; + } + } + m <<= 1; + pfd++; + } + if (rfds != NULL) + sbuf->lsb_rfds[n / LX_FD_SET_BITS] = in; + if (wfds != NULL) + sbuf->lsb_wfds[n / LX_FD_SET_BITS] = out; + if (efds != NULL) + sbuf->lsb_efds[n / LX_FD_SET_BITS] = ex; + } + +copyout: + if (rfds != NULL) { + if (copyout(sbuf->lsb_rfds, rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyout(sbuf->lsb_wfds, wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyout(sbuf->lsb_efds, efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + *fdcnt = rv; + return (0); +} + + +static long +lx_select_common(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL, *fake_fds = NULL; + lx_select_buf_t sbuf = {0}; + int error = 0, fdcnt = 0; + + if (nfds < 0) { + return (set_errno(EINVAL)); + } + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto out; + } + mutex_exit(&p->p_lock); + } + + /* + * Because poll caching uses the userspace pollfd_t pointer to verify + * cache reuse validity, a simulated value must be supplied when + * emulating Linux select(2). The first non-NULL pointer from + * rfds/wfds/efds is used for this purpose. + */ + if (rfds != NULL) { + fake_fds = (pollfd_t *)rfds; + } else if (wfds != NULL) { + fake_fds = (pollfd_t *)wfds; + } else if (efds != NULL) { + fake_fds = (pollfd_t *)efds; + } else { + /* + * A non-zero nfds was supplied but all three fd_set pointers + * were null. Fall back to doing a simple timeout. + */ + nfds = 0; + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + /* + * Cap the number of FDs they can give us so we don't go + * allocating a huge chunk of memory. Note that this is *not* + * the RLIMIT_NOFILE rctl. + */ + if (nfds > lx_poll_max_fds) { + error = EINVAL; + goto out; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + sbuf.lsb_size = LX_FD_SET_SIZE(nfds); + if (rfds != NULL) + sbuf.lsb_rfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (wfds != NULL) + sbuf.lsb_wfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (efds != NULL) + sbuf.lsb_efds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + + error = lx_select_copyin(ps, &sbuf, nfds, rfds, wfds, efds); + if (error != 0) { + goto out; + } + + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fake_fds, (nfds_t)nfds, tsp, &fdcnt); + +out: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (error == 0 && nfds != 0) { + error = lx_select_copyout(pollfdp, &sbuf, nfds, rfds, wfds, + efds, &fdcnt); + } + if (sbuf.lsb_size != 0) { + if (sbuf.lsb_rfds != NULL) + kmem_free(sbuf.lsb_rfds, sbuf.lsb_size); + if (sbuf.lsb_wfds != NULL) + kmem_free(sbuf.lsb_wfds, sbuf.lsb_size); + if (sbuf.lsb_efds != NULL) + kmem_free(sbuf.lsb_efds, sbuf.lsb_size); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_select(int nfds, long *rfds, long *wfds, long *efds, + struct timeval *timeoutp) +{ + timespec_t ts, *tsp = NULL; + + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + struct timeval tv; + + if (copyin(timeoutp, &tv, sizeof (tv))) + return (set_errno(EFAULT)); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * (NANOSEC / MICROSEC); + } else { + struct timeval32 tv32; + + if (copyin(timeoutp, &tv32, sizeof (tv32))) + return (set_errno(EFAULT)); + ts.tv_sec = tv32.tv_sec; + ts.tv_nsec = tv32.tv_usec * (NANOSEC / MICROSEC); + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, NULL)); +} + + +typedef struct { + uintptr_t lpsa_addr; + unsigned long lpsa_len; +} lx_pselect_sig_arg_t; + +#if defined(_LP64) +typedef struct { + caddr32_t lpsa_addr; + uint32_t lpsa_len; +} lx_pselect_sig_arg32_t; +#endif /* defined(_LP64) */ + +long +lx_pselect(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *timeoutp, void *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset, *sigaddr = NULL; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_pselect_sig_arg_t lpsa; + + if (copyin(setp, &lpsa, sizeof (lpsa)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)lpsa.lpsa_addr; + } +#if defined(_LP64) + else { + lx_pselect_sig_arg32_t lpsa32; + + if (copyin(setp, &lpsa32, sizeof (lpsa32)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa32.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)(uint64_t)lpsa32.lpsa_addr; + } +#endif /* defined(_LP64) */ + + /* This is where we check if the sigset is *really* NULL. */ + if (sigaddr != NULL) { + if (copyin(sigaddr, &lset, sizeof (lset)) != 0) + return (set_errno(EFAULT)); + + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, ksetp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_prctl.c b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c new file mode 100644 index 0000000000..a8b3c3422c --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c @@ -0,0 +1,288 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/user.h> +#include <sys/priv.h> +#include <sys/brand.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> + +#define LX_PR_SET_PDEATHSIG 1 +#define LX_PR_GET_PDEATHSIG 2 +#define LX_PR_GET_DUMPABLE 3 +#define LX_PR_SET_DUMPABLE 4 +#define LX_PR_GET_UNALIGN 5 +#define LX_PR_SET_UNALIGN 6 +#define LX_PR_GET_KEEPCAPS 7 +#define LX_PR_SET_KEEPCAPS 8 +#define LX_PR_GET_FPEMU 9 +#define LX_PR_SET_FPEMU 10 +#define LX_PR_GET_FPEXC 11 +#define LX_PR_SET_FPEXC 12 +#define LX_PR_GET_TIMING 13 +#define LX_PR_SET_TIMING 14 +#define LX_PR_SET_NAME 15 +#define LX_PR_GET_NAME 16 +#define LX_PR_GET_ENDIAN 19 +#define LX_PR_SET_ENDIAN 20 +#define LX_PR_GET_SECCOMP 21 +#define LX_PR_SET_SECCOMP 22 +#define LX_PR_CAPBSET_READ 23 +#define LX_PR_CAPBSET_DROP 24 +#define LX_PR_GET_TSC 25 +#define LX_PR_SET_TSC 26 +#define LX_PR_GET_SECUREBITS 27 +#define LX_PR_SET_SECUREBITS 28 +#define LX_PR_SET_TIMERSLACK 29 +#define LX_PR_GET_TIMERSLACK 30 +#define LX_PR_TASK_PERF_EVENTS_DISABLE 31 +#define LX_PR_TASK_PERF_EVENTS_ENABLE 32 +#define LX_PR_MCE_KILL 33 +#define LX_PR_MCE_KILL_GET 34 +#define LX_PR_SET_MM 35 +#define LX_PR_SET_CHILD_SUBREAPER 36 +#define LX_PR_GET_CHILD_SUBREAPER 37 +#define LX_PR_SET_NO_NEW_PRIVS 38 +#define LX_PR_GET_NO_NEW_PRIVS 39 +#define LX_PR_GET_TID_ADDRESS 40 +#define LX_PR_SET_THP_DISABLE 41 +#define LX_PR_GET_THP_DISABLE 42 + +long +lx_prctl(int opt, uintptr_t data) +{ + long err; + char ebuf[64]; + + switch (opt) { + case LX_PR_GET_DUMPABLE: { + /* Only track in brand data - could hook into SNOCD later */ + lx_proc_data_t *lxpd; + int val; + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + val = lxpd->l_flags & LX_PROC_NO_DUMP; + mutex_exit(&curproc->p_lock); + + return (val == 0); + } + + case LX_PR_SET_DUMPABLE: { + lx_proc_data_t *lxpd; + + if (data != 0 && data != 1) { + return (set_errno(EINVAL)); + } + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + if (data == 0) { + lxpd->l_flags |= LX_PROC_NO_DUMP; + } else { + lxpd->l_flags &= ~LX_PROC_NO_DUMP; + } + mutex_exit(&curproc->p_lock); + + return (0); + } + + case LX_PR_GET_SECUREBITS: { + /* Our bits are always 0 */ + return (0); + } + + case LX_PR_SET_SECUREBITS: { + /* Ignore setting any bits from arg2 */ + return (0); + } + + case LX_PR_SET_KEEPCAPS: { + /* + * The closest illumos analog to SET_KEEPCAPS is the PRIV_AWARE + * flag. There are probably some cases where it's not exactly + * the same, but this will do for a first try. + */ + if (data == 0) { + err = setpflags(PRIV_AWARE_RESET, 1, NULL); + } else { + err = setpflags(PRIV_AWARE, 1, NULL); + } + + if (err != 0) { + return (set_errno(err)); + } + return (0); + } + + case LX_PR_GET_NAME: { + /* + * We allow longer thread names than Linux for compatibility + * with other OSes (Solaris, NetBSD) that also allow larger + * names. We just truncate (with NUL termination) if + * the name is longer. + */ + char name[LX_PR_SET_NAME_NAMELEN] = { 0 }; + kthread_t *t = curthread; + + mutex_enter(&ttoproc(t)->p_lock); + if (t->t_name != NULL) { + (void) strlcpy(name, t->t_name, sizeof (name)); + } + mutex_exit(&ttoproc(t)->p_lock); + + /* + * FWIW, the prctl(2) manpage says that the user-supplied + * buffer should be at least 16 (LX_PR_SET_NAME_NAMELEN) bytes + * long. + */ + if (copyout(name, (void *)data, LX_PR_SET_NAME_NAMELEN) != 0) { + return (set_errno(EFAULT)); + } + return (0); + } + + case LX_PR_SET_NAME: { + char name[LX_PR_SET_NAME_NAMELEN] = { 0 }; + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + int ret; + + ret = copyinstr((const char *)data, name, sizeof (name), NULL); + /* + * prctl(2) explicitly states that over length strings are + * silently truncated + */ + if (ret != 0 && ret != ENAMETOOLONG) { + return (set_errno(EFAULT)); + } + name[LX_PR_SET_NAME_NAMELEN - 1] = '\0'; + + if ((ret = thread_setname(t, name)) != 0) { + return (set_errno(ret)); + } + + /* + * In Linux, PR_SET_NAME sets the name of the thread, not the + * process. Due to the historical quirks of Linux's asinine + * thread model, this name is effectively the name of the + * process (as visible via ps(1)) if the thread is the first of + * its task group. The first thread is therefore special, and + * to best mimic Linux semantics we set the thread name, and if + * we are setting LWP 1, we also update the name of the process. + */ + if (t->t_tid != 1) { + return (0); + } + + /* + * We are currently choosing to not allow an empty thread + * name to clear p->p_user.u_comm and p->p_user.u_psargs. + * This is a slight divergence from linux behavior (which + * allows this) so that we can preserve the original command. + */ + if (strlen(name) == 0) { + return (0); + } + + /* + * We explicitly use t->t_name here instead of name in case + * a thread has come in between the above thread_setname() + * call and the setting of u_comm/u_psargs below. On Linux, + * one can also change the name of a thread (either itself or + * another thread in the same process) via writing to /proc, so + * while racy, this is no worse than what might happen on + * Linux. + */ + mutex_enter(&p->p_lock); + (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1); + (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ); + mutex_exit(&p->p_lock); + return (0); + } + + case LX_PR_GET_PDEATHSIG: { + int sig; + lx_proc_data_t *lxpd; + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + sig = lxpd->l_parent_deathsig; + mutex_exit(&curproc->p_lock); + + return (sig); + } + + case LX_PR_SET_PDEATHSIG: { + int sig = lx_ltos_signo((int)data, 0); + proc_t *pp = NULL; + lx_proc_data_t *lxpd; + + if (sig == 0 && data != 0) { + return (set_errno(EINVAL)); + } + + mutex_enter(&pidlock); + /* Set signal on our self */ + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + lxpd->l_parent_deathsig = sig; + pp = curproc->p_parent; + mutex_exit(&curproc->p_lock); + + /* Configure parent to potentially signal children on death */ + mutex_enter(&pp->p_lock); + if (PROC_IS_BRANDED(pp)) { + VERIFY((lxpd = ptolxproc(pp)) != NULL); + /* + * Mark the parent as having children which wish to be + * signaled on death of parent. + */ + lxpd->l_flags |= LX_PROC_CHILD_DEATHSIG; + } else { + /* + * If the parent is not a branded process, the needed + * hooks to facilitate this mechanism will not fire + * when it dies. We lie about success in this case. + */ + /* EMPTY */ + } + mutex_exit(&pp->p_lock); + mutex_exit(&pidlock); + return (0); + } + + case LX_PR_CAPBSET_DROP: { + /* + * On recent versions of Linux the login svc drops capabilities + * and if that fails the svc dies and is restarted by systemd. + * For now we pretend dropping capabilities succeeded. + */ + return (0); + } + + default: + break; + } + + (void) snprintf(ebuf, 64, "prctl option %d", opt); + lx_unsupported(ebuf); + return (set_errno(EINVAL)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_priority.c b/usr/src/uts/common/brand/lx/syscall/lx_priority.c new file mode 100644 index 0000000000..44c60b66bf --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_priority.c @@ -0,0 +1,192 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/procset.h> +#include <sys/resource.h> +#include <sys/priocntl.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* From uts/common/disp/priocntl.c */ +extern int donice(procset_t *, pcnice_t *); + +/* + * The Linux syscall returns priorities in the range (highest) 40-1 (lowest) + * and then glibc adjusts these to the range -20 - 19. + */ +long +lx_getpriority(int which, id_t who) +{ + int rval; + idtype_t idtype; + id_t id, lid; + pcnice_t pcnice; + procset_t procset; + + switch (which) { + case PRIO_PROCESS: + idtype = P_PID; + if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0) + return (set_errno(ESRCH)); + break; + case PRIO_PGRP: + idtype = P_PGID; + break; + case PRIO_USER: + idtype = P_UID; + break; + default: + return (set_errno(EINVAL)); + } + + /* Linux fails with a different errno on a negative id */ + if (who < 0) + return (set_errno(ESRCH)); + + id = (who == 0 ? P_MYID : who); + + pcnice.pc_val = 0; + pcnice.pc_op = PC_GETNICE; + + setprocset(&procset, POP_AND, idtype, id, P_ALL, 0); + + rval = donice(&procset, &pcnice); + if (rval != 0) { + if (which == PRIO_PROCESS && + (who == curproc->p_pid || who == 0) && + strcmp(sclass[curthread->t_cid].cl_name, "RT") == 0) { + /* + * donice() will always return EINVAL if we're in the + * RT class. The zone won't be able to put itself or any + * of its processes into RT, but if we put the whole + * zone into RT via the scheduling-class property, then + * getpriority would always fail. This breaks pam and + * prevents any login. Just pretend to be the highest + * priority. + */ + return (40); + } + + /* + * Linux does not return EINVAL for invalid 'who' values, it + * returns ESRCH instead. We already validated 'which' above. + */ + if (rval == EINVAL) + rval = ESRCH; + return (set_errno(rval)); + } + + /* + * The return value of the getpriority syscall is biased by 20 to avoid + * returning negative values when successful (-20 internally is our + * highest priority and 19 is our lowest). + */ + return (20 - pcnice.pc_val); +} + +/* + * Return EPERM if the current process is not allowed to operate on the target + * process (which is part of the procset for setpriority). + */ +/* ARGSUSED */ +static int +lx_chk_pripriv(proc_t *pp, char *dummy) +{ + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + if (!prochasprocperm(pp, curproc, CRED())) { + mutex_exit(&pp->p_lock); + return (EPERM); + } + mutex_exit(&pp->p_lock); + return (0); +} + +long +lx_setpriority(int which, id_t who, int prio) +{ + int rval; + idtype_t idtype; + id_t id, lid; + pcnice_t pcnice; + procset_t procset; + + switch (which) { + case PRIO_PROCESS: + idtype = P_PID; + if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0) + return (set_errno(ESRCH)); + break; + case PRIO_PGRP: + idtype = P_PGID; + break; + case PRIO_USER: + idtype = P_UID; + break; + default: + return (set_errno(EINVAL)); + } + + /* Linux fails with a different errno on a negative id */ + if (who < 0) + return (set_errno(ESRCH)); + + id = (who == 0 ? P_MYID : who); + + if (prio > NZERO - 1) { + prio = NZERO - 1; + } else if (prio < -NZERO) { + prio = -NZERO; + } + + pcnice.pc_val = prio; + pcnice.pc_op = PC_SETNICE; + + setprocset(&procset, POP_AND, idtype, id, P_ALL, 0); + + rval = donice(&procset, &pcnice); + if (rval != 0) { + /* + * Once we fully support Linux capabilities, we should update + * the following check to look at the CAP_SYS_NICE capability. + */ + if (rval == EPERM && crgetuid(CRED()) != 0) { + /* + * donice() returns EPERM under two conditions: + * 1) if either the real or eff. uid don't match + * 2) we lack the privileges to raise the priority + * + * However, setpriority() must return a different errno + * based on the following: + * EPERM - real or eff. uid did not match + * EACCES - trying to increase priority + * + * We use lx_chk_pripriv to determine which case we hit. + * + * Note that the native setpriority(3C) code has the + * same race on re-checking. + */ + if (dotoprocs(&procset, lx_chk_pripriv, NULL) != EPERM) + rval = EACCES; + } + + return (set_errno(rval)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rename.c b/usr/src/uts/common/brand/lx/syscall/lx_rename.c new file mode 100644 index 0000000000..2fad627771 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rename.c @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* From uts/common/syscall/rename.c */ +extern int rename(char *, char *); +extern int renameat(int, char *, int, char *); + +long +lx_rename(char *p1, char *p2) +{ + return (rename(p1, p2)); +} + +long +lx_renameat(int atfd1, char *p1, int atfd2, char *p2) +{ + if (atfd1 == LX_AT_FDCWD) + atfd1 = AT_FDCWD; + + if (atfd2 == LX_AT_FDCWD) + atfd2 = AT_FDCWD; + + return (renameat(atfd1, p1, atfd2, p2)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c new file mode 100644 index 0000000000..30fa996615 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c @@ -0,0 +1,587 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/zone.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#define LX_RLIMIT_CPU 0 +#define LX_RLIMIT_FSIZE 1 +#define LX_RLIMIT_DATA 2 +#define LX_RLIMIT_STACK 3 +#define LX_RLIMIT_CORE 4 +#define LX_RLIMIT_RSS 5 +#define LX_RLIMIT_NPROC 6 +#define LX_RLIMIT_NOFILE 7 +#define LX_RLIMIT_MEMLOCK 8 +#define LX_RLIMIT_AS 9 +#define LX_RLIMIT_LOCKS 10 /* NA limit on locks, early 2.4 only */ +#define LX_RLIMIT_SIGPENDING 11 +#define LX_RLIMIT_MSGQUEUE 12 +#define LX_RLIMIT_NICE 13 /* NA ceiling for nice */ +#define LX_RLIMIT_RTPRIO 14 /* NA ceiling on the RT priority */ +#define LX_RLIMIT_RTTIME 15 /* NA cpu limit for RT proc. */ + +#define LX_RLIMIT_NLIMITS 16 + +#define RCTL_INFINITE(x) \ + ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x->rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct { + ulong_t rlim_cur; + ulong_t rlim_max; +} lx_rlimit_t; + +typedef struct { + uint32_t rlim_cur; + uint32_t rlim_max; +} lx_rlimit32_t; + +/* + * Linux supports many of the same resources that we do, but on illumos these + * are rctls. Instead of using rlimit, we use rctls for all of the limits. + * This table is used to translate Linux rlimit keys into the illumos legacy + * rlimit. We then primarily use the rctl/rlimit compatability code to + * manage these. + */ +static int l_to_r[LX_RLIMIT_NLIMITS] = { + RLIMIT_CPU, /* 0 CPU */ + RLIMIT_FSIZE, /* 1 FSIZE */ + RLIMIT_DATA, /* 2 DATA */ + RLIMIT_STACK, /* 3 STACK */ + RLIMIT_CORE, /* 4 CORE */ + -1, /* 5 RSS */ + -1, /* 6 NPROC */ + RLIMIT_NOFILE, /* 7 NOFILE */ + -1, /* 8 MEMLOCK */ + RLIMIT_AS, /* 9 AS */ + -1, /* 10 LOCKS */ + -1, /* 11 SIGPENDING */ + -1, /* 12 MSGQUEUE */ + -1, /* 13 NICE */ + -1, /* 14 RTPRIO */ + -1 /* 15 RTTIME */ +}; + +/* + * Magic value Linux uses to indicate infinity + */ +#define LX_RLIM_INFINITY_N ULONG_MAX + +void +lx_get_rctl(char *nm, struct rlimit64 *rlp64) +{ + rctl_hndl_t hndl; + rctl_val_t *oval, *nval; + + rlp64->rlim_cur = RLIM_INFINITY; + rlp64->rlim_max = RLIM_INFINITY; + + nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP); + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + oval = NULL; + while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) { + oval = nval; + switch (nval->rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_cur = nval->rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_max = nval->rcv_value; + break; + } + } + + mutex_exit(&curproc->p_lock); + kmem_free(nval, sizeof (rctl_val_t)); + + if (rlp64->rlim_cur == RLIM_INFINITY && + rlp64->rlim_max != RLIM_INFINITY) + rlp64->rlim_cur = rlp64->rlim_max; +} + +static int +lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int resource; + int64_t cur = -1; + boolean_t cur_inf = B_FALSE; + int64_t max = -1; + boolean_t max_inf = B_FALSE; + struct rlimit64 rlim64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max; + break; + + case LX_RLIMIT_NICE: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max; + break; + + case LX_RLIMIT_RTTIME: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max; + break; + + case LX_RLIMIT_RSS: + /* zone.max-physical-memory */ + zone_get_physmem_data(curzone->zone_id, + (pgcnt_t *)&rlim64.rlim_cur, + (pgcnt_t *)&rlim64.rlim_max); /* max is dummy variable */ + rlim64.rlim_cur = rlim64.rlim_max = ptob(rlim64.rlim_cur); + + break; + + case LX_RLIMIT_NPROC: + /* zone.max-lwps */ + rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_nlwps_ctl; + break; + + case LX_RLIMIT_MEMLOCK: + lx_get_rctl("process.max-locked-memory", &rlim64); + + /* If unlimited, use zone.max-locked-memory */ + if (rlim64.rlim_max == RLIM64_INFINITY) + rlim64.rlim_max = curzone->zone_locked_mem_ctl; + if (rlim64.rlim_cur == RLIM64_INFINITY) + rlim64.rlim_cur = curzone->zone_locked_mem_ctl; + break; + + case LX_RLIMIT_SIGPENDING: + lx_get_rctl("process.max-sigqueue-size", &rlim64); + break; + + case LX_RLIMIT_MSGQUEUE: + lx_get_rctl("process.max-msg-messages", &rlim64); + break; + + default: + resource = l_to_r[lx_resource]; + + mutex_enter(&curproc->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], curproc, + &rlim64); + mutex_exit(&curproc->p_lock); + break; + } + + + if (rlim64.rlim_cur == RLIM64_INFINITY) { + cur = LX_RLIM_INFINITY_N; + } else { + cur = rlim64.rlim_cur; + } + if (rlim64.rlim_max == RLIM64_INFINITY) { + max = LX_RLIM_INFINITY_N; + } else { + max = rlim64.rlim_max; + } + + if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) { + /* + * Stunningly, Linux has somehow managed to confuse the concept + * of a "limit" with that of a "default" -- and the value of + * RLIMIT_STACK is used by NPTL as the _default_ stack size if + * it isn't specified. (!!) Even for a system that prides + * itself on slapdash castles of junk, this is an amazingly + * willful act of incompetence -- and one that is gleefully + * confessed in the pthread_create() man page: "if the + * RLIMIT_STACK soft resource limit at the time the program + * started has any value other than 'unlimited', then it + * determines the default stack size of new threads." A + * typical stack limit for us is 32TB; if it needs to be said, + * setting the default stack size to be 32TB doesn't work so + * well! Of course, glibc dropping a deuce in its pants + * becomes our problem -- so to prevent smelly accidents we + * tell Linux that any stack limit over the old (32-bit) values + * for infinity are just infinitely large. + */ + cur_inf = B_TRUE; + max_inf = B_TRUE; + } + + if (cur_inf) { + *rlim_curp = LX_RLIM64_INFINITY; + } else { + *rlim_curp = cur; + } + + if (max_inf) { + *rlim_maxp = LX_RLIM64_INFINITY; + } else { + *rlim_maxp = max; + } + + return (0); +} + +/* + * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit + * in Linux headers and code. The only difference between this and the old + * getrlimit (variously called getrlimit or old_getrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. Modern code will + * use this version by default. + */ +long +lx_getrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (rlim_cur == LX_RLIM64_INFINITY) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else if (rlim_cur > LX_RLIM_INFINITY_N) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max == LX_RLIM64_INFINITY) + rl.rlim_max = LX_RLIM_INFINITY_N; + else if (rlim_max > LX_RLIM_INFINITY_N) + rl.rlim_max = LX_RLIM_INFINITY_N; + else + rl.rlim_max = (ulong_t)rlim_max; + + if (copyout(&rl, rlp, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (rlim_cur > UINT_MAX) + rl.rlim_cur = UINT_MAX; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > UINT_MAX) + rl.rlim_max = UINT_MAX; + else + rl.rlim_max = (ulong_t)rlim_max; + + rl32.rlim_cur = rl.rlim_cur; + rl32.rlim_max = rl.rlim_max; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +/* + * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit + * in Linux headers and code. The only difference between this and the new + * getrlimit (variously called getrlimit or ugetrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. + * + * This is only used for 32-bit code. + */ +long +lx_oldgetrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit32_t rl32; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (rlim_cur > INT_MAX) + rl32.rlim_cur = INT_MAX; + else + rl32.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > INT_MAX) + rl32.rlim_max = INT_MAX; + else + rl32.rlim_max = (ulong_t)rlim_cur; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +lx_set_rctl(char *nm, struct rlimit64 *rlp64) +{ + int err; + rctl_hndl_t hndl; + rctl_alloc_gp_t *gp; + + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + + /* + * We're not supposed to do this but since we want all our rctls to + * behave like rlimits, we take advantage of this function to set up + * this way. + */ + err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0, + CRED()); + + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + + return (err); +} + +static int +lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int err; + int resource; + rctl_alloc_gp_t *gp; + struct rlimit64 rl64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max; + break; + + case LX_RLIMIT_NICE: + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTTIME: + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RSS: + /* + * zone.max-physical-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_NPROC: + /* + * zone.max-lwps + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_MEMLOCK: + /* + * We allow setting to unlimited (LX_RLIM_INFINITY_N). The zone + * limit will always apply. + */ + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + err = lx_set_rctl("process.max-locked-memory", &rl64); + if (err != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_SIGPENDING: + /* + * On Ubuntu at least, the login and sshd processes expect to + * set this limit to 16k and login will fail if this fails. On + * illumos we have a system limit of 8k and normally the + * privileged limit is 512. We simply pretend this works to + * allow login to work. + */ + if (rlim_max > 8192) + return (0); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64)) + != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_MSGQUEUE: + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0) + return (set_errno(err)); + break; + + default: + resource = l_to_r[lx_resource]; + + /* + * Linux limits the max number of open files to 1m and there is + * a test for this. + */ + if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024)) + return (EPERM); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + err = rctl_rlimit_set(rctlproc_legacy[resource], curproc, + &rl64, gp, rctlproc_flags[resource], + rctlproc_signals[resource], CRED()); + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + if (err != 0) + return (set_errno(err)); + break; + } + + return (0); +} + +long +lx_setrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(rlp, &rl, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (copyin(rlp, &rl32, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + rl.rlim_cur = rl32.rlim_cur; + rl.rlim_max = rl32.rlim_max; + } +#endif + + if ((rl.rlim_max != LX_RLIM_INFINITY_N && + rl.rlim_cur == LX_RLIM_INFINITY_N) || + rl.rlim_cur > rl.rlim_max) + return (set_errno(EINVAL)); + + if (rl.rlim_cur == LX_RLIM_INFINITY_N) + rlim_cur = LX_RLIM64_INFINITY; + else + rlim_cur = rl.rlim_cur; + + if (rl.rlim_max == LX_RLIM_INFINITY_N) + rlim_max = LX_RLIM64_INFINITY; + else + rlim_max = rl.rlim_max; + + rv = lx_setrlimit_common(resource, rlim_cur, rlim_max); + if (rv != 0) + return (set_errno(rv)); + return (0); +} + +/* + * From the man page: + * The Linux-specific prlimit() system call combines and extends the + * functionality of setrlimit() and getrlimit(). It can be used to both set + * and get the resource limits of an arbitrary process. + * + * If pid is 0, then the call applies to the calling process. + */ +long +lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp) +{ + int rv; + lx_rlimit64_t nrl, orl; + + if (pid != 0) { + /* XXX TBD if needed */ + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "setting prlimit %d for another process\n", resource); + lx_unsupported(buf); + return (ENOTSUP); + } + + if (orlp != NULL) { + /* we first get the current limits */ + rv = lx_getrlimit_common(resource, &orl.rlim_cur, + &orl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (nrlp != NULL) { + if (copyin(nrlp, &nrl, sizeof (nrl)) != 0) + return (set_errno(EFAULT)); + + if ((nrl.rlim_max != LX_RLIM64_INFINITY && + nrl.rlim_cur == LX_RLIM64_INFINITY) || + nrl.rlim_cur > nrl.rlim_max) + return (set_errno(EINVAL)); + + rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (orlp != NULL) { + /* now return the original limits, if necessary */ + if (copyout(&orl, orlp, sizeof (orl)) != 0) + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c new file mode 100644 index 0000000000..34aafcaf5d --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c @@ -0,0 +1,956 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/nbmlock.h> +#include <sys/limits.h> + +/* uts/common/syscall/rw.c */ +extern size_t copyout_max_cached; + + +/* Common routines */ + +static int +lx_iovec_copyin(void *uiovp, int iovcnt, iovec_t *kiovp, ssize_t *count) +{ +#ifdef _SYSCALL32_IMPL + /* + * 32-bit callers need to have their iovec expanded, while ensuring + * that they can't move more than 2Gbytes of data in a single call. + */ + if (get_udatamodel() == DATAMODEL_ILP32) { + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len = 0; + ssize32_t total32 = 0; + int i; + + if (iovcnt > IOV_MAX_STACK) { + aiov32len = iovcnt * sizeof (iovec32_t); + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + } + + if (copyin(uiovp, aiov32, iovcnt * sizeof (iovec32_t))) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32 = aiov32[i].iov_len; + total32 += iovlen32; + if (iovlen32 < 0 || total32 < 0) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EINVAL); + } + kiovp[i].iov_len = iovlen32; + kiovp[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT32) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + } + *count = total32; + + if (aiov32len != 0) + kmem_free(aiov32, aiov32len); + } else +#endif + { + ssize_t total = 0; + int i; + + if (copyin(uiovp, kiovp, iovcnt * sizeof (iovec_t))) + return (EFAULT); + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = kiovp[i].iov_len; + total += iovlen; + if (iovlen < 0 || total < 0) { + return (EINVAL); + } + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT) { + return (EFAULT); + } + } + *count = total; + } + return (0); +} + +int +lx_read_common(file_t *fp, uio_t *uiop, size_t *nread, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 0, ioflag; + ssize_t count = uiop->uio_resid; + size_t rcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + if (fp->f_vnode->v_type == VDIR) { + return (EISDIR); + } + if (positioned && + (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) { + return (ESPIPE); + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, uiop->uio_offset, count, svmand, + NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + /* + * For non-positioned reads, recheck offset/count validity inside + * VOP_WRLOCK to prevent filesize from changing during validation. + */ + if (!positioned) { + u_offset_t uoffset = (u_offset_t)(ulong_t)fp->f_offset; + + if ((vp->v_type == VREG) && (uoffset >= OFFSET_MAX(fp))) { + struct vattr va; + + va.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL); + VOP_RWUNLOCK(vp, rwflag, NULL); + if (error != 0) + goto out; + /* We have to return EOF if fileoff is >= file size. */ + if (uoffset >= va.va_size) + goto out; + /* + * File is greater than or equal to maxoff and + * therefore we return EOVERFLOW. + */ + error = EOVERFLOW; + goto out; + } + if ((vp->v_type == VREG) && + (uoffset + count > OFFSET_MAX(fp))) { + count = (ssize_t)(OFFSET_MAX(fp) - uoffset); + uiop->uio_resid = count; + } + uiop->uio_offset = uoffset; + } + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, uiop, ioflag, fp->f_cred, NULL); + rcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)rcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)rcount; + /* Store offset for non-positioned reads */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = rcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nread = rcount; + return (error); +} + +int +lx_write_common(file_t *fp, uio_t *uiop, size_t *nwrite, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 1, ioflag; + ssize_t count = uiop->uio_resid; + size_t wcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + if (positioned && + (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) { + return (ESPIPE); + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, uiop->uio_loffset, count, + svmand, NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + if (!positioned) { + /* + * For non-positioned writes, the value of fp->f_offset is + * re-queried while inside VOP_RWLOCK. This ensures that other + * writes which alter the filesize will be taken into account. + */ + uiop->uio_loffset = fp->f_offset; + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } else { + /* + * In a senseless departure from POSIX, positioned write calls + * on Linux do _not_ ignore the O_APPEND flag. + */ + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } + if (vp->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)(ulong_t)uiop->uio_loffset; + + if (fileoff >= curproc->p_fsz_ctl) { + VOP_RWUNLOCK(vp, rwflag, NULL); + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + error = EFBIG; + goto out; + } + if (fileoff >= OFFSET_MAX(fp)) { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EFBIG; + goto out; + } + if (fileoff + count > OFFSET_MAX(fp)) { + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + uiop->uio_resid = count; + } + } + + error = VOP_WRITE(vp, uiop, ioflag, fp->f_cred, NULL); + wcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)wcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)wcount; + + /* Store offset for non-positioned writes */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = wcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nwrite = wcount; + return (error); +} + +/* + * The Linux routines for reading and writing data from file descriptors behave + * differently from their SunOS counterparts in a few key ways: + * + * - Passing an iovcnt of 0 to the vectored functions results in an error on + * SunOS, but on Linux it yields return value of 0. + * + * - If any data is successfully read or written, Linux will return a success. + * This is unlike SunOS which would return an error code for the entire + * operation in cases where vectors had gone unprocessed. + * + * - Breaking from POSIX, Linux positioned writes (pwrite/pwritev) on Linux + * will obey the O_APPEND flag if it is set on the descriptor. + */ + +ssize_t +lx_read(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return ((ssize_t)nread); +} + +ssize_t +lx_write(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return (nwrite); +} + +ssize_t +lx_readv(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_writev(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pread_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write + * a file beyond the maximum offset representable in + * an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + if (error) { + return (set_errno(error)); + } + return ((ssize_t)nread); + +} + +ssize_t +lx_pread(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + file_t *fp; + size_t nread; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + nread = lx_pread_fp(fp, cbuf, ccount, offset); + releasef(fdes); + return (nread); +} + +ssize_t +lx_pwrite_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if (((fflag = fp->f_flag) & (FWRITE)) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * return EINVAL for offsets that cannot be + * represented in an off_t. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwrite to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + if (error) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pwrite(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + file_t *fp; + size_t nwrite; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + nwrite = lx_pwrite_fp(fp, cbuf, ccount, offset); + releasef(fdes); + return (nwrite); +} + +ssize_t +lx_pread32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pread(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwrite32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwrite(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_preadv(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_pwritev(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwritev to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_preadv32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, uint32_t off_hi) +{ + return (lx_preadv(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwritev32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwritev(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c new file mode 100644 index 0000000000..6d4904a5fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -0,0 +1,1161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Emulation for scheduling related syscalls. + * + * Under a typical zone configuration the zones will always be running under + * FSS so that no single zone can monopolize the system. Zones do not have the + * privilege to leave FSS (for the obvious reason that this would violate the + * global zone resource management policies). Thus, for the sched_* syscalls + * we typically will never be able to emulate those using our other native + * scheduling classes. Under this common case we simply track the scheduler + * settings on the lwp's lx brand structure and we also try to adjust the + * lwp priority within the valid range to approximate the intended effect. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/processor.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/procset.h> +#include <sys/priocntl.h> + +typedef int l_pid_t; + +extern int yield(); +extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); + +static int lx_sched_setprocset(procset_t *, l_pid_t); +static long lx_do_priocntlsys(int, procset_t *, void *); + +#define BITS_PER_BYTE 8 + +/* + * Linux scheduler policies. + */ +#define LX_SCHED_OTHER 0 +#define LX_SCHED_FIFO 1 +#define LX_SCHED_RR 2 +#define LX_SCHED_BATCH 3 +#define LX_SCHED_IDLE 5 +#define LX_SCHED_DEADLINE 6 + +/* + * Linux scheduler priority ranges. + */ +#define LX_SCHED_PRIORITY_MIN_OTHER 0 +#define LX_SCHED_PRIORITY_MAX_OTHER 0 +#define LX_SCHED_PRIORITY_MIN_RRFIFO 1 +#define LX_SCHED_PRIORITY_MAX_RRFIFO 99 + +#define MAXPRI 60 /* See FSS_MAXUPRI */ + +/* + * When emulating scheduling priorities (e.g. under FSS) we'll do the best we + * can by adjusting the thread's priority within our range. + */ +static int lx_emul_pri_map[] = { + 0, /* LX_SCHED_OTHER */ + MAXPRI, /* LX_SCHED_FIFO */ + MAXPRI - 1, /* LX_SCHED_RR */ + -MAXPRI + 1, /* LX_SCHED_BATCH */ + 0, /* UNUSED */ + -MAXPRI, /* LX_SCHED_IDLE */ + MAXPRI /* LX_SCHED_DEADLINE */ +}; + +/* + * Determine if we should emulate the sched_* syscalls. A zone is almost always + * going to be running under FSS in any kind of production configuration, and + * FSS is currently the only class which zone processes won't have the privilege + * to leave. Instead of checking for FSS explicitly, we generalize our check + * using CL_CANEXIT. + */ +#define EMUL_SCHED() (CL_CANEXIT(curthread, CRED()) != 0) + +struct lx_sched_param { + int lx_sched_prio; +}; + +typedef struct lx_sched_attr { + uint32_t lx_size; + + uint32_t lx_sched_policy; + uint64_t lx_sched_flags; + + /* For LX_SCHED_OTHER or LX_SCHED_BATCH */ + int lx_sched_nice; + + /* For LX_SCHED_FIFO or LX_SCHED_RR */ + uint32_t lx_sched_priority; + + /* For LX_SCHED_DEADLINE */ + uint64_t lx_sched_runtime; + uint64_t lx_sched_deadline; + uint64_t lx_sched_period; +} lx_sched_attr_t; + +long +lx_sched_yield(void) +{ + yield(); + + return (0); +} + +static void +ltos_cpuset(lx_affmask_t *lmask, cpuset_t *smask) +{ + /* NOTE: fix this code if NCPU is ever made > LX_NCPU */ + + cpuset_zero(smask); + for (int i = 0; i < NCPU; i++) { + if (BT_TEST(*lmask, i)) { + cpuset_add(smask, i); + } + } +} + +static void +stol_cpuset(cpuset_t *smask, lx_affmask_t *lmask) +{ + /* NOTE: fix this code if NCPU is ever made > LX_NCPU */ + + bzero(lmask, sizeof (*lmask)); + for (int i = 0; i < NCPU; i++) { + if (cpu_in_set(smask, i)) { + BT_SET(*lmask, i); + } + } +} + +/* + * Find and lock a process for lx_sched_* operations. + * Sets 'pp' and 'tp' on success, with P_PR_LOCK set and p_lock held. + * The target process must be branded. + */ +static int +lx_sched_pidlock(l_pid_t pid, proc_t **pp, kthread_t **tp, boolean_t is_write) +{ + proc_t *p; + kthread_t *t = NULL; + int err = 0; + + if (pid < 0) { + return (EINVAL); + } + if (pid == 0) { + p = curproc; + ASSERT(PROC_IS_BRANDED(p)); + mutex_enter(&p->p_lock); + sprlock_proc(p); + + *tp = curthread; + *pp = p; + return (0); + } + + if (lx_lpid_lock((pid_t)pid, curzone, LXP_PRLOCK, &p, &t) != 0) { + return (ESRCH); + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + if (!(PROC_IS_BRANDED(p))) { + sprunlock(p); + return (EPERM); + } + + if (is_write) { + cred_t *cr = CRED(); + + /* + * To perform a sched_* operation on a thread outside of the + * current process, either the euid/egid of the target must + * match, or the calling process must hold CAP_SYS_NICE. + * (PRIV_PROC_PRIOUP maps to CAP_SYS_NICE) + */ + err = 0; + if (secpolicy_raisepriority(cr) != 0) { + err = 0; + mutex_exit(&p->p_lock); + mutex_enter(&p->p_crlock); + if (crgetuid(cr) != crgetuid(p->p_cred) || + crgetgid(cr) != crgetgid(p->p_cred)) { + err = EPERM; + } + mutex_exit(&p->p_crlock); + mutex_enter(&p->p_lock); + if (err != 0) { + sprunlock(p); + return (err); + } + } + } + *pp = p; + *tp = t; + ASSERT(MUTEX_HELD(&p->p_lock)); + return (0); +} + +long +lx_sched_getaffinity(l_pid_t pid, unsigned int len, void *maskp) +{ + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + int err; + unsigned int pmin, pmax, compare_size; + lx_affmask_t lmask; + cpuset_t *smask; + + /* + * The length boundary requirement is to match Linux's behavior. + */ + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + compare_size = sizeof (uint32_t); + break; + default: + compare_size = sizeof (ulong_t); + break; + } + if ((len & (compare_size - 1)) != 0) { + return (set_errno(EINVAL)); + } + + smask = cpuset_alloc(KM_SLEEP); + if ((err = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) { + cpuset_free(smask); + return (set_errno(err)); + } + + mutex_exit(&p->p_lock); + mutex_enter(&cpu_lock); + mutex_enter(&p->p_lock); + /* + * Grab the existing affinity mask and constrain it by the current set + * of active CPUs (which may have changed since it was assigned. + */ + lwpd = ttolxlwp(tp); + cpuset_or(smask, lwpd->br_affinitymask); + cpuset_and(smask, &cpu_active_set); + sprunlock(p); + mutex_exit(&cpu_lock); + + cpuset_bounds(smask, &pmin, &pmax); + stol_cpuset(smask, &lmask); + cpuset_free(smask); + + /* + * It is out of convenience that this check is performed so late. If + * the need arises, it could be altered to be done earlier in order to + * match Linux error ordering. + */ + if (pmax >= (len * BITS_PER_BYTE)) { + return (set_errno(EINVAL)); + } + + len = MIN(len, sizeof (lx_affmask_t)); + if (copyout(&lmask, maskp, len) != 0) { + return (set_errno(EFAULT)); + } + return (len); +} + +long +lx_sched_setaffinity(l_pid_t pid, unsigned int len, void *maskp) +{ + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + int err; + unsigned int pmin, pmax; + lx_affmask_t lmask; + cpuset_t *smask; + + if (pid < 0) { + return (set_errno(EINVAL)); + } + + if (len < sizeof (lmask)) { + bzero(&lmask, sizeof (lmask)); + } else if (len > sizeof (lmask)) { + len = sizeof (lmask); + } + if (copyin(maskp, &lmask, len) != 0) { + return (set_errno(EFAULT)); + } + smask = cpuset_alloc(KM_SLEEP); + ltos_cpuset(&lmask, smask); + if ((err = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) { + cpuset_free(smask); + return (set_errno(err)); + } + + /* + * Constrain the mask to currently active CPUs. + */ + mutex_exit(&p->p_lock); + mutex_enter(&cpu_lock); + mutex_enter(&p->p_lock); + lwpd = ttolxlwp(tp); + + cpuset_and(smask, &cpu_active_set); + if (cpuset_isnull(smask)) { + err = EINVAL; + goto out; + } + if (cpuset_isequal(lwpd->br_affinitymask, smask)) { + err = 0; + goto out; + } + + /* + * If one (and only one) CPU is selected in the affinity mask, bind the + * thread to that CPU. + */ + cpuset_bounds(smask, &pmin, &pmax); + VERIFY(pmin != CPUSET_NOTINSET); + if (pmin == pmax) { + processorid_t obind; + + (void) cpu_bind_thread(tp, pmin, &obind, &err); + if (err != 0) { + goto out; + } + } else { + /* + * If the thread transitions away from a single-CPU mask, it + * should be unbound from that processor. + */ + cpuset_bounds(lwpd->br_affinitymask, &pmin, &pmax); + if (pmin == pmax) { + processorid_t obind; + (void) cpu_bind_thread(tp, PBIND_NONE, &obind, &err); + } + } + cpuset_zero(lwpd->br_affinitymask); + cpuset_or(lwpd->br_affinitymask, smask); + err = 0; + +out: + mutex_exit(&cpu_lock); + sprunlock(p); + cpuset_free(smask); + if (err != 0) { + return (set_errno(err)); + } + return (0); +} + +void +lx_affinity_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + proc_t *pp = lwptoproc(srclwp); + lx_lwp_data_t *slwpd = lwptolxlwp(srclwp); + lx_lwp_data_t *dlwpd = lwptolxlwp(dstlwp); + + /* + * Copy over the affinity mask. This could be enhanced in the future + * to perform single-CPU binding like sched_setaffinity. + */ + mutex_enter(&pp->p_lock); + cpuset_zero(dlwpd->br_affinitymask); + cpuset_or(dlwpd->br_affinitymask, slwpd->br_affinitymask); + mutex_exit(&pp->p_lock); +} + +long +lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int prio, maxupri; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + int incr; + lx_lwp_data_t *lwpd; + + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + if (prio != LX_SCHED_PRIORITY_MIN_OTHER) + return (set_errno(EINVAL)); + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO || + prio > LX_SCHED_PRIORITY_MAX_RRFIFO) + return (set_errno(EINVAL)); + break; + default: + return (set_errno(EINVAL)); + } + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + lwpd = lwptolxlwp(ttolwp(tp)); + if (lwpd->br_schd_class == LX_SCHED_IDLE && + policy != LX_SCHED_IDLE && crgetuid(CRED()) != 0) { + + sprunlock(p); + return (set_errno(EPERM)); + } + + lwpd->br_schd_class = policy; + lwpd->br_schd_pri = prio; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + + sprunlock(p); + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* get the class id */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* get the current policy */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (policy < 0) { + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + return (set_errno(EINVAL)); + } + } + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getscheduler(l_pid_t pid) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int policy; + int rv; + + if (pid < 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + sprunlock(p); + + return (policy); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + policy = set_errno(EINVAL); + } + + return (policy); +} + +long +lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int policy; + int prio, maxupri; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + int incr; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + if (prio != LX_SCHED_PRIORITY_MIN_OTHER) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO || + prio > LX_SCHED_PRIORITY_MAX_RRFIFO) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + default: + /* this shouldn't happen */ + ASSERT(0); + sprunlock(p); + return (set_errno(EINVAL)); + } + + lwptolxlwp(ttolwp(tp))->br_schd_pri = prio; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + sprunlock(p); + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_sched_param local_param; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + tsinfo_t *tsi; + int prio, scale; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + local_param.lx_sched_prio = lwptolxlwp(ttolwp(tp))->br_schd_pri; + sprunlock(p); + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + bzero(&local_param, sizeof (local_param)); + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + /* + * I don't know if we need to do this, coz it can't be + * changed from zero anyway..... + */ + tsi = (tsinfo_t *)pcinfo.pc_clinfo; + prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + scale = tsi->ts_maxupri; + if (scale == 0) + local_param.lx_sched_prio = 0; + else + local_param.lx_sched_prio = -(prio * 20) / scale; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + local_param.lx_sched_prio = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else { + rv = set_errno(EINVAL); + } + + if (rv == 0) + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (rv); +} + +long +lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival) +{ + klwp_t *lwp = ttolwp(curthread); + struct timespec interval; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int rv; + + if (pid < 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + int policy; + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + sprunlock(p); + + interval.tv_sec = 0; + if (policy == LX_SCHED_RR) { + /* Use a made-up value similar to Linux */ + interval.tv_nsec = 100000000; + } else { + interval.tv_nsec = 0; + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + /* + * A timespec may overflow for 32-bit but EOVERFLOW + * is not documented as an acceptable error for + * sched_rr_get_interval. Such an occurance would be + * exceptionally weird for the RR interval. + */ + TIMESPEC_TO_TIMESPEC32(&t32, &interval); + + if (copyout(&t32, ival, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + } + else +#endif + { + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + } + + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0); + bzero(&pcinfo, sizeof (pcinfo)); + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * Contrary to what the man page says, you don't have to be in RR to + * get this interval. + */ + if (((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) { + interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + /* + * Like above, the 32-bit EOVERFLOW check is not + * appropriate here. + */ + TIMESPEC_TO_TIMESPEC32(&t32, &interval); + + if (copyout(&t32, ival, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + } + else +#endif + { + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + } + + return (0); + } + + return (set_errno(EINVAL)); +} + +long +lx_sched_get_priority_min(uintptr_t policy) +{ + /* + * Linux scheduling priorities are not alterable, so there is no + * illumos translation necessary. + */ + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + return (LX_SCHED_PRIORITY_MIN_RRFIFO); + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + return (LX_SCHED_PRIORITY_MIN_OTHER); + default: + break; + } + return (set_errno(EINVAL)); +} + +long +lx_sched_get_priority_max(uintptr_t policy) +{ + /* + * Linux scheduling priorities are not alterable, so there is no + * illumos translation necessary. + */ + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + return (LX_SCHED_PRIORITY_MAX_RRFIFO); + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + return (LX_SCHED_PRIORITY_MAX_OTHER); + default: + break; + } + return (set_errno(EINVAL)); +} + +long +lx_sched_setattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t flags) +{ + int rv; + uint32_t lx_size; + lx_sched_attr_t local_attr; + uint64_t flg; + + if (pid < 0 || attr == NULL || flags != 0) + return (set_errno(EINVAL)); + + if (copyin(attr, &lx_size, sizeof (lx_size))) + return (set_errno(EFAULT)); + + if (lx_size > sizeof (local_attr)) + return (set_errno(E2BIG)); + + bzero(&local_attr, sizeof (local_attr)); + if (copyin(attr, &local_attr, lx_size)) + return (set_errno(EFAULT)); + + flg = local_attr.lx_sched_flags; + if ((flg & ~LX_SCHED_FLAG_RESET_ON_FORK) != 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + int policy; + proc_t *p; + kthread_t *tp = NULL; + int incr; + lx_lwp_data_t *lwpd; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + policy = local_attr.lx_sched_policy; + + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + if (local_attr.lx_sched_priority < + LX_SCHED_PRIORITY_MIN_RRFIFO || + local_attr.lx_sched_priority > + LX_SCHED_PRIORITY_MAX_RRFIFO) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + + case LX_SCHED_DEADLINE: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + break; + default: + sprunlock(p); + return (set_errno(EINVAL)); + } + + lwpd = lwptolxlwp(ttolwp(tp)); + lwpd->br_schd_class = policy; + lwpd->br_schd_flags = flg; + lwpd->br_schd_pri = local_attr.lx_sched_priority; + + lwpd->br_schd_runtime = local_attr.lx_sched_runtime; + lwpd->br_schd_deadline = local_attr.lx_sched_deadline; + lwpd->br_schd_period = local_attr.lx_sched_period; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + sprunlock(p); + return (0); + } + + /* Currently not supported under other classes */ + return (set_errno(ENOSYS)); +} + +long +lx_sched_getattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t size, + uint32_t flags) +{ + lx_sched_attr_t local_attr; + int rv; + + if (pid < 0 || attr == NULL || flags != 0 || size < sizeof (local_attr)) + return (set_errno(EINVAL)); + + bzero(&local_attr, sizeof (local_attr)); + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + lwpd = lwptolxlwp(ttolwp(tp)); + local_attr.lx_sched_policy = lwpd->br_schd_class; + local_attr.lx_sched_priority = lwpd->br_schd_pri; + local_attr.lx_sched_flags = lwpd->br_schd_flags; + + local_attr.lx_sched_runtime = lwpd->br_schd_runtime; + local_attr.lx_sched_deadline = lwpd->br_schd_deadline; + local_attr.lx_sched_period = lwpd->br_schd_period; + + sprunlock(p); + + local_attr.lx_size = sizeof (lx_sched_attr_t); + + if (copyout(&local_attr, attr, sizeof (local_attr))) + return (set_errno(EFAULT)); + + return (0); + } + + /* Currently not supported under other classes */ + return (set_errno(ENOSYS)); +} + +static int +lx_sched_setprocset(procset_t *procset, l_pid_t pid) +{ + id_t lid, rid; + idtype_t lidtype, ridtype; + + /* + * define the target lwp + */ + if (pid == 0) + pid = curproc->p_pid; + + if (lx_lpid_to_spair(pid, &pid, &lid) < 0) + return (set_errno(ESRCH)); + rid = 0; + ridtype = P_ALL; + lidtype = P_LWPID; + + setprocset(procset, POP_AND, lidtype, lid, ridtype, rid); + + return (0); +} + +static long +lx_do_priocntlsys(int cmd, procset_t *procset, void *arg) +{ + return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0, + UIO_SYSSPACE)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_socket.c b/usr/src/uts/common/brand/lx/syscall/lx_socket.c new file mode 100644 index 0000000000..a95e220ea2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_socket.c @@ -0,0 +1,4537 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/sockio.h> +#include <sys/thread.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kmem.h> +#include <sys/un.h> +#include <sys/sunddi.h> +#include <sys/cred.h> +#include <sys/ucred.h> +#include <sys/model.h> +#include <sys/brand.h> +#include <sys/vmsystm.h> +#include <sys/limits.h> +#include <sys/fcntl.h> +#include <sys/sysmacros.h> +#include <netpacket/packet.h> +#include <sockcommon.h> +#include <socktpi_impl.h> +#include <netinet/udp.h> +#include <sys/sdt.h> +#include <netinet/tcp.h> +#include <netinet/igmp.h> +#include <netinet/icmp6.h> +#include <inet/tcp_impl.h> +#include <lx_errno.h> + +#include <sys/lx_brand.h> +#include <sys/lx_socket.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> + +/* From uts/common/fs/sockfs/socksyscalls.c */ +extern int listen(int, int, int); +extern int shutdown(int, int, int); + +typedef struct lx_ucred { + pid_t lxu_pid; + lx_uid_t lxu_uid; + lx_gid_t lxu_gid; +} lx_ucred_t; + +typedef struct lx_socket_aux_data +{ + kmutex_t lxsad_lock; + enum lxsad_status_t { + LXSS_NONE = 0, + LXSS_CONNECTING, + LXSS_CONNECTED + } lxsad_status; + uint_t lxsad_flags; +} lx_socket_aux_data_t; + +#define LX_SS_MAXSIZE 128 + +typedef struct lx_sockaddr_storage { + unsigned short lxss_family; + char lxdata[LX_SS_MAXSIZE - sizeof (unsigned short)]; +} lx_sockaddr_storage_t; + +typedef struct lx_group_req { + uint32_t lxgr_interface; +#ifdef _LP64 + /* On 64-bit linux kernels, gr_interface is padded by 4 bytes. */ + uint32_t _lxgr_pad; +#endif + lx_sockaddr_storage_t lxgr_group; +} lx_group_req_t; + +#if defined(_SYSCALL32_IMPL) + +typedef struct lx_group_req32 { + uint32_t lxgr_interface; + lx_sockaddr_storage_t lxgr_group; +} lx_group_req32_t; + +#endif /* defined(_SYSCALL32_IMPL) */ + +/* lxsad_flags */ +#define LXSAD_FL_STRCRED 0x1 +#define LXSAD_FL_EMULSEQPKT 0x2 + +static lx_socket_aux_data_t *lx_sad_acquire(vnode_t *); + +/* VSD key for lx-specific socket information */ +static uint_t lx_socket_vsd = 0; + +/* Convenience enum to enforce translation direction */ +typedef enum lx_xlate_dir { + SUNOS_TO_LX, + LX_TO_SUNOS +} lx_xlate_dir_t; + +/* enum for getpeername/getsockname handling */ +typedef enum lx_getname_type { + LX_GETPEERNAME, + LX_GETSOCKNAME +} lx_getname_type_t; + +/* + * What follows are a series of tables we use to translate Linux constants + * into equivalent Illumos constants and back again. I wish this were + * cleaner, more programmatic, and generally nicer. Sadly, life is messy, + * and Unix networking even more so. + */ +static const int ltos_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* LX_AF_UNSPEC */ + AF_UNIX, /* LX_AF_UNIX */ + AF_INET, /* LX_AF_INET */ + AF_NOTSUPPORTED, /* LX_AF_AX25 */ + AF_NOTSUPPORTED, /* LX_AF_IPX */ + AF_NOTSUPPORTED, /* LX_AF_APPLETALK */ + AF_NOTSUPPORTED, /* LX_AF_NETROM */ + AF_NOTSUPPORTED, /* LX_AF_BRIDGE */ + AF_NOTSUPPORTED, /* LX_AF_ATMPVC */ + AF_NOTSUPPORTED, /* LX_AF_X25 */ + AF_INET6, /* LX_AF_INET6 */ + AF_NOTSUPPORTED, /* LX_AF_ROSE */ + AF_NOTSUPPORTED, /* LX_AF_DECNET */ + AF_NOTSUPPORTED, /* LX_AF_NETBEUI */ + AF_NOTSUPPORTED, /* LX_AF_SECURITY */ + AF_NOTSUPPORTED, /* LX_AF_KEY */ + AF_LX_NETLINK, /* LX_AF_NETLINK */ + AF_PACKET, /* LX_AF_PACKET */ + AF_NOTSUPPORTED, /* LX_AF_ASH */ + AF_NOTSUPPORTED, /* LX_AF_ECONET */ + AF_NOTSUPPORTED, /* LX_AF_ATMSVC */ + AF_NOTSUPPORTED, /* LX_AF_RDS */ + AF_NOTSUPPORTED, /* LX_AF_SNA */ + AF_NOTSUPPORTED, /* LX_AF_IRDA */ + AF_NOTSUPPORTED, /* LX_AF_PPOX */ + AF_NOTSUPPORTED, /* LX_AF_WANPIPE */ + AF_NOTSUPPORTED, /* LX_AF_LLC */ + AF_NOTSUPPORTED, /* NONE */ + AF_NOTSUPPORTED, /* NONE */ + AF_NOTSUPPORTED, /* LX_AF_CAN */ + AF_NOTSUPPORTED, /* LX_AF_TIPC */ + AF_NOTSUPPORTED, /* LX_AF_BLUETOOTH */ + AF_NOTSUPPORTED, /* LX_AF_IUCV */ + AF_NOTSUPPORTED /* LX_AF_RXRPC */ + /* LX_AF_ISDN */ + /* LX_AF_PHONET */ + /* LX_AF_IEEE802154 */ + /* LX_AF_CAIF */ + /* LX_AF_ALG */ + /* LX_AF_NFC */ + /* LX_AF_VSOCK */ +}; + +static const int stol_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* AF_UNSPEC */ + AF_UNIX, /* AF_UNIX */ + AF_INET, /* AF_INET */ + AF_NOTSUPPORTED, /* AF_IMPLINK */ + AF_NOTSUPPORTED, /* AF_PUP */ + AF_NOTSUPPORTED, /* AF_CHAOS */ + AF_NOTSUPPORTED, /* AF_NS */ + AF_NOTSUPPORTED, /* AF_NBS */ + AF_NOTSUPPORTED, /* AF_ECMA */ + AF_NOTSUPPORTED, /* AF_DATAKIT */ + AF_NOTSUPPORTED, /* AF_CCITT */ + AF_NOTSUPPORTED, /* AF_SNA */ + AF_NOTSUPPORTED, /* AF_DECNET */ + AF_NOTSUPPORTED, /* AF_DLI */ + AF_NOTSUPPORTED, /* AF_LAT */ + AF_NOTSUPPORTED, /* AF_HYLINK */ + AF_NOTSUPPORTED, /* AF_APPLETALK */ + AF_NOTSUPPORTED, /* AF_NIT */ + AF_NOTSUPPORTED, /* AF_802 */ + AF_NOTSUPPORTED, /* AF_OSI */ + AF_NOTSUPPORTED, /* AF_X25 */ + AF_NOTSUPPORTED, /* AF_OSINET */ + AF_NOTSUPPORTED, /* AF_GOSIP */ + AF_NOTSUPPORTED, /* AF_IPX */ + AF_NOTSUPPORTED, /* AF_ROUTE */ + AF_NOTSUPPORTED, /* AF_LINK */ + LX_AF_INET6, /* AF_INET6 */ + AF_NOTSUPPORTED, /* AF_KEY */ + AF_NOTSUPPORTED, /* AF_NCA */ + AF_NOTSUPPORTED, /* AF_POLICY */ + AF_NOTSUPPORTED, /* AF_INET_OFFLOAD */ + AF_NOTSUPPORTED, /* AF_TRILL */ + LX_AF_PACKET, /* AF_PACKET */ + LX_AF_NETLINK /* AF_LX_NETLINK */ +}; + +#define LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL) +#define STOL_FAMILY(d) ((d) <= LX_AF_MAX ? stol_family[(d)] : AF_INVAL) + + +static const int ltos_socktype[LX_SOCK_PACKET + 1] = { + SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, + SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, + SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED +}; + +static const int stol_socktype[SOCK_SEQPACKET + 1] = { + SOCK_NOTSUPPORTED, LX_SOCK_DGRAM, LX_SOCK_STREAM, SOCK_NOTSUPPORTED, + LX_SOCK_RAW, LX_SOCK_RDM, LX_SOCK_SEQPACKET +}; + +#define LTOS_SOCKTYPE(t) \ + ((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL) +#define STOL_SOCKTYPE(t) \ + ((t) <= SOCK_SEQPACKET ? stol_socktype[(t)] : SOCK_INVAL) + + +/* + * This string is used to prefix all abstract namespace Unix sockets, ie all + * abstract namespace sockets are converted to regular sockets in the /tmp + * directory with .ABSK_ prefixed to their names. + */ +#define ABST_PRFX "/tmp/.ABSK_" +#define ABST_PRFX_LEN (sizeof (ABST_PRFX) - 1) + +#define DATAFILT "datafilt" + +typedef enum { + lxa_none, + lxa_abstract, + lxa_devlog +} lx_addr_type_t; + +static int +ltos_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + + +typedef struct lx_flag_map { + enum { + LXFM_MAP, + LXFM_IGNORE, + LXFM_UNSUP + } lxfm_action; + int lxfm_sunos_flag; + int lxfm_linux_flag; + char *lxfm_name; +} lx_flag_map_t; + +static lx_flag_map_t lx_flag_map_tbl[] = { + { LXFM_MAP, MSG_OOB, LX_MSG_OOB, NULL }, + { LXFM_MAP, MSG_PEEK, LX_MSG_PEEK, NULL }, + { LXFM_MAP, MSG_DONTROUTE, LX_MSG_DONTROUTE, NULL }, + { LXFM_MAP, MSG_CTRUNC, LX_MSG_CTRUNC, NULL }, + { LXFM_MAP, MSG_TRUNC, LX_MSG_TRUNC, NULL }, + { LXFM_MAP, MSG_DONTWAIT, LX_MSG_DONTWAIT, NULL }, + { LXFM_MAP, MSG_EOR, LX_MSG_EOR, NULL }, + { LXFM_MAP, MSG_WAITALL, LX_MSG_WAITALL, NULL }, + /* MSG_CONFIRM is safe to ignore */ + { LXFM_IGNORE, 0, LX_MSG_CONFIRM, NULL }, + /* + * The NOSIGNAL and CMSG_CLOEXEC flags are handled by the emulation + * outside of the flag-conversion routine. + */ + { LXFM_IGNORE, 0, LX_MSG_NOSIGNAL, NULL }, + { LXFM_IGNORE, 0, LX_MSG_CMSG_CLOEXEC, NULL }, + { LXFM_UNSUP, LX_MSG_PROXY, 0, "MSG_PROXY" }, + { LXFM_UNSUP, LX_MSG_FIN, 0, "MSG_FIN" }, + { LXFM_UNSUP, LX_MSG_SYN, 0, "MSG_SYN" }, + { LXFM_UNSUP, LX_MSG_RST, 0, "MSG_RST" }, + { LXFM_UNSUP, LX_MSG_ERRQUEUE, 0, "MSG_ERRQUEUE" }, + { LXFM_UNSUP, LX_MSG_MORE, 0, "MSG_MORE" }, + { LXFM_UNSUP, LX_MSG_WAITFORONE, 0, "MSG_WAITFORONE" }, + { LXFM_UNSUP, LX_MSG_FASTOPEN, 0, "MSG_FASTOPEN" }, +}; + +#define LX_FLAG_MAP_MAX \ + (sizeof (lx_flag_map_tbl) / sizeof (lx_flag_map_tbl[0])) + +#define LX_UNSUP_BUFSZ 64 + +static int +lx_xlate_sock_flags(int inflags, lx_xlate_dir_t dir) +{ + int i, outflags = 0; + char buf[LX_UNSUP_BUFSZ]; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_FLAG_MAP_MAX; i++) { + lx_flag_map_t *map = &lx_flag_map_tbl[i]; + int match, out; + + if (dir == SUNOS_TO_LX) { + match = inflags & map->lxfm_sunos_flag; + out = map->lxfm_linux_flag; + } else { + match = inflags & map->lxfm_linux_flag; + out = map->lxfm_sunos_flag; + } + switch (map->lxfm_action) { + case LXFM_MAP: + if (match != 0) { + inflags &= ~(match); + outflags |= out; + } + break; + case LXFM_IGNORE: + if (match != 0) { + inflags &= ~(match); + } + break; + case LXFM_UNSUP: + if (match != 0) { + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "unsupported sock flag %s", map->lxfm_name); + lx_unsupported(buf); + } + } + } + if (inflags != 0) { + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "unsupported sock flags 0x%08x", inflags); + lx_unsupported(buf); + } + + return (outflags); +} + +typedef enum lx_sun_type { + LX_SUN_NORMAL, + LX_SUN_ABSTRACT, +} lx_sun_type_t; + +static void +ltos_sockaddr_ux(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + struct sockaddr_un buf; + /* Calculate size of (sun_family + any padding) in sockaddr */ + int sizediff = (sizeof (buf) - sizeof (buf.sun_path)); + int len = inlen - sizediff; + + VERIFY(len > 0); + VERIFY(len <= sizeof (buf.sun_path)); + bzero(&buf, sizeof (buf)); + + if (inaddr->sa_data[0] == '\0') { + /* + * Linux supports abstract Unix sockets, which are simply + * sockets that do not exist on the file system. These sockets + * are denoted by beginning the path with a NULL character. To + * support these, we strip out the leading NULL character and + * change the path to point to a real place in /tmp directory, + * by prepending ABST_PRFX and replacing all illegal characters + * with * '_'. + * + * Since these sockets are supposed to exist outside the + * filesystem, they must be cleaned up after use. This removal + * is performed during bind(). + */ + int idx, odx; + + /* Add our abstract prefix */ + (void) strcpy(buf.sun_path, ABST_PRFX); + for (idx = 1, odx = ABST_PRFX_LEN; + idx < len && odx < sizeof (buf.sun_path); + idx++, odx++) { + char c = inaddr->sa_data[idx]; + if (c == '\0' || c == '/') { + buf.sun_path[odx] = '_'; + } else { + buf.sun_path[odx] = c; + } + } + + /* + * Since abstract socket addresses might not be NUL terminated, + * we must explicitly NUL terminate the translated path. + * Care is taken not to overflow the buffer. + */ + if (odx == sizeof (buf.sun_path)) { + buf.sun_path[odx - 1] = '\0'; + } else { + buf.sun_path[odx] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_ABSTRACT; + } + } else { + /* Copy the address directly, minding termination */ + (void) strncpy(buf.sun_path, inaddr->sa_data, len); + len = strnlen(buf.sun_path, len); + if (len == sizeof (buf.sun_path)) { + buf.sun_path[len - 1] = '\0'; + } else { + VERIFY(len < sizeof (buf.sun_path)); + buf.sun_path[len] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_NORMAL; + } + } + buf.sun_family = AF_UNIX; + *outlen = strlen(buf.sun_path) + 1 + sizediff; + VERIFY(*outlen <= sizeof (struct sockaddr_un)); + + *outaddr = kmem_alloc(*outlen, KM_SLEEP); + bcopy(&buf, *outaddr, *outlen); +} + +/* + * Copy in a Linux-native socket address from userspace and convert it into + * illumos format. When successful, it will allocate an appropriately sized + * struct to be freed by the caller. + */ +static long +ltos_sockaddr_copyin(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + sa_family_t family; + struct sockaddr *laddr; + struct sockaddr_ll *sal; + int proto, error = 0; + + VERIFY(inaddr != NULL); + + if (inlen < sizeof (sa_family_t) || + inlen > sizeof (struct sockaddr_storage)) { + return (EINVAL); + } + laddr = kmem_alloc(inlen, KM_SLEEP); + if (copyin(inaddr, laddr, inlen) != 0) { + kmem_free(laddr, inlen); + return (EFAULT); + } + + family = LTOS_FAMILY(laddr->sa_family); + switch (family) { + case (sa_family_t)AF_NOTSUPPORTED: + error = EPROTONOSUPPORT; + break; + + case (sa_family_t)AF_INVAL: + error = EAFNOSUPPORT; + break; + + case AF_UNIX: + if (inlen < sizeof (sa_family_t) + 2 || + inlen > sizeof (struct sockaddr_un)) { + error = EINVAL; + break; + } + ltos_sockaddr_ux(laddr, inlen, outaddr, outlen, + sun_type); + + /* AF_UNIX bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + case AF_PACKET: + if (inlen < sizeof (struct sockaddr_ll)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_ll); + + /* sll_protocol must be translated */ + /* LINTED: alignment */ + sal = (struct sockaddr_ll *)laddr; + proto = ltos_pkt_proto(sal->sll_protocol); + if (proto < 0) { + error = EINVAL; + } + sal->sll_protocol = proto; + break; + + case AF_INET: + if (inlen < sizeof (struct sockaddr)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr); + break; + + case AF_INET6: + /* + * The illumos sockaddr_in6 has one more 32-bit field + * than the Linux version. We simply zero that field + * via kmem_zalloc. + */ + if (inlen < sizeof (lx_sockaddr_in6_t)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_in6); + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, + KM_SLEEP); + bcopy(laddr, *outaddr, sizeof (lx_sockaddr_in6_t)); + (*outaddr)->sa_family = AF_INET6; + /* AF_INET6 bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + default: + *outlen = inlen; + } + + if (error == 0) { + /* + * For most address families, just copying into a sockaddr of + * the correct size and updating sa_family is adequate. + */ + VERIFY(inlen >= *outlen); + + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, KM_SLEEP); + bcopy(laddr, *outaddr, *outlen); + (*outaddr)->sa_family = family; + } + kmem_free(laddr, inlen); + return (error); +} + +/* + * Convert an illumos-native socket address into Linux format and copy it out + * to userspace. + */ +static long +stol_sockaddr_copyout(struct sockaddr *inaddr, socklen_t inlen, + struct sockaddr *outaddr, void *outlenp, socklen_t orig) +{ + socklen_t size = inlen; + struct sockaddr_storage buf; + struct sockaddr *bufaddr; + + /* + * Either we were passed a valid sockaddr (with length) or the length + * is set to 0. + */ + VERIFY(inaddr != NULL || inlen == 0); + + if (inlen == 0) { + goto finish; + } + + + switch (inaddr->sa_family) { + case AF_INET: + if (inlen != sizeof (struct sockaddr)) { + return (EINVAL); + } + break; + + case AF_INET6: + if (inlen != sizeof (struct sockaddr_in6)) { + return (EINVAL); + } + /* + * The linux sockaddr_in6 is shorter than illumos. + * Truncate the extra field on the way out. + */ + size = (sizeof (lx_sockaddr_in6_t)); + inlen = (sizeof (lx_sockaddr_in6_t)); + break; + + case AF_UNIX: + if (inlen > sizeof (struct sockaddr_un)) { + return (EINVAL); + } + + /* + * On Linux an empty AF_UNIX address is returned as NULL, which + * means setting the returned length to only encompass the + * address family part of the buffer. However, some code also + * references the address portion of the buffer and uses it, + * even though the returned length has been shortened. Thus, we + * clear the buffer to ensure that the address portion is NULL. + */ + if (inaddr->sa_data[0] == '\0') { + bzero(&buf, sizeof (buf)); + inlen = sizeof (inaddr->sa_family); + } + break; + + case (sa_family_t)AF_NOTSUPPORTED: + return (EPROTONOSUPPORT); + + case (sa_family_t)AF_INVAL: + return (EAFNOSUPPORT); + + default: + break; + } + + /* + * The input should be smaller than sockaddr_storage, the largest + * sockaddr we support. + */ + VERIFY(inlen <= sizeof (buf)); + + bufaddr = (struct sockaddr *)&buf; + bcopy(inaddr, bufaddr, inlen); + bufaddr->sa_family = STOL_FAMILY(bufaddr->sa_family); + + /* + * It is possible that userspace passed us a smaller buffer than we + * hope to output. When this is the case, we will truncate our output + * to the max size of their buffer but report the true size of the + * sockaddr when outputting the outlen value. + */ + size = (orig < size) ? orig : size; + + if (copyout(bufaddr, outaddr, size) != 0) { + return (EFAULT); + } + +finish: +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)inlen; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&inlen, outlenp, sizeof (inlen)) != 0) { + return (EFAULT); + } + } + + return (0); +} + +typedef struct lx_cmsg_xlate { + int lcx_sunos_level; + int lcx_sunos_type; + int (*lcx_stol_conv)(struct cmsghdr *, struct cmsghdr *); + int lcx_linux_level; + int lcx_linux_type; + int (*lcx_ltos_conv)(struct cmsghdr *, struct cmsghdr *); +} lx_cmsg_xlate_t; + +static int cmsg_conv_generic(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int ltos_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_recvttl(struct cmsghdr *, struct cmsghdr *); + +/* + * Table describing SunOS <-> Linux cmsg translation mappings. + * Certain types (IP_RECVTTL) are only converted in one direction and are + * indicated by one of the translation functions being set to NULL. + */ +static lx_cmsg_xlate_t lx_cmsg_xlate_tbl[] = { + { SOL_SOCKET, SCM_RIGHTS, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_RIGHTS, cmsg_conv_generic }, + { SOL_SOCKET, SCM_UCRED, stol_conv_ucred, + LX_SOL_SOCKET, LX_SCM_CRED, ltos_conv_ucred }, + { SOL_SOCKET, SCM_TIMESTAMP, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_TIMESTAMP, cmsg_conv_generic }, + { IPPROTO_IP, IP_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_PKTINFO, cmsg_conv_generic }, + { IPPROTO_IP, IP_RECVTTL, stol_conv_recvttl, + LX_IPPROTO_IP, LX_IP_TTL, NULL }, + { IPPROTO_IP, IP_TTL, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_TTL, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_HOPLIMIT, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_HOPLIMIT, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_PKTINFO, cmsg_conv_generic } +}; + +#define LX_MAX_CMSG_XLATE \ + (sizeof (lx_cmsg_xlate_tbl) / sizeof (lx_cmsg_xlate_tbl[0])) + +#if defined(_LP64) + +typedef struct { + int64_t cmsg_len; + int32_t cmsg_level; + int32_t cmsg_type; +} lx_cmsghdr64_t; + +/* The alignment/padding for 64bit Linux cmsghdr is not the same. */ +#define LX_CMSG64_ALIGNMENT 8 +#define ISALIGNED_LX_CMSG64(addr) \ + (((uintptr_t)(addr) & (LX_CMSG64_ALIGNMENT - 1)) == 0) +#define ROUNDUP_LX_CMSG64_LEN(len) \ + (((len) + LX_CMSG64_ALIGNMENT - 1) & ~(LX_CMSG64_ALIGNMENT - 1)) + +#define LX_CMSG64_IS_ALIGNED(m) \ + (((uintptr_t)(m) & (_CMSG_DATA_ALIGNMENT - 1)) == 0) +#define LX_CMSG64_DATA(c) ((unsigned char *)(((lx_cmsghdr64_t *)(c)) + 1)) +/* + * LX_CMSG64_VALID is closely derived from CMSG_VALID with one particularly + * important addition. Since cmsg_len is 64bit, (cmsg + cmsg_len) is checked + * against the start address as well. This prevents bogus inputs from wrapping + * around the address space. + */ +#define LX_CMSG64_VALID(cmsg, start, end) \ + (ISALIGNED_LX_CMSG64(cmsg) && \ + ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ + ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ + ((cmsg)->cmsg_len >= sizeof (lx_cmsghdr64_t)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len >= (uintptr_t)(start))) +#define LX_CMSG64_NEXT(cmsg) \ + (lx_cmsghdr64_t *)((uintptr_t)(cmsg) + \ + ROUNDUP_LX_CMSG64_LEN((cmsg)->cmsg_len)) +#define LX_CMSG64_DIFF sizeof (uint32_t) + +#endif /* defined(_LP64) */ + +/* + * convert ucred_s to lx_ucred. + */ +static int +stol_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * Format the data correctly in the omsg buffer. + */ + if (omsg != NULL) { + struct ucred_s *scred; + prcred_t *cr; + lx_ucred_t lcred; + + scred = (struct ucred_s *)CMSG_CONTENT(inmsg); + lcred.lxu_pid = scred->uc_pid; + /* LINTED: alignment */ + cr = UCCRED(scred); + if (cr != NULL) { + lcred.lxu_uid = cr->pr_euid; + lcred.lxu_gid = cr->pr_egid; + } else { + lcred.lxu_uid = lcred.lxu_gid = 0; + } + + bcopy(&lcred, CMSG_CONTENT(omsg), sizeof (lx_ucred_t)); + } + + return (sizeof (struct cmsghdr) + sizeof (lx_ucred_t)); +} + +static int +ltos_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + struct ucred_s *uc; + prcred_t *pc; + lx_ucred_t *lcred; + + uc = (struct ucred_s *)CMSG_CONTENT(omsg); + /* LINTED: alignment */ + pc = (prcred_t *)((char *)uc + sizeof (struct ucred_s)); + + uc->uc_credoff = sizeof (struct ucred_s); + + lcred = (lx_ucred_t *)CMSG_CONTENT(inmsg); + + uc->uc_pid = lcred->lxu_pid; + pc->pr_euid = lcred->lxu_uid; + pc->pr_egid = lcred->lxu_gid; + } + + return (sizeof (struct cmsghdr) + sizeof (struct ucred_s) + + sizeof (prcred_t)); + +} + +static int +stol_conv_recvttl(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * SunOS communicates the TTL of incoming packets via IP_RECVTTL using + * a uint8_t value instead of IP_TTL using an int. This conversion is + * only needed in the one direction since Linux does not handle + * IP_RECVTTL in the sendmsg path. + */ + if (omsg != NULL) { + uint8_t *inttl = (uint8_t *)CMSG_CONTENT(inmsg); + int *ottl = (int *)CMSG_CONTENT(omsg); + + *ottl = (int)*inttl; + } + + return (sizeof (struct cmsghdr) + sizeof (int)); +} + +static int +cmsg_conv_generic(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + size_t data_len; + + data_len = inmsg->cmsg_len - sizeof (struct cmsghdr); + bcopy(CMSG_CONTENT(inmsg), CMSG_CONTENT(omsg), data_len); + } + + return (inmsg->cmsg_len); +} + +static int +lx_xlate_cmsg(struct cmsghdr *inmsg, struct cmsghdr *omsg, lx_xlate_dir_t dir) +{ + int i; + int len; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_MAX_CMSG_XLATE; i++) { + lx_cmsg_xlate_t *xlate = &lx_cmsg_xlate_tbl[i]; + if (dir == LX_TO_SUNOS && + inmsg->cmsg_level == xlate->lcx_linux_level && + inmsg->cmsg_type == xlate->lcx_linux_type && + xlate->lcx_ltos_conv != NULL) { + len = xlate->lcx_ltos_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_sunos_level; + omsg->cmsg_type = xlate->lcx_sunos_type; + } + return (len); + } else if (dir == SUNOS_TO_LX && + inmsg->cmsg_level == xlate->lcx_sunos_level && + inmsg->cmsg_type == xlate->lcx_sunos_type && + xlate->lcx_stol_conv != NULL) { + len = xlate->lcx_stol_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_linux_level; + omsg->cmsg_type = xlate->lcx_linux_type; + } + return (len); + } + } + /* + * The Linux man page for sendmsg does not define a specific error for + * unsupported cmsgs. While it is meant to indicated bad values for + * passed flags, EOPNOTSUPP appears to be the next closest choice. + */ + return (-EOPNOTSUPP); +} + +static long +ltos_cmsgs_copyin(void *addr, socklen_t inlen, void **outmsg, + socklen_t *outlenp) +{ + void *inbuf, *obuf; + struct cmsghdr *inmsg, *omsg; + int slen = 0; + + if (inlen < sizeof (struct cmsghdr) || inlen > SO_MAXARGSIZE) { + return (EINVAL); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE && + inlen < sizeof (lx_cmsghdr64_t)) { + /* The size requirements are more strict for 64bit. */ + return (EINVAL); + } +#endif /* defined(_LP64) */ + + inbuf = kmem_alloc(inlen, KM_SLEEP); + if (copyin(addr, inbuf, inlen) != 0) { + kmem_free(inbuf, inlen); + return (EFAULT); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Linux cmsg headers are longer than illumos under x86_64. + * Convert to regular cmsgs first. + */ + lx_cmsghdr64_t *lmsg; + struct cmsghdr *smsg; + void *newbuf; + int len = 0; + + /* Inventory the new cmsg size */ + for (lmsg = (lx_cmsghdr64_t *)inbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg)) { + len += ROUNDUP_cmsglen(lmsg->cmsg_len - LX_CMSG64_DIFF); + } + + VERIFY(len < inlen); + if (len == 0) { + /* Input was bogus, so we can give up early. */ + kmem_free(inbuf, inlen); + *outmsg = NULL; + *outlenp = 0; + return (EINVAL); + } + + newbuf = kmem_alloc(len, KM_SLEEP); + + for (lmsg = (lx_cmsghdr64_t *)inbuf, + smsg = (struct cmsghdr *)newbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg), smsg = CMSG_NEXT(smsg)) { + smsg->cmsg_level = lmsg->cmsg_level; + smsg->cmsg_type = lmsg->cmsg_type; + smsg->cmsg_len = lmsg->cmsg_len - LX_CMSG64_DIFF; + + /* The above length measurement should ensure this */ + ASSERT(CMSG_VALID(smsg, newbuf, + (uintptr_t)newbuf + len)); + + bcopy(LX_CMSG64_DATA(lmsg), CMSG_CONTENT(smsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(inbuf, inlen); + inbuf = newbuf; + inlen = len; + } +#endif /* defined(_LP64) */ + + /* + * Now determine how much space we need for the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, LX_TO_SUNOS)) < 0) { + /* unsupported msg */ + kmem_free(inbuf, inlen); + return (-sz); + } + + slen += ROUNDUP_cmsglen(sz); + } + + obuf = kmem_zalloc(slen, KM_SLEEP); + + /* + * Now do the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, LX_TO_SUNOS) >= 0); + } + + kmem_free(inbuf, inlen); + *outmsg = obuf; + *outlenp = slen; + return (0); +} + +static long +stol_cmsgs_copyout(void *input, socklen_t inlen, void *addr, + void *outlenp, socklen_t orig_outlen) +{ + void *obuf; + struct cmsghdr *inmsg, *omsg; + int error = 0; + socklen_t lx_len = 0; +#if defined(_LP64) + model_t model = get_udatamodel(); +#endif + + if (inlen == 0) { + /* Simply output the zero controllen */ + goto finish; + } + + VERIFY(inlen >= sizeof (struct cmsghdr)); + + /* + * First determine how much space we need for the conversion and + * make sure the caller has provided at least that much space to return + * results. + */ + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, SUNOS_TO_LX)) < 0) { + /* unsupported msg */ + return (-sz); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* + * The converted 64-bit cmsgs require an additional 4 + * bytes of header space and must be aligned to 8 bytes + * (instead of the typical 4 for x86) + */ + sz = ROUNDUP_LX_CMSG64_LEN(sz + LX_CMSG64_DIFF); + } else +#endif /* defined(_LP64) */ + { + /* + * The converted 32-bit cmsgs do not require additional + * header space or padding for Linux conversion. + */ + sz = ROUNDUP_cmsglen(sz); + } + + /* + * Unlike SunOS, Linux requires that the last cmsg be + * adequately padded for alignment. + */ + lx_len += sz; + } + + if (lx_len > orig_outlen || addr == NULL) { + /* This will be interpreted by the caller */ + error = EMSGSIZE; + lx_len = 0; + goto finish; + } + + /* + * Since cmsgs are often padded to an aligned size, kmem_zalloc is + * necessary to prevent leaking the contents of uninitialized memory. + */ + obuf = kmem_zalloc(lx_len, KM_SLEEP); + + /* + * Convert the msgs. + */ + for (inmsg = (struct cmsghdr *)input, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, SUNOS_TO_LX) >= 0); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* Linux cmsg headers are longer than illumos under x86_64. */ + struct cmsghdr *smsg; + lx_cmsghdr64_t *lmsg; + void *newbuf; + + /* + * Once again, kmem_zalloc is needed to avoid leaking the + * contents of uninialized memory + */ + newbuf = kmem_zalloc(lx_len, KM_SLEEP); + for (smsg = (struct cmsghdr *)obuf, + lmsg = (lx_cmsghdr64_t *)newbuf; + CMSG_VALID(smsg, obuf, (uintptr_t)obuf + inlen) != 0; + smsg = CMSG_NEXT(smsg), lmsg = LX_CMSG64_NEXT(lmsg)) { + lmsg->cmsg_level = smsg->cmsg_level; + lmsg->cmsg_type = smsg->cmsg_type; + lmsg->cmsg_len = smsg->cmsg_len + LX_CMSG64_DIFF; + + ASSERT(LX_CMSG64_VALID(lmsg, newbuf, + (uintptr_t)newbuf + lx_len) != 0); + + bcopy(CMSG_CONTENT(smsg), LX_CMSG64_DATA(lmsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(obuf, lx_len); + obuf = newbuf; + } +#endif /* defined(_LP64) */ + + if (copyout(obuf, addr, lx_len) != 0) { + kmem_free(obuf, lx_len); + return (EFAULT); + } + kmem_free(obuf, lx_len); + +finish: + if (outlenp != NULL) { +#if defined(_LP64) + if (model != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)lx_len; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&lx_len, outlenp, sizeof (lx_len)) != 0) { + return (EFAULT); + } + } + } + return (error); +} + +static void +lx_cmsg_set_cloexec(void *input, socklen_t inlen) +{ + struct cmsghdr *inmsg; + + if (inlen == 0) { + return; + } + + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + if (inmsg->cmsg_level == SOL_SOCKET && + inmsg->cmsg_type == SCM_RIGHTS) { + int *fds = (int *)CMSG_CONTENT(inmsg); + int i, num = (int)CMSG_CONTENTLEN(inmsg) / sizeof (int); + + for (i = 0; i < num; i++) { + char flags; + file_t *fp; + + fp = getf(fds[i]); + if (fp == NULL) { + /* + * It is possible that a received fd + * will already have been closed if a + * thread in the local process is + * indiscriminately issuing close(2) + * calls while the message is being + * received. If that is the case, no + * further processing of the fd is + * needed. It will still be passed + * up in the cmsg even though the + * caller chose to close it already. + */ + continue; + } + + flags = f_getfd(fds[i]); + flags |= FD_CLOEXEC; + f_setfd(fds[i], flags); + releasef(fds[i]); + } + } + } +} + +static int +lx_cmsg_try_ucred(sonode_t *so, struct nmsghdr *msg, socklen_t origlen) +{ + lx_socket_aux_data_t *sad; + struct cmsghdr *cmsg = NULL; + int msgsize; + cred_t *cred; + + if (origlen == 0) { + return (0); + } + sad = lx_sad_acquire(SOTOV(so)); + if ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0) { + mutex_exit(&sad->lxsad_lock); + return (0); + } + mutex_exit(&sad->lxsad_lock); + + mutex_enter(&so->so_lock); + if (so->so_peercred == NULL) { + mutex_exit(&so->so_lock); + return (0); + } + crhold(cred = so->so_peercred); + mutex_exit(&so->so_lock); + + msgsize = ucredminsize(cred) + sizeof (struct cmsghdr); + if (msg->msg_control == NULL) { + msg->msg_controllen = msgsize; + msg->msg_control = cmsg = kmem_zalloc(msgsize, KM_SLEEP); + } else { + /* + * The so_recvmsg operation may have allocated a msg_control + * buffer which precisely fits all returned cmsgs. We must + * manually verify the length of that cmsg data and reallocate + * the buffer if it lacks the necessary space. + */ + uintptr_t start = (uintptr_t)msg->msg_control; + uintptr_t end = start + msg->msg_controllen; + + ASSERT(msg->msg_controllen > 0); + cmsg = (struct cmsghdr *)msg->msg_control; + while (CMSG_VALID(cmsg, start, end) != 0) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_UCRED) { + /* + * If some later code change results in a ucred + * being attached anyways, there is no need for + * us to do it manually + */ + crfree(cred); + return (0); + } + cmsg = CMSG_NEXT(cmsg); + } + if (((uintptr_t)cmsg + msgsize) > end) { + socklen_t offset = (uintptr_t)cmsg - start; + socklen_t newsize = offset + msgsize; + void *newbuf; + + if (newsize < msg->msg_controllen) { + /* size overflow, bail */ + crfree(cred); + return (-1); + } + newbuf = kmem_alloc(newsize, KM_SLEEP); + bcopy(msg->msg_control, newbuf, msg->msg_controllen); + kmem_free(msg->msg_control, msg->msg_controllen); + + msg->msg_control = newbuf; + msg->msg_controllen = newsize; + cmsg = (struct cmsghdr *)((uintptr_t)newbuf + offset); + } + } + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + cmsg->cmsg_len = msgsize; + (void) cred2ucred(cred, so->so_cpid, CMSG_CONTENT(cmsg), CRED()); + crfree(cred); + return (0); +} + +static lx_socket_aux_data_t * +lx_sad_acquire(vnode_t *vp) +{ + lx_socket_aux_data_t *cur, *created; + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + /* perform our allocation carefully */ + mutex_exit(&vp->v_vsd_lock); + + created = (lx_socket_aux_data_t *)kmem_zalloc( + sizeof (*created), KM_SLEEP); + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + mutex_init(&created->lxsad_lock, NULL, MUTEX_DEFAULT, + NULL); + (void) vsd_set(vp, lx_socket_vsd, created); + cur = created; + } else { + kmem_free(created, sizeof (*created)); + } + } + mutex_exit(&vp->v_vsd_lock); + mutex_enter(&cur->lxsad_lock); + return (cur); +} + +static int +lx_convert_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + +static int +lx_convert_sock_args(int in_dom, int in_type, int in_proto, int *out_dom, + int *out_type, int *out_options, int *out_proto) +{ + int domain, type, options; + + if (in_dom < 0 || in_type < 0 || in_proto < 0) + return (EINVAL); + + domain = LTOS_FAMILY(in_dom); + if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC) + return (EAFNOSUPPORT); + if (domain == AF_INVAL) + return (EINVAL); + + type = LTOS_SOCKTYPE(in_type & LX_SOCK_TYPE_MASK); + if (type == SOCK_INVAL) + return (EINVAL); + /* + * Linux does not allow the app to specify IP Protocol for raw sockets. + * SunOS does, so bail out here. + */ + if (type == SOCK_NOTSUPPORTED || + (domain == AF_INET && type == SOCK_RAW && in_proto == IPPROTO_IP)) { + if (lx_kern_release_cmp(curzone, "2.6.15") < 0) { + /* + * Use error appropriate for kernel version. + * See lx_socket_create for more detail. + */ + return (ESOCKTNOSUPPORT); + } + return (EPROTONOSUPPORT); + } + + options = 0; + in_type &= ~(LX_SOCK_TYPE_MASK); + if (in_type & LX_SOCK_NONBLOCK) { + in_type ^= LX_SOCK_NONBLOCK; + options |= SOCK_NONBLOCK; + } + if (in_type & LX_SOCK_CLOEXEC) { + in_type ^= LX_SOCK_CLOEXEC; + options |= SOCK_CLOEXEC; + } + if (in_type != 0) { + return (EINVAL); + } + + /* Protocol definitions for PF_PACKET differ between Linux and SunOS */ + if (domain == PF_PACKET && + (in_proto = lx_convert_pkt_proto(in_proto)) < 0) + return (EINVAL); + + *out_dom = domain; + *out_type = type; + *out_options = options; + *out_proto = in_proto; + return (0); +} + +/* + * For restartable socket syscall handling, the relevant syscalls are only + * restarted when a timeout is not set on the socket. + */ +static void +lx_sock_syscall_restart(sonode_t *so, boolean_t recv) +{ + if (recv) { + if (so->so_rcvtimeo != 0) + return; + } else { + if (so->so_sndtimeo != 0) + return; + } + + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; +} + +static int +lx_socket_create(int domain, int type, int protocol, int options, file_t **fpp, + int *fdp) +{ + sonode_t *so; + vnode_t *vp; + file_t *fp; + int err, fd; + + /* logic cloned from so_socket */ + so = socket_create(domain, type, protocol, NULL, NULL, SOCKET_SLEEP, + SOV_DEFAULT, CRED(), &err); + + if (so == NULL) { + switch (err) { + case EPROTOTYPE: + case EPROTONOSUPPORT: + if (lx_kern_release_cmp(curzone, "2.6.15") < 0) { + /* + * Linux changed its socket error behavior in + * versions 2.6.15 and later. See git commit + * 86c8f9d158f68538a971a47206a46a22c7479bac in + * the Linux repository. + * + * LTP presently checks for version 2.6.16. + */ + return (ESOCKTNOSUPPORT); + } + return (EPROTONOSUPPORT); + default: + return (err); + } + } + + /* Allocate a file descriptor for the socket */ + vp = SOTOV(so); + if ((err = falloc(vp, FWRITE|FREAD, &fp, &fd)) != 0) { + (void) socket_close(so, 0, CRED()); + socket_destroy(so); + return (err); + } + + /* + * Linux programs do not tolerate errors appearing from asynchronous + * events (such as ICMP messages arriving). Setting SM_DEFERERR will + * prevent checking/delivery of such errors. + */ + so->so_mode |= SM_DEFERERR; + + /* Now fill in the entries that falloc reserved */ + if (options & SOCK_NONBLOCK) { + so->so_state |= SS_NONBLOCK; + fp->f_flag |= FNONBLOCK; + } + mutex_exit(&fp->f_tlock); + *fpp = fp; + *fdp = fd; + return (0); +} + +static void +lx_socket_destroy(file_t *fp, int fd) +{ + sonode_t *so = VTOSO(fp->f_vnode); + + setf(fd, NULL); + + mutex_enter(&fp->f_tlock); + unfalloc(fp); + + (void) socket_close(so, 0, CRED()); + socket_destroy(so); +} + +long +lx_socket(int domain, int type, int protocol) +{ + int error, options, fd = -1; + file_t *fp = NULL; + + if ((error = lx_convert_sock_args(domain, type, protocol, &domain, + &type, &options, &protocol)) != 0) { + return (set_errno(error)); + } + + error = lx_socket_create(domain, type, protocol, options, &fp, &fd); + if (error != 0) { + return (set_errno(error)); + } + + setf(fd, fp); + if ((options & SOCK_CLOEXEC) != 0) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); +} + +long +lx_bind(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + socklen_t len = 0; + file_t *fp; + int error; + lx_sun_type_t sun_type; + boolean_t not_sock = B_FALSE; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, &sun_type); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + if (addr != NULL && addr->sa_family == AF_UNIX) { + vnode_t *vp; + + error = so_ux_lookup(so, (struct sockaddr_un *)addr, B_TRUE, + &vp); + if (error == 0) { + /* A valid socket exists and is open at this address. */ + VN_RELE(vp); + } else { + /* Keep track of paths which are not valid sockets. */ + if (error == ENOTSOCK) { + not_sock = B_TRUE; + } + + /* + * When binding to an abstract namespace address or + * /dev/log, implicit clean-up must occur if there is + * not a valid socket at the specififed address. See + * ltos_sockaddr_copyin for details about why these + * socket types act differently. + */ + if (sun_type == LX_SUN_ABSTRACT) { + (void) vn_removeat(NULL, addr->sa_data, + UIO_SYSSPACE, RMFILE); + } + } + } + + error = socket_bind(so, addr, len, _SOBIND_XPG4_2, CRED()); + + /* + * Linux returns EADDRINUSE for attempts to bind to Unix domain + * sockets that aren't sockets. + */ + if (error == EINVAL && addr != NULL && addr->sa_family == AF_UNIX && + not_sock == B_TRUE) { + error = EADDRINUSE; + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_connect(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + lx_socket_aux_data_t *sad = NULL; + socklen_t len = 0; + file_t *fp; + int error; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + /* + * Ensure the name is sized appropriately before we alloc memory and + * copy it in from userspace. We need at least the address family to + * make later sizing decisions. + */ + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, NULL); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + error = socket_connect(so, addr, len, fp->f_flag, + _SOCONNECT_XPG4_2, CRED()); + + if (error == EINTR) + lx_sock_syscall_restart(so, B_FALSE); + + /* + * Linux connect(2) behavior is rather strange when using the + * O_NONBLOCK flag. The first call will return EINPROGRESS, as + * expected. Provided that is successful, a second call to connect + * will return 0 instead of EISCONN. Subsequent connect calls will + * return EISCONN. + */ + if ((fp->f_flag & FNONBLOCK) != 0 && error != 0) { + sad = lx_sad_acquire(SOTOV(so)); + if (error == EISCONN && + sad->lxsad_status == LXSS_CONNECTING) { + /* Report the one success */ + sad->lxsad_status = LXSS_CONNECTED; + error = 0; + } else if (error == EINPROGRESS) { + sad->lxsad_status = LXSS_CONNECTING; + } + mutex_exit(&sad->lxsad_lock); + } + + /* + * When connecting to a UDP socket, configure it so that future + * sendto/sendmsg operations are allowed to specify a destination + * address. See the Posix spec. for sendto(2). Linux allows this while + * illumos would return EISCONN if the option is not set. + */ + if (error == 0 && so->so_protocol == IPPROTO_UDP && + (so->so_family == AF_INET || so->so_family == AF_INET6)) { + int val = 1; + + DTRACE_PROBE(lx__connect__udp); + (void) socket_setsockopt(so, IPPROTO_UDP, UDP_SND_TO_CONNECTED, + &val, sizeof (val), CRED()); + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +/* + * Custom version of socket_recvmsg for error-handling overrides. + */ +static int +lx_socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr) +{ + int error; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache when reading data, as the application + * is likely to access the data shortly. + */ + uiop->uio_extflg |= UIO_COPY_CACHED; + + error = SOP_RECVMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial read */ + if (uiop->uio_resid != orig_resid) + error = 0; + break; + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + } + break; + default: + break; + } + return (error); +} + +static long +lx_recv_common(int sock, struct nmsghdr *msg, xuio_t *xuiop, int flags, + void *namelenp, void *controllenp, void *flagsp) +{ + struct sonode *so; + file_t *fp; + void *name; + socklen_t namelen; + void *control; + socklen_t controllen; + ssize_t len; + int error; + boolean_t fd_cloexec; + boolean_t is_peek_trunc; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + fd_cloexec = ((flags & LX_MSG_CMSG_CLOEXEC) != 0); + flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + is_peek_trunc = (flags & (MSG_PEEK|MSG_TRUNC)) == (MSG_PEEK|MSG_TRUNC); + len = xuiop->xu_uio.uio_resid; + xuiop->xu_uio.uio_fmode = fp->f_flag; + xuiop->xu_uio.uio_extflg = UIO_COPY_CACHED; + + /* + * Linux accepts MSG_TRUNC as an input flag, unlike SunOS and many + * other UNIX distributions. When combined with MSG_PEEK, it causes + * recvmsg to return the size of the waiting message, regardless of + * buffer size. This behavior is commonly used with a 0-length buffer + * to interrogate the size of a queued message prior to allocating a + * buffer for it. + * + * In order to support this functionality, a custom XUIO type is used + * to communicate the total message size out from the depths of sockfs. + */ + if (is_peek_trunc) { + xuiop->xu_uio.uio_extflg |= UIO_XUIO; + xuiop->xu_type = UIOTYPE_PEEKSIZE; + xuiop->xu_ext.xu_ps.xu_ps_set = B_FALSE; + xuiop->xu_ext.xu_ps.xu_ps_size = 0; + } + + name = msg->msg_name; + namelen = msg->msg_namelen; + control = msg->msg_control; + controllen = msg->msg_controllen; + + /* + * socket_recvmsg will allocate these if needed. + * NULL them out to prevent any confusion. + */ + msg->msg_name = NULL; + msg->msg_control = NULL; + + msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + + error = lx_socket_recvmsg(so, msg, (struct uio *)xuiop, CRED()); + if (error) { + if (error == EINTR) + lx_sock_syscall_restart(so, B_TRUE); + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGRCV, 1); + releasef(sock); + + if (namelen != 0) { + error = stol_sockaddr_copyout(msg->msg_name, msg->msg_namelen, + name, namelenp, namelen); + + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + msg->msg_namelen = 0; + } + + /* + * Errors during copyout of the name are not a concern to Linux + * callers at this point in the syscall + */ + if (error != 0 && error != EFAULT) { + goto err; + } + } + + if (controllen != 0) { + if (fd_cloexec) { + /* + * If CLOEXEC needs to set on file descriptors passed + * via SCM_RIGHTS, do so before formatting the cmsgs + * for Linux. + */ + lx_cmsg_set_cloexec(msg->msg_control, + msg->msg_controllen); + } + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + /* + * It may be necessary to append a SCM_UCRED cmsg to + * the controls if SO_PASSCRED is set on a + * connection-oriented AF_UNIX socket. + * + * See lx_setsockopt_socket for more details. + */ + if (lx_cmsg_try_ucred(so, msg, controllen) != 0) { + msg->msg_flags |= MSG_CTRUNC; + } + } + + error = stol_cmsgs_copyout(msg->msg_control, + msg->msg_controllen, control, controllenp, controllen); + + if (error != 0) { + /* + * If there was an error during cmsg translation or + * copyout, we need to clean up any FDs that are being + * passed back via SCM_RIGHTS. This prevents us from + * leaking those open files. + */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, + 0); + + /* + * An error during cmsg_copyout means we had + * _something_ to process. + */ + VERIFY(msg->msg_controllen != 0); + + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + + if (error == EMSGSIZE) { + /* Communicate that messages were truncated */ + msg->msg_flags |= MSG_CTRUNC; + error = 0; + } else { + goto err; + } + } else if (msg->msg_controllen != 0) { + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + } + } + + if (flagsp != NULL) { + int flags; + + /* Clear internal flag. */ + flags = msg->msg_flags & ~MSG_XPG4_2; + flags = lx_xlate_sock_flags(flags, SUNOS_TO_LX); + + if (copyout(&flags, flagsp, sizeof (flags) != 0)) { + error = EFAULT; + goto err; + } + } + + /* + * If both MSG_PEEK|MSG_TRUNC were set on the input flags and the + * socket layer was able to calculate the total message size for us, + * return that instead of the copied size. + */ + if (is_peek_trunc && xuiop->xu_ext.xu_ps.xu_ps_set == B_TRUE) { + return (xuiop->xu_ext.xu_ps.xu_ps_size); + } + + return (len - xuiop->xu_uio.uio_resid); + +err: + if (msg->msg_controllen != 0) { + /* Prevent FD leakage (see above) */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, 0); + kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + } + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + } + return (set_errno(error)); +} + +long +lx_recv(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* + * The input len is unsigned, so limit it to SSIZE_MAX since + * the return value is signed. + */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_namelen = 0; + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + return (lx_recv_common(sock, &smsg, &xuio, flags, NULL, NULL, NULL)); +} + +long +lx_recvfrom(int sock, void *buffer, size_t len, int flags, + struct sockaddr *srcaddr, socklen_t *addrlenp) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_name = (char *)srcaddr; + if (addrlenp != NULL && srcaddr != NULL) { + /* + * Despite addrlenp being defined as a socklen_t *, Linux + * treats it internally as an int *. Certain LTP tests depend + * upon this behavior, so we must emulate it as well. + */ + int namelen; + + if (copyin(addrlenp, &namelen, sizeof (namelen)) != 0) { + return (set_errno(EFAULT)); + } + if (namelen < 0) { + return (set_errno(EINVAL)); + } + smsg.msg_namelen = namelen; + } else { + smsg.msg_namelen = 0; + } + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + + return (lx_recv_common(sock, &smsg, &xuio, flags, addrlenp, NULL, + NULL)); +} + +long +lx_recvmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec luiov[IOV_MAX_STACK], *uiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + void *namelenp, *controllenp, *flagsp; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + + namelenp = &((lx_msghdr32_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr32_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr32_t *)msg)->msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + + namelenp = &((lx_msghdr_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr_t *)msg)->msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + uiov = kmem_alloc(iovsize, KM_SLEEP); + } else if (iovcnt > 0) { + iovsize = 0; + uiov = luiov; + } else { + iovsize = 0; + uiov = NULL; + goto noiov; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 luiov32[IOV_MAX_STACK], *uiov32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + uiov32 = kmem_alloc(iov32size, KM_SLEEP); + } else { + uiov32 = luiov32; + } + + if (copyin((struct iovec32 *)smsg.msg_iov, uiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = uiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + uiov[i].iov_len = iovlen32; + uiov[i].iov_base = + (caddr_t)(uintptr_t)uiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, uiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = uiov[i].iov_len; + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + +noiov: + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = uiov; + xuio.xu_uio.uio_iovcnt = iovcnt; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + res = lx_recv_common(sock, &smsg, &xuio, flags, namelenp, controllenp, + flagsp); + + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + + return (res); +} + +long +lx_recvmmsg(int sock, void *msg, uint_t vlen, int flags, timespec_t *timeoutp) +{ + hrtime_t deadline = 0; + uint_t rcvd = 0; + long ret = 0; + boolean_t waitforone; + + waitforone = ((flags & LX_MSG_WAITFORONE) != 0); + flags &= ~LX_MSG_WAITFORONE; + + /* + * We want to limit the work that a thread calling recvmmsg() can + * perform in the kernel so that it cannot accrue too high a priority. + * Artificially capping vlen means that the thread will return to + * userspace after processing at most IOV_MAX messages, giving the + * system a chance to reset the thread priority. + * + * Linux does not cap vlen here and recvmmsg() is expected to return + * once vlen messages have been received, a timeout occurs, or if an + * error is encountered; the artificial cap adds another case. + * + * It is possible that returning "early" in this emulation will + * cause problems with some applications however a properly written + * recvmmsg() consumer should consume only the received datagrams + * and try again if it wants more. This may need revisiting in the + * future. + */ + if (vlen > IOV_MAX) + vlen = IOV_MAX; + + if (timeoutp != NULL) { + timespec_t timeout; + uhrtime_t utime = (uhrtime_t)gethrtime(); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &timeout, sizeof (timestruc_t))) + return (set_errno(EFAULT)); + } else { + timestruc32_t timeout32; + if (copyin(timeoutp, &timeout32, + sizeof (timestruc32_t))) + return (set_errno(EFAULT)); + timeout.tv_sec = (time_t)timeout32.tv_sec; + timeout.tv_nsec = timeout32.tv_nsec; + } + + if (itimerspecfix(&timeout)) + return (set_errno(EINVAL)); + + /* + * Make sure that deadline will not overflow. itimerspecfix() + * has already checked for negative values and too big a value + * in tv_nsec + */ + if (timeout.tv_sec >= HRTIME_MAX / NANOSEC) + return (set_errno(EINVAL)); + + utime += timeout.tv_sec * NANOSEC; + utime += timeout.tv_nsec; + + if (utime > HRTIME_MAX) + return (set_errno(EINVAL)); + + deadline = (hrtime_t)utime; + } + + for (rcvd = 0; rcvd < vlen; rcvd++) { + uint_t *ptr; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_mmsghdr_t *hdr = (lx_mmsghdr_t *)msg; + hdr += rcvd; + ret = lx_recvmsg(sock, (lx_msghdr_t *)hdr, flags); + ptr = &hdr->msg_len; + } else { + lx_mmsghdr32_t *hdr = (lx_mmsghdr32_t *)msg; + hdr += rcvd; + ret = lx_recvmsg(sock, (lx_msghdr32_t *)hdr, flags); + ptr = &hdr->msg_len; + } + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, ptr, sizeof (*ptr)); + /* + * If MSG_WAITFORONE is set, set MSG_DONTWAIT after the + * first packet has been received. + */ + if (waitforone) { + flags |= LX_MSG_DONTWAIT; + waitforone = B_FALSE; + } + /* + * The Linux man page documents the timeout option as + * only being checked after each datagram is received. + * The man page does not document ETIMEDOUT as a return + * code so we do not set an errno. + */ + if (deadline > 0 && gethrtime() >= deadline) + break; + } + + if (rcvd > 0) { + /* + * Any error code is deliberately discarded if any message + * was successfully received. + */ + ttolwp(curthread)->lwp_errno = 0; + return (rcvd); + } + + return (ret); +} + +/* + * Custom version of socket_sendmsg for error-handling overrides. + */ +static int +lx_socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr, boolean_t nosig) +{ + int error = 0; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache if we are doing a local (AF_UNIX) write. + */ + if (so->so_family == AF_UNIX) { + uiop->uio_extflg |= UIO_COPY_CACHED; + } else { + uiop->uio_extflg &= ~UIO_COPY_CACHED; + } + + error = SOP_SENDMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + case ENOMEM: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial send */ + if (uiop->uio_resid != orig_resid) { + error = 0; + } + break; + + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + break; + } + + /* Appease LTP and match behavior detailed in the man page */ + error = EPIPE; + /* FALLTHROUGH */ + case EPIPE: + if (nosig == B_FALSE) { + tsignal(curthread, SIGPIPE); + } + break; + + default: + break; + } + + return (error); +} + +static long +lx_send_common(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) +{ + struct sonode *so; + file_t *fp; + struct sockaddr *name = NULL; + socklen_t namelen; + void *control = NULL; + socklen_t controllen; + ssize_t len = 0; + int error; + boolean_t nosig; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + uiop->uio_fmode = fp->f_flag; + + /* Allocate and copyin name and control */ + if (msg->msg_name != NULL && msg->msg_namelen != 0) { + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + + error = ltos_sockaddr_copyin((struct sockaddr *)msg->msg_name, + msg->msg_namelen, &name, &namelen, NULL); + if (error != 0) { + goto done; + } + /* copyin_name null terminates addresses for AF_UNIX */ + msg->msg_namelen = namelen; + msg->msg_name = name; + } else { + msg->msg_name = name = NULL; + msg->msg_namelen = namelen = 0; + } + + if (msg->msg_control != NULL && msg->msg_controllen != 0) { + /* + * Verify that the length is not excessive to prevent + * an application from consuming all of kernel memory. + */ + if (msg->msg_controllen > SO_MAXARGSIZE) { + error = EINVAL; + goto done; + } + if ((error = ltos_cmsgs_copyin(msg->msg_control, + msg->msg_controllen, &control, &controllen)) != 0) { + goto done; + } + msg->msg_control = control; + msg->msg_controllen = controllen; + } else { + msg->msg_control = control = NULL; + msg->msg_controllen = controllen = 0; + } + + len = uiop->uio_resid; + msg->msg_flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + nosig = ((flags & LX_MSG_NOSIGNAL) != 0); + + error = lx_socket_sendmsg(so, msg, uiop, CRED(), nosig); + if (error == EINTR) + lx_sock_syscall_restart(so, B_FALSE); +done: + if (control != NULL) { + kmem_free(control, controllen); + } + if (name != NULL) { + kmem_free(name, namelen); + } + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGSND, 1); + releasef(sock); + return (len - uiop->uio_resid); +} + +/* + * For both send and sendto Linux evaluates errors in a different order than + * we do internally. Specifically it will check the buffer address before + * checking if the socket is connected. This can lead to a different errno on + * us vs. Linux (seen with LTP) but we don't bother to emulate this. + */ +long +lx_send(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = NULL; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendto(int sock, void *buffer, size_t len, int flags, + struct sockaddr *dstaddr, socklen_t addrlen) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = (char *)dstaddr; + smsg.msg_namelen = addrlen; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt <= 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } else { + iovsize = 0; + aiov = buf; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + } + + if (copyin((struct iovec32 *)smsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = aiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + aiov[i].iov_len = iovlen32; + aiov[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, aiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = aiov[i].iov_len; + + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + res = lx_send_common(sock, &smsg, &auio, flags); + + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + + return (res); +} + +long +lx_sendmmsg(int sock, void *msg, uint_t vlen, int flags) +{ + long ret = 0; + uint_t sent = 0; + + /* + * Linux caps vlen to UIO_MAXIOV (1024). + */ + if (vlen > IOV_MAX) + vlen = IOV_MAX; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_mmsghdr_t *hdr = msg; + + for (sent = 0; sent < vlen; sent++, hdr++) { + ret = lx_sendmsg(sock, (lx_msghdr_t *)hdr, flags); + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len)); + } + } else { + lx_mmsghdr32_t *hdr = msg; + + for (sent = 0; sent < vlen; sent++, hdr++) { + ret = lx_sendmsg(sock, (lx_msghdr32_t *)hdr, flags); + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len)); + } + } + + if (sent > 0) { + /* + * Any error code is deliberately discarded if any message + * was successfully sent. + */ + ttolwp(curthread)->lwp_errno = 0; + return (sent); + } + + return (ret); +} + +/* + * Linux socket option type definitions + * + * The protocol `levels` are well defined (see in.h) The option values are + * not so well defined. Linux often uses different values vs. Illumos + * although they mean the same thing. For example, IP_TOS in Linux is + * defined as value 1 but in Illumos it is defined as value 3. This table + * maps all the Protocol levels to their options and maps them between + * Linux and Illumos and vice versa. Hence the reason for the complexity. + * + * For a certain subset of sockopts, Linux will implicitly truncate optval + * input, so long as optlen meets a minimum size. Because illumos is strict + * about optlen, we must cap optlen for those options. + */ + +typedef struct lx_sockopt_map { + const int lsm_opt; /* Illumos-native equivalent */ + const int lsm_lcap; /* Cap optlen to this size. (Ignored if 0) */ +} lx_sockopt_map_t; + +typedef struct lx_proto_opts { + const lx_sockopt_map_t *lpo_entries; /* Linux->SunOS map entries */ + unsigned int lpo_max; /* max entries in table */ +} lx_proto_opts_t; + +#define OPTNOTSUP -1 /* we don't support it */ + +#define PROTO_SOCKOPTS(opts) \ + { (opts), sizeof ((opts)) / sizeof ((opts)[0]) } + +/* Shorten name so the columns can line up */ +#define IP_MREQ_SZ sizeof (struct ip_mreq) + +static const lx_sockopt_map_t ltos_ip_sockopts[LX_IP_UNICAST_IF + 1] = { + { OPTNOTSUP, 0 }, + { IP_TOS, sizeof (int) }, /* IP_TOS */ + { IP_TTL, sizeof (int) }, /* IP_TTL */ + { IP_HDRINCL, sizeof (int) }, /* IP_HDRINCL */ + { IP_OPTIONS, 0 }, /* IP_OPTIONS */ + { OPTNOTSUP, 0 }, /* IP_ROUTER_ALERT */ + { IP_RECVOPTS, sizeof (int) }, /* IP_RECVOPTS */ + { IP_RETOPTS, sizeof (int) }, /* IP_RETOPTS */ + { IP_PKTINFO, sizeof (int) }, /* IP_PKTINFO */ + { OPTNOTSUP, 0 }, /* IP_PKTOPTIONS */ + { OPTNOTSUP, 0 }, /* IP_MTUDISCOVER */ + { OPTNOTSUP, 0 }, /* IP_RECVERR */ + { IP_RECVTTL, sizeof (int) }, /* IP_RECVTTL */ + { OPTNOTSUP, 0 }, /* IP_RECVTOS */ + { OPTNOTSUP, 0 }, /* IP_MTU */ + { OPTNOTSUP, 0 }, /* IP_FREEBIND */ + { OPTNOTSUP, 0 }, /* IP_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IP_XFRM_POLICY */ + { OPTNOTSUP, 0 }, /* IP_PASSSEC */ + { OPTNOTSUP, 0 }, /* IP_TRANSPARENT */ + { OPTNOTSUP, 0 }, /* IP_ORIGDSTADDR */ + { OPTNOTSUP, 0 }, /* IP_MINTTL */ + { OPTNOTSUP, 0 }, /* IP_NODEFRAG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IP_MULTICAST_IF, sizeof (int) }, /* IP_MULTICAST_IF */ + { IP_MULTICAST_TTL, sizeof (int) }, /* IP_MULTICAST_TTL */ + { IP_MULTICAST_LOOP, sizeof (int) }, /* IP_MULTICAST_LOOP */ + { IP_ADD_MEMBERSHIP, IP_MREQ_SZ }, /* IP_ADD_MEMBERSHIP */ + { IP_DROP_MEMBERSHIP, IP_MREQ_SZ }, /* IP_DROP_MEMBERSHIP */ + { IP_UNBLOCK_SOURCE, 0 }, /* IP_UNBLOCK_SOURCE */ + { IP_BLOCK_SOURCE, 0 }, /* IP_BLOCK_SOURCE */ + { IP_ADD_SOURCE_MEMBERSHIP, 0 }, /* IP_ADD_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_DROP_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_MSFILTER */ + { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */ + { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */ + { OPTNOTSUP, 0 }, /* IP_MULTICAST_ALL */ + { OPTNOTSUP, 0 } /* IP_UNICAST_IF */ +}; + +/* Shorten name so the columns can line up */ +#define IP6_MREQ_SZ sizeof (struct ipv6_mreq) + +static const lx_sockopt_map_t ltos_ipv6_sockopts[LX_IPV6_TCLASS + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* IPV6_ADDRFORM */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTINFO */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292DSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTOPTIONS */ + { IPV6_CHECKSUM, sizeof (int) }, /* IPV6_CHECKSUM */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_NEXTHOP */ + { OPTNOTSUP, 0 }, /* IPV6_AUTHHDR */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IPV6_UNICAST_HOPS, sizeof (int) }, /* IPV6_UNICAST_HOPS */ + { IPV6_MULTICAST_IF, sizeof (int) }, /* IPV6_MULTICAST_IF */ + { IPV6_MULTICAST_HOPS, sizeof (int) }, /* IPV6_MULTICAST_HOPS */ + { IPV6_MULTICAST_LOOP, sizeof (int) }, /* IPV6_MULTICAST_LOOP */ + { IPV6_ADD_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_JOIN_GROUP */ + { IPV6_DROP_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* IPV6_ROUTER_ALERT */ + { OPTNOTSUP, 0 }, /* IPV6_MTU_DISCOVER */ + { OPTNOTSUP, 0 }, /* IPV6_MTU */ + { OPTNOTSUP, 0 }, /* IPV6_RECVERR */ + { IPV6_V6ONLY, sizeof (int) }, /* IPV6_V6ONLY */ + { OPTNOTSUP, 0 }, /* IPV6_JOIN_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_LEAVE_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IPV6_XFRM_POLICY */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */ + { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */ + { IPV6_RECVPKTINFO, sizeof (int) }, /* IPV6_RECVPKTINFO */ + { IPV6_PKTINFO, 0 }, /* IPV6_PKTINFO */ + { IPV6_RECVHOPLIMIT, sizeof (int) }, /* IPV6_RECVHOPLIMIT */ + { IPV6_HOPLIMIT, 0 }, /* IPV6_HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_RECVHOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDRDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RECVRTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RECVDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_DSTOPTS */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* IPV6_RECVTCLASS */ + { IPV6_TCLASS, sizeof (int) } /* IPV6_TCLASS */ +}; + +static const lx_sockopt_map_t ltos_icmpv6_sockopts[LX_ICMP6_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { ICMP6_FILTER, 0 } /* ICMP6_FILTER */ +}; + +/* + * Options marked as "in code" in their comment are handled in the + * lx_setsockopt_tcp() and lx_getsockopt_tcp() functions. + * + * For the Linux TCP_SYNCNT option (the number of SYN retransmits) we emulate + * that by interpreting the two connection interval settings: + * TCP_CONN_NOTIFY_THRESHOLD + * tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval + * TCP_CONN_ABORT_THRESHOLD + * tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval + * The system (re)transmits a SYN and performs a doubling backoff from the + * first timer until it passes the second timer. We determine the SYN count + * from these two values. Normally it will be 5. Also see the TCPS_SYN_SENT + * case in tcp_timer(); a tcp_second_ctimer_threshold value of 0 means to + * retransmit SYN indefinitely. + * + * For the Linux TCP_USER_TIMEOUT option we use our TCP_ABORT_THRESHOLD since + * this seems to be the closest match. This value is the + * tcp_second_timer_threshold, which gets initialized to the + * tcp_ip_abort_interval value. The tunable guide describes this as: + * For a given TCP connection, if TCP has been retransmitting for + * tcp_ip_abort_interval period of time and it has not received any + * acknowledgment from the other endpoint during this period, TCP closes + * this connection. + * The value is in milliseconds, which matches TCP_USER_TIMEOUT. + */ +static const lx_sockopt_map_t ltos_tcp_sockopts[LX_TCP_NOTSENT_LOWAT + 1] = { + { OPTNOTSUP, 0 }, + { TCP_NODELAY, sizeof (int) }, /* TCP_NODELAY */ + { TCP_MAXSEG, sizeof (int) }, /* TCP_MAXSEG - in code */ + { TCP_CORK, sizeof (int) }, /* TCP_CORK */ + { TCP_KEEPIDLE, sizeof (int) }, /* TCP_KEEPIDLE */ + { TCP_KEEPINTVL, sizeof (int) }, /* TCP_KEEPINTVL */ + { TCP_KEEPCNT, sizeof (int) }, /* TCP_KEEPCNT */ + { OPTNOTSUP, 0 }, /* TCP_SYNCNT - in code */ + { TCP_LINGER2, sizeof (int) }, /* TCP_LINGER2 */ + { OPTNOTSUP, 0 }, /* TCP_DEFER_ACCEPT - in code */ + { OPTNOTSUP, 0 }, /* TCP_WINDOW_CLAMP - in code */ + { OPTNOTSUP, 0 }, /* TCP_INFO */ + { OPTNOTSUP, 0 }, /* TCP_QUICKACK - in code */ + { OPTNOTSUP, 0 }, /* TCP_CONGESTION */ + { OPTNOTSUP, 0 }, /* TCP_MD5SIG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* TCP_THIN_LINEAR_TIMEOUTS */ + { OPTNOTSUP, 0 }, /* TCP_THIN_DUPACK */ + { TCP_ABORT_THRESHOLD, sizeof (int) }, /* TCP_USER_TIMEOUT */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_QUEUE */ + { OPTNOTSUP, 0 }, /* TCP_QUEUE_SEQ */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_OPTIONS */ + { OPTNOTSUP, 0 }, /* TCP_FASTOPEN */ + { OPTNOTSUP, 0 }, /* TCP_TIMESTAMP */ + { OPTNOTSUP, 0 } /* TCP_NOTSENT_LOWAT */ +}; + +static const lx_sockopt_map_t ltos_igmp_sockopts[IGMP_MTRACE + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MINLEN, 0 }, /* IGMP_MINLEN */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MEMBERSHIP_QUERY, 0 }, /* IGMP_HOST_MEMBERSHIP_QUERY */ + { IGMP_V1_MEMBERSHIP_REPORT, 0 }, /* IGMP_HOST_MEMBERSHIP_REPORT */ + { IGMP_DVMRP, 0 }, /* IGMP_DVMRP */ + { IGMP_PIM, 0 }, /* IGMP_PIM */ + { OPTNOTSUP, 0 }, /* IGMP_TRACE */ + { IGMP_V2_MEMBERSHIP_REPORT, 0 }, /* IGMPV2_HOST_MEMBERSHIP_REPORT */ + { IGMP_V2_LEAVE_GROUP, 0 }, /* IGMP_HOST_LEAVE_MESSAGE */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MTRACE_RESP, 0 }, /* IGMP_MTRACE_RESP */ + { IGMP_MTRACE, 0 } /* IGMP_MTRACE */ +}; + +static const lx_sockopt_map_t ltos_socket_sockopts[LX_SO_BPF_EXTENSIONS + 1] = { + { OPTNOTSUP, 0 }, + { SO_DEBUG, sizeof (int) }, /* SO_DEBUG */ + { SO_REUSEADDR, sizeof (int) }, /* SO_REUSEADDR */ + { SO_TYPE, 0 }, /* SO_TYPE */ + { SO_ERROR, 0 }, /* SO_ERROR */ + { SO_DONTROUTE, sizeof (int) }, /* SO_DONTROUTE */ + { SO_BROADCAST, sizeof (int) }, /* SO_BROADCAST */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUF */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUF */ + { SO_KEEPALIVE, sizeof (int) }, /* SO_KEEPALIVE */ + { SO_OOBINLINE, sizeof (int) }, /* SO_OOBINLINE */ + { OPTNOTSUP, 0 }, /* SO_NO_CHECK */ + { OPTNOTSUP, 0 }, /* SO_PRIORITY */ + { SO_LINGER, 0 }, /* SO_LINGER */ + { OPTNOTSUP, 0 }, /* SO_BSDCOMPAT */ + { SO_REUSEPORT, sizeof (int) }, /* SO_REUSEPORT */ + { SO_RECVUCRED, sizeof (int) }, /* SO_PASSCRED */ + { OPTNOTSUP, 0 }, /* SO_PEERCRED */ + { SO_RCVLOWAT, sizeof (int) }, /* SO_RCVLOWAT */ + { SO_SNDLOWAT, sizeof (int) }, /* SO_SNDLOWAT */ + { SO_RCVTIMEO, 0 }, /* SO_RCVTIMEO */ + { SO_SNDTIMEO, 0 }, /* SO_SNDTIMEO */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_AUTHENTICATION */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_TRANSPORT */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_NETWORK */ + { OPTNOTSUP, 0 }, /* SO_BINDTODEVICE */ + { SO_ATTACH_FILTER, 0 }, /* SO_ATTACH_FILTER */ + { SO_DETACH_FILTER, 0 }, /* SO_DETACH_FILTER */ + { OPTNOTSUP, 0 }, /* SO_PEERNAME */ + { SO_TIMESTAMP, sizeof (int) }, /* SO_TIMESTAMP */ + { SO_ACCEPTCONN, 0 }, /* SO_ACCEPTCONN */ + { OPTNOTSUP, 0 }, /* SO_PEERSEC */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUFFORCE */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUFFORCE */ + { OPTNOTSUP, 0 }, /* SO_PASSSEC */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPNS */ + { OPTNOTSUP, 0 }, /* SO_MARK */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPING */ + { SO_PROTOTYPE, 0 }, /* SO_PROTOCOL */ + { SO_DOMAIN, 0 }, /* SO_DOMAIN */ + { OPTNOTSUP, 0 }, /* SO_RXQ_OVFL */ + { OPTNOTSUP, 0 }, /* SO_WIFI_STATUS */ + { OPTNOTSUP, 0 }, /* SO_PEEK_OFF */ + { OPTNOTSUP, 0 }, /* SO_NOFCS */ + { OPTNOTSUP, 0 }, /* SO_LOCK_FILTER */ + { OPTNOTSUP, 0 }, /* SO_SELECT_ERR_QUEUE */ + { OPTNOTSUP, 0 }, /* SO_BUSY_POLL */ + { OPTNOTSUP, 0 }, /* SO_MAX_PACING_RATE */ + { OPTNOTSUP, 0 } /* SO_BPF_EXTENSIONS */ +}; + +static const lx_sockopt_map_t ltos_raw_sockopts[LX_ICMP_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 } /* ICMP_FILTER */ +}; + +static const lx_sockopt_map_t ltos_packet_sockopts[LX_PACKET_STATISTICS + 1] = { + { OPTNOTSUP, 0 }, + { PACKET_ADD_MEMBERSHIP, 0 }, /* PACKET_ADD_MEMBERSHIP */ + { PACKET_DROP_MEMBERSHIP, 0 }, /* PACKET_DROP_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* PACKET_RECV_OUTPUT */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* PACKET_RX_RING */ + { PACKET_STATISTICS, 0 } /* PACKET_STATISTICS */ +}; + +/* Needed for SO_ATTACH_FILTER */ +struct lx_bpf_program { + unsigned short bf_len; + caddr_t bf_insns; +}; + +/* Invert filter fields as Linux expects */ +#define LX_ICMP6_FILTER_INVERT(filterp) ( \ + ((filterp)->__icmp6_filt[0] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[1] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[2] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[3] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[4] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[5] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[6] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[7] ^= 0xFFFFFFFFU)) + +static boolean_t +lx_sockopt_lookup(lx_proto_opts_t tbl, int *optname, socklen_t *optlen) +{ + const lx_sockopt_map_t *entry; + + if (*optname > tbl.lpo_max) { + return (B_FALSE); + } + entry = &tbl.lpo_entries[*optname]; + if (entry->lsm_opt == OPTNOTSUP) { + return (B_FALSE); + } + *optname = entry->lsm_opt; + /* Truncate the optlen if needed/allowed */ + if (entry->lsm_lcap != 0 && *optlen > entry->lsm_lcap) { + *optlen = entry->lsm_lcap; + } + return (B_TRUE); +} + +static int +lx_mcast_common(sonode_t *so, int level, int optname, void *optval, + socklen_t optlen) +{ + int error; + struct group_req gr; + lx_sockaddr_storage_t *lxss; + + ASSERT(optname == LX_MCAST_JOIN_GROUP || + optname == LX_MCAST_LEAVE_GROUP); + + /* + * For MCAST_JOIN_GROUP and MCAST_LEAVE_GROUP, Linux uses a + * gr_group that has a different size from the native gr_group. + * We need to translate to the native gr_group taking special + * care to do the right thing when dealing with a 32-bit program + * making a call into a 64-bit kernel. + */ + + bzero(&gr, sizeof (gr)); + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + if (optlen != sizeof (lx_group_req32_t)) { + return (EINVAL); + } + + lx_group_req32_t *lxgr = optval; + + /* use the 32-bit type */ + gr.gr_interface = lxgr->lxgr_interface; + lxss = &lxgr->lxgr_group; + } else +#endif /* defined(_SYSCALL32_IMPL) */ + { + if (optlen != sizeof (lx_group_req_t)) { + return (EINVAL); + } + + lx_group_req_t *lxgr = optval; + + gr.gr_interface = lxgr->lxgr_interface; + lxss = &lxgr->lxgr_group; + } + + bcopy(lxss, &gr.gr_group, sizeof (*lxss)); + gr.gr_group.ss_family = LTOS_FAMILY(lxss->lxss_family); + + optlen = sizeof (gr); + optname = (optname == LX_MCAST_JOIN_GROUP) ? + MCAST_JOIN_GROUP : MCAST_LEAVE_GROUP; + + error = socket_setsockopt(so, level, optname, &gr, + optlen, CRED()); + return (error); +} + +static int +lx_setsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + switch (optname) { + case LX_IP_RECVERR: + /* + * Ping sets this option to receive errors on raw sockets. + * Currently we just ignore it to make ping happy. From the + * Linux ip.7 man page: + * + * For raw sockets, IP_RECVERR enables passing of all + * received ICMP errors to the application. + * + * Programs known to depend upon this: + * - ping + * - traceroute + * - mount.nfs + */ + return (0); + + case LX_IP_MTU_DISCOVER: { + int val; + + /* + * We translate Linux's IP_MTU_DISCOVER into our IP_DONTFRAG, + * allowing this be a byte or an integer and observing the + * inverted sense of the two relative to one another (and + * translating accordingly). + */ + if (optlen < sizeof (int)) { + val = *((uint8_t *)optval); + } else { + val = *((int *)optval); + } + + switch (val) { + case LX_IP_PMTUDISC_DONT: + val = 1; + break; + + case LX_IP_PMTUDISC_DO: + case LX_IP_PMTUDISC_WANT: + val = 0; + break; + + default: + return (EOPNOTSUPP); + } + + error = socket_setsockopt(so, IPPROTO_IP, IP_DONTFRAG, + &val, sizeof (val), CRED()); + return (error); + } + + case LX_IP_MULTICAST_TTL: + case LX_IP_MULTICAST_LOOP: + /* + * For IP_MULTICAST_TTL and IP_MULTICAST_LOOP, Linux defines + * the option value to be an integer while we define it to be + * an unsigned character. To prevent the kernel from spitting + * back an error on an illegal length, verify that the option + * value is less than UCHAR_MAX before truncating optlen. + */ + if (optlen <= 0 || optlen > sizeof (int) || + *intval > UINT8_MAX) { + return (EINVAL); + } + optlen = sizeof (uchar_t); + break; + + case LX_MCAST_JOIN_GROUP: + case LX_MCAST_LEAVE_GROUP: + error = lx_mcast_common(so, IPPROTO_IP, optname, optval, + optlen); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + switch (optname) { + case LX_IPV6_MTU: + /* + * There isn't a good translation for IPV6_MTU and certain apps + * such as bind9 will bail if it cannot be set. + * We just lie about the success for now. + */ + return (0); + case LX_MCAST_JOIN_GROUP: + case LX_MCAST_LEAVE_GROUP: + error = lx_mcast_common(so, IPPROTO_IPV6, optname, optval, + optlen); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_IPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_icmpv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER && optval != NULL) { + /* + * Surprise! The input to ICMP6_FILTER on Linux is inverted + * when compared to illumos. + */ + if (optlen != sizeof (icmp6_filter_t)) { + return (EINVAL); + } + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + cred_t *cr = CRED(); + uint32_t rto_max, abrt_thresh; + boolean_t abrt_changed = B_FALSE, rto_max_changed = B_FALSE; + + if (optname == LX_TCP_WINDOW_CLAMP || optname == LX_TCP_QUICKACK) { + /* It appears safe to lie and say we did these. */ + return (0); + } + + if (optname == LX_TCP_MAXSEG) { + /* + * We can get, but not set, TCP_MAXSEG. However, it appears + * safe to lie and say we did this. A future extension might + * be to allow setting this before a connection is established. + */ + return (0); + } + + if (optname == LX_TCP_SYNCNT) { + int intval; + uint64_t syn_last_backoff; + uint_t syn_cnt, syn_backoff, len; + + /* + * See the comment above the ltos_tcp_sockopts table for an + * explanation of the TCP_SYNCNT emulation. + */ + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = *(int *)optval; + if (intval > 255) { + return (EINVAL); + } + + len = sizeof (syn_backoff); + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0, cr); + if (error != 0) + return (error); + + syn_last_backoff = syn_backoff; + for (syn_cnt = 0; syn_cnt < intval; syn_cnt++) { + syn_last_backoff *= 2; + /* + * Since the tcps_ip_abort_cinterval is milliseconds and + * stored as a uint_t, it's basically impossible to get + * up to the Linux limit of 255 SYN retries due to the + * doubling on the backoff. + */ + if (syn_last_backoff > UINT_MAX) { + return (EINVAL); + } + } + + syn_backoff = (uint_t)syn_last_backoff; + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_CONN_ABORT_THRESHOLD, &syn_backoff, len, cr); + return (error); + } + + if (optname == LX_TCP_DEFER_ACCEPT) { + int *intval; + char *dfp; + + /* + * Emulate TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we can't emulate the timeout aspect so treat any + * non-zero value as enabling and zero as disabling. + */ + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + + /* + * socket_setsockopt asserts that the optval is aligned, so + * we use kmem_alloc to ensure this. + */ + dfp = (char *)kmem_alloc(sizeof (DATAFILT), KM_SLEEP); + (void) strcpy(dfp, DATAFILT); + + if (*intval > 0) { + error = socket_setsockopt(so, SOL_FILTER, FIL_ATTACH, + dfp, 9, cr); + if (error == EEXIST) { + error = 0; + } + } else { + error = socket_setsockopt(so, SOL_FILTER, FIL_DETACH, + dfp, 9, cr); + if (error == ENXIO) { + error = 0; + } + } + kmem_free(dfp, sizeof (DATAFILT)); + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + if (optname == TCP_KEEPINTVL) { + /* + * When setting TCP_KEEPINTVL there is an unfortunate set of + * dependencies. TCP_KEEPINTVL must be <= TCP_RTO_MAX and + * TCP_RTO_MAX must be <= TCP_ABORT_THRESHOLD. Thus, we may + * have to increase one or both of these in order to increase + * TCP_KEEPINTVL. Note that TCP_KEEPINTVL is passed in seconds + * but TCP_RTO_MAX and TCP_ABORT_THRESHOLD are in milliseconds. + * Also note that we currently make no attempt to handle + * concurrent application threads simultaneously changing + * TCP_KEEPINTVL, since that is unlikely. We could revisit + * locking if it ever becomes an issue. + */ + uint32_t new_val = *(uint_t *)optval * 1000; + uint32_t len; + + /* + * Linux limits this to 32k, so we do too. However, anything + * over 2 hours (7200000 ms) will fail anyway due to the + * system-wide default (see "_rexmit_interval_max" in + * tcp_tunables.c). Our 2 hour default seems reasonable as a + * practical limit for now. + */ + if (*(uint_t *)optval > SHRT_MAX) + return (EINVAL); + + len = sizeof (rto_max); + if ((error = socket_getsockopt(so, IPPROTO_TCP, TCP_RTO_MAX, + &rto_max, &len, 0, cr)) != 0) + return (error); + len = sizeof (abrt_thresh); + if ((error = socket_getsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &abrt_thresh, &len, 0, cr)) != 0) + return (error); + + if (new_val > abrt_thresh) { + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &new_val, sizeof (new_val), + cr); + if (error != 0) + goto fail; + abrt_changed = B_TRUE; + } + if (new_val > rto_max) { + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_RTO_MAX, &new_val, sizeof (new_val), cr); + if (error != 0) + goto fail; + rto_max_changed = B_TRUE; + } + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, cr); + +fail: + if (error != 0 && optname == TCP_KEEPINTVL) { + /* + * If changing TCP_KEEPINTVL failed then we may need to + * restore the previous values for TCP_ABORT_THRESHOLD and + * TCP_RTO_MAX. + */ + if (rto_max_changed) { + (void) socket_setsockopt(so, IPPROTO_TCP, + TCP_RTO_MAX, &rto_max, + sizeof (rto_max), cr); + } + if (abrt_changed) { + (void) socket_setsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &abrt_thresh, + sizeof (abrt_thresh), cr); + } + } + + return (error); +} + +static int +lx_setsockopt_socket(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + struct lx_bpf_program *lbp; + int *intval; + struct bpf_program bp; + + switch (optname) { + case LX_SO_BSDCOMPAT: + /* Linux ignores this option. */ + return (0); + + case LX_SO_TIMESTAMP: + /* + * SO_TIMESTAMP is not supported on AF_UNIX sockets but we have + * some of those which apps use for logging, etc., so pretend + * this worked. + */ + if (so->so_family == AF_UNIX) { + return (0); + } + break; + + case LX_SO_ATTACH_FILTER: + /* + * Convert bpf program struct + */ + if (optlen != sizeof (struct lx_bpf_program)) { + return (EINVAL); + } + lbp = (struct lx_bpf_program *)optval; + bp.bf_len = lbp->bf_len; + /* LINTED: alignment */ + bp.bf_insns = (struct bpf_insn *)lbp->bf_insns; + optval = &bp; + break; + + case LX_SO_PASSSEC: + /* + * SO_PASSSEC is very similar to SO_PASSCRED (emulated by + * SO_RECVUCRED) in that it requests that cmsgs containing + * identity information be attached to recieved messages. + * Instead of ucred information, security-module-specific + * information such as selinux label is expected + * + * Since LX does not at all support selinux today, the + * option is silently accepted. + */ + return (0); + + case LX_SO_PASSCRED: + /* + * In many cases, the Linux SO_PASSCRED is mapped to the SunOS + * SO_RECVUCRED to enable the passing of peer credential + * information via received cmsgs. One exception is for + * connection-oriented AF_UNIX sockets which do not yet support + * that option. Instead, we track the setting internally and, + * when there is appropriate cmsg space, emulate the credential + * passing by querying the STREAMS ioctl. + * + * Note: this approach is broken for the case when a process + * sets up a Unix-domain socket with SO_PASSCRED, then forks + * one or more children, and expects to use the cmsg cred to + * accurately know which child pid sent the message (currently + * a pid is recorded when the socket is connected, not for each + * msg sent). getpeerucred(3c) suffers from the same problem. + * We have a workaround in lx_socketpair (use DGRAM if + * SEQPACKET), but the general case requires enhancing our + * streams support to allow passing credential cmsgs on a + * connection-oriented Unix socket. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + lx_socket_aux_data_t *sad; + + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + sad = lx_sad_acquire(SOTOV(so)); + if (*intval == 0) { + sad->lxsad_flags &= ~LXSAD_FL_STRCRED; + } else { + sad->lxsad_flags |= LXSAD_FL_STRCRED; + } + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_SOCKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + switch (optname) { + case LX_ICMP_FILTER: + /* + * This option is currently ignored to appease ping. + */ + return (0); + + case LX_IPV6_CHECKSUM: + /* + * Ping6 tries to set the IPV6_CHECKSUM offset in a way that + * illumos won't allow. Quietly ignore this to prevent it from + * complaining. + */ + return (0); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_packet(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + struct packet_mreq *mr; + + switch (optname) { + case LX_PACKET_ADD_MEMBERSHIP: + case LX_PACKET_DROP_MEMBERSHIP: + /* Convert Linux mr_type to illumos */ + if (optlen != sizeof (struct packet_mreq)) { + return (EINVAL); + } + mr = (struct packet_mreq *)optval; + if (--mr->mr_type > PACKET_MR_ALLMULTI) + return (EINVAL); + optval = mr; + break; + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_PACKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IGMP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_getsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IPV6, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_icmpv6(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER) { + error = socket_getsockopt(so, IPPROTO_ICMPV6, ICMP6_FILTER, + optval, optlen, 0, CRED()); + + /* + * ICMP6_FILTER is inverted on Linux. Make it so before copying + * back to caller's buffer. + */ + if (error == 0) { + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + 0, CRED()); + return (error); +} + +static int +lx_getsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + cred_t *cr = CRED(); + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + + switch (optname) { + case LX_TCP_WINDOW_CLAMP: + case LX_TCP_QUICKACK: + /* + * We do not support these options but some apps rely on them. + * Rather than return an error we just return 0. This isn't + * exactly a lie, since the options really aren't set, but it's + * not the whole truth either. Fortunately, we aren't under + * oath. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_TCP_SYNCNT: + /* + * See the comment above the ltos_tcp_sockopts table for an + * explanation of the TCP_SYNCNT emulation. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + uint_t syn_cnt, syn_backoff, syn_abortconn, len; + + len = sizeof (syn_backoff); + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0, + cr); + if (error != 0) + return (error); + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_ABORT_THRESHOLD, &syn_abortconn, &len, 0, + cr); + if (error != 0) + return (error); + + syn_cnt = 0; + while (syn_backoff < syn_abortconn) { + syn_cnt++; + syn_backoff *= 2; + } + if (syn_cnt > 255) /* clamp to Linux limit */ + syn_cnt = 255; + + *intval = syn_cnt; + *optlen = sizeof (int); + } + + return (error); + + case LX_TCP_DEFER_ACCEPT: + /* + * We do support TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we don't emulate the timeout aspect so treat the + * existence as 1 and absence as 0. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + struct fil_info fi[10]; + int i; + socklen_t len = sizeof (fi); + + if ((error = socket_getsockopt(so, SOL_FILTER, + FIL_LIST, fi, &len, 0, cr)) != 0) { + *optlen = sizeof (int); + return (error); + } + + *intval = 0; + len = len / sizeof (struct fil_info); + for (i = 0; i < len; i++) { + if (fi[i].fi_flags == FILF_PROG && + strcmp(fi[i].fi_name, "datafilt") == 0) { + *intval = 1; + break; + } + } + } + *optlen = sizeof (int); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_TCP, optname, optval, optlen, 0, + cr); + return (error); +} + +static int +lx_getsockopt_socket(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + + switch (optname) { + case LX_SO_TYPE: + /* + * Special handling for connectionless AF_UNIX sockets. + * See lx_socketpair for more details. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) == 0) { + lx_socket_aux_data_t *sad; + + if (*optlen < sizeof (int)) + return (EINVAL); + sad = lx_sad_acquire(SOTOV(so)); + if ((sad->lxsad_flags & LXSAD_FL_EMULSEQPKT) != 0) { + *intval = LX_SOCK_SEQPACKET; + *optlen = sizeof (int); + mutex_exit(&sad->lxsad_lock); + return (0); + } + mutex_exit(&sad->lxsad_lock); + } + break; + + case LX_SO_PASSSEC: + /* + * Communicate value of 0 since selinux-related functionality + * is not supported. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_SO_PASSCRED: + /* + * Special handling for connection-oriented AF_UNIX sockets. + * See lx_setsockopt_socket for more details. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + lx_socket_aux_data_t *sad; + + if (*optlen < sizeof (int)) { + return (EINVAL); + } + sad = lx_sad_acquire(SOTOV(so)); + *intval = ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0 ? + 0 : 1); + *optlen = sizeof (int); + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + + case LX_SO_PEERCRED: + if (*optlen < sizeof (struct lx_ucred)) { + error = EINVAL; + } else { + struct lx_ucred *lcred = (struct lx_ucred *)optval; + + mutex_enter(&so->so_lock); + if ((so->so_mode & SM_CONNREQUIRED) == 0) { + error = ENOTSUP; + } else if (so->so_peercred == NULL) { + error = EINVAL; + } else { + lcred->lxu_uid = crgetuid(so->so_peercred); + lcred->lxu_gid = crgetgid(so->so_peercred); + lcred->lxu_pid = so->so_cpid; + } + mutex_exit(&so->so_lock); + } + *optlen = sizeof (struct lx_ucred); + return (error); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_SOCKET, optname, optval, optlen, 0, + CRED()); + + if (error == 0) { + switch (optname) { + case SO_TYPE: + /* translate our type back to Linux */ + *intval = STOL_SOCKTYPE(*intval); + break; + + case SO_ERROR: + *intval = lx_errno(*intval, EINVAL); + break; + default: + break; + } + } + return (error); +} + +static int +lx_getsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_RAW, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_packet(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_PACKET, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IGMP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +long +lx_setsockopt(int sock, int level, int optname, void *optval, socklen_t optlen) +{ + struct sonode *so; + file_t *fp; + int buflen = 0; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + int error = 0; + + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_alloc(optlen, KM_SLEEP); + } else { + /* + * Zero the on-stack buffer to avoid poisoning smaller + * optvals with stack garbage. + */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + if (copyin(optval, optbuf, optlen) != 0) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(EFAULT)); + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_setsockopt_ip(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_setsockopt_ipv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_setsockopt_icmpv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_TCP: + error = lx_setsockopt_tcp(so, optname, optbuf, optlen); + break; + case LX_SOL_SOCKET: + error = lx_setsockopt_socket(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_RAW: + error = lx_setsockopt_raw(so, optname, optbuf, optlen); + break; + case LX_SOL_PACKET: + error = lx_setsockopt_packet(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_setsockopt_igmp(so, optname, optbuf, optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_setsockopt(so, LX_SOL_NETLINK, optname, optval, + optlen, CRED()); + break; + default: + error = ENOPROTOOPT; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + (void) snprintf(buf, LX_UNSUP_BUFSZ, "setsockopt(%d, %d)", + level, optname); + lx_unsupported(buf); + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getsockopt(int sock, int level, int optname, void *optval, + socklen_t *optlenp) +{ + struct sonode *so; + file_t *fp; + int error = 0, buflen = 0; + socklen_t optlen; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + + if (copyin(optlenp, &optlen, sizeof (optlen)) != 0) { + return (set_errno(EFAULT)); + } + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_zalloc(optlen, KM_SLEEP); + } else { + /* zero the on-stack buffer, just in case */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_getsockopt_ip(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_getsockopt_ipv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_getsockopt_icmpv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_TCP: + error = lx_getsockopt_tcp(so, optname, optbuf, &optlen); + break; + case LX_SOL_SOCKET: + error = lx_getsockopt_socket(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_RAW: + error = lx_getsockopt_raw(so, optname, optbuf, &optlen); + break; + case LX_SOL_PACKET: + error = lx_getsockopt_packet(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_getsockopt_igmp(so, optname, optbuf, &optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_getsockopt(so, LX_SOL_NETLINK, optname, optval, + &optlen, 0, CRED()); + break; + default: + error = EOPNOTSUPP; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + (void) snprintf(buf, LX_UNSUP_BUFSZ, "getsockopt(%d, %d)", + level, optname); + lx_unsupported(buf); + } + if (copyout(&optlen, optlenp, sizeof (optlen)) != 0) { + error = EFAULT; + } + if (error == 0 && optlen > 0) { + VERIFY(optlen <= sizeof (stkbuf) || optlen <= buflen); + if (copyout(optbuf, optval, optlen) != 0) { + error = EFAULT; + } + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getname_common(lx_getname_type_t type, int sockfd, void *np, int *nlp) +{ + struct sockaddr_storage buf; + struct sockaddr *name = (struct sockaddr *)&buf; + socklen_t namelen, namelen_orig; + int err, tmp; + struct sonode *so; + + /* We need to validate the name address up front to pass LTP. */ + if (copyin(np, &tmp, sizeof (tmp)) != 0) + return (set_errno(EFAULT)); + + if (copyin(nlp, &namelen, sizeof (socklen_t)) != 0) + return (set_errno(EFAULT)); + namelen_orig = namelen; + + /* LTP can pass -1 */ + if ((int)namelen < 0) + return (set_errno(EINVAL)); + + if ((so = getsonode(sockfd, &err, NULL)) == NULL) + return (set_errno(err)); + + bzero(&buf, sizeof (buf)); + namelen = sizeof (struct sockaddr_storage); + if (type == LX_GETPEERNAME) { + err = socket_getpeername(so, name, &namelen, B_FALSE, CRED()); + } else { + err = socket_getsockname(so, name, &namelen, CRED()); + } + + if (err == 0) { + ASSERT(namelen <= so->so_max_addr_len); + err = stol_sockaddr_copyout(name, namelen, + (struct sockaddr *)np, (socklen_t *)nlp, namelen_orig); + } + + releasef(sockfd); + return (err != 0 ? set_errno(err) : 0); +} + +long +lx_getpeername(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETPEERNAME, sockfd, np, nlp)); +} + +long +lx_getsockname(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETSOCKNAME, sockfd, np, nlp)); +} + +static int +lx_accept_common(int sock, struct sockaddr *name, socklen_t *nlp, int flags) +{ + struct sonode *so; + file_t *fp; + int error; + socklen_t namelen; + struct sonode *nso; + struct vnode *nvp; + struct file *nfp; + int nfd; + int arg; + + if (flags & ~(LX_SOCK_CLOEXEC | LX_SOCK_NONBLOCK)) { + return (set_errno(EINVAL)); + } + + if ((so = getsonode(sock, &error, &fp)) == NULL) + return (set_errno(error)); + + if (name != NULL) { + /* + * The Linux man page says that -1 is returned and errno is set + * to EFAULT if the "name" address is bad, but it is silent on + * what to set errno to if the "namelen" address is bad. + * LTP expects EINVAL. + * + * Note that we must first check the name pointer, as the Linux + * docs state nothing is copied out if the "name" pointer is + * NULL. If it is NULL, we don't care about the namelen + * pointer's value or about dereferencing it. + */ + if (copyin(nlp, &namelen, sizeof (namelen))) { + releasef(sock); + return (set_errno(EINVAL)); + } + if (namelen == 0) { + name = NULL; + } + } else { + namelen = 0; + } + + /* + * Allocate the user fd before socket_accept() in order to + * catch EMFILE errors before calling socket_accept(). + */ + if ((error = falloc(NULL, FWRITE|FREAD, &nfp, &nfd)) != 0) { + eprintsoline(so, EMFILE); + releasef(sock); + return (set_errno(error)); + } + if ((error = socket_accept(so, fp->f_flag, CRED(), &nso)) != 0) { + if (error == EINTR) + lx_sock_syscall_restart(so, B_TRUE); + setf(nfd, NULL); + unfalloc(nfp); + releasef(sock); + return (set_errno(error)); + } + + nvp = SOTOV(nso); + + if (namelen != 0) { + socklen_t addrlen = sizeof (struct sockaddr_storage); + struct sockaddr_storage buf; + struct sockaddr *addrp = (struct sockaddr *)&buf; + + if ((error = socket_getpeername(nso, addrp, &addrlen, B_TRUE, + CRED())) == 0) { + error = stol_sockaddr_copyout(addrp, addrlen, + name, nlp, namelen); + /* + * Logic might dictate that we should check if we can + * write to the namelen pointer earlier so we don't + * accept a pending connection only to fail the call + * because we can't write the namelen value back out. + * However, testing shows Linux does indeed fail the + * call after accepting the connection so we must + * behave in a compatible manner. + */ + } else { + ASSERT(error == EINVAL || error == ENOTCONN); + error = ECONNABORTED; + } + } + + if (error != 0) { + setf(nfd, NULL); + unfalloc(nfp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + releasef(sock); + return (set_errno(error)); + } + + /* Fill in the entries that falloc reserved */ + nfp->f_vnode = nvp; + mutex_exit(&nfp->f_tlock); + setf(nfd, nfp); + + /* Act on LX_SOCK_CLOEXEC from flags */ + if (flags & LX_SOCK_CLOEXEC) { + f_setfd(nfd, FD_CLOEXEC); + } + + /* + * In Linux, accept()ed sockets do not inherit anything set by fcntl(), + * so either explicitly set the flags or filter those out. + * + * The VOP_SETFL code is a simplification of the F_SETFL code in + * fcntl(). Ignore any errors from VOP_SETFL. + */ + arg = 0; + if (flags & LX_SOCK_NONBLOCK) + arg |= FNONBLOCK; + + error = VOP_SETFL(nvp, nfp->f_flag, arg, nfp->f_cred, NULL); + if (error != 0) { + eprintsoline(so, error); + error = 0; + } else { + mutex_enter(&nfp->f_tlock); + nfp->f_flag &= ~FMASK | (FREAD|FWRITE); + nfp->f_flag |= arg; + mutex_exit(&nfp->f_tlock); + } + + releasef(sock); + return (nfd); +} + +long +lx_accept(int sockfd, void *np, int *nlp) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, 0)); +} + +long +lx_accept4(int sockfd, void *np, int *nlp, int flags) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, flags)); +} + +long +lx_listen(int sockfd, int backlog) +{ + return (listen(sockfd, backlog, 0)); +} + +long +lx_shutdown(int sockfd, int how) +{ + return (shutdown(sockfd, how, 0)); +} + +/* + * Connect two sockets together for a socketpair. This is derived from + * so_socketpair, but forgoes the task of dealing with file descriptors. + */ +static int +lx_socketpair_connect(file_t *fp1, file_t *fp2) +{ + sonode_t *so1, *so2; + sotpi_info_t *sti1, *sti2; + struct sockaddr_ux name; + int error; + + so1 = VTOSO(fp1->f_vnode); + so2 = VTOSO(fp2->f_vnode); + sti1 = SOTOTPI(so1); + sti2 = SOTOTPI(so2); + + VERIFY(so1->so_ops == &sotpi_sonodeops && + so2->so_ops == &sotpi_sonodeops); + + if (so1->so_type == SOCK_DGRAM) { + /* + * Bind both sockets and connect them with each other. + */ + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + name.sou_family = AF_UNIX; + name.sou_addr = sti2->sti_ux_laddr; + error = socket_connect(so1, (struct sockaddr *)&name, + (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED()); + if (error) { + return (error); + } + name.sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, (struct sockaddr *)&name, + (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED()); + return (error); + } else { + sonode_t *nso; + + /* + * Bind both sockets, with 'so1' being a listener. Connect + * 'so2' to 'so1', doing so as nonblocking to avoid waiting for + * soaccept to complete. Accept the connection on 'so1', + * replacing the socket/vnode in 'fp1' with the new connection. + * + * We could simply call socket_listen() here (which would do the + * binding automatically) if the code didn't rely on passing + * _SOBIND_NOXLATE to the TPI implementation of socket_bind(). + */ + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC| + _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR, CRED()); + if (error) { + return (error); + } + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + + name.sou_family = AF_UNIX; + name.sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, + (struct sockaddr *)&name, + (socklen_t)sizeof (name), + FNONBLOCK, _SOCONNECT_NOXLATE, CRED()); + if (error != 0 && error != EINPROGRESS) { + return (error); + } + + error = socket_accept(so1, 0, CRED(), &nso); + if (error) { + return (error); + } + + /* wait for so2 being SS_CONNECTED */ + mutex_enter(&so2->so_lock); + error = sowaitconnected(so2, 0, 0); + mutex_exit(&so2->so_lock); + if (error != 0) { + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + return (error); + } + + (void) socket_close(so1, 0, CRED()); + socket_destroy(so1); + fp1->f_vnode = SOTOV(nso); + } + return (0); +} + +long +lx_socketpair(int domain, int type, int protocol, int *sv) +{ + int err, options, fds[2]; + file_t *fps[2]; + boolean_t emul_seqp = B_FALSE; + + /* + * For the special case of SOCK_SEQPACKET for AF_UNIX, we want to treat + * this as a SOCK_DGRAM. The semantics are similar, but our native code + * will not pass cmsg creds over a connection-oriented socket, unlike a + * connectionless one. Some Linux code depends on this for Unix-domain + * sockets. In particular, a sockopt of SO_PASSCRED, which we map into + * our native SO_RECVUCRED, must work across fork so that the correct + * pid of the sender is available in the cmsg. See the comment in + * lx_setsockopt_socket(). + */ + if (domain == LX_AF_UNIX && type == LX_SOCK_SEQPACKET) { + type = LX_SOCK_DGRAM; + emul_seqp = B_TRUE; + } + + if ((err = lx_convert_sock_args(domain, type, protocol, &domain, &type, + &options, &protocol)) != 0) { + return (set_errno(err)); + } + + if ((err = lx_socket_create(domain, type, protocol, options, &fps[0], + &fds[0])) != 0) { + return (set_errno(err)); + } + + /* + * While it seems silly to check the family after socket creation, this + * is done to appease LTP when it tries some outlandish combinations of + * domain/type/protocol. The socket_create function is relied upon to + * emit the expected errors. + */ + if (VTOSO(fps[0]->f_vnode)->so_family != AF_UNIX) { + lx_socket_destroy(fps[0], fds[0]); + return (set_errno(EOPNOTSUPP)); + } + + if ((err = lx_socket_create(domain, type, protocol, options, &fps[1], + &fds[1])) != 0) { + lx_socket_destroy(fps[0], fds[0]); + return (set_errno(err)); + } + + err = lx_socketpair_connect(fps[0], fps[1]); + if (err != 0) { + lx_socket_destroy(fps[0], fds[0]); + lx_socket_destroy(fps[1], fds[1]); + return (set_errno(err)); + } + + if (emul_seqp) { + int i; + for (i = 0; i < 2; i++) { + sonode_t *so = VTOSO(fps[i]->f_vnode); + lx_socket_aux_data_t *sad = lx_sad_acquire(SOTOV(so)); + sad->lxsad_flags |= LXSAD_FL_EMULSEQPKT; + mutex_exit(&sad->lxsad_lock); + } + } + + setf(fds[0], fps[0]); + setf(fds[1], fps[1]); + + if ((options & SOCK_CLOEXEC) != 0) { + f_setfd(fds[0], FD_CLOEXEC); + f_setfd(fds[1], FD_CLOEXEC); + } + if (copyout(fds, sv, sizeof (fds)) != 0) { + (void) closeandsetf(fds[0], NULL); + (void) closeandsetf(fds[1], NULL); + return (set_errno(EFAULT)); + } + return (0); +} + + +#if defined(_SYSCALL32_IMPL) + +#define LX_SYS_SOCKETCALL 102 +#define LX_SOCKETCALL_MAX 20 + +typedef long (*lx_sockfn_t)(); + +static struct { + lx_sockfn_t s_fn; /* Function implementing the subcommand */ + int s_nargs; /* Number of arguments the function takes */ +} lx_socketcall_fns[] = { + lx_socket, 3, /* socket */ + lx_bind, 3, /* bind */ + lx_connect, 3, /* connect */ + lx_listen, 2, /* listen */ + lx_accept, 3, /* accept */ + lx_getsockname, 3, /* getsockname */ + lx_getpeername, 3, /* getpeername */ + lx_socketpair, 4, /* socketpair */ + lx_send, 4, /* send */ + lx_recv, 4, /* recv */ + lx_sendto, 6, /* sendto */ + lx_recvfrom, 6, /* recvfrom */ + lx_shutdown, 2, /* shutdown */ + lx_setsockopt, 5, /* setsockopt */ + lx_getsockopt, 5, /* getsockopt */ + lx_sendmsg, 3, /* sendmsg */ + lx_recvmsg, 3, /* recvmsg */ + lx_accept4, 4, /* accept4 */ + lx_recvmmsg, 5, /* recvmmsg */ + lx_sendmmsg, 4 /* sendmmsg */ +}; + +long +lx_socketcall(long p1, uint32_t *p2) +{ + int subcmd, i; + unsigned long args[6] = { 0, 0, 0, 0, 0, 0 }; + + /* incoming subcmds are 1-indexed */ + subcmd = (int)p1 - 1; + + if (subcmd < 0 || subcmd >= LX_SOCKETCALL_MAX || + lx_socketcall_fns[subcmd].s_fn == NULL) { + return (set_errno(EINVAL)); + } + + /* + * Copy the arguments to the subcommand in from the app's address + * space, returning EFAULT if we get a bogus pointer. + */ + for (i = 0; i < lx_socketcall_fns[subcmd].s_nargs; i++) { + uint32_t arg; + + if (copyin(&p2[i], &arg, sizeof (uint32_t)) != 0) { + return (set_errno(EFAULT)); + } + args[i] = (unsigned long)arg; + } + + return ((lx_socketcall_fns[subcmd].s_fn)(args[0], args[1], args[2], + args[3], args[4], args[5])); +} + +#endif /* defined(_SYSCALL32_IMPL) */ + +static void +lx_socket_vsd_free(void *data) +{ + lx_socket_aux_data_t *entry; + + entry = (lx_socket_aux_data_t *)data; + mutex_destroy(&entry->lxsad_lock); + kmem_free(entry, sizeof (*entry)); +} + +void +lx_socket_init() +{ + vsd_create(&lx_socket_vsd, lx_socket_vsd_free); +} + +void +lx_socket_fini() +{ + vsd_destroy(&lx_socket_vsd); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_splice.c b/usr/src/uts/common/brand/lx/syscall/lx_splice.c new file mode 100644 index 0000000000..64db538413 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_splice.c @@ -0,0 +1,491 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/fs/fifonode.h> +#include <sys/strsun.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_misc.h> +#include <sys/lx_signal.h> + +/* Splice flags */ +#define LX_SPLICE_F_MOVE 0x01 +#define LX_SPLICE_F_NONBLOCK 0x02 +#define LX_SPLICE_F_MORE 0x04 +#define LX_SPLICE_F_GIFT 0x08 + +/* + * Use a max buffer size of 32k. This is a good compromise between doing I/O in + * large chunks, the limit on how much data we can write into an lx pipe by + * default (LX_DEFAULT_PIPE_SIZE), and how much kernel memory we'll allocate. + */ +#define LX_SPL_BUF_SIZE (32 * 1024) + +/* + * We only want to read as much from the input fd as we can write into the + * output fd, up to our buffer size. Figure out what that quantity is. + * Note that len will continuously decrease to 0 which triggers the typical + * end of the splice loop. + */ +static size_t +lx_spl_wr_sz(file_t *fp_out, u_offset_t fileoff, size_t bsz, size_t len, + boolean_t first) +{ + size_t sz; + + sz = MIN(bsz, len); + if (fp_out->f_vnode->v_type == VFIFO) { + /* + * If no readers on pipe, or if it would go over high water + * mark then return 0. Note that the first write into a + * pipe is expected to block if we're over the high water mark. + */ + fifonode_t *fn_dest = VTOF(fp_out->f_vnode)->fn_dest; + fifolock_t *fn_lock = fn_dest->fn_lock; + + mutex_enter(&fn_lock->flk_lock); + if (fn_dest->fn_rcnt == 0) { + sz = 0; + } else if (!first && + (sz + fn_dest->fn_count) > fn_dest->fn_hiwat) { + sz = 0; + } + mutex_exit(&fn_lock->flk_lock); + } else if (fp_out->f_vnode->v_type == VREG) { + if (fileoff >= curproc->p_fsz_ctl || + fileoff >= OFFSET_MAX(fp_out)) { + sz = 0; + } else { + sz = MIN(sz, (size_t)curproc->p_fsz_ctl - fileoff); + sz = MIN(sz, (size_t)OFFSET_MAX(fp_out) - fileoff); + } + } + + /* + * if (fp_out->f_vnode->v_type == VSOCK) + * + * There is no good way to determine if a socket is "full". A write for + * the different protocol implementations can return EWOULDBLOCK under + * different conditions, none of which we can easily check for in + * advance. + */ + + return (sz); +} + +/* + * The splice read function handles "reading" from a pipe and passes everything + * else along to our normal VOP_READ code path. + * + * When we have a pipe as our input, we don't want to consume the data out + * of the pipe until the write has succeeded. This aligns more closely with + * the Linux behavior when a write error occurs. Thus, when a pipe is the input + * and we got some data, we return with the fifo flagged as FIFORDBLOCK. This + * ensures that the data we're writing cannot be consumed by another thread + * until we consume it ourself. + * + * The pipe "read" code here is derived from the fifo I_PEEK code. + */ +static int +lx_spl_read(file_t *fp, uio_t *uiop, size_t *nread, boolean_t pipe_in, + boolean_t rd_pos) +{ + fifonode_t *fnp; + fifolock_t *fn_lock; + int count; + mblk_t *bp; + + if (!pipe_in) + return (lx_read_common(fp, uiop, nread, rd_pos)); + + ASSERT(fp->f_vnode->v_type == VFIFO); + fnp = VTOF(fp->f_vnode); + fn_lock = fnp->fn_lock; + *nread = 0; + + mutex_enter(&fn_lock->flk_lock); + + /* + * If the pipe has been switched to socket mode then this implies an + * internal programmatic error. Likewise, if it was switched to + * socket mode because we dropped the lock to set the stayfast flag. + */ + if ((fnp->fn_flag & FIFOFAST) == 0 || !fifo_stayfast_enter(fnp)) { + mutex_exit(&fn_lock->flk_lock); + return (EBADF); + } + + while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) { + fifonode_t *fn_dest = fnp->fn_dest; + + /* No writer, EOF */ + if (fn_dest->fn_wcnt == 0 || fn_dest->fn_rcnt == 0) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + return (0); + } + + /* If non-blocking, return EAGAIN otherwise 0. */ + if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + if (uiop->uio_fmode & FNONBLOCK) + return (EAGAIN); + return (0); + } + + /* Wait for data */ + fnp->fn_flag |= FIFOWANTR; + if (!cv_wait_sig_swap(&fnp->fn_wait_cv, &fn_lock->flk_lock)) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + return (EINTR); + } + } + + VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0); + VERIFY((fnp->fn_flag & FIFOSTAYFAST) != 0); + + /* Get up to our read size or whatever is currently available. */ + count = MIN(uiop->uio_resid, fnp->fn_count); + ASSERT(count > 0); + *nread = count; + bp = fnp->fn_mp; + while (count > 0) { + uint_t cnt = MIN(uiop->uio_resid, MBLKL(bp)); + + /* + * We have the input pipe locked and we know there is data + * available to consume. We're doing a UIO_SYSSPACE move into + * an internal buffer that we allocated in lx_splice() so + * this should never fail. + */ + VERIFY(uiomove((char *)bp->b_rptr, cnt, UIO_READ, uiop) == 0); + count -= cnt; + bp = bp->b_cont; + } + + fnp->fn_flag |= FIFORDBLOCK; + + mutex_exit(&fn_lock->flk_lock); + return (0); +} + +/* + * We've already "read" the data out of the pipe without actually consuming it. + * Here we update the pipe to consume the data and discard it. This is derived + * from the fifo_read code, except that we already know the amount of data + * in the pipe to consume and we don't have to actually move any data. + */ +static void +lx_spl_consume(file_t *fp, uint_t count) +{ + fifonode_t *fnp, *fn_dest; + fifolock_t *fn_lock; + + ASSERT(fp->f_vnode->v_type == VFIFO); + + fnp = VTOF(fp->f_vnode); + fn_lock = fnp->fn_lock; + + mutex_enter(&fn_lock->flk_lock); + VERIFY(fnp->fn_count >= count); + + while (count > 0) { + int bpsize = MBLKL(fnp->fn_mp); + int decr_size = MIN(bpsize, count); + + fnp->fn_count -= decr_size; + if (bpsize <= decr_size) { + mblk_t *bp = fnp->fn_mp; + fnp->fn_mp = fnp->fn_mp->b_cont; + freeb(bp); + } else { + fnp->fn_mp->b_rptr += decr_size; + } + + count -= decr_size; + } + + fnp->fn_flag &= ~FIFORDBLOCK; + fifo_stayfast_exit(fnp); + + fifo_wakereader(fnp, fn_lock); + + /* + * Wake up any blocked writers, processes sleeping on POLLWRNORM, or + * processes waiting for SIGPOLL. + */ + fn_dest = fnp->fn_dest; + if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) && + fnp->fn_count < fn_dest->fn_hiwat) { + fifo_wakewriter(fn_dest, fn_lock); + } + + /* Update vnode update access time */ + fnp->fn_atime = fnp->fn_dest->fn_atime = gethrestime_sec(); + + mutex_exit(&fn_lock->flk_lock); +} + +/* + * Transfer data from the input file descriptor to the output file descriptor + * without leaving the kernel. For Linux this is limited by it's kernel + * implementation which forces at least one of the file descriptors to be a + * pipe. Our implementation is likely quite different from the Linux + * one, which appears to play some VM tricks with shared pages from the pipe + * code. Instead, our implementation uses our normal VOP_READ/VOP_WRITE + * operations to internally move the data while using a single uio buffer. We + * implement the additional Linux behavior around the various checks and + * limitations. + * + * One key point on the read side is how we handle an input pipe. We don't + * want to consume the data out of the pipe until the write has succeeded. + * This aligns more closely with the Linux behavior when a write error occurs. + * The lx_spl_read() and lx_spl_consume() functions are used to handle this + * case. + */ +long +lx_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out, size_t len, + uint_t flags) +{ + int error = 0; + file_t *fp_in = NULL, *fp_out = NULL; + boolean_t found_pipe = B_FALSE, rd_pos = B_FALSE, wr_pos = B_FALSE; + boolean_t first = B_TRUE, pipe_in = B_FALSE; + iovec_t iov; + uio_t uio; + void *buf = NULL; + off_t r_off = 0, w_off = 0; + ushort_t r_flag, w_flag; + size_t bsize = 0, wr_sz, nread, nwrite, total = 0; + + /* + * Start by validating the inputs. + * + * Linux doesn't bother to check for valid flags, so neither do we. + * Also, aside from SPLICE_F_NONBLOCK, we ignore the rest of the + * flags since they're just hints to the Linux kernel implementation + * and have no effect on the proper functioning of the syscall. + */ + + if (len == 0) + return (0); + + if ((fp_in = getf(fd_in)) == NULL) { + error = EBADF; + goto done; + } + switch (fp_in->f_vnode->v_type) { + case VFIFO: + /* A fifo that is not in fast mode does not count as a pipe */ + if (((VTOF(fp_in->f_vnode))->fn_flag & FIFOFAST) != 0) { + found_pipe = B_TRUE; + pipe_in = B_TRUE; + } + /*FALLTHROUGH*/ + case VSOCK: + if (off_in != NULL) { + error = ESPIPE; + goto done; + } + break; + case VREG: + case VBLK: + case VCHR: + case VPROC: + if (off_in != NULL) { + if (copyin(off_in, &r_off, sizeof (r_off)) != 0) { + error = EFAULT; + goto done; + } + rd_pos = B_TRUE; + } + break; + default: + error = EBADF; + goto done; + } + r_flag = fp_in->f_flag; + if ((r_flag & FREAD) == 0) { + error = EBADF; + goto done; + } + + if ((fp_out = getf(fd_out)) == NULL) { + error = EBADF; + goto done; + } + switch (fp_out->f_vnode->v_type) { + case VFIFO: + found_pipe = B_TRUE; + /* Splicing to ourself returns EINVAL on Linux */ + if (pipe_in) { + fifonode_t *fnp = VTOF(fp_in->f_vnode); + if (VTOF(fp_out->f_vnode) == fnp->fn_dest) { + error = EINVAL; + goto done; + } + } + /*FALLTHROUGH*/ + case VSOCK: + if (off_out != NULL) { + error = ESPIPE; + goto done; + } + break; + case VREG: + case VBLK: + case VCHR: + case VPROC: + if (off_out != NULL) { + if (copyin(off_out, &w_off, sizeof (w_off)) != 0) { + error = EFAULT; + goto done; + } + wr_pos = B_TRUE; + } + break; + default: + error = EBADF; + goto done; + } + w_flag = fp_out->f_flag; + if ((w_flag & FWRITE) == 0) { + error = EBADF; + goto done; + } + /* Appending is invalid for output fd in splice */ + if ((w_flag & FAPPEND) != 0) { + error = EINVAL; + goto done; + } + + if (!found_pipe) { + error = EINVAL; + goto done; + } + + /* + * Check for non-blocking pipe operations. If no data in the input + * pipe, return EAGAIN. If the output pipe is full, return EAGAIN. + */ + if (flags & LX_SPLICE_F_NONBLOCK) { + fifonode_t *fn_dest; + + if (fp_in->f_vnode->v_type == VFIFO) { + fn_dest = VTOF(fp_in->f_vnode)->fn_dest; + if (fn_dest->fn_count == 0) { + error = EAGAIN; + goto done; + } + } + if (fp_out->f_vnode->v_type == VFIFO) { + fn_dest = VTOF(fp_out->f_vnode)->fn_dest; + fifolock_t *fn_lock = fn_dest->fn_lock; + mutex_enter(&fn_lock->flk_lock); + if (fn_dest->fn_count >= fn_dest->fn_hiwat) { + mutex_exit(&fn_lock->flk_lock); + error = EAGAIN; + goto done; + } + mutex_exit(&fn_lock->flk_lock); + } + } + + bsize = MIN(LX_SPL_BUF_SIZE, len); + + buf = kmem_alloc(bsize, KM_SLEEP); + bzero(&uio, sizeof (uio)); + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_llimit = curproc->p_fsz_ctl; + + /* + * Loop reading data from fd_in and writing to fd_out. This is + * controlled by how much of the requested data we can actually write, + * particularly when the destination is a pipe. This matches the Linux + * behavior, which may terminate earlier than the full 'len' if the + * pipe fills up. However, we need to block when writing into a full + * pipe on the first iteration of the loop. We already checked above + * for a full output pipe when non-blocking. + */ + while ((wr_sz = lx_spl_wr_sz(fp_out, w_off, bsize, len, first)) > 0) { + first = B_FALSE; + + /* (re)setup for a read */ + uio.uio_resid = iov.iov_len = wr_sz; /* only rd. max writable */ + iov.iov_base = buf; + uio.uio_offset = r_off; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_fmode = r_flag; + error = lx_spl_read(fp_in, &uio, &nread, pipe_in, rd_pos); + if (error != 0 || nread == 0) + break; + r_off = uio.uio_offset; + + /* Setup and perform a write from the same buffer */ + uio.uio_resid = iov.iov_len = nread; + iov.iov_base = buf; + uio.uio_offset = w_off; + uio.uio_extflg = UIO_COPY_DEFAULT; + uio.uio_fmode = w_flag; + error = lx_write_common(fp_out, &uio, &nwrite, wr_pos); + if (error != 0) { + if (pipe_in) { + /* Need to unblock reading from the fifo. */ + fifonode_t *fnp = VTOF(fp_in->f_vnode); + + mutex_enter(&fnp->fn_lock->flk_lock); + fnp->fn_flag &= ~FIFORDBLOCK; + fifo_stayfast_exit(fnp); + fifo_wakereader(fnp, fnp->fn_lock); + mutex_exit(&fnp->fn_lock->flk_lock); + } + break; + } + w_off = uio.uio_offset; + + /* + * If input is a pipe, then we can consume the amount of data + * out of the pipe that we successfully wrote. + */ + if (pipe_in) + lx_spl_consume(fp_in, nwrite); + + total += nwrite; + len -= nwrite; + } + +done: + if (buf != NULL) + kmem_free(buf, bsize); + if (fp_in != NULL) + releasef(fd_in); + if (fp_out != NULL) + releasef(fd_out); + if (error != 0) + return (set_errno(error)); + + return (total); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_stat.c b/usr/src/uts/common/brand/lx/syscall/lx_stat.c new file mode 100644 index 0000000000..9af0080138 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_stat.c @@ -0,0 +1,486 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/model.h> +#include <sys/mode.h> +#include <sys/stat.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> +#include <sys/brand.h> +#include <sys/ddi.h> + +/* From "uts/common/syscall/stat.c" */ +extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **); + +typedef struct lx_timespec32 { + int32_t ts_sec; + int32_t ts_nsec; +} lx_timespec32_t; + +typedef struct lx_timespec64 { + int64_t ts_sec; + int64_t ts_nsec; +}lx_timespec64_t; + +struct lx_stat32 { + uint16_t st_dev; + uint16_t st_pad1; + uint32_t st_ino; + uint16_t st_mode; + uint16_t st_nlink; + uint16_t st_uid; + uint16_t st_gid; + uint16_t st_rdev; + uint16_t st_pad2; + uint32_t st_size; + uint32_t st_blksize; + uint32_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint32_t st_pad3; + uint32_t st_pad4; +}; + +#pragma pack(4) +struct lx_stat64_32 { + uint64_t st_dev; + uint32_t st_pad1; + uint32_t st_small_ino; + uint32_t st_mode; + uint32_t st_nlink; + uint32_t st_uid; + uint32_t st_gid; + uint64_t st_rdev; + uint32_t st_pad2; + uint64_t st_size; + uint32_t st_blksize; + uint64_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint64_t st_ino; +}; +#pragma pack() + +#if defined(_LP64) +struct lx_stat64_64 { + uint64_t st_dev; + uint64_t st_ino; + uint64_t st_nlink; /* yes, the order really is */ + uint32_t st_mode; /* different for these two */ + uint32_t st_uid; + uint32_t st_gid; + uint32_t st_pad0; + uint64_t st_rdev; + int64_t st_size; + int64_t st_blksize; + int64_t st_blocks; + lx_timespec64_t st_atime; + lx_timespec64_t st_mtime; + lx_timespec64_t st_ctime; + int64_t st_unused[3]; +}; +#endif /* defined(_LP64) */ + +typedef enum lx_stat_fmt { + LXF_STAT32, + LXF_STAT64_32, + LXF_STAT64_64 +} lx_stat_fmt_t; + +static void +lx_stat_xlate_dev(vattr_t *vattr) +{ + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + dev_t dev; + lx_virt_disk_t *vd; + boolean_t is_dev; + + if (S_ISCHR(vattr->va_mode) || S_ISBLK(vattr->va_mode)) { + dev = vattr->va_rdev; + is_dev = B_TRUE; + } else { + dev = vattr->va_fsid; + is_dev = B_FALSE; + } + + /* + * See if this is the /dev/zfs device. If it is, the device number has + * already been converted to Linux format in the lx devfs so we have + * to check for that and not a native major/minor style. + */ + if (S_ISCHR(vattr->va_mode) && + LX_GETMAJOR(dev) == getmajor(lxzd->lxzd_zfs_dev) && + LX_GETMINOR(dev) == 0) { + /* + * We use the /dev/zfs device as a placeholder for our in-zone + * fabricated /dev/zfsds0 device that we're pretending / is + * mounted on. lx_zone_get_zfsds has pre-allocated this + * entry in the emulated device list. Reset dev so we can + * properly match in the following loop. + */ + dev = curproc->p_zone->zone_rootvp->v_vfsp->vfs_dev; + } + + /* Substitute emulated major/minor on zvols or mounted datasets. */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_real_dev == dev) { + dev = vd->lxvd_emul_dev; + /* + * We only update rdev for matching zfds/zvol devices + * so that the other devices are unchanged. + */ + if (is_dev) { + vattr->va_rdev = LX_MAKEDEVICE(getmajor(dev), + getminor(dev)); + } + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + /* Mangle st_dev into expected format */ + vattr->va_fsid = LX_MAKEDEVICE(getmajor(dev), getminor(dev)); +} + +static long +lx_stat_common(vnode_t *vp, cred_t *cr, void *outp, lx_stat_fmt_t fmt, + int follow) +{ + vattr_t vattr; + mode_t mode; + int error, flags; + + /* + * When symlink following is desired, the ATTR_REAL flag is necessary + * to circumvent some of the weird behavior present in filesystems like + * lx_proc. + */ + flags = (follow == FOLLOW) ? ATTR_REAL : 0; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, flags, cr, NULL)) != 0) { + return (error); + } + + mode = VTTOIF(vattr.va_type) | vattr.va_mode; + if ((mode & S_IFMT) == S_IFBLK) { + /* Linux seems to report a 0 st_size for all block devices */ + vattr.va_size = 0; + } + if (vattr.va_rdev == NODEV) { + /* Linux leaves st_rdev zeroed when it is absent */ + vattr.va_rdev = 0; + } + + lx_stat_xlate_dev(&vattr); + + if (fmt == LXF_STAT32) { + struct lx_stat32 sb; + + if (vattr.va_fsid > USHRT_MAX || vattr.va_rdev > USHRT_MAX || + vattr.va_nlink > USHRT_MAX || vattr.va_size > INT_MAX) { + return (EOVERFLOW); + } + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = LX_UID32_TO_UID16(vattr.va_uid); + sb.st_gid = LX_GID32_TO_GID16(vattr.va_gid); + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_32) { + struct lx_stat64_32 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_small_ino = (vattr.va_nodeid & UINT_MAX); + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_64) { +#if defined(_LP64) + struct lx_stat64_64 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); +#else + /* Invalid output format on 32-bit */ + VERIFY(0); +#endif + } + + /* Invalid output format */ + VERIFY(0); + return (0); +} + +long +lx_stat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32, FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat32(int fd, void *outp) +{ + file_t *fp; + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, LXF_STAT32, + FOLLOW); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32, NO_FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_stat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat64(int fd, void *outp) +{ + file_t *fp; + model_t model = get_udatamodel(); + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +#define LX_FSTATAT_ALLOWED (LX_AT_SYMLINK_NOFOLLOW | LX_AT_EMPTY_PATH | \ + LX_AT_NO_AUTOMOUNT) + +long +lx_fstatat64(int fd, char *name, void *outp, int flag) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + enum symfollow follow = FOLLOW; + int error; + char c; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + if ((flag & ~LX_FSTATAT_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flag & LX_AT_NO_AUTOMOUNT) != 0) { + /* + * While AT_NO_AUTOMOUNT is a legal flag for fstatat64, it is + * not yet supported by lx_autofs. + */ + lx_unsupported("fstatat(AT_NO_AUTOMOUNT)"); + return (set_errno(EINVAL)); + } + if ((flag & LX_AT_SYMLINK_NOFOLLOW) != 0) { + follow = NO_FOLLOW; + } + + if (copyin(name, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + if ((flag & LX_AT_EMPTY_PATH) == 0) { + return (set_errno(ENOENT)); + } + + /* + * When AT_EMPTY_PATH is set and and empty string has been + * passed for the name parameter, direct the lookup against the + * vnode for that fd. + */ + if (fd == AT_FDCWD) { + mutex_enter(&curproc->p_lock); + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + mutex_exit(&curproc->p_lock); + cr = CRED(); + crhold(cr); + } else { + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + cr = fp->f_cred; + crhold(cr); + releasef(fd); + } + } else { + if ((error = cstatat_getvp(fd, name, follow, &vp, &cr)) != 0) { + return (set_errno(error)); + } + } + + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, follow); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, + NO_FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sync.c b/usr/src/uts/common/brand/lx/syscall/lx_sync.c new file mode 100644 index 0000000000..614afca0b0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sync.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +long +lx_syncfs(int fd) +{ + file_t *fp; + vfs_t *vfsp; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + vfsp = fp->f_vnode->v_vfsp; + releasef(fd); + + (void) (vfsp->vfs_op->vfs_sync)(vfsp, 0, CRED()); + + return (0); +} + +#define LX_SYNC_FILE_RANGE_WAIT_BEFORE 0x1 +#define LX_SYNC_FILE_RANGE_WRITE 0x2 +#define LX_SYNC_FILE_RANGE_WAIT_AFTER 0x4 + +#define LX_SYNC_FILE_RANGE_VALID (LX_SYNC_FILE_RANGE_WAIT_BEFORE | \ + LX_SYNC_FILE_RANGE_WRITE | LX_SYNC_FILE_RANGE_WAIT_AFTER) + + +long +lx_sync_file_range(int fd, off_t offset, off_t nbytes, int flags) +{ + file_t *fp; + int error, sflags = 0; + + if ((flags & ~LX_SYNC_FILE_RANGE_VALID) != 0) + return (set_errno(EINVAL)); + if (offset < 0 || nbytes < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + /* + * Since sync_file_range is implemented in terms of VOP_PUTPAGE, both + * SYNC_FILE_RANGE_WAIT flags are treated as forcing synchronous + * operation. While this differs from the Linux behavior where + * BEFORE/AFTER are distinct, it achieves an adequate level of safety + * since the requested data is synced out at the end of the call. + */ + if ((flags & (LX_SYNC_FILE_RANGE_WAIT_BEFORE | + LX_SYNC_FILE_RANGE_WAIT_AFTER)) == 0) { + sflags |= B_ASYNC; + } + + error = VOP_PUTPAGE(fp->f_vnode, offset, nbytes, sflags, CRED(), NULL); + if (error == ENOSYS) { + error = ESPIPE; + } + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c new file mode 100644 index 0000000000..052ad322a7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -0,0 +1,207 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. + */ + +#include <vm/anon.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/zone.h> +#include <sys/time.h> + +typedef struct lx_sysinfo { + int64_t si_uptime; /* Seconds since boot */ + uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint64_t si_totalram; /* Total memory size */ + uint64_t si_freeram; /* Available memory */ + uint64_t si_sharedram; /* Shared memory */ + uint64_t si_bufferram; /* Buffer memory */ + uint64_t si_totalswap; /* Total swap space */ + uint64_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint64_t si_totalhigh; /* High memory size */ + uint64_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ +} lx_sysinfo_t; + +#if defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit usermode struct. + */ +#pragma pack(4) +typedef struct lx_sysinfo32 { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ + char __si_pad[8]; +} lx_sysinfo32_t; +#pragma pack() +#endif + +extern pgcnt_t swapfs_minfree; + +static void +lx_sysinfo_common(lx_sysinfo_t *si) +{ + zone_t *zone = curzone; + pgcnt_t zphysmem, zfreemem; + ulong_t ztotswap, zfreeswap; + + si->si_uptime = gethrestime_sec() - zone->zone_boot_time; + + si->si_loads[0] = zone->zone_hp_avenrun[0]; + si->si_loads[1] = zone->zone_hp_avenrun[1]; + si->si_loads[2] = zone->zone_hp_avenrun[2]; + + /* + * In linux each thread looks like a process, so we conflate the + * two in this stat as well. + */ + si->si_procs = (int32_t)zone->zone_nlwps; + + zone_get_physmem_data(zone->zone_id, &zphysmem, &zfreemem); + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + ztotswap = k_anoninfo.ani_max; + zfreeswap = k_anoninfo.ani_free; + } else { + /* + * See the comment in swapctl for a description of how free is + * calculated within a zone. + */ + rctl_qty_t used; + spgcnt_t avail; + uint64_t max; + + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail; + + mutex_enter(&zone->zone_mem_lock); + ztotswap = btop(zone->zone_max_swap_ctl); + used = btop(zone->zone_max_swap); + mutex_exit(&zone->zone_mem_lock); + + zfreeswap = MIN(ztotswap, max) - used; + } + + /* + * If the maximum memory stat is less than 1^20 pages (i.e. 4GB), + * then we report the result in bytes. Otherwise we use pages. + * Once we start supporting >1TB systems/zones, we'll need a third + * option. + */ + if (MAX(zphysmem, ztotswap) < 1024 * 1024) { + si->si_totalram = ptob(zphysmem); + si->si_freeram = ptob(zfreemem); + si->si_totalswap = ptob(ztotswap); + si->si_freeswap = ptob(zfreeswap); + si->si_mem_unit = 1; + } else { + si->si_totalram = zphysmem; + si->si_freeram = zfreemem; + si->si_totalswap = ztotswap; + si->si_freeswap = zfreeswap; + si->si_mem_unit = PAGESIZE; + } + si->si_bufferram = 0; + si->si_sharedram = 0; + + /* + * These two stats refer to high physical memory. If an + * application running in a Linux zone cares about this, then + * either it or we are broken. + */ + si->si_totalhigh = 0; + si->si_freehigh = 0; +} + +long +lx_sysinfo64(caddr_t sip) +{ + lx_sysinfo_t si; + + bzero(&si, sizeof (si)); + lx_sysinfo_common(&si); + + if (copyout(&si, sip, sizeof (si)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +long +lx_sysinfo32(caddr_t sip) +{ + lx_sysinfo_t si; + lx_sysinfo32_t si32; + int i; + + lx_sysinfo_common(&si); + + /* + * Convert the lx_sysinfo_t into the legacy 32-bit view: + */ + bzero(&si32, sizeof (si32)); + si32.si_uptime = si.si_uptime; + + for (i = 0; i < 3; i++) { + if ((si.si_loads[i]) > 0x7fffffff) + si32.si_loads[i] = 0x7fffffff; + else + si32.si_loads[i] = si.si_loads[i]; + } + + si32.si_procs = si.si_procs; + si32.si_totalram = si.si_totalram; + si32.si_freeram = si.si_freeram; + si32.si_totalswap = si.si_totalswap; + si32.si_freeswap = si.si_freeswap; + si32.si_mem_unit = si.si_mem_unit; + + si32.si_bufferram = si.si_bufferram; + si32.si_sharedram = si.si_sharedram; + + si32.si_totalhigh = si.si_totalhigh; + si32.si_freehigh = si.si_freehigh; + + if (copyout(&si32, sip, sizeof (si32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c new file mode 100644 index 0000000000..a84c17e139 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cpuvar.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <lx_syscall.h> + +/* ARGSUSED */ +long +lx_arch_prctl(int code, ulong_t addr) +{ +#if defined(__amd64) + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *llwp = lwptolxlwp(lwp); + pcb_t *pcb = &lwp->lwp_pcb; + + switch (code) { + case LX_ARCH_GET_FS: + if (copyout(&llwp->br_lx_fsbase, (void *)addr, + sizeof (llwp->br_lx_fsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_FS: + llwp->br_lx_fsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_fsbase != llwp->br_lx_fsbase) { + pcb->pcb_fsbase = llwp->br_lx_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + break; + + case LX_ARCH_GET_GS: + if (copyout(&llwp->br_lx_gsbase, (void *)addr, + sizeof (llwp->br_lx_gsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_GS: + llwp->br_lx_gsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_gsbase != llwp->br_lx_gsbase) { + pcb->pcb_gsbase = llwp->br_lx_gsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + break; + + default: + return (set_errno(EINVAL)); + } +#endif + + return (0); +} + +long +lx_get_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + + if (fuword32(&inf->entry_number, (uint32_t *)&entry)) + return (set_errno(EFAULT)); + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + /* + * convert the solaris ldt to the linux format expected by the + * caller + */ + DESC_TO_LDT_INFO(dscrp, &ldt_inf); + ldt_inf.entry_number = entry; + + if (copyout(&ldt_inf, inf, sizeof (struct ldt_info))) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_set_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + int i; + + if (copyin(inf, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + entry = ldt_inf.entry_number; + if (entry == -1) { + /* + * Find an empty entry in the tls for this thread. + * The casts assume each user_desc_t entry is 8 bytes. + */ + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) { + if (((uint_t *)dscrp)[0] == 0 && + ((uint_t *)dscrp)[1] == 0) + break; + } + + if (i < LX_TLSNUM) { + /* + * found one + */ + entry = i + GDT_TLSMIN; + if (suword32(&inf->entry_number, entry)) + return (set_errno(EFAULT)); + } else { + return (set_errno(ESRCH)); + } + } + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + /* + * convert the linux ldt info to standard intel descriptor + */ + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + if (LDT_INFO_EMPTY(&ldt_inf)) { + ((uint_t *)dscrp)[0] = 0; + ((uint_t *)dscrp)[1] = 0; + } else { + LDT_INFO_TO_DESC(&ldt_inf, dscrp); + } + + /* + * update the gdt with the new descriptor + */ + kpreempt_disable(); + + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) + lx_set_gdt(GDT_TLSMIN + i, dscrp); + + kpreempt_enable(); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_time.c b/usr/src/uts/common/brand/lx/syscall/lx_time.c new file mode 100644 index 0000000000..b9bc8e5ab4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_time.c @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/times.h> +#include <sys/msacct.h> +#include <sys/lx_userhz.h> + +/* See the comment on LX_USERHZ for more details. */ +#define LX_NSEC_PER_USERHZ (NANOSEC / LX_USERHZ) +#define NSEC_TO_LX_USERHZ(nsec) ((nsec) / LX_NSEC_PER_USERHZ) + +/* + * Our times(2) implementation is based on the native times(2), but with + * the necessary scaling to adjust to USER_HZ. Also, Linux avoids writing + * to a NULL tp, whereas our native code returns EFAULT. + */ +long +lx_times(struct tms *tp) +{ + proc_t *p = curproc; + struct tms p_time; + clock_t ret_lbolt; + + mutex_enter(&p->p_lock); + p_time.tms_utime = + (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_USER)); + p_time.tms_stime = + (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_SYSTEM)); + p_time.tms_cutime = HZ_TO_LX_USERHZ(p->p_cutime); + p_time.tms_cstime = HZ_TO_LX_USERHZ(p->p_cstime); + mutex_exit(&p->p_lock); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct tms32 t32; + + t32.tms_utime = p_time.tms_utime; + t32.tms_stime = p_time.tms_stime; + t32.tms_cutime = p_time.tms_cutime; + t32.tms_cstime = p_time.tms_cstime; + + if (tp != NULL && copyout(&t32, tp, sizeof (t32)) != 0) + return (set_errno(EFAULT)); + + ret_lbolt = ddi_get_lbolt(); + return ((clock32_t)HZ_TO_LX_USERHZ(ret_lbolt)); + } else +#endif /* _SYSCALL32_IMPL */ + { + if (tp != NULL && copyout(&p_time, tp, sizeof (p_time)) != 0) + return (set_errno(EFAULT)); + + ret_lbolt = ddi_get_lbolt(); + return (HZ_TO_LX_USERHZ(ret_lbolt)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c new file mode 100644 index 0000000000..279bdbddc7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c @@ -0,0 +1,637 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The illumos kernel provides two clock backends: CLOCK_REALTIME, the + * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically + * increasing time source that is not subject to drift or adjustment. By + * contrast, the Linux kernel is furnished with an overabundance of narrowly + * differentiated clock types. + * + * Fortunately, most of the commonly used Linux clock types are either similar + * enough to the native clock backends that they can be directly mapped, or + * represent queries to the per-process and per-LWP microstate counters. + * + * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into + * account time that the system is suspended. Since that is uninteresting to + * us, we treat it the same. + */ + +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <lx_signum.h> + +/* + * From "uts/common/os/timer.c": + */ +extern int clock_settime(clockid_t, timespec_t *); +extern int clock_gettime(clockid_t, timespec_t *); +extern int clock_getres(clockid_t, timespec_t *); +extern int nanosleep(timespec_t *, timespec_t *); + + +static int lx_emul_clock_getres(clockid_t, timespec_t *); +static int lx_emul_clock_gettime(clockid_t, timespec_t *); +static int lx_emul_clock_settime(clockid_t, timespec_t *); + +typedef struct lx_clock_backend { + clockid_t lclk_ntv_id; + int (*lclk_clock_getres)(clockid_t, timespec_t *); + int (*lclk_clock_gettime)(clockid_t, timespec_t *); + int (*lclk_clock_settime)(clockid_t, timespec_t *); +} lx_clock_backend_t; + +/* + * NOTE: The Linux man pages state this structure is obsolete and is + * unsupported, so it is declared here for sizing purposes only. + */ +struct lx_timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +/* + * Use the native clock_* system call implementation, but with a translated + * clock identifier: + */ +#define NATIVE(ntv_id) \ + { ntv_id, clock_getres, clock_gettime, clock_settime } + +/* + * This backend is not supported, so we provide an emulation handler: + */ +#define EMUL(ntv_id) \ + { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime, \ + lx_emul_clock_settime } + +static lx_clock_backend_t lx_clock_backends[] = { + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC */ + EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */ + EMUL(CLOCK_THREAD_CPUTIME_ID), /* LX_CLOCK_THREAD_CPUTIME_ID */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_RAW */ + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME_COARSE */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_COARSE */ + NATIVE(CLOCK_HIGHRES) /* LX_CLOCK_BOOTTIME */ +}; + +#define LX_CLOCK_MAX \ + (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0])) +#define LX_CLOCK_BACKEND(clk) (((clk) < LX_CLOCK_MAX && (clk) >= 0) ? \ + &lx_clock_backends[(clk)] : NULL) + +/* + * Linux defines the size of the sigevent structure to be 64 bytes. In order + * to meet that definition, the trailing union includes a member which pads it + * out to the desired length for the given architecture. + */ +#define LX_SIGEV_PAD_SIZE ((64 - \ + (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int)) + +typedef struct { + union sigval lx_sigev_value; + int lx_sigev_signo; + int lx_sigev_notify; + union { + int lx_pad[LX_SIGEV_PAD_SIZE]; + int lx_tid; + struct { + void (*lx_notify_function)(union sigval); + void *lx_notify_attribute; + } lx_sigev_thread; + } lx_sigev_un; +} lx_sigevent_t; + + +#ifdef _SYSCALL32_IMPL + +#define LX_SIGEV32_PAD_SIZE ((64 - \ + (sizeof (int) * 2 + sizeof (union sigval32))) / sizeof (int)) + +typedef struct { + union sigval32 lx_sigev_value; + int lx_sigev_signo; + int lx_sigev_notify; + union { + int lx_pad[LX_SIGEV32_PAD_SIZE]; + int lx_tid; + struct { + caddr32_t lx_notify_function; + caddr32_t lx_notify_attribute; + } lx_sigev_thread; + } lx_sigev_un; +} lx_sigevent32_t; + +#endif /* _SYSCALL32_IMPL */ + +#define LX_SIGEV_SIGNAL 0 +#define LX_SIGEV_NONE 1 +#define LX_SIGEV_THREAD 2 +#define LX_SIGEV_THREAD_ID 4 + +/* + * Access private SIGEV_THREAD_ID callback state in itimer_t + */ +#define LX_SIGEV_THREAD_ID_LPID(it) ((it)->it_cb_data[0]) +#define LX_SIGEV_THREAD_ID_TID(it) ((it)->it_cb_data[1]) + + +/* ARGSUSED */ +static int +lx_emul_clock_settime(clockid_t clock, timespec_t *tp) +{ + return (set_errno(EINVAL)); +} + +static int +lx_emul_clock_gettime(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: { + proc_t *p = ttoproc(curthread); + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage() in "rusagesys.c": + */ + mutex_enter(&p->p_lock); + unsecs = mstate_aggr_state(p, LMS_USER); + snsecs = mstate_aggr_state(p, LMS_SYSTEM); + mutex_exit(&p->p_lock); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + case CLOCK_THREAD_CPUTIME_ID: { + klwp_t *lwp = ttolwp(curthread); + struct mstate *ms = &lwp->lwp_mstate; + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage_lwp() in "rusagesys.c": + */ + unsecs = ms->ms_acct[LMS_USER]; + snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + + scalehrtime(&unsecs); + scalehrtime(&snsecs); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static int +lx_emul_clock_getres(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + if (tp == NULL) { + return (0); + } + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: + case CLOCK_THREAD_CPUTIME_ID: + /* + * These clock backends return microstate accounting values for + * the LWP or the entire process. The Linux kernel claims they + * have nanosecond resolution; so will we. + */ + t.tv_sec = 0; + t.tv_nsec = 1; + break; + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static void +lx_clock_unsupported(int clock) +{ + char buf[100]; + + (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock); + lx_unsupported(buf); +} + +long +lx_clock_settime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_gettime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_getres(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + /* + * It is important this check is performed after the clock + * check. Both glibc and musl, in their clock_getcpuclockid(), + * use clock_getres() with a NULL tp to validate a clock + * value. Performing the tp check before the clock check could + * indicate a valid clock to libc when it shouldn't. + */ + if (tp == NULL) { + return (0); + } + + return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp)); +} + +static int +lx_ltos_sigev(lx_sigevent_t *lev, struct sigevent *sev) +{ + bzero(sev, sizeof (*sev)); + + switch (lev->lx_sigev_notify) { + case LX_SIGEV_NONE: + sev->sigev_notify = SIGEV_NONE; + break; + + case LX_SIGEV_SIGNAL: + case LX_SIGEV_THREAD_ID: + sev->sigev_notify = SIGEV_SIGNAL; + break; + + case LX_SIGEV_THREAD: + /* + * Just as in illumos, SIGEV_THREAD handling is performed in + * userspace with the help of SIGEV_SIGNAL/SIGEV_THREAD_ID. + * + * It's not expected to make an appearance in the syscall. + */ + default: + return (EINVAL); + } + + sev->sigev_signo = lx_ltos_signo(lev->lx_sigev_signo, 0); + sev->sigev_value = lev->lx_sigev_value; + + /* Ensure SIGEV_SIGNAL has a valid signo to work with. */ + if (sev->sigev_notify == SIGEV_SIGNAL && sev->sigev_signo == 0) { + return (EINVAL); + } + return (0); +} + +static int +lx_sigev_copyin(lx_sigevent_t *userp, lx_sigevent_t *levp) +{ +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_sigevent32_t lev32; + + if (copyin(userp, &lev32, sizeof (lev32)) != 0) { + return (EFAULT); + } + levp->lx_sigev_value.sival_int = lev32.lx_sigev_value.sival_int; + levp->lx_sigev_signo = lev32.lx_sigev_signo; + levp->lx_sigev_notify = lev32.lx_sigev_notify; + levp->lx_sigev_un.lx_tid = lev32.lx_sigev_un.lx_tid; + } else +#endif /* _SYSCALL32_IMPL */ + { + if (copyin(userp, levp, sizeof (lx_sigevent_t)) != 0) { + return (EFAULT); + } + } + return (0); +} + +static void +lx_sigev_thread_fire(itimer_t *it) +{ + proc_t *p = it->it_proc; + pid_t lpid = (pid_t)LX_SIGEV_THREAD_ID_LPID(it); + id_t tid = (id_t)LX_SIGEV_THREAD_ID_TID(it); + lwpdir_t *ld; + + ASSERT(MUTEX_HELD(&it->it_mutex)); + ASSERT(it->it_pending == 0); + ASSERT(it->it_flags & IT_SIGNAL); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ld = lwp_hash_lookup(p, tid); + if (ld != NULL) { + lx_lwp_data_t *lwpd; + kthread_t *t; + + t = ld->ld_entry->le_thread; + lwpd = ttolxlwp(t); + if (lwpd != NULL && lwpd->br_pid == lpid) { + /* + * A thread matching the LX pid is still present in the + * process. Send a targeted signal as requested. + */ + it->it_pending = 1; + mutex_exit(&it->it_mutex); + sigaddqa(p, t, it->it_sigq); + return; + } + } + + mutex_exit(&it->it_mutex); +} + +long +lx_timer_create(int clock, lx_sigevent_t *sevp, timer_t *tidp) +{ + int error; + lx_sigevent_t lev; + struct sigevent sev; + clock_backend_t *backend = NULL; + proc_t *p = curproc; + itimer_t *itp; + timer_t tid; + + if (clock == -2) { + /* + * A change was made to the old userspace timer emulation to + * handle this specific clock ID for MapR. It was wrongly + * mapped to CLOCK_REALTIME rather than CLOCK_THREAD_CPUTIME_ID + * which it maps to. Until the CLOCK_*_CPUTIME_ID timers can + * be emulated, the admittedly incorrect mapping will remain. + */ + backend = clock_get_backend(CLOCK_REALTIME); + } else { + lx_clock_backend_t *lback = LX_CLOCK_BACKEND(clock); + + if (lback != NULL) { + backend = clock_get_backend(lback->lclk_ntv_id); + } + } + if (backend == NULL) { + return (set_errno(EINVAL)); + } + + /* We have to convert the Linux sigevent layout to the illumos layout */ + if (sevp != NULL) { + if ((error = lx_sigev_copyin(sevp, &lev)) != 0) { + return (set_errno(error)); + } + if ((error = lx_ltos_sigev(&lev, &sev)) != 0) { + return (set_errno(error)); + } + } else { + bzero(&sev, sizeof (sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGALRM; + } + + if ((error = timer_setup(backend, &sev, NULL, &itp, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * The SIGEV_THREAD_ID notification method in Linux allows the caller + * to target a specific thread to receive the signal. The IT_CALLBACK + * timer functionality is used to fulfill this need. After translating + * the LX pid to a SunOS thread ID (ensuring it exists in the current + * process), those IDs are attached to the timer along with the custom + * lx_sigev_thread_fire callback. This targets the signal notification + * properly when the timer fires. + */ + if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) { + pid_t lpid, spid; + id_t stid; + + lpid = (pid_t)lev.lx_sigev_un.lx_tid; + if (lx_lpid_to_spair(lpid, &spid, &stid) != 0 || + spid != curproc->p_pid) { + error = EINVAL; + goto err; + } + + itp->it_flags |= IT_CALLBACK; + itp->it_cb_func = lx_sigev_thread_fire; + LX_SIGEV_THREAD_ID_LPID(itp) = lpid; + LX_SIGEV_THREAD_ID_TID(itp) = stid; + } + + /* + * When the sigevent is not specified, its sigev_value field is + * expected to be populated with the timer ID. + */ + if (sevp == NULL) { + itp->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + error = EFAULT; + goto err; + } + + timer_release(p, itp); + return (0); + +err: + timer_delete_grabbed(p, tid, itp); + return (set_errno(error)); +} + +long +lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp) +{ + struct lx_timezone tz; + + bzero(&tz, sizeof (tz)); + + /* + * We want to be similar to libc which just does a fasttrap to + * gethrestime and simply converts that result. We follow how uniqtime + * does the conversion but we can't use that code since it does some + * extra work which can cause the result to bounce around based on which + * CPU we run on. + */ + if (tvp != NULL) { + struct timeval tv; + timestruc_t ts; + int usec, nsec; + + gethrestime(&ts); + nsec = ts.tv_nsec; + usec = nsec + (nsec >> 2); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 4); + usec = nsec - (usec >> 3); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 3); + usec = nsec + (usec >> 4); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 6); + usec = usec >> 10; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = usec; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(&tv, tvp, sizeof (tv)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (copyout(&tv32, tvp, sizeof (tv32))) + return (set_errno(EFAULT)); + } +#endif + } + + /* + * The Linux man page states use of the second parameter is obsolete, + * but gettimeofday(2) should still return EFAULT if it is set + * to a bad non-NULL pointer (sigh...) + */ + if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure + * mode is documented as "undefined." + */ +long +lx_time(time_t *tp) +{ + timestruc_t ts; + struct timeval tv; + + gethrestime(&ts); + tv.tv_sec = ts.tv_sec; + tv.tv_usec = 0; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (tp != NULL && + copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0) + return (set_errno(EFAULT)); + + return (tv.tv_sec); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (tp != NULL && + copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec))) + return (set_errno(EFAULT)); + + return (tv32.tv_sec); + } +#endif /* _SYSCALL32_IMPL */ + /* NOTREACHED */ +} + +long +lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp) +{ + return (nanosleep(rqtp, rmtp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_umask.c b/usr/src/uts/common/brand/lx/syscall/lx_umask.c new file mode 100644 index 0000000000..cb5e4ed232 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_umask.c @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> + +/* From usr/src/uts/common/syscall/umask.c */ +extern int umask(int); + +/* + * Just do what umask() does, but for the given process. + */ +static int +lx_clone_umask_cb(proc_t *pp, void *arg) +{ + mode_t cmask = (mode_t)(intptr_t)arg; + mode_t orig; + + orig = PTOU(pp)->u_cmask; + PTOU(pp)->u_cmask = (mode_t)(cmask & PERMMASK); + return ((int)orig); +} + +long +lx_umask(mode_t cmask) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_grp_member(lproc, LX_CLONE_FS)) { + int omask; + + omask = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_umask_cb, + (void *)(intptr_t)cmask); + return (omask); + } + + return (umask(cmask)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_uname.c b/usr/src/uts/common/brand/lx/syscall/lx_uname.c new file mode 100644 index 0000000000..2d18408eaa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_uname.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> + +struct lx_utsname { + char lxu_sysname[LX_SYS_UTS_LN]; + char lxu_nodename[LX_SYS_UTS_LN]; + char lxu_release[LX_SYS_UTS_LN]; + char lxu_version[LX_SYS_UTS_LN]; + char lxu_machine[LX_SYS_UTS_LN]; + char lxu_domainname[LX_SYS_UTS_LN]; +}; + +long +lx_uname(void *uptr) +{ + proc_t *p = curproc; + lx_proc_data_t *lxpd = ptolxproc(p); + lx_zone_data_t *lxzd = ztolxzd(p->p_zone); + struct lx_utsname un; + + bzero(&un, sizeof (un)); + + (void) strlcpy(un.lxu_sysname, LX_UNAME_SYSNAME, LX_SYS_UTS_LN); + (void) strlcpy(un.lxu_nodename, p->p_zone->zone_nodename, + LX_SYS_UTS_LN); + + mutex_enter(&lxzd->lxzd_lock); + + if (lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(un.lxu_release, lxpd->l_uname_release, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_release, lxzd->lxzd_kernel_release, + LX_SYS_UTS_LN); + } + if (lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(un.lxu_version, lxpd->l_uname_version, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_version, lxzd->lxzd_kernel_version, + LX_SYS_UTS_LN); + } + + mutex_exit(&lxzd->lxzd_lock); + + if (get_udatamodel() == DATAMODEL_LP64) { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE64, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE32, + LX_SYS_UTS_LN); + } + (void) strlcpy(un.lxu_domainname, p->p_zone->zone_domain, + LX_SYS_UTS_LN); + + if (copyout(&un, uptr, sizeof (un)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c new file mode 100644 index 0000000000..e8358f9f69 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +/* + * wait() family of functions. + * + * The first minor difference between the Linux and Solaris family of wait() + * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully, + * the exit status values are identical between the two implementations. + * + * Things get very different and very complicated when we introduce the Linux + * threading model. Under linux, both threads and child processes are + * represented as processes. However, the behavior of wait() with respect to + * each child varies according to the flags given to clone() + * + * SIGCHLD The SIGCHLD signal should be sent on termination + * CLONE_THREAD The child shares the same thread group as the parent + * CLONE_DETACHED The parent receives no notification when the child exits + * + * The following flags control the Linux behavior w.r.t. the above attributes: + * + * __WALL Wait on all children, regardless of type + * __WCLONE Wait only on non-SIGCHLD children + * __WNOTHREAD Don't wait on children of other threads in this group + * + * The following chart shows whether wait() returns when the child exits: + * + * default __WCLONE __WALL + * no SIGCHLD - X X + * SIGCHLD X - X + * + * The following chart shows whether wait() returns when the grandchild exits: + * + * default __WNOTHREAD + * no CLONE_THREAD - - + * CLONE_THREAD X - + * + * The CLONE_DETACHED flag is universal - when the child exits, no state is + * stored and wait() has no effect. + * + * XXX Support the above combination of options, or some reasonable subset that + * covers at least fork() and pthread_create(). + */ + +#include <sys/wait.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_errno.h> +#include <lx_syscall.h> + +/* + * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c": + */ +extern int waitid(idtype_t, id_t, k_siginfo_t *, int); +extern int rusagesys(int, void *, void *, void *, void *); + +/* + * Convert between Linux options and Solaris options, returning -1 if any + * invalid flags are found. + */ +#define LX_WNOHANG 0x00000001 +#define LX_WUNTRACED 0x00000002 +#define LX_WSTOPPED LX_WUNTRACED +#define LX_WEXITED 0x00000004 +#define LX_WCONTINUED 0x00000008 +#define LX_WNOWAIT 0x01000000 + +#define LX_WNOTHREAD 0x20000000 +#define LX_WALL 0x40000000 +#define LX_WCLONE 0x80000000 + +#define LX_P_ALL 0x0 +#define LX_P_PID 0x1 +#define LX_P_GID 0x2 + +/* + * Split the passed waitpid/waitid options into two separate variables: + * those for the native illumos waitid(2), and the extra Linux-specific + * options we will handle in our brand-specific code. + */ +static int +ltos_options(uintptr_t options, int *native_options, int *extra_options) +{ + int newoptions = 0; + + if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED | + LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL | + LX_WCLONE)) != 0) { + return (-1); + } + + *extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE); + + if (options & LX_WNOHANG) + newoptions |= WNOHANG; + if (options & LX_WUNTRACED) + newoptions |= WUNTRACED; + if (options & LX_WEXITED) + newoptions |= WEXITED; + if (options & LX_WCONTINUED) + newoptions |= WCONTINUED; + if (options & LX_WNOWAIT) + newoptions |= WNOWAIT; + + /* + * The trapped option is implicit on Linux. + */ + newoptions |= WTRAPPED; + + *native_options = newoptions; + return (0); +} + +static int +lx_wstat(int code, int status) +{ + int stat = 0; + + switch (code) { + case CLD_EXITED: + stat = status << 8; + break; + case CLD_DUMPED: + stat = lx_stol_signo(status, SIGKILL) | WCOREFLG; + break; + case CLD_KILLED: + stat = lx_stol_signo(status, SIGKILL); + break; + case CLD_TRAPPED: + case CLD_STOPPED: + stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG; + break; + case CLD_CONTINUED: + stat = WCONTFLG; + break; + } + + return (stat); +} + +static int +lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options, + int extra_options) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int error; + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = extra_options; + + if ((error = waitid(idtype, id, sip, native_options)) == EINTR) { + /* + * According to signal(7), the wait4(2), waitid(2), and + * waitpid(2) system calls are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (error); +} + +long +lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) +{ + k_siginfo_t info = { 0 }; + idtype_t idtype; + id_t id; + int status = 0; + pid_t pid = (pid_t)p1; + int error; + int native_options, extra_options; + int *statusp = (int *)p2; + void *rup = (void *)p4; + + if (ltos_options(p3, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (pid > maxpid) { + return (set_errno(ECHILD)); + } + + /* + * While not listed as a valid return code, Linux's wait4(2) does, + * in fact, get an EFAULT if either the status pointer or rusage + * pointer is invalid. Since a failed waitpid should leave child + * process in a state where a future wait4(2) will succeed, we + * check them by copying out the values their buffers originally + * contained. (We need to do this as a failed system call should + * never affect the contents of a passed buffer.) + * + * This will fail if the buffers in question are write-only. + */ + if (statusp != NULL) { + if (copyin(statusp, &status, sizeof (status)) != 0 || + copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + /* + * Do the same check for the "struct rusage" pointer, which differs + * in size for 32- and 64-bit processes. + */ + if (rup != NULL) { + struct rusage ru; + void *krup = &ru; + size_t rusz = sizeof (ru); +#if defined(_SYSCALL32_IMPL) + struct rusage32 ru32; + + if (get_udatamodel() != DATAMODEL_NATIVE) { + krup = &ru32; + rusz = sizeof (ru32); + } +#endif + + if (copyin(rup, krup, rusz) != 0 || + copyout(krup, rup, rusz) != 0) { + return (set_errno(EFAULT)); + } + } + + if (pid < -1) { + idtype = P_PGID; + id = -pid; + } else if (pid == -1) { + idtype = P_ALL; + id = 0; + } else if (pid == 0) { + idtype = P_PGID; + mutex_enter(&pidlock); + id = curproc->p_pgrp; + mutex_exit(&pidlock); + } else { + idtype = P_PID; + id = pid; + } + + native_options |= (WEXITED | WTRAPPED); + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + + status = lx_wstat(info.si_code, info.si_status); + + /* + * Unfortunately if this attempt to copy out either the status or the + * rusage fails, the process will be in an inconsistent state as + * subsequent calls to wait for the same child will fail where they + * should succeed on a Linux system. This, however, is rather + * unlikely since we tested the validity of both above. + */ + if (statusp != NULL) { + if (copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + if (rup != NULL) { + if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL, + NULL, NULL)) != 0) { + return (set_errno(error)); + } + } + + return (info.si_pid); +} + +long +lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3) +{ + return (lx_wait4(p1, p2, p3, NULL)); +} + +long +lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) +{ + int error; + int native_options, extra_options; + k_siginfo_t info = { 0 }; + + if (ltos_options(opt, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) { + return (set_errno(EINVAL)); + } + + switch (idtype) { + case LX_P_ALL: + idtype = P_ALL; + break; + case LX_P_PID: + idtype = P_PID; + break; + case LX_P_GID: + idtype = P_PGID; + break; + default: + return (set_errno(EINVAL)); + } + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (stol_ksiginfo32_copyout(&info, (void *)infop)); + } else +#endif + { + return (stol_ksiginfo_copyout(&info, (void *)infop)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c new file mode 100644 index 0000000000..19bf9a4ebb --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c @@ -0,0 +1,519 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/lx_acl.h> + + +#define LX_XATTR_NAME_MAX 255 +#define LX_XATTR_SIZE_MAX 65536 +#define LX_XATTR_LIST_MAX 65536 + +#define LX_XATTR_FLAG_CREATE 0x1 +#define LX_XATTR_FLAG_REPLACE 0x2 +#define LX_XATTR_FLAGS_VALID (LX_XATTR_FLAG_CREATE | LX_XATTR_FLAG_REPLACE) + +enum lx_xattr_ns { + LX_XATTR_NS_SECURITY, + LX_XATTR_NS_SYSTEM, + LX_XATTR_NS_TRUSTED, + LX_XATTR_NS_USER, + LX_XATTR_NS_INVALID /* Catch-all for invalid namespaces */ +}; + +/* Present under the 'security.' namespace */ +#define LX_XATTR_CAPABILITY "capability" + +typedef struct lx_xattr_ns_list { + const char *lxnl_name; + unsigned lxnl_len; + enum lx_xattr_ns lxnl_ns; +} lx_xattr_ns_list_t; + +static lx_xattr_ns_list_t lx_xattr_namespaces[] = { + { "user.", 5, LX_XATTR_NS_USER }, + { "system.", 7, LX_XATTR_NS_SYSTEM }, + { "trusted.", 8, LX_XATTR_NS_TRUSTED }, + { "security.", 9, LX_XATTR_NS_SECURITY }, + { NULL, 0, LX_XATTR_NS_INVALID } +}; + +static int +lx_xattr_parse(const char *name, size_t nlen, const char **key) +{ + lx_xattr_ns_list_t *lxn = lx_xattr_namespaces; + + for (; lxn->lxnl_name != NULL; lxn++) { + if (nlen < lxn->lxnl_len) { + continue; + } + if (strncmp(lxn->lxnl_name, name, lxn->lxnl_len) == 0) { + *key = name + (lxn->lxnl_len); + return (lxn->lxnl_ns); + } + } + + *key = name; + return (LX_XATTR_NS_INVALID); +} + +/* + * *xattr() family of functions. + * + * These are largely unimplemented. In most cases we return EOPNOTSUPP, rather + * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1). + * + * Note that CRED() is used instead of f_cred in the f*xattr functions. This + * is intentional as Linux does not have the same notion of per-fd credentials. + */ + +/* ARGSUSED */ +static int +lx_setxattr_common(vnode_t *vp, char *name, void *value, size_t sz, int flags) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + void *buf = NULL; + + if ((flags & ~LX_XATTR_FLAGS_VALID) != 0) { + return (EINVAL); + } + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + type = lx_xattr_parse(name_buf, name_len, &key); + + if (sz != 0) { + if (sz > LX_XATTR_SIZE_MAX) { + return (E2BIG); + } + buf = kmem_alloc(sz, KM_SLEEP); + if (copyin(value, buf, sz) != 0) { + kmem_free(buf, sz); + return (EFAULT); + } + } + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SECURITY: + /* + * In order to keep package management software happy, despite + * lacking support for file-based Linux capabilities via + * xattrs, we fake success when root attempts a setxattr on + * that attribute. + */ + if (crgetuid(CRED()) == 0 && + strcmp(key, LX_XATTR_CAPABILITY) == 0) { + error = 0; + } + break; + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_setxattr(vp, LX_ACL_ACCESS, buf, sz); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_setxattr(vp, LX_ACL_DEFAULT, buf, sz); + } + default: + break; + } + + if (buf != NULL) { + kmem_free(buf, sz); + } + return (error); +} + +/* ARGSUSED */ +static int +lx_getxattr_common(vnode_t *vp, char *name, char *value, size_t sz, + ssize_t *osz) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + void *buf = NULL; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + if (sz != 0) { + if (sz > LX_XATTR_SIZE_MAX) { + sz = LX_XATTR_SIZE_MAX; + } + buf = kmem_alloc(sz, KM_SLEEP); + } + + type = lx_xattr_parse(name_buf, name_len, &key); + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_getxattr(vp, LX_ACL_ACCESS, buf, sz, + osz); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_getxattr(vp, LX_ACL_DEFAULT, buf, sz, + osz); + } + break; + default: + break; + } + + if (error == 0 && buf != NULL) { + VERIFY(*osz <= sz); + + if (copyout(buf, value, *osz) != 0) { + error = EFAULT; + } + } + if (buf != NULL) { + kmem_free(buf, sz); + } + return (error); +} + +/* ARGSUSED */ +static int +lx_listxattr_common(vnode_t *vp, void *value, size_t size, ssize_t *osize) +{ + struct uio auio; + struct iovec aiov; + int err = 0; + + aiov.iov_base = value; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = size; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Call into all the listxattr routines (which may be no-ops) which are + * currently implemented. + */ + err = lx_acl_listxattr(vp, &auio); + + if (err == 0) { + *osize = size - auio.uio_resid; + } + + return (err); +} + +/* ARGSUSED */ +static int +lx_removexattr_common(vnode_t *vp, char *name) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + + type = lx_xattr_parse(name_buf, name_len, &key); + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_removexattr(vp, LX_ACL_ACCESS); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_removexattr(vp, LX_ACL_DEFAULT); + } + default: + break; + } + + return (EOPNOTSUPP); +} + + +long +lx_setxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lsetxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fsetxattr(int fd, char *name, void *value, size_t size, int flags) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_setxattr_common(fp->f_vnode, name, value, size, flags); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +ssize_t +lx_getxattr(char *path, char *name, void *value, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_lgetxattr(char *path, char *name, void *value, size_t size) +{ + + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_fgetxattr(int fd, char *name, void *value, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + /* + * When a file is opened with O_PATH we clear read/write and fgetxattr + * is expected to return EBADF. + */ + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + releasef(fd); + return (set_errno(EBADF)); + } + + error = lx_getxattr_common(fp->f_vnode, name, value, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_listxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_llistxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_flistxattr(int fd, char *list, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_listxattr_common(fp->f_vnode, list, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +int +lx_removexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_lremovexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_fremovexattr(int fd, char *name) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_removexattr_common(fp->f_vnode, name); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h new file mode 100644 index 0000000000..f34ed31dcb --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h @@ -0,0 +1,198 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _LXSYSFS_H +#define _LXSYSFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_sysfs.h: declarations, data structures and macros for lx_sysfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> + +/* + * Convert a vnode into an lxsys_mnt_t + */ +#define VTOLXSM(vp) ((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxsys_node + */ +#define VTOLXS(vp) ((lxsys_node_t *)(vp)->v_data) + +/* + * convert a lxsys_node into a vnode + */ +#define LXSTOV(lxsnp) ((lxsnp)->lxsys_vnode) + +/* + * convert a lxsys_node into zone for fs + */ +#define LXSTOZ(lxsnp) \ + (((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone) + +#define LXSNSIZ 256 /* max size of lx /sys file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXSYS_SDSIZE 16 + +/* Root sysfs lxsys_instance */ +#define LXSYS_INST_ROOT 0 + +/* + * Node/file types for lx /sys files + * (directories and files contained therein). + */ +typedef enum lxsys_nodetype { + LXSYS_NONE, /* None-type to keep inodes non-zero */ + LXSYS_STATIC, /* Statically defined entries */ + LXSYS_CLASS_NET, /* /sys/class/net/<iface> */ + LXSYS_DEV_NET, /* /sys/devices/virtual/net/<iface> */ + LXSYS_BLOCK, /* /sys/block/<dev> */ + LXSYS_DEV_ZFS, /* /sys/devices/zfs/<dev> */ + LXSYS_DEV_SYS_CPU, /* /sys/devices/system/cpu/<cpu> */ + LXSYS_DEV_SYS_CPUINFO, /* /sys/devices/system/cpu/cpuN/<info> */ + LXSYS_DEV_SYS_NODE, /* /sys/devices/system/node/node0/<info> */ + LXSYS_MAXTYPE, /* type limit */ +} lxsys_nodetype_t; + +/* + * external dirent characteristics + */ +typedef struct { + unsigned int d_idnum; + char *d_name; +} lxsys_dirent_t; + +typedef struct { + unsigned int dl_instance; + lxsys_dirent_t *dl_list; + int dl_length; +} lxsys_dirlookup_t; + +/* + * This is the lx sysfs private data object + * which is attached to v_data in the vnode structure + */ +struct lxsys_node; +typedef struct lxsys_node lxsys_node_t; +struct lxsys_node { + lxsys_nodetype_t lxsys_type; /* type ID of node */ + unsigned int lxsys_instance; /* instance ID node */ + unsigned int lxsys_endpoint; /* endpoint ID node */ + vnode_t *lxsys_vnode; /* vnode for the node */ + vnode_t *lxsys_parentvp; /* parent directory */ + lxsys_node_t *lxsys_next; /* next list entry */ + timestruc_t lxsys_time; /* creation time */ + mode_t lxsys_mode; /* file mode bits */ + uid_t lxsys_uid; /* file owner */ + gid_t lxsys_gid; /* file group owner */ + ino_t lxsys_ino; /* node id */ +}; + +/* + * This is the lxsysfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxsys_mnt { + kmutex_t lxsysm_lock; /* protects fields */ + lxsys_node_t *lxsysm_node; /* node at root of sys mount */ + zone_t *lxsysm_zone; /* zone for this mount */ +} lxsys_mnt_t; + +extern vnodeops_t *lxsys_vnodeops; + +typedef struct mounta mounta_t; + +extern void lxsys_initnodecache(); +extern void lxsys_fininodecache(); +extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int); +extern ino_t lxsys_parentinode(lxsys_node_t *); +extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int, + unsigned int); +extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int); +extern void lxsys_freenode(lxsys_node_t *); + +extern netstack_t *lxsys_netstack(lxsys_node_t *); +extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t); + +extern int lxsys_ino_get_type(ino_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t bufsize; + char *pos; + size_t beg; + int error; +} lxsys_uiobuf_t; + +extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *); +extern void lxsys_uiobuf_free(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int); +extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t); +extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LXSYSFS_H */ diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c new file mode 100644 index 0000000000..69234ddbaa --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c @@ -0,0 +1,443 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * lx_syssubr.c: Various functions for the /sys vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_sysfs.h" + +#define LXSYSCACHE_NAME "lxsys_cache" + +static int lxsys_node_constructor(void *, void *, int); +static void lxsys_node_destructor(void *, void *); + +static kmem_cache_t *lxsys_node_cache; + +void +lxsys_initnodecache() +{ + lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME, + sizeof (lxsys_node_t), 0, + lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxsys_fininodecache() +{ + kmem_cache_destroy(lxsys_node_cache); +} + +/* ARGSUSED */ +static int +lxsys_node_constructor(void *buf, void *un, int kmflags) +{ + lxsys_node_t *lxsnp = buf; + vnode_t *vp; + + vp = lxsnp->lxsys_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxsys_vnodeops); + vp->v_data = lxsnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxsys_node_destructor(void *buf, void *un) +{ + lxsys_node_t *lxsnp = buf; + + vn_free(LXSTOV(lxsnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxsys node + */ +ino_t +lxsys_inode(lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + /* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + */ + ASSERT(instance <= 0xffff); + ASSERT(endpoint <= 0xff); + + return ((ino_t)(type << 24)|(instance << 8)|endpoint); +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxsys_parentinode(lxsys_node_t *lxsnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxsnp->lxsys_type == LXSYS_STATIC && + lxsnp->lxsys_instance == LXSYS_INST_ROOT) { + return (lxsnp->lxsys_ino); + } else { + return (VTOLXS(lxsnp->lxsys_parentvp)->lxsys_ino); + } +} + +/* + * Allocate a new lxsys node + * + * This also allocates the vnode associated with it + */ +lxsys_node_t * +lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + lxsys_node_t *lxsnp; + vnode_t *vp; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxsnp->lxsys_type = type; + lxsnp->lxsys_instance = instance; + lxsnp->lxsys_endpoint = endpoint; + lxsnp->lxsys_next = NULL; + lxsnp->lxsys_parentvp = dp; + VN_HOLD(dp); + + lxsnp->lxsys_time = now; + lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0; + lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint); + + /* initialize the vnode data */ + vp = lxsnp->lxsys_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Default to a directory with open permissions. + * Specific components will override this + */ + if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) { + vp->v_flag |= VROOT; + } + vp->v_type = VDIR; + lxsnp->lxsys_mode = 0555; + + return (lxsnp); +} + +lxsys_node_t * +lxsys_getnode_static(vnode_t *dp, unsigned int instance) +{ + lxsys_mnt_t *lxsm = VTOLXSM(dp); + lxsys_node_t *lnp, *tail = NULL; + + mutex_enter(&lxsm->lxsysm_lock); + for (lnp = lxsm->lxsysm_node; lnp != NULL; lnp = lnp->lxsys_next) { + if (lnp->lxsys_instance == instance) { + VERIFY(lnp->lxsys_parentvp == dp); + + VN_HOLD(lnp->lxsys_vnode); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); + } else if (lnp->lxsys_next == NULL) { + /* Found no match by the end of the list */ + tail = lnp; + break; + } + } + + tail->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0); + lnp = tail->lxsys_next; + /* Allow mounts on static entries */ + LXSTOV(lnp)->v_flag &= (~VNOMOUNT); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); +} + +/* Clean up persistence for static lxsys_node */ +int +lxsys_freenode_static(lxsys_node_t *lnp) +{ + lxsys_node_t *plnp; + vnode_t *vp = LXSTOV(lnp); + lxsys_mnt_t *lxsm = VTOLXSM(vp); + + if (lnp->lxsys_instance == LXSYS_INST_ROOT) { + /* + * The root vnode does not need special cleanup since it + * anchors the list and is freed by lxsys_unmount. + */ + return (0); + } + + mutex_enter(&lxsm->lxsysm_lock); + + /* + * It is possible that a different process acquired a fresh reference + * to this vnode via lookup while we were waiting on the lxsysm_lock. + * To avoid freeing the vnode out from under them, we will double-check + * v_count and bail from the fop_inactive if it was grabbed. + */ + mutex_enter(&vp->v_lock); + if (vp->v_count != 1) { + VERIFY(vp->v_count > 0); + + /* Release our hold before bailing out of lxsys_inactive */ + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&lxsm->lxsysm_lock); + return (-1); + } + mutex_exit(&vp->v_lock); + + /* search for the record pointing to lnp */ + plnp = lxsm->lxsysm_node; + while (plnp != NULL && plnp->lxsys_next != lnp) { + plnp = plnp->lxsys_next; + } + /* entry should always be found */ + VERIFY(plnp != NULL); + plnp->lxsys_next = lnp->lxsys_next; + + mutex_exit(&lxsm->lxsysm_lock); + return (0); +} + +/* + * Free the storage obtained from lxsys_getnode(). + */ +void +lxsys_freenode(lxsys_node_t *lxsnp) +{ + vnode_t *vp = LXSTOV(lxsnp); + + VERIFY(vp != NULL); + + if (lxsnp->lxsys_type == LXSYS_STATIC) { + if (lxsys_freenode_static(lxsnp) != 0) { + return; + } + } + + /* + * delete any association with parent vp + */ + if (lxsnp->lxsys_parentvp != NULL) + VN_RELE(lxsnp->lxsys_parentvp); + + /* + * Release the lxsysnode. + */ + kmem_cache_free(lxsys_node_cache, lxsnp); +} + +/* + * Get the netstack associated with this lxsys mount + */ +netstack_t * +lxsys_netstack(lxsys_node_t *lnp) +{ + zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone; + + return (netstack_hold_if_active(zone->zone_netstack)); +} + +ill_t * +lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex) +{ + ill_t *ill; + phyint_t *phyi; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &ifindex, NULL); + if (phyi != NULL) { + /* + * Since interface information presented via /sys is not + * specific to IPv4 or IPv6, an ill reference from either + * protocol will be adequate. Check both, starting with IPv4 + * for a valid reference to use. + */ + for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6; + ill = phyi->phyint_illv6) { + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ill); + } + mutex_exit(&ill->ill_lock); + } + } + } + rw_exit(&ipst->ips_ill_g_lock); + return (NULL); +} + + +#define LXSYSUIOBUFSZ 4096 + +lxsys_uiobuf_t * +lxsys_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxsys_uiobuf and output buffer */ + int bufsize = LXSYSUIOBUFSZ; + lxsys_uiobuf_t *uiobuf = + kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->bufsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize); +} + +void +lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->bufsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxsys_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxsys_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxsys_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c new file mode 100644 index 0000000000..fddc1e0234 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c @@ -0,0 +1,365 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lxsysvfsops.c: vfs operations for lx sysfs. + * + * sysfs has a close relationship with the lx getdents(2) syscall. This is + * necessary so that the getdents code can populate the 'd_type' entries + * during a sysfs readdir operation. The glibc code which accesses sysfs + * (specifically the 'cpu' subtree) expects dirents to have the d_type field + * populated. One problematic consumer is java, which becomes unstable if it + * gets the incorrect data from glibc. When sysfs loads, it populates the + * lx_sysfs_vfs_type and lx_sysfs_vtype variables defined in lx_getdents.c. + * The getdents code can then call into sysfs to determine the d_type for any + * given inode directory entry. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_sysfs.h" + +/* Module level parameters */ +static int lxsysfstype; +static dev_t lxsysdev; +static kmutex_t lxsys_mount_lock; + +extern int lx_sysfs_vfs_type; +extern int (*lx_sysfs_vtype)(ino_t); + +static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxsys_unmount(vfs_t *, int, cred_t *); +static int lxsys_root(vfs_t *, vnode_t **); +static int lxsys_statvfs(vfs_t *, statvfs64_t *); +static int lxsys_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_sysfs", + lxsys_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand sysfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + lx_sysfs_vfs_type = 0; + lx_sysfs_vtype = NULL; + + /* + * destroy lxsys_node cache + */ + lxsys_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxsysfstype); + vn_freevnodeops(lxsys_vnodeops); + + mutex_destroy(&lxsys_mount_lock); +done: + return (retval); +} + +static int +lxsys_init(int fstype, char *name) +{ + static const fs_operation_def_t lxsys_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxsys_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxsys_unmount }, + VFSNAME_ROOT, { .vfs_root = lxsys_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxsys_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxsys_vnodeops_template[]; + int error; + major_t dev; + + lx_sysfs_vtype = lxsys_ino_get_type; + lx_sysfs_vfs_type = lxsysfstype = fstype; + ASSERT(lxsysfstype != 0); + + mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxsys_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxsys_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxsys_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxsysdev = makedevice(dev, 0); + + /* + * Initialise cache for lxsys_nodes + */ + lxsys_initnodecache(); + + return (0); +} + +static int +lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt; + zone_t *zone = curproc->p_zone; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxsys" doesn't make sense + */ + vfs_setresource(vfsp, "lxsys", 0); + + lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP); + + mutex_enter(&lxsys_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxsys_mount_lock); + kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + + mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL); + zone_hold(lxsys_mnt->lxsysm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC, + LXSYS_INST_ROOT, 0); + lxsys_mnt->lxsysm_node->lxsys_next = NULL; + + /* Correctly set the fs for the root node */ + lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxsysfstype; + vfsp->vfs_data = (caddr_t)lxsys_mnt; + vfsp->vfs_dev = lxsysdev; + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data; + lxsys_node_t *lnp; + vnode_t *vp; + int count; + + VERIFY(lxsys_mnt != NULL); + + mutex_enter(&lxsys_mount_lock); + + /* must be root to unmount */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxsys_mount_lock); + return (EPERM); + } + + /* forced unmount is not supported by this fs */ + if (flag & MS_FORCE) { + mutex_exit(&lxsys_mount_lock); + return (ENOTSUP); + } + + /* Ensure that no vnodes are in use on this mount point. */ + lnp = lxsys_mnt->lxsysm_node; + vp = LXSTOV(lnp); + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxsys_mount_lock); + return (EBUSY); + } + + /* + * If there are no references to the root vnode the list of persistent + * static vnodes should be empty + */ + VERIFY(lnp->lxsys_next == NULL); + + (void) dnlc_purge_vfsp(vfsp, 0); + + lxsys_mnt->lxsysm_node = NULL; + lxsys_freenode(lnp); + zone_rele(lxsys_mnt->lxsysm_zone); + vfsp->vfs_data = NULL; + kmem_free(lxsys_mnt, sizeof (*lxsys_mnt)); + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + + VERIFY(lxsm != NULL); + VERIFY(lxsm->lxsysm_node != NULL); + + vp = LXSTOV(lxsm->lxsysm_node); + VN_HOLD(vp); + *vpp = vp; + + return (0); +} + +static int +lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + dev32_t d32; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)3; + sp->f_ffree = (fsfilcnt64_t)0; /* none */ + sp->f_favail = (fsfilcnt64_t)0; /* none */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/sys"); + (void) strcpy(&sp->f_fstr[6], "/sys"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c new file mode 100644 index 0000000000..10c99baa7b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c @@ -0,0 +1,2165 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lx_sysfs -- a Linux-compatible /sys for the LX brand + */ + +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/sunddi.h> +#include <sys/vnode.h> +#include <sys/netstack.h> +#include <sys/ethernet.h> +#include <inet/ip_arp.h> + +#include "lx_sysfs.h" + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxsys_init() in lx_sysvfsops.c + */ +vnodeops_t *lxsys_vnodeops; + +static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxsys_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxsys_sync(void); +static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_blockdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_zfsdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpu(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpuinfo(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_sysnode(lxsys_node_t *, char *); + +static int lxsys_read_static(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_zfs_block(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_syscpu(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_sysnode(lxsys_node_t *, lxsys_uiobuf_t *); + +static int lxsys_readdir_devices_syscpu(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_syscpuinfo(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_sysnode(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_blockdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_zfsdir(lxsys_node_t *, uio_t *, int *); + +static int lxsys_readlink_class_net(lxsys_node_t *, char *, size_t); +static int lxsys_readlink_block(lxsys_node_t *, char *, size_t); + +/* + * The lx /sys vnode operations vector + */ +const fs_operation_def_t lxsys_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxsys_open }, + VOPNAME_CLOSE, { .vop_close = lxsys_close }, + VOPNAME_READ, { .vop_read = lxsys_read }, + VOPNAME_GETATTR, { .vop_getattr = lxsys_getattr }, + VOPNAME_ACCESS, { .vop_access = lxsys_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxsys_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxsys_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxsys_readlink }, + VOPNAME_FSYNC, { .error = lxsys_sync }, + VOPNAME_SEEK, { .error = lxsys_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxsys_inactive }, + VOPNAME_CMP, { .vop_cmp = lxsys_cmp }, + NULL, NULL +}; + +typedef enum lxsys_cpu_state { + LXSYS_CPU_ON, /* online */ + LXSYS_CPU_OFF, /* offline */ + LXSYS_CPU_ANY, /* don't care */ +} lxsys_cpu_state_t; + +static void lxsys_format_cpu(char *, int, lxsys_cpu_state_t); + +/* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + * + * Where TYPE is one of: + * 1 - SYS_STATIC + * 2 - SYS_CLASS_NET + * 3 - SYS_DEV_NET + * 4 - SYS_BLOCK + * 5 - SYS_DEV_ZFS + * 6 - SYS_DEV_SYS_CPU + * 7 - SYS_DEV_SYS_CPUINFO + * 8 - SYS_DEV_SYS_NODE + * + * Static entries will have assigned INSTANCE identifiers: + * - 0x00: /sys + * - 0x01: /sys/class + * - 0x02: /sys/devices + * - 0x03: /sys/fs + * - 0x04: /sys/class/net + * - 0x05: /sys/devices/virtual + * - 0x06: /sys/devices/system + * - 0x07: /sys/fs/cgroup + * - 0x08: /sys/devices/virtual/net + * - 0x09: /sys/block + * - 0x0a: /sys/devices/zfs + * - 0x0b: /sys/devices/system/cpu + * - 0x0c: /sys/devices/system/node + * - 0x0d: /sys/bus + * + * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived + * from the corresonding ifindex. + * + * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use + * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs + * for the contained entries. + * + * Dynamic /sys/block/<dev> symlinks will use an INSTANCE derived from the + * device major and instance from records listed in kstat or zvols. + * + * Dynamic /sys/devices/zfs/<dev> directories will use an INSTANCE derived from + * the emulated minor number. + * + * Semi-static/Dynamic /sys/devices/system/cpu contains the fixed 'kernel_max', + * 'offline', 'online', 'possible', and 'present' files, and a dynamic set of + * cpuN subdirectories. All of these are dynamic nodes. + * + * Static /sys/devices/system/node/node0 currently only contains a + * static cpulist file, but will likely need future dynamic entries for cpuN + * symlinks, and perhaps other static files. By only providing 'node0' we + * pretend that there is only a single NUMA node available to a zone (trying to + * be NUMA-aware inside a zone is generally not going to work anyway). + * If dynamic entries are added under node0, it must be converted to the + * semi-static/dynamic approach as used under /sys/devices/system/cpu. + * + * The dyn_ino_type table must be updated whenever a new static instance is + * defined. + */ + +#define LXSYS_INST_CLASSDIR 0x1 +#define LXSYS_INST_DEVICESDIR 0x2 +#define LXSYS_INST_FSDIR 0x3 +#define LXSYS_INST_CLASS_NETDIR 0x4 +#define LXSYS_INST_DEVICES_VIRTUALDIR 0x5 +#define LXSYS_INST_DEVICES_SYSTEMDIR 0x6 +#define LXSYS_INST_FS_CGROUPDIR 0x7 +#define LXSYS_INST_DEVICES_VIRTUAL_NETDIR 0x8 +#define LXSYS_INST_BLOCKDIR 0x9 +#define LXSYS_INST_DEVICES_ZFSDIR 0xa +#define LXSYS_INST_DEVICES_SYSCPU 0xb +#define LXSYS_INST_DEVICES_SYSNODE 0xc +#define LXSYS_INST_BUSDIR 0xd +#define LXSYS_INST_MAX LXSYS_INST_BUSDIR /* limit */ + +/* + * These are of dynamic type (LXSYS_DEV_SYS_CPU), but essentially fixed + * instances. Under /sys/devices/system/cpu we have: kernel_max, offline, + * online, possible and present. We also have a dynamic set of cpuN subdirs. + * The cpuN subdirs are actually of type LXSYS_DEV_SYS_CPUINFO, so we can use + * the following instance IDs for the fixed files. + */ +#define LXSYS_INST_DEV_SYSCPU_KMAX 0x1 +#define LXSYS_INST_DEV_SYSCPU_OFFLINE 0x2 +#define LXSYS_INST_DEV_SYSCPU_ONLINE 0x3 +#define LXSYS_INST_DEV_SYSCPU_POSSIBLE 0x4 +#define LXSYS_INST_DEV_SYSCPU_PRESENT 0x5 + +/* + * This array is used for directory inode correction in lxsys_readdir_common + * when a directory's static-type entry is actually a dynamic-type. + */ +static int dyn_ino_type [] = { + 0, /* invalid */ + 0, /* LXSYS_INST_CLASSDIR */ + 0, /* LXSYS_INST_DEVICESDIR */ + 0, /* LXSYS_INST_FSDIR */ + LXSYS_CLASS_NET, /* LXSYS_INST_CLASS_NETDIR */ + 0, /* LXSYS_INST_DEVICES_VIRTUALDIR */ + 0, /* LXSYS_INST_DEVICES_SYSTEMDIR */ + 0, /* LXSYS_INST_FS_CGROUPDIR */ + LXSYS_DEV_NET, /* LXSYS_INST_DEV_VIRTUAL_NETDIR */ + LXSYS_BLOCK, /* LXSYS_INST_BLOCKDIR */ + LXSYS_DEV_ZFS, /* LXSYS_INST_DEVICES_ZFSDIR */ + LXSYS_DEV_SYS_CPU, /* LXSYS_INST_DEVICES_SYSCPU */ + LXSYS_DEV_SYS_NODE, /* LXSYS_INST_DEV_SYSNODE */ + 0, /* LXSYS_INST_BUSDIR */ +}; +#define DYN_INO_LEN \ + (sizeof (dyn_ino_type) / sizeof ((dyn_ino_type)[0])) + +/* + * file contents of an lx /sys directory. + */ +static lxsys_dirent_t dirlist_root[] = { + { LXSYS_INST_BLOCKDIR, "block" }, + { LXSYS_INST_BUSDIR, "bus" }, + { LXSYS_INST_CLASSDIR, "class" }, + { LXSYS_INST_DEVICESDIR, "devices" }, + { LXSYS_INST_FSDIR, "fs" } +}; +static lxsys_dirent_t dirlist_class[] = { + { LXSYS_INST_CLASS_NETDIR, "net" } +}; +static lxsys_dirent_t dirlist_fs[] = { + { LXSYS_INST_FS_CGROUPDIR, "cgroup" } +}; +static lxsys_dirent_t dirlist_devices[] = { + { LXSYS_INST_DEVICES_SYSTEMDIR, "system" }, + { LXSYS_INST_DEVICES_VIRTUALDIR, "virtual" }, + { LXSYS_INST_DEVICES_ZFSDIR, "zfs" } +}; +static lxsys_dirent_t dirlist_devices_virtual[] = { + { LXSYS_INST_DEVICES_VIRTUAL_NETDIR, "net" } +}; + +static lxsys_dirent_t dirlist_devices_system[] = { + { LXSYS_INST_DEVICES_SYSCPU, "cpu" }, + { LXSYS_INST_DEVICES_SYSNODE, "node" } +}; + +#define LXSYS_ENDP_NET_ADDRESS 1 +#define LXSYS_ENDP_NET_ADDRLEN 2 +#define LXSYS_ENDP_NET_FLAGS 3 +#define LXSYS_ENDP_NET_IFINDEX 4 +#define LXSYS_ENDP_NET_MTU 5 +#define LXSYS_ENDP_NET_TXQLEN 6 +#define LXSYS_ENDP_NET_TYPE 7 + +#define LXSYS_ENDP_BLOCK_DEVICE 1 + +#define LXSYS_ENDP_NODE_CPULIST 1 +#define LXSYS_ENDP_NODE_CPUMAP 2 + +static lxsys_dirent_t dirlist_devices_virtual_net[] = { + { LXSYS_ENDP_NET_ADDRESS, "address" }, + { LXSYS_ENDP_NET_ADDRLEN, "addr_len" }, + { LXSYS_ENDP_NET_FLAGS, "flags" }, + { LXSYS_ENDP_NET_IFINDEX, "ifindex" }, + { LXSYS_ENDP_NET_MTU, "mtu" }, + { LXSYS_ENDP_NET_TXQLEN, "tx_queue_len" }, + { LXSYS_ENDP_NET_TYPE, "type" } +}; + +static lxsys_dirent_t dirlist_devices_zfs_block[] = { + { LXSYS_ENDP_BLOCK_DEVICE, "device" } +}; + +static lxsys_dirent_t dirlist_devices_sysnode[] = { + { LXSYS_ENDP_NODE_CPULIST, "cpulist" }, + { LXSYS_ENDP_NODE_CPUMAP, "cpumap" } +}; + +#define SYSDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +#define SYSDLENT(i, l) { i, l, SYSDIRLISTSZ(l) } +static lxsys_dirlookup_t lxsys_dirlookup[] = { + SYSDLENT(LXSYS_INST_ROOT, dirlist_root), + SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class), + SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs), + { LXSYS_INST_FS_CGROUPDIR, NULL, 0 }, + SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices), + SYSDLENT(LXSYS_INST_DEVICES_SYSTEMDIR, dirlist_devices_system), + SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual), + SYSDLENT(LXSYS_INST_DEVICES_SYSNODE, dirlist_devices_sysnode), + { LXSYS_INST_BUSDIR, NULL, 0 }, +}; + + +/* + * Array of lookup functions, indexed by lx /sys file type. + */ +static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_lookup_static, /* LXSYS_STATIC */ + lxsys_lookup_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_lookup_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_lookup_blockdir, /* LXSYS_BLOCK */ + lxsys_lookup_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_lookup_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_lookup_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_lookup_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readdir functions, indexed by /sys file type. + */ +static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_readdir_static, /* LXSYS_STATIC */ + lxsys_readdir_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_readdir_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_readdir_blockdir, /* LXSYS_BLOCK */ + lxsys_readdir_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_readdir_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_readdir_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_readdir_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of read functions, indexed by /sys file type. + */ +static int (*lxsys_read_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_read_static, /* LXSYS_STATIC */ + NULL, /* LXSYS_CLASS_NET */ + lxsys_read_devices_virtual_net, /* LXSYS_DEV_NET */ + NULL, /* LXSYS_BLOCK */ + lxsys_read_devices_zfs_block, /* LXSYS_DEV_ZFS */ + lxsys_read_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_read_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readlink functions, indexed by /sys file type. + */ +static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + NULL, /* LXSYS_STATIC */ + lxsys_readlink_class_net, /* LXSYS_CLASS_NET */ + NULL, /* LXSYS_DEV_NET */ + lxsys_readlink_block, /* LXSYS_BLOCK */ + NULL, /* LXSYS_DEV_ZFS */ + NULL, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + NULL, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Given one of our inodes, return the vnode type. + * + * lxsys_getnode will always set the vnode type to VDIR. It expects the + * caller (normally the lookup functions) to fix the type. Those same rules are + * encoded here for our inode-to-type translation. + */ +int +lxsys_ino_get_type(ino_t ino) +{ + lxsys_nodetype_t type; + unsigned int instance; + unsigned int endpoint; + + type = (ino & 0xff000000) >> 24; + instance = (ino & 0xffff00) >> 8; + endpoint = (ino & 0xff); + + if (instance > LXSYS_INST_MAX) + return (VNON); + + /* Validate non-static node types */ + if (type != LXSYS_STATIC && + (type <= LXSYS_STATIC || type >= LXSYS_MAXTYPE)) { + return (VNON); + } + + if (type != LXSYS_STATIC) { + /* Non-static node types */ + switch (type) { + case LXSYS_CLASS_NET: + if (instance != 0) { + return (VLNK); + } + break; + case LXSYS_DEV_NET: + /* + * /sys/devices/virtual/net usually has the eth0 and + * lo directories. Each network device directory is an + * instances with a 0 endpoint. The files within + * that directory have a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_BLOCK: + if (instance != 0) { + return (VLNK); + } + break; + case LXSYS_DEV_ZFS: + /* + * /sys/devices/zfs usually has the zfsds0 directory + * instance with a 0 endpoint. The device file within + * that directory has a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_CPU: + if (instance != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_CPUINFO: + /* + * There is an instance of /sys/devices/system/cpu/cpuN + * for each CPU. These have an instance per CPU and + * currently the endpoint is 0 since there is nothing + * underneath the cpuN subdirectories. Future + * regular file entries are likely to be added there. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_NODE: + /* + * /sys/devices/system/node has the node0 directory + * instance with a 0 endpoint. The cpulist file within + * that directory has a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + default: + break; + } + } + return (VDIR); +} + +/* + * lxsys_open(): Vnode operation for VOP_OPEN() + */ +/* ARGSUSED */ +static int +lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + /* + * We only allow reading in this file system + */ + if (flag & FWRITE) + return (EROFS); + + return (0); +} + + +/* + * lxsys_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + + +/* + * lxsys_read(): Vnode operation for VOP_READ() + * All we currently have in this fs are directories. + */ +/* ARGSUSED */ +static int +lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + lxsys_uiobuf_t *luio; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type == VDIR) { + return (EISDIR); + } + + rlfunc = lxsys_read_function[type]; + if (rlfunc != NULL) { + luio = lxsys_uiobuf_new(uiop); + if ((error = rlfunc(lnp, luio)) == 0) { + error = lxsys_uiobuf_flush(luio); + } + lxsys_uiobuf_free(luio); + } else { + error = EIO; + } + + return (error); +} + +/* + * lxsys_getattr(): Vnode operation for VOP_GETATTR() + */ +/* ARGSUSED */ +static int +lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxsys_node_t *lxsnp = VTOLXS(vp); + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxsnp->lxsys_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxsnp->lxsys_uid; + vap->va_gid = lxsnp->lxsys_gid; + vap->va_nodeid = lxsnp->lxsys_ino; + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxsys_access(): Vnode operation for VOP_ACCESS() + */ +/* ARGSUSED */ +static int +lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxsys_node_t *lxsnp = VTOLXS(vp); + int shift = 0; + + /* + * Although our lx sysfs is basically a read only file system, Linux + * expects it to be writable so we can't just error if (mode & VWRITE). + */ + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxsnp->lxsys_uid) { + shift += 3; + if (!groupmember((uid_t)lxsnp->lxsys_gid, cr)) + shift += 3; + } + + mode &= ~(lxsnp->lxsys_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* + * lxsys_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + int error; + + VERIFY(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxsnp->lxsys_parentvp); + *vpp = lxsnp->lxsys_parentvp; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxsys_lookup_function[type](lxsnp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +static lxsys_node_t * +lxsys_lookup_disk(lxsys_node_t *ldp, char *comp, lxsys_nodetype_t type) +{ + lxsys_node_t *lnp = NULL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (NULL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int inst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (strcmp(vd->lxvd_name, comp) == 0 && inst != 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, type, inst, 0); + break; + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (lnp); +} + +static vnode_t * +lxsys_lookup_static(lxsys_node_t *ldp, char *comp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + if (dirent == NULL) { + return (NULL); + } + + for (i = 0; i < len; i++) { + if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) { + lxsys_nodetype_t node_type = ldp->lxsys_type; + unsigned int node_instance = 0; + lxsys_node_t *lnp; + + switch (dirent[i].d_idnum) { + case LXSYS_INST_BLOCKDIR: + node_type = LXSYS_BLOCK; + break; + case LXSYS_INST_CLASS_NETDIR: + node_type = LXSYS_CLASS_NET; + break; + case LXSYS_INST_DEVICES_VIRTUAL_NETDIR: + node_type = LXSYS_DEV_NET; + break; + case LXSYS_INST_DEVICES_ZFSDIR: + node_type = LXSYS_DEV_ZFS; + break; + case LXSYS_INST_DEVICES_SYSCPU: + node_type = LXSYS_DEV_SYS_CPU; + break; + case LXSYS_INST_DEVICES_SYSNODE: + node_type = LXSYS_DEV_SYS_NODE; + break; + default: + /* Another static node */ + node_instance = dirent[i].d_idnum; + } + if (node_type == LXSYS_STATIC) { + lnp = lxsys_getnode_static(ldp->lxsys_vnode, + node_instance); + } else { + lnp = lxsys_getnode(ldp->lxsys_vnode, + node_type, node_instance, 0); + } + return (lnp->lxsys_vnode); + } + } + return (NULL); +} + +static vnode_t * +lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp) +{ + vnode_t *result = NULL; + lxsys_node_t *lnp; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + if (ldp->lxsys_type != LXSYS_CLASS_NET || + ldp->lxsys_instance != 0) { + /* Lookups only allowed at directory level */ + return (NULL); + } + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + result->v_type = VLNK; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); +} + +static vnode_t * +lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level interface listing */ + vnode_t *result = NULL; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); + } else if (ldp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_virtual_net); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_virtual_net[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_blockdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_BLOCK); + + if (lnp != NULL) { + lnp->lxsys_vnode->v_type = VLNK; + return (lnp->lxsys_vnode); + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_zfsdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_DEV_ZFS); + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + /* + * All of these entries currently look like regular files + * but on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + size = SYSDIRLISTSZ(dirlist_devices_zfs_block); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_zfs_block[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_syscpu(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* top-level cpu listing */ + + /* If fixed entry */ + if (strcmp(comp, "kernel_max") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_KMAX, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "offline") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_OFFLINE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "online") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_ONLINE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "possible") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_POSSIBLE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "present") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_PRESENT, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else { + /* Else dynamic cpuN entry */ + cpuset_t *avail; /* all installed CPUs */ + uint_t i, avlo, avhi; + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + for (i = avlo; i <= avhi; i++) { + char cpunm[16]; + + if (!cpu_in_set(avail, i)) + continue; + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%u", + i); + + if (strcmp(comp, cpunm) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + LXSYS_DEV_SYS_CPUINFO, i + 1, 0); + break; + } + } + cpuset_free(avail); + } + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing, currently empty */ + /* EMPTY */ + } + + return (NULL); +} + +/* ARGSUSED */ +static vnode_t * +lxsys_lookup_devices_syscpuinfo(lxsys_node_t *ldp, char *comp) +{ + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_sysnode(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* + * The system is presently represented as a single node, + * regardless of any NUMA topology which exists. + * The instances are offset by 1 to account for the top level + * directory occupying instance 0. + */ + if (strcmp(comp, "node0") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + 1, 0); + return (lnp->lxsys_vnode); + } + } else { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_sysnode); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_sysnode[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static int +lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + netstack_t *ns; + ill_t *ill; + uint_t ifindex = lnp->lxsys_instance; + uint8_t *addr; + uint64_t flags; + int error = 0; + + if (ifindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (EIO); + } + + ill = lxsys_find_ill(ns->netstack_ip, ifindex); + if (ill == NULL) { + netstack_rele(ns); + return (EIO); + } + + switch (lnp->lxsys_endpoint) { + case LXSYS_ENDP_NET_ADDRESS: + if (ill->ill_phys_addr_length != ETHERADDRL) { + lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n"); + break; + } + addr = ill->ill_phys_addr; + lxsys_uiobuf_printf(luio, + "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + break; + case LXSYS_ENDP_NET_ADDRLEN: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length); + break; + case LXSYS_ENDP_NET_FLAGS: + flags = (ill->ill_flags | ill->ill_ipif->ipif_flags | + ill->ill_phyint->phyint_flags) & 0xffff; + lx_ifflags_convert(&flags, LX_IF_FROMNATIVE); + lxsys_uiobuf_printf(luio, "0x%x\n", flags); + break; + case LXSYS_ENDP_NET_IFINDEX: + lxsys_uiobuf_printf(luio, "%u\n", ifindex); + break; + case LXSYS_ENDP_NET_MTU: + lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu); + break; + case LXSYS_ENDP_NET_TXQLEN: + /* perpetuate the txqlen lie */ + if (IS_LOOPBACK(ill)) { + lxsys_uiobuf_printf(luio, "0\n"); + } else { + lxsys_uiobuf_printf(luio, "1\n"); + } + break; + case LXSYS_ENDP_NET_TYPE: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK : + arp_hw_type(ill->ill_mactype)); + break; + default: + error = EIO; + } + + ill_refrele(ill); + netstack_rele(ns); + return (error); +} + +/* ARGSUSED1 */ +static int +lxsys_read_devices_zfs_block(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t dskindex = lnp->lxsys_instance; + + if (dskindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + return (EIO); +} + +/* + * In the Linux src tree, see ABI/stable/sysfs-devices-node. + * + * For the 'cpumap' file, each CPU is treated as a bit, then those are + * accumulated and printed as a hex digit, with CPU0 as the rightmost bit. + * Each set of 8 digits (i.e. 32 CPUs) is then delimited with a comma. + * Since we are emulating a single NUMA group, all of our CPUs will be listed + * in this file. For example, a 48 CPU system would look like: + * 00000000,00000000,00000000,00000000,00000000,00000000,0000ffff,ffffffff + * It comes out this way because 'kernel_max' is NCPU, which is currently + * defined to be 256. + */ +static int +lxsys_read_devices_sysnode(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + if (lnp->lxsys_instance == 1) { + char outbuf[256]; + + if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPULIST) { + /* Show the range of CPUs */ + lxsys_format_cpu(outbuf, sizeof (outbuf), + LXSYS_CPU_ANY); + } else if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPUMAP) { + int i; + uint_t j, ndigits; + cpuset_t *avail; /* all installed CPUs */ + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + outbuf[0] = '\0'; + ndigits = 0; + for (i = NCPU - 1; i >= 0; i -= 4) { + char buf[8]; + int cnt = 3; + uint_t digit = 0; + + for (j = i; cnt >= 0; j--, cnt--) { + if (cpu_in_set(avail, j)) + digit |= 1 << cnt; + } + (void) snprintf(buf, sizeof (buf), "%x", digit); + if (ndigits == 8) { + (void) strlcat(outbuf, ",", + sizeof (outbuf)); + ndigits = 0; + } + (void) strlcat(outbuf, buf, sizeof (outbuf)); + ndigits++; + } + + cpuset_free(avail); + } else { + return (EISDIR); + } + + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + return (EISDIR); +} + +static void +lxsys_format_range(char *buf, int blen, boolean_t *first, uint_t start, + uint_t cnt) +{ + char tmp[256]; + char *delim; + + if (cnt == 0) + return; + + if (*first) { + *first = B_FALSE; + delim = ""; + } else { + delim = ","; + } + if (cnt > 1) { + (void) snprintf(tmp, sizeof (tmp), "%s%u-%u", delim, start, + start + cnt - 1); + } else { + (void) snprintf(tmp, sizeof (tmp), "%s%u", delim, start); + } + (void) strlcat(buf, tmp, blen); +} + +/* + * Format a string of which CPUs are online, offline, or don't care (depending + * on chk_state), and which would be formatted like this: + * 0-31 + * or + * 0-12,14,20-31 + */ +static void +lxsys_format_cpu(char *buf, int blen, lxsys_cpu_state_t chk_state) +{ + uint_t start, cnt, avlo, avhi; + boolean_t first = B_TRUE; + cpuset_t *active; /* CPUs online */ + cpuset_t *avail; /* all installed CPUs */ + + active = cpuset_alloc(KM_SLEEP); + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(active); + cpuset_all(avail); + + /* Take a snapshot of the available and active sets */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + cpuset_and(active, &cpu_active_set); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + buf[0] = '\0'; + if (chk_state == LXSYS_CPU_ANY) { + start = avlo; + cnt = avhi + 1; + } else { + uint_t i; + boolean_t incl_cpu = B_TRUE; + + start = 0; + cnt = 0; + for (i = avlo; i <= avhi; i++) { + if (chk_state == LXSYS_CPU_ON) { + if (!cpu_in_set(active, i)) + incl_cpu = B_FALSE; + } else { + if (cpu_in_set(active, i)) + incl_cpu = B_FALSE; + } + + if (incl_cpu && cpu_in_set(avail, i)) { + cnt++; + } else { + /* + * Note: this may print nothing if our 'cnt' + * is 0, but we advance 'start' properly so we + * handle the next range of elements we're + * looking for. + */ + lxsys_format_range(buf, blen, &first, start, + cnt); + start += cnt + 1; + cnt = 0; + incl_cpu = B_TRUE; + } + } + } + + cpuset_free(avail); + cpuset_free(active); + + lxsys_format_range(buf, blen, &first, start, cnt); +} + +static int +lxsys_read_devices_syscpu(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t inst = lnp->lxsys_instance; + char outbuf[256]; + + /* + * For 'kernel_max', 'offline', 'online', 'possible', and 'present', + * see the Documentaion/cputopology.txt file in the Linux src tree. + */ + if (inst == LXSYS_INST_DEV_SYSCPU_KMAX) { + lxsys_uiobuf_printf(luio, "%d\n", NCPU - 1); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_OFFLINE) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_OFF); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_ONLINE) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ON); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_POSSIBLE || + inst == LXSYS_INST_DEV_SYSCPU_PRESENT) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ANY); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + /* All other nodes are directories */ + return (EISDIR); +} + +/* ARGSUSED */ +static int +lxsys_read_static(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + /* All static nodes are directories */ + return (EISDIR); +} + +/* + * lxsys_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + ssize_t uresid; + off_t uoffset; + int error, leof; + + ASSERT(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXSYS_SDSIZE) + return (ENOENT); + + /* Free lower functions from having to check eofp == NULL */ + if (eofp == NULL) { + eofp = &leof; + } + + return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp)); +} + +static int +lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio) +{ + int error; + off_t offset = uio->uio_offset; + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the + * same amount. But we want uiop->uio_offset to change in increments + * of LXSYS_SDSIZE, which is different from the number of bytes being + * returned to the user. To accomplish this, we set uiop->uio_offset + * separately on success, overriding what uiomove() does. + */ + d->d_off = (off64_t)(offset + LXSYS_SDSIZE); + d->d_reclen = n; + if ((error = uiomove(d, n, UIO_READ, uio)) != 0) { + return (error); + } + uio->uio_offset = offset + LXSYS_SDSIZE; + return (0); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + int idnum, ino_type = 0; + + idnum = dirtab[dirindex].d_idnum; + if (idnum > 0 && idnum < DYN_INO_LEN) + ino_type = dyn_ino_type[idnum]; + + if (ino_type != 0) { + /* + * Correct the inode for static directories + * which contain non-static lxsys_nodetype_t's. + */ + dirent->d_ino = lxsys_inode(ino_type, 0, 0); + DTRACE_PROBE3(lxsys__fix__inode, + char *, dirtab[dirindex].d_name, + int, ino_type, int, dirent->d_ino); + } else { + dirent->d_ino = lxsys_inode(LXSYS_STATIC, + idnum, 0); + } + + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + VERIFY(dirtab != NULL || dirtablen == 0); + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(lxsnp->lxsys_type, + lxsnp->lxsys_instance, dirtab[dirindex].d_idnum); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + int error, i; + + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + if ((ns = lxsys_netstack(ldp)) == NULL) { + *eofp = 1; + return (0); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_first(phytree); + if (phyi == NULL) { + *eofp = 1; + } + bzero(bp, sizeof (bp)); + + /* + * Skip records we have already passed with the offset. + * This accounts for the two "." and ".." records already seen. + */ + for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) { + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) { + uint_t ifindex; + int reclen; + + ifindex = phyi->phyint_ifindex; + (void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE); + dirent->d_ino = lxsys_inode(type, ifindex, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readdir_disks(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + uint_t instance; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (EINVAL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (skip > 0) { + skip--; + goto next; + } + + if (strnlen(vd->lxvd_name, sizeof (vd->lxvd_name)) > LXSNSIZ) + goto next; + + (void) strncpy(dirent->d_name, vd->lxvd_name, LXSNSIZ); + + instance = getminor(vd->lxvd_emul_dev) & 0xffff; + if (instance == 0) + goto next; + + dirent->d_ino = lxsys_inode(type, instance, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + +next: + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Indicate EOF if we reached the end of the virtual disks. */ + if (vd == NULL) { + *eofp = 1; + } + + return (error); +} + + +static int +lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + boolean_t found = B_FALSE; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + found = B_TRUE; + break; + } + } + + if (!found) { + return (ENOTDIR); + } + + return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len)); +} + +static int +lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_CLASS_NET || + lnp->lxsys_instance != 0) { + /* + * Since /sys/class/net contains only symlinks, readdir + * operations should not be performed anywhere except the top + * level (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET)); +} + +static int +lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level interface listing */ + error = lxsys_readdir_ifaces(lnp, uiop, eofp, + LXSYS_DEV_NET); + } else if (lnp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_virtual_net, + SYSDIRLISTSZ(dirlist_devices_virtual_net)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_blockdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_BLOCK || + lnp->lxsys_instance != 0) { + /* + * Since /sys/block contains only symlinks, readdir operations + * should not be performed anywhere except the top level + * (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_disks(lnp, uiop, eofp, LXSYS_BLOCK)); +} + +static int +lxsys_readdir_devices_zfsdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level dev listing */ + error = lxsys_readdir_disks(lnp, uiop, eofp, + LXSYS_DEV_ZFS); + } else if (lnp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_zfs_block, + SYSDIRLISTSZ(dirlist_devices_zfs_block)); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +/* Handle fixed entries within the cpu directory. */ +static int +lxsys_do_sub_cpu(struct uio *uiop, ssize_t oresid, dirent64_t *dirent, + char *nm, int inst, int *errp) +{ + int reclen; + ssize_t uresid; + + (void) strncpy(dirent->d_name, nm, LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPU, inst, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + *errp = EINVAL; + } + return (-1); + } + if ((*errp = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (-1); + } + + return (0); +} + +static int +lxsys_readdir_cpu(lxsys_node_t *ldp, struct uio *uiop, int *eofp) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + cpuset_t *avail; + uint_t i, avlo, avhi; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "kernel_max", + LXSYS_INST_DEV_SYSCPU_KMAX, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "offline", + LXSYS_INST_DEV_SYSCPU_OFFLINE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "online", + LXSYS_INST_DEV_SYSCPU_ONLINE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "possible", + LXSYS_INST_DEV_SYSCPU_POSSIBLE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "present", + LXSYS_INST_DEV_SYSCPU_PRESENT, &error) != 0) + goto done; + } + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + /* Output dynamic CPU info */ + for (i = avlo; i <= avhi; i++) { + char cpunm[16]; + + if (skip > 0) { + skip--; + continue; + } + + if (!cpu_in_set(avail, i)) + continue; + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%u", i); + (void) strncpy(dirent->d_name, cpunm, LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPUINFO, i + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + } + cpuset_free(avail); + + /* Indicate EOF if we reached the end of the CPU list. */ + if (i == avhi) { + *eofp = 1; + } + +done: + return (error); +} + +static int +lxsys_readdir_devices_syscpu(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level cpu listing */ + error = lxsys_readdir_cpu(lnp, uiop, eofp); + } else if (lnp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, NULL, 0); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_devices_syscpuinfo(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_type != LXSYS_DEV_SYS_CPUINFO) { + /* + * Since /sys/devices/system/cpu/cpuN is empty, readdir + * operations should not be performed anywhere except the top + * level. + */ + return (ENOTDIR); + } + + /* + * Emit "." and ".." entries + * All cpuN directories are currently empty. + */ + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + /* Indicate EOF */ + *eofp = 1; + + return (error); +} + +static int +lxsys_readdir_devices_sysnode(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level node listing */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int reclen, skip; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + (void) strncpy(dirent->d_name, "node0", LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_NODE, + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + return (EINVAL); + } + return (0); + } + error = lxsys_dirent_out(dirent, reclen, uiop); + } + /* Indicate EOF */ + if (error == 0) { + *eofp = 1; + } + } else if (lnp->lxsys_endpoint == 0) { + /* node-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_sysnode, + SYSDIRLISTSZ(dirlist_devices_sysnode)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +/* + * lxsys_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char buf[MAXPATHLEN + 1]; + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type != VLNK) { + return (EINVAL); + } + + rlfunc = lxsys_readlink_function[lnp->lxsys_type]; + if (rlfunc != NULL) { + if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) { + error = uiomove(buf, strlen(buf), UIO_READ, uiop); + } + } else { + error = EINVAL; + } + + return (error); +} + + +static int +lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len) +{ + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + uint_t ifindex; + char ifname[LIFNAMSIZ]; + int error = EINVAL; + + if ((ifindex = lnp->lxsys_instance) == 0) { + return (error); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (error); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_find(phytree, &ifindex, NULL); + if (phyi != NULL) { + (void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + (void) snprintf(buf, len, "/sys/devices/virtual/net/%s", + ifname); + error = 0; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readlink_block(lxsys_node_t *lnp, char *buf, size_t len) +{ + int inst, error = EINVAL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + if ((inst = lnp->lxsys_instance) == 0) { + return (error); + } + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (error); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int vinst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (vinst == inst) { + (void) snprintf(buf, len, + "../devices/zfs/%s", vd->lxvd_name); + error = 0; + break; + } + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (error); +} + +/* + * lxsys_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxsys_freenode(VTOLXS(vp)); +} + +/* + * lxsys_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxsys_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxsys_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + if (vn_matchops(vp1, lxsys_vnodeops) || + vn_matchops(vp2, lxsys_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index d61928d578..ebdabce2b5 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/errno.h> @@ -42,43 +43,69 @@ char *sn1_emulation_table = NULL; -void sn1_init_brand_data(zone_t *); +void sn1_init_brand_data(zone_t *, kmutex_t *); void sn1_free_brand_data(zone_t *); void sn1_setbrand(proc_t *); int sn1_getattr(zone_t *, int, void *, size_t *); int sn1_setattr(zone_t *, int, void *, size_t); int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void sn1_copy_procdata(proc_t *, proc_t *); -void sn1_proc_exit(struct proc *, klwp_t *); +void sn1_proc_exit(struct proc *); void sn1_exec(); -int sn1_initlwp(klwp_t *); +void sn1_initlwp(klwp_t *, void *); void sn1_forklwp(klwp_t *, klwp_t *); void sn1_freelwp(klwp_t *); void sn1_lwpexit(klwp_t *); int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); /* sn1 brand */ struct brand_ops sn1_brops = { - sn1_init_brand_data, - sn1_free_brand_data, - sn1_brandsys, - sn1_setbrand, - sn1_getattr, - sn1_setattr, - sn1_copy_procdata, - sn1_proc_exit, - sn1_exec, - lwp_setrval, - sn1_initlwp, - sn1_forklwp, - sn1_freelwp, - sn1_lwpexit, - sn1_elfexec, - NULL, - NULL, - NSIG, + sn1_init_brand_data, /* b_init_brand_data */ + sn1_free_brand_data, /* b_free_brand_data */ + sn1_brandsys, /* b_brandsys */ + sn1_setbrand, /* b_setbrand */ + sn1_getattr, /* b_getattr */ + sn1_setattr, /* b_setattr */ + sn1_copy_procdata, /* b_copy_procdata */ + sn1_proc_exit, /* b_proc_exit */ + sn1_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + sn1_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + sn1_forklwp, /* b_forklwp */ + sn1_freelwp, /* b_freelwp */ + sn1_lwpexit, /* b_lwpexit */ + sn1_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE, /* b_intp_parse_arg */ + NULL, /* b_clearbrand */ + NULL, /* b_rpc_statd */ + NULL /* b_acct_out */ }; #ifdef sparc @@ -94,9 +121,12 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, + NULL, sn1_brand_int91_callback, sn1_brand_syscall_callback, - sn1_brand_syscall32_callback + sn1_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -104,7 +134,10 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, NULL, + NULL, sn1_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -115,7 +148,8 @@ struct brand sn1_brand = { BRAND_VER_1, "sn1", &sn1_brops, - &sn1_mops + &sn1_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -148,10 +182,10 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) return (EINVAL); } -/*ARGSUSED*/ +/* ARGSUSED5 */ int sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4) { int res; @@ -171,9 +205,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent) } void -sn1_proc_exit(struct proc *p, klwp_t *l) +sn1_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &sn1_brand); + brand_solaris_proc_exit(p, &sn1_brand); } void @@ -182,10 +216,10 @@ sn1_exec() brand_solaris_exec(&sn1_brand); } -int -sn1_initlwp(klwp_t *l) +void +sn1_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &sn1_brand)); + brand_solaris_initlwp(l, &sn1_brand); } void @@ -214,18 +248,18 @@ sn1_free_brand_data(zone_t *zone) /*ARGSUSED*/ void -sn1_init_brand_data(zone_t *zone) +sn1_init_brand_data(zone_t *zone, kmutex_t *zsl) { } int sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, - int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME, - SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32)); + SN1_LIB, SN1_LIB32)); } int diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h index b487745e21..fef9dc128b 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.h +++ b/usr/src/uts/common/brand/sn1/sn1_brand.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SN1_BRAND_H @@ -37,20 +38,14 @@ extern "C" { #define SN1_VERSION SN1_VERSION_1 #define SN1_LIB_NAME "sn1_brand.so.1" -#define SN1_LINKER_NAME "ld.so.1" #define SN1_LIB32 BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME -#define SN1_LINKER32 "/lib/" SN1_LINKER_NAME - #define SN1_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME -#define SN1_LINKER64 "/lib/64/" SN1_LINKER_NAME #if defined(_LP64) #define SN1_LIB SN1_LIB64 -#define SN1_LINKER SN1_LINKER64 #else /* !_LP64 */ #define SN1_LIB SN1_LIB32 -#define SN1_LINKER SN1_LINKER32 #endif /* !_LP64 */ #if defined(_KERNEL) diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index 0841f02e51..4de7cbcc05 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/errno.h> @@ -46,45 +46,71 @@ char *s10_emulation_table = NULL; -void s10_init_brand_data(zone_t *); +void s10_init_brand_data(zone_t *, kmutex_t *); void s10_free_brand_data(zone_t *); void s10_setbrand(proc_t *); int s10_getattr(zone_t *, int, void *, size_t *); int s10_setattr(zone_t *, int, void *, size_t); int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void s10_copy_procdata(proc_t *, proc_t *); -void s10_proc_exit(struct proc *, klwp_t *); +void s10_proc_exit(struct proc *); void s10_exec(); -int s10_initlwp(klwp_t *); +void s10_initlwp(klwp_t *, void *); void s10_forklwp(klwp_t *, klwp_t *); void s10_freelwp(klwp_t *); void s10_lwpexit(klwp_t *); int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); void s10_sigset_native_to_s10(sigset_t *); void s10_sigset_s10_to_native(sigset_t *); /* s10 brand */ struct brand_ops s10_brops = { - s10_init_brand_data, - s10_free_brand_data, - s10_brandsys, - s10_setbrand, - s10_getattr, - s10_setattr, - s10_copy_procdata, - s10_proc_exit, - s10_exec, - lwp_setrval, - s10_initlwp, - s10_forklwp, - s10_freelwp, - s10_lwpexit, - s10_elfexec, - s10_sigset_native_to_s10, - s10_sigset_s10_to_native, - S10_NSIG, + s10_init_brand_data, /* b_init_brand_data */ + s10_free_brand_data, /* b_free_brand_data */ + s10_brandsys, /* b_brandsys */ + s10_setbrand, /* b_setbrand */ + s10_getattr, /* b_getattr */ + s10_setattr, /* b_setattr */ + s10_copy_procdata, /* b_copy_procdata */ + s10_proc_exit, /* b_proc_exit */ + s10_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + s10_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + s10_forklwp, /* b_forklwp */ + s10_freelwp, /* b_freelwp */ + s10_lwpexit, /* b_lwpexit */ + s10_elfexec, /* b_elfexec */ + s10_sigset_native_to_s10, /* b_sigset_native_to_brand */ + s10_sigset_s10_to_native, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + S10_NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE, /* b_intp_parse_arg */ + NULL, /* b_clearbrand */ + NULL, /* b_rpc_statd */ + NULL /* b_acct_out */ }; #ifdef sparc @@ -100,9 +126,12 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, + NULL, s10_brand_int91_callback, s10_brand_syscall_callback, - s10_brand_syscall32_callback + s10_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -110,7 +139,10 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, NULL, + NULL, s10_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -121,7 +153,8 @@ struct brand s10_brand = { BRAND_VER_1, "solaris10", &s10_brops, - &s10_mops + &s10_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -250,10 +283,10 @@ s10_native(void *cmd, void *args) return (0); } -/*ARGSUSED*/ +/* ARGSUSED5 */ int s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4) { proc_t *p = curproc; int res; @@ -327,9 +360,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent) } void -s10_proc_exit(struct proc *p, klwp_t *l) +s10_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &s10_brand); + brand_solaris_proc_exit(p, &s10_brand); } void @@ -338,10 +371,10 @@ s10_exec() brand_solaris_exec(&s10_brand); } -int -s10_initlwp(klwp_t *l) +void +s10_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &s10_brand)); + brand_solaris_initlwp(l, &s10_brand); } void @@ -381,7 +414,7 @@ s10_free_brand_data(zone_t *zone) } void -s10_init_brand_data(zone_t *zone) +s10_init_brand_data(zone_t *zone, kmutex_t *zsl) { ASSERT(zone->zone_brand == &s10_brand); ASSERT(zone->zone_brand_data == NULL); @@ -390,12 +423,12 @@ s10_init_brand_data(zone_t *zone) int s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, - int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, - S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); + S10_LIB, S10_LIB32)); } void diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h index 11f9853f48..ffef485e12 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.h +++ b/usr/src/uts/common/brand/solaris10/s10_brand.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _S10_BRAND_H @@ -42,17 +43,12 @@ extern "C" { #define S10_LINKER_NAME "ld.so.1" #define S10_LIB32 BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME -#define S10_LINKER32 "/lib/" S10_LINKER_NAME - #define S10_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME -#define S10_LINKER64 "/lib/64/" S10_LINKER_NAME #if defined(_LP64) #define S10_LIB S10_LIB64 -#define S10_LINKER S10_LINKER64 #else /* !_LP64 */ #define S10_LIB S10_LIB32 -#define S10_LINKER S10_LINKER32 #endif /* !_LP64 */ /* diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index 64227a3998..1120748b98 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. * Copyright 2012 Milan Jurik. All rights reserved. */ @@ -559,8 +560,8 @@ char *isa_list = architecture; static pgcnt_t original_physmem = 0; #define MIN_DEFAULT_MAXUSERS 8u -#define MAX_DEFAULT_MAXUSERS 2048u -#define MAX_MAXUSERS 4096u +#define MAX_DEFAULT_MAXUSERS 10000u +#define MAX_MAXUSERS 20000u void param_preset(void) @@ -572,7 +573,7 @@ void param_calc(int platform_max_nprocs) { /* - * Default to about one "user" per megabyte, taking into + * Default to about one "user" per 8MB, taking into * account both physical and virtual constraints. * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT) * converts pages to megs without integer overflow. @@ -586,8 +587,9 @@ param_calc(int platform_max_nprocs) if (maxusers == 0) { pgcnt_t physmegs = physmem >> (20 - PAGESHIFT); pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20; - maxusers = MIN(MAX(MIN(physmegs, virtmegs), - MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS); + maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */ + maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS); + maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS); } if (maxusers > MAX_MAXUSERS) { maxusers = MAX_MAXUSERS; @@ -604,15 +606,26 @@ param_calc(int platform_max_nprocs) /* * We need to dynamically change any variables now so that - * the setting of maxusers and pidmax propagate to the other + * the setting of maxusers and maxpid propagate to the other * variables that are dependent on them. */ if (reserved_procs == 0) reserved_procs = 5; - if (pidmax < reserved_procs || pidmax > MAX_MAXPID) + if (pidmax < reserved_procs || pidmax > MAX_MAXPID) { maxpid = MAX_MAXPID; - else + } else { + /* + * If pidmax has not been explicity set in /etc/system, then + * increase it to the maximum on larger machines. We choose a + * 128GB memory size as the threshold to increase pidmax. + */ + if (pidmax == DEFAULT_MAXPID) { + if (physmem > (btop(128ULL * 0x40000000ULL))) { + pidmax = MAX_MAXPID; + } + } maxpid = pidmax; + } /* * This allows platform-dependent code to constrain the maximum diff --git a/usr/src/uts/common/contract/process.c b/usr/src/uts/common/contract/process.c index 9fd23fdb61..e46cbd3abf 100644 --- a/usr/src/uts/common/contract/process.c +++ b/usr/src/uts/common/contract/process.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/mutex.h> @@ -955,6 +956,18 @@ contract_process_exit(cont_process_t *ctp, proc_t *p, int exitstatus) (void) cte_publish_all(ct, event, nvl, NULL); mutex_enter(&ct->ct_lock); } + + /* + * CT_PR_EV_EXIT is not part of the CT_PR_ALLFATAL definition since + * we never allow including this in the fatal set via a user-land + * application, but we do allow CT_PR_EV_EXIT in the contract's fatal + * set for a process setup for zone init. See zone_start_init(). + */ + if (EVFATALP(ctp, CT_PR_EV_EXIT)) { + ASSERT(MUTEX_HELD(&ct->ct_lock)); + contract_process_kill(ct, p, B_TRUE); + } + if (empty) { /* * Send EMPTY message. @@ -1057,6 +1070,17 @@ contract_process_fork(ctmpl_process_t *rtmpl, proc_t *cp, proc_t *pp, event->cte_type = CT_PR_EV_FORK; (void) cte_publish_all(ct, event, nvl, NULL); } + + /* + * Because the CT_PR_KEEP_EXEC flag is meant to be used by applications + * which are not contract aware, we can assume that these applications + * will never explicitly abandon the child's new contract. Thus, we + * abandon it now. + */ + if (ctp->conp_params & CT_PR_KEEP_EXEC) { + (void) contract_abandon(ct, pp, 1); + } + return (ctp); } diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c index 64f9e4e68d..2a51830e6e 100644 --- a/usr/src/uts/common/crypto/api/kcf_random.c +++ b/usr/src/uts/common/crypto/api/kcf_random.c @@ -70,6 +70,7 @@ #include <sys/cpuvar.h> #include <sys/taskq.h> #include <rng/fips_random.h> +#include <sys/strlog.h> #define RNDPOOLSIZE 1024 /* Pool size in bytes */ #define MINEXTRACTBYTES 20 @@ -933,7 +934,8 @@ rnd_handler(void *arg) int len = 0; if (!rng_prov_found && rng_ok_to_log) { - cmn_err(CE_WARN, "No randomness provider enabled for " + (void) strlog(0, 0, 0, SL_NOTE, + "No randomness provider enabled for " "/dev/random. Use cryptoadm(1M) to enable a provider."); rng_ok_to_log = B_FALSE; } diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index 9e079a079e..ec9df915c5 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg) case 0: case -1: /* - * Woke up with no work to do. Check - * if this thread should exit. We keep - * at least kcf_minthreads. + * Woke up with no work to do. Check if we + * should lwp_exit() (which won't return). We + * keep at least kcf_minthreads. */ if (kcfpool->kp_threads > kcf_minthreads) { KCF_ATOMIC_DECR(kcfpool->kp_threads); diff --git a/usr/src/uts/common/crypto/io/dprov.c b/usr/src/uts/common/crypto/io/dprov.c index 5b4e23dca9..806bbef280 100644 --- a/usr/src/uts/common/crypto/io/dprov.c +++ b/usr/src/uts/common/crypto/io/dprov.c @@ -221,6 +221,8 @@ typedef enum dprov_mech_type { SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE, /* SUN_CKM_SHA512_256 */ DES_CBC_MECH_INFO_TYPE, /* SUN_CKM_DES_CBC */ DES3_CBC_MECH_INFO_TYPE, /* SUN_CKM_DES3_CBC */ @@ -430,6 +432,14 @@ static crypto_mech_info_t dprov_mech_info_tab[] = { CRYPTO_FG_ENCRYPT_MAC_ATOMIC | CRYPTO_FG_MAC_DECRYPT_ATOMIC, SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA512_224 */ + {SUN_CKM_SHA512_224, SHA512_224_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0, + CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA512_256 */ + {SUN_CKM_SHA512_256, SHA512_256_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0, + CRYPTO_KEYSIZE_UNIT_IN_BITS}, /* DES-CBC */ {SUN_CKM_DES_CBC, DES_CBC_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_DECRYPT | CRYPTO_FG_ENCRYPT_MAC | @@ -1948,7 +1958,9 @@ dprov_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, mechanism->cm_type != SHA1_MECH_INFO_TYPE && mechanism->cm_type != SHA256_MECH_INFO_TYPE && mechanism->cm_type != SHA384_MECH_INFO_TYPE && - mechanism->cm_type != SHA512_MECH_INFO_TYPE) { + mechanism->cm_type != SHA512_MECH_INFO_TYPE && + mechanism->cm_type != SHA512_224_MECH_INFO_TYPE && + mechanism->cm_type != SHA512_256_MECH_INFO_TYPE) { cmn_err(CE_WARN, "dprov_digest_init: unexpected mech type " "0x%llx\n", (unsigned long long)mechanism->cm_type); return (CRYPTO_MECHANISM_INVALID); diff --git a/usr/src/uts/common/crypto/io/sha2_mod.c b/usr/src/uts/common/crypto/io/sha2_mod.c index 23c73d1909..186c0c3240 100644 --- a/usr/src/uts/common/crypto/io/sha2_mod.c +++ b/usr/src/uts/common/crypto/io/sha2_mod.c @@ -128,7 +128,15 @@ static crypto_mech_info_t sha2_mech_info_tab[] = { {SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, - CRYPTO_KEYSIZE_UNIT_IN_BYTES} + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA512_224 */ + {SUN_CKM_SHA512_224, SHA512_224_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA512_256 */ + {SUN_CKM_SHA512_256, SHA512_256_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS} }; static void sha2_provider_status(crypto_provider_handle_t, uint_t *); @@ -593,6 +601,12 @@ sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, case SHA512_MECH_INFO_TYPE: sha_digest_len = SHA512_DIGEST_LENGTH; break; + case SHA512_224_MECH_INFO_TYPE: + sha_digest_len = SHA512_224_DIGEST_LENGTH; + break; + case SHA512_256_MECH_INFO_TYPE: + sha_digest_len = SHA512_256_DIGEST_LENGTH; + break; default: return (CRYPTO_MECHANISM_INVALID); } @@ -722,6 +736,12 @@ sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest, case SHA512_MECH_INFO_TYPE: sha_digest_len = SHA512_DIGEST_LENGTH; break; + case SHA512_224_MECH_INFO_TYPE: + sha_digest_len = SHA512_224_DIGEST_LENGTH; + break; + case SHA512_256_MECH_INFO_TYPE: + sha_digest_len = SHA512_256_DIGEST_LENGTH; + break; default: return (CRYPTO_MECHANISM_INVALID); } @@ -909,6 +929,19 @@ sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes) } +static boolean_t +sha2_is_general_hmech(const crypto_mechanism_t *mechanism) +{ + switch (mechanism->cm_type) { + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + return (B_TRUE); + default: + return (B_FALSE); + } +} + /* */ static int @@ -979,7 +1012,7 @@ sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, /* * Get the mechanism parameters, if applicable. */ - if (mechanism->cm_type % 3 == 2) { + if (sha2_is_general_hmech(mechanism)) { if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (ulong_t)) ret = CRYPTO_MECHANISM_PARAM_INVALID; @@ -1214,7 +1247,7 @@ sha2_mac_atomic(crypto_provider_handle_t provider, } /* get the mechanism parameters, if applicable */ - if ((mechanism->cm_type % 3) == 2) { + if (sha2_is_general_hmech(mechanism)) { if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (ulong_t)) { ret = CRYPTO_MECHANISM_PARAM_INVALID; @@ -1356,7 +1389,7 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider, } /* get the mechanism parameters, if applicable */ - if (mechanism->cm_type % 3 == 2) { + if (sha2_is_general_hmech(mechanism)) { if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (ulong_t)) { ret = CRYPTO_MECHANISM_PARAM_INVALID; @@ -1592,17 +1625,32 @@ sha2_free_context(crypto_ctx_t *ctx) if (ctx->cc_provider_private == NULL) return (CRYPTO_SUCCESS); - /* - * We have to free either SHA2 or SHA2-HMAC contexts, which - * have different lengths. - * - * Note: Below is dependent on the mechanism ordering. - */ - - if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0) + switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { + case SHA256_MECH_INFO_TYPE: + case SHA384_MECH_INFO_TYPE: + case SHA512_MECH_INFO_TYPE: + case SHA512_224_MECH_INFO_TYPE: + case SHA512_256_MECH_INFO_TYPE: ctx_len = sizeof (sha2_ctx_t); - else + break; + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: ctx_len = sizeof (sha2_hmac_ctx_t); + break; + default: + /* + * If we get here, someone forgot to update the above list + * when adding a new mechanism. Without the correct ctx_len + * we will corrupt the heap when calling kmem_free, so panic + * now and make it easier to identify the problem. + */ + panic("Unknown SHA2 mechanism %d", + PROV_SHA2_CTX(ctx)->sc_mech_type); + } bzero(ctx->cc_provider_private, ctx_len); kmem_free(ctx->cc_provider_private, ctx_len); diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index 0196b15dae..80b5340543 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp) /* * Return non-zero if thread can migrate between "from" and "to" - * without a performance penalty + * without a performance penalty. This is true only if we share a core on + * virtually any CPU; sharing the last-level cache is insufficient to make + * migration possible without penalty. */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { - if (from->cpu_physid->cpu_cacheid == - to->cpu_physid->cpu_cacheid) + if (from->cpu_physid->cpu_coreid == + to->cpu_physid->cpu_coreid) return (1); return (0); } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..2a4365ff73 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap) ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); + + /* do this first to avoid race with cap_kstat_update */ + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; - if (cap->cap_kstat != NULL) { - kstat_delete(cap->cap_kstat); - cap->cap_kstat = NULL; - } - } /* @@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t) /* * Convert internal cap statistics into values exported by cap kstat. + * Note that the kstat is held throughout this function but caps_lock is not. */ static int cap_kstat_update(kstat_t *ksp, int rw) @@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/cpupart.c b/usr/src/uts/common/disp/cpupart.c index 8de1f5cc37..123776a123 100644 --- a/usr/src/uts/common/disp/cpupart.c +++ b/usr/src/uts/common/disp/cpupart.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2018 Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. */ @@ -325,7 +327,7 @@ cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) kthread_t *t; int move_threads = 1; lgrp_id_t lgrpid; - proc_t *p; + proc_t *p; int lgrp_diff_lpl; lpl_t *cpu_lpl; int ret; @@ -570,8 +572,8 @@ again: /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); } t = t->t_forw; } while (t != p->p_tlist); @@ -623,8 +625,8 @@ again: /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, - t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); } t = t->t_next; @@ -879,7 +881,7 @@ cpupart_create(psetid_t *psid) static int cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) { - void *projbuf, *zonebuf; + void *projbuf, *zonebuf; kthread_t *t; proc_t *p; int err = 0; diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index 0c2c0b4993..4898a18bf2 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2018, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -56,6 +60,7 @@ #include <sys/dtrace.h> #include <sys/sdt.h> #include <sys/archsystm.h> +#include <sys/ht.h> #include <vm/as.h> @@ -105,7 +110,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri); /* * If this is set, only interrupt threads will cause kernel preemptions. * This is done by changing the value of kpreemptpri. kpreemptpri - * will either be the max sysclass pri + 1 or the min interrupt pri. + * will either be the max sysclass pri or the min interrupt pri. */ int only_intr_kpreempt; @@ -252,7 +257,23 @@ dispinit(void) maxglobpri = cl_maxglobpri; } } - kpreemptpri = (pri_t)v.v_maxsyspri + 1; + + /* + * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is + * to say, maxclsyspri + 1. However, over time, the system has used + * more and more asynchronous kernel threads, with an increasing number + * of these doing work on direct behalf of higher-level software (e.g., + * network processing). This has led to potential priority inversions: + * threads doing low-priority lengthy kernel work can effectively + * delay kernel-level processing of higher-priority data. To minimize + * such inversions, we set kpreemptpri to be v_maxsyspri; anything in + * the kernel that runs at maxclsyspri will therefore induce kernel + * preemption, and this priority should be used if/when an asynchronous + * thread (or, as is often the case, task queue) is performing a task + * on behalf of higher-level software (or any task that is otherwise + * latency-sensitve). + */ + kpreemptpri = (pri_t)v.v_maxsyspri; if (kpqpri == KPQPRI) kpqpri = kpreemptpri; @@ -1115,15 +1136,13 @@ swtch_to(kthread_t *next) */ } -#define CPU_IDLING(pri) ((pri) == -1) - static void cpu_resched(cpu_t *cp, pri_t tpri) { int call_poke_cpu = 0; pri_t cpupri = cp->cpu_dispatch_pri; - if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { + if (cpupri != CPU_IDLE_PRI && cpupri < tpri) { TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); if (tpri >= upreemptpri && cp->cpu_runrun == 0) { @@ -1219,17 +1238,17 @@ setbackdq(kthread_t *tp) /* * We'll generally let this thread continue to run where * it last ran...but will consider migration if: - * - We thread probably doesn't have much cache warmth. + * - The thread probably doesn't have much cache warmth. + * - HT exclusion would prefer us to run elsewhere * - The CPU where it last ran is the target of an offline * request. - * - The thread last ran outside it's home lgroup. + * - The thread last ran outside its home lgroup. */ if ((!THREAD_HAS_CACHE_WARMTH(tp)) || - (tp->t_cpu == cpu_inmotion)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL); - } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - self ? tp->t_cpu : NULL); + !ht_should_run(tp, tp->t_cpu) || + (tp->t_cpu == cpu_inmotion) || + !LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); } else { cp = tp->t_cpu; } @@ -1258,7 +1277,8 @@ setbackdq(kthread_t *tp) newcp = cp->cpu_next_part; } - if (RUNQ_LEN(newcp, tpri) < qlen) { + if (ht_should_run(tp, newcp) && + RUNQ_LEN(newcp, tpri) < qlen) { DTRACE_PROBE3(runq__balance, kthread_t *, tp, cpu_t *, cp, cpu_t *, newcp); @@ -1269,8 +1289,8 @@ setbackdq(kthread_t *tp) /* * Migrate to a cpu in the new partition. */ - cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, - tp->t_lpl, tp->t_pri, NULL); + cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, tp, + tp->t_pri); } ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); } else { @@ -1407,7 +1427,7 @@ setfrontdq(kthread_t *tp) /* * We'll generally let this thread continue to run * where it last ran, but will consider migration if: - * - The thread last ran outside it's home lgroup. + * - The thread last ran outside its home lgroup. * - The CPU where it last ran is the target of an * offline request (a thread_nomigrate() on the in * motion CPU relies on this when forcing a preempt). @@ -1415,21 +1435,18 @@ setfrontdq(kthread_t *tp) * it last ran, and it is considered not likely to * have significant cache warmth. */ - if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) || - (cp == cpu_inmotion)) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - (tp == curthread) ? cp : NULL); - } else if ((tpri < cp->cpu_disp->disp_maxrunpri) && - (!THREAD_HAS_CACHE_WARMTH(tp))) { - cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, - NULL); + if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp) || + cp == cpu_inmotion || + (tpri < cp->cpu_disp->disp_maxrunpri && + !THREAD_HAS_CACHE_WARMTH(tp))) { + cp = disp_lowpri_cpu(tp->t_cpu, tp, tpri); } } else { /* * Migrate to a cpu in the new partition. */ cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, - tp->t_lpl, tp->t_pri, NULL); + tp, tp->t_pri); } ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); } else { @@ -1580,7 +1597,7 @@ setkpdq(kthread_t *tp, int borf) /* migrate to a cpu in the new partition */ cp = tp->t_cpupart->cp_cpulist; } - cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); + cp = disp_lowpri_cpu(cp, tp, tp->t_pri); disp_lock_enter_high(&cp->cpu_disp->disp_lock); ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); @@ -2258,7 +2275,7 @@ disp_getbest(disp_t *dp) * placed earlier. */ if (tcp == NULL || - pri >= minclsyspri || + (pri >= minclsyspri && tp->t_procp == &p0) || tp->t_cpu != tcp) break; @@ -2553,80 +2570,85 @@ disp_cpu_inactive(cpu_t *cp) } /* - * disp_lowpri_cpu - find CPU running the lowest priority thread. - * The hint passed in is used as a starting point so we don't favor - * CPU 0 or any other CPU. The caller should pass in the most recently - * used CPU for the thread. + * Return a score rating this CPU for running this thread: lower is better. + * + * If curthread is looking for a new CPU, then we ignore cpu_dispatch_pri for + * curcpu (as that's our own priority). + * + * If a cpu is the target of an offline request, then try to avoid it. * - * The lgroup and priority are used to determine the best CPU to run on - * in a NUMA machine. The lgroup specifies which CPUs are closest while - * the thread priority will indicate whether the thread will actually run - * there. To pick the best CPU, the CPUs inside and outside of the given - * lgroup which are running the lowest priority threads are found. The - * remote CPU is chosen only if the thread will not run locally on a CPU - * within the lgroup, but will run on the remote CPU. If the thread - * cannot immediately run on any CPU, the best local CPU will be chosen. + * Otherwise we'll use double the effective dispatcher priority for the CPU. + * + * We do this so ht_adjust_cpu_score() can increment the score if needed, + * without ending up over-riding a dispatcher priority. + */ +static pri_t +cpu_score(cpu_t *cp, kthread_t *tp) +{ + pri_t score; + + if (tp == curthread && cp == curthread->t_cpu) + score = 2 * CPU_IDLE_PRI; + else if (cp == cpu_inmotion) + score = SHRT_MAX; + else + score = 2 * cp->cpu_dispatch_pri; + + if (2 * cp->cpu_disp->disp_maxrunpri > score) + score = 2 * cp->cpu_disp->disp_maxrunpri; + if (2 * cp->cpu_chosen_level > score) + score = 2 * cp->cpu_chosen_level; + + return (ht_adjust_cpu_score(tp, cp, score)); +} + +/* + * disp_lowpri_cpu - find a suitable CPU to run the given thread. * - * The lpl specified also identifies the cpu partition from which - * disp_lowpri_cpu should select a CPU. + * We are looking for a CPU with an effective dispatch priority lower than the + * thread's, so that the thread will run immediately rather than be enqueued. + * For NUMA locality, we prefer "home" CPUs within the thread's ->t_lpl group. + * If we don't find an available CPU there, we will expand our search to include + * wider locality levels. (Note these groups are already divided by CPU + * partition.) * - * curcpu is used to indicate that disp_lowpri_cpu is being called on - * behalf of the current thread. (curthread is looking for a new cpu) - * In this case, cpu_dispatch_pri for this thread's cpu should be - * ignored. + * If the thread cannot immediately run on *any* CPU, we'll enqueue ourselves on + * the best home CPU we found. * - * If a cpu is the target of an offline request then try to avoid it. + * The hint passed in is used as a starting point so we don't favor CPU 0 or any + * other CPU. The caller should pass in the most recently used CPU for the + * thread; it's of course possible that this CPU isn't in the home lgroup. * - * This function must be called at either high SPL, or with preemption - * disabled, so that the "hint" CPU cannot be removed from the online - * CPU list while we are traversing it. + * This function must be called at either high SPL, or with preemption disabled, + * so that the "hint" CPU cannot be removed from the online CPU list while we + * are traversing it. */ cpu_t * -disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) +disp_lowpri_cpu(cpu_t *hint, kthread_t *tp, pri_t tpri) { cpu_t *bestcpu; cpu_t *besthomecpu; cpu_t *cp, *cpstart; - pri_t bestpri; - pri_t cpupri; - klgrpset_t done; - klgrpset_t cur_set; lpl_t *lpl_iter, *lpl_leaf; - int i; - /* - * Scan for a CPU currently running the lowest priority thread. - * Cannot get cpu_lock here because it is adaptive. - * We do not require lock on CPU list. - */ ASSERT(hint != NULL); - ASSERT(lpl != NULL); - ASSERT(lpl->lpl_ncpu > 0); + ASSERT(tp->t_lpl->lpl_ncpu > 0); - /* - * First examine local CPUs. Note that it's possible the hint CPU - * passed in in remote to the specified home lgroup. If our priority - * isn't sufficient enough such that we can run immediately at home, - * then examine CPUs remote to our home lgroup. - * We would like to give preference to CPUs closest to "home". - * If we can't find a CPU where we'll run at a given level - * of locality, we expand our search to include the next level. - */ bestcpu = besthomecpu = NULL; klgrpset_clear(done); - /* start with lpl we were passed */ - lpl_iter = lpl; + lpl_iter = tp->t_lpl; do { + pri_t best = SHRT_MAX; + klgrpset_t cur_set; - bestpri = SHRT_MAX; klgrpset_clear(cur_set); - for (i = 0; i < lpl_iter->lpl_nrset; i++) { + for (int i = 0; i < lpl_iter->lpl_nrset; i++) { lpl_leaf = lpl_iter->lpl_rset[i]; if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) continue; @@ -2639,34 +2661,25 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) cp = cpstart = lpl_leaf->lpl_cpus; do { - if (cp == curcpu) - cpupri = -1; - else if (cp == cpu_inmotion) - cpupri = SHRT_MAX; - else - cpupri = cp->cpu_dispatch_pri; - if (cp->cpu_disp->disp_maxrunpri > cpupri) - cpupri = cp->cpu_disp->disp_maxrunpri; - if (cp->cpu_chosen_level > cpupri) - cpupri = cp->cpu_chosen_level; - if (cpupri < bestpri) { - if (CPU_IDLING(cpupri)) { - ASSERT((cp->cpu_flags & - CPU_QUIESCED) == 0); - return (cp); - } + pri_t score = cpu_score(cp, tp); + + if (score < best) { + best = score; bestcpu = cp; - bestpri = cpupri; + + /* An idle CPU: we're done. */ + if (score / 2 == CPU_IDLE_PRI) + goto out; } } while ((cp = cp->cpu_next_lpl) != cpstart); } - if (bestcpu && (tpri > bestpri)) { - ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); - return (bestcpu); - } + if (bestcpu != NULL && tpri > (best / 2)) + goto out; + if (besthomecpu == NULL) besthomecpu = bestcpu; + /* * Add the lgrps we just considered to the "done" set */ @@ -2678,8 +2691,11 @@ disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) * The specified priority isn't high enough to run immediately * anywhere, so just return the best CPU from the home lgroup. */ - ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); - return (besthomecpu); + bestcpu = besthomecpu; + +out: + ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); + return (bestcpu); } /* @@ -2699,3 +2715,19 @@ static void generic_enq_thread(cpu_t *cpu, int bound) { } + +cpu_t * +disp_choose_best_cpu(void) +{ + kthread_t *t = curthread; + cpu_t *curcpu = CPU; + + ASSERT(t->t_preempt > 0); + ASSERT(t->t_state == TS_ONPROC); + ASSERT(t->t_schedflag & TS_VCPU); + + if (ht_should_run(t, curcpu)) + return (curcpu); + + return (disp_lowpri_cpu(curcpu, t, t->t_pri)); +} diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c index adb70871e2..5b190242e6 100644 --- a/usr/src/uts/common/disp/fx.c +++ b/usr/src/uts/common/disp/fx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -71,16 +71,6 @@ static struct modlinkage modlinkage = { }; -/* - * control flags (kparms->fx_cflags). - */ -#define FX_DOUPRILIM 0x01 /* change user priority limit */ -#define FX_DOUPRI 0x02 /* change user priority */ -#define FX_DOTQ 0x04 /* change FX time quantum */ - - -#define FXMAXUPRI 60 /* maximum user priority setting */ - #define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */ /* diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 5412df83f5..60e870ba28 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg) #endif -static int donice(procset_t *, pcnice_t *); +int donice(procset_t *, pcnice_t *); static int doprio(procset_t *, pcprio_t *); static int proccmp(proc_t *, struct pcmpargs *); static int setparms(proc_t *, struct stprmargs *); @@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice) /* * Update the nice value of the specified LWP or set of processes. */ -static int +int donice(procset_t *procset, pcnice_t *pcnice) { int err_proc = 0; diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c index f87f8c56ce..115e42ccb8 100644 --- a/usr/src/uts/common/disp/rt.c +++ b/usr/src/uts/common/disp/rt.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -103,13 +103,6 @@ _info(struct modinfo *modinfop) pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */ rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */ -/* - * control flags (kparms->rt_cflags). - */ -#define RT_DOPRI 0x01 /* change priority */ -#define RT_DOTQ 0x02 /* change RT time quantum */ -#define RT_DOSIG 0x04 /* change RT time quantum signal */ - static int rt_admin(caddr_t, cred_t *); static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *); static int rt_fork(kthread_t *, kthread_t *, void *); diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c index 768b499ef2..cc88ed72fc 100644 --- a/usr/src/uts/common/disp/rt_dptbl.c +++ b/usr/src/uts/common/disp/rt_dptbl.c @@ -28,8 +28,6 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/proc.h> #include <sys/priocntl.h> #include <sys/class.h> @@ -70,8 +68,6 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ - rtdpent_t config_rt_dptbl[] = { /* prilevel Time quantum */ diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 854b33798d..d576738e75 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -75,6 +75,11 @@ #include <sys/cpucaps.h> #include <sys/kiconv.h> #include <sys/ctype.h> +#include <sys/ht.h> + +#ifndef STACK_GROWTH_DOWN +#error Stacks do not grow downward; 3b2 zombie attack detected! +#endif struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ @@ -373,7 +378,7 @@ thread_create( if (stksize <= sizeof (kthread_t) + PTR24_ALIGN) cmn_err(CE_PANIC, "thread_create: proposed stack size" " too small to hold thread."); -#ifdef STACK_GROWTH_DOWN + stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1); stksize &= -PTR24_ALIGN; /* make thread aligned */ t = (kthread_t *)(stk + stksize); @@ -382,13 +387,6 @@ thread_create( audit_thread_create(t); t->t_stk = stk + stksize; t->t_stkbase = stk; -#else /* stack grows to larger addresses */ - stksize -= SA(sizeof (kthread_t)); - t = (kthread_t *)(stk); - bzero(t, sizeof (kthread_t)); - t->t_stk = stk + sizeof (kthread_t); - t->t_stkbase = stk + stksize + sizeof (kthread_t); -#endif /* STACK_GROWTH_DOWN */ t->t_flag |= T_TALLOCSTK; t->t_swap = stk; } else { @@ -401,13 +399,8 @@ thread_create( * Initialize t_stk to the kernel stack pointer to use * upon entry to the kernel */ -#ifdef STACK_GROWTH_DOWN t->t_stk = stk + stksize; t->t_stkbase = stk; -#else - t->t_stk = stk; /* 3b2-like */ - t->t_stkbase = stk + stksize; -#endif /* STACK_GROWTH_DOWN */ } if (kmem_stackinfo != 0) { @@ -487,15 +480,9 @@ thread_create( curthread->t_prev = t; /* - * Threads should never have a NULL t_cpu pointer so assign it - * here. If the thread is being created with state TS_RUN a - * better CPU may be chosen when it is placed on the run queue. - * - * We need to keep kernel preemption disabled when setting all - * three fields to keep them in sync. Also, always create in - * the default partition since that's where kernel threads go - * (if this isn't a kernel thread, t_cpupart will be changed - * in lwp_create before setting the thread runnable). + * We'll always create in the default partition since that's where + * kernel threads go (we'll change this later if needed, in + * lwp_create()). */ t->t_cpupart = &cp_default; @@ -504,20 +491,23 @@ thread_create( * Since the kernel does not (presently) allocate its memory * in a locality aware fashion, the root is an appropriate home. * If this thread is later associated with an lwp, it will have - * it's lgroup re-assigned at that time. + * its lgroup re-assigned at that time. */ lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1); /* - * Inherit the current cpu. If this cpu isn't part of the chosen - * lgroup, a new cpu will be chosen by cpu_choose when the thread - * is ready to run. + * If the current CPU is in the default cpupart, use it. Otherwise, + * pick one that is; before entering the dispatcher code, we'll + * make sure to keep the invariant that ->t_cpu is set. (In fact, we + * rely on this, in ht_should_run(), in the call tree of + * disp_lowpri_cpu().) */ - if (CPU->cpu_part == &cp_default) + if (CPU->cpu_part == &cp_default) { t->t_cpu = CPU; - else - t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl, - t->t_pri, NULL); + } else { + t->t_cpu = cp_default.cp_cpulist; + t->t_cpu = disp_lowpri_cpu(t->t_cpu, t, t->t_pri); + } t->t_disp_queue = t->t_cpu->cpu_disp; kpreempt_enable(); @@ -590,6 +580,9 @@ thread_exit(void) if ((t->t_proc_flag & TP_ZTHREAD) != 0) cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called"); + if ((t->t_flag & T_SPLITSTK) != 0) + cmn_err(CE_PANIC, "thread_exit: called when stack is split"); + tsd_exit(); /* Clean up this thread's TSD */ kcpc_passivate(); /* clean up performance counter state */ @@ -870,12 +863,12 @@ thread_zone_destroy(zoneid_t zoneid, void *unused) /* * Guard against race condition in mutex_owner_running: - * thread=owner(mutex) - * <interrupt> - * thread exits mutex - * thread exits - * thread reaped - * thread struct freed + * thread=owner(mutex) + * <interrupt> + * thread exits mutex + * thread exits + * thread reaped + * thread struct freed * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE. * A cross call to all cpus will cause the interrupt handler * to reset the PC if it is in mutex_owner_running, refreshing @@ -932,12 +925,12 @@ thread_reaper() /* * Guard against race condition in mutex_owner_running: - * thread=owner(mutex) - * <interrupt> - * thread exits mutex - * thread exits - * thread reaped - * thread struct freed + * thread=owner(mutex) + * <interrupt> + * thread exits mutex + * thread exits + * thread reaped + * thread struct freed * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE. * A cross call to all cpus will cause the interrupt handler * to reset the PC if it is in mutex_owner_running, refreshing @@ -1055,8 +1048,44 @@ installctx( ctx->exit_op = exit; ctx->free_op = free; ctx->arg = arg; - ctx->next = t->t_ctx; + ctx->save_ts = 0; + ctx->restore_ts = 0; + + /* + * Keep ctxops in a doubly-linked list to allow traversal in both + * directions. Using only the newest-to-oldest ordering was adequate + * previously, but reversing the order for restore_op actions is + * necessary if later-added ctxops depends on earlier ones. + * + * One example of such a dependency: Hypervisor software handling the + * guest FPU expects that it save FPU state prior to host FPU handling + * and consequently handle the guest logic _after_ the host FPU has + * been restored. + * + * The t_ctx member points to the most recently added ctxop or is NULL + * if no ctxops are associated with the thread. The 'next' pointers + * form a loop of the ctxops in newest-to-oldest order. The 'prev' + * pointers form a loop in the reverse direction, where t_ctx->prev is + * the oldest entry associated with the thread. + * + * The protection of kpreempt_disable is required to safely perform the + * list insertion, since there are inconsistent states between some of + * the pointer assignments. + */ + kpreempt_disable(); + if (t->t_ctx == NULL) { + ctx->next = ctx; + ctx->prev = ctx; + } else { + struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev; + + ctx->next = head; + ctx->prev = tail; + head->prev = ctx; + tail->next = ctx; + } t->t_ctx = ctx; + kpreempt_enable(); } /* @@ -1073,7 +1102,7 @@ removectx( void (*exit)(void *), void (*free)(void *, int)) { - struct ctxop *ctx, *prev_ctx; + struct ctxop *ctx, *head; /* * The incoming kthread_t (which is the thread for which the @@ -1098,17 +1127,31 @@ removectx( * and the target thread from racing with each other during lwp exit. */ mutex_enter(&t->t_ctx_lock); - prev_ctx = NULL; kpreempt_disable(); - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) { + + if (t->t_ctx == NULL) { + mutex_exit(&t->t_ctx_lock); + kpreempt_enable(); + return (0); + } + + ctx = head = t->t_ctx; + do { if (ctx->save_op == save && ctx->restore_op == restore && ctx->fork_op == fork && ctx->lwp_create_op == lwp_create && ctx->exit_op == exit && ctx->free_op == free && ctx->arg == arg) { - if (prev_ctx) - prev_ctx->next = ctx->next; - else + ctx->prev->next = ctx->next; + ctx->next->prev = ctx->prev; + if (ctx->next == ctx) { + /* last remaining item */ + t->t_ctx = NULL; + } else if (ctx == t->t_ctx) { + /* fix up head of list */ t->t_ctx = ctx->next; + } + ctx->next = ctx->prev = NULL; + mutex_exit(&t->t_ctx_lock); if (ctx->free_op != NULL) (ctx->free_op)(ctx->arg, 0); @@ -1116,44 +1159,70 @@ removectx( kpreempt_enable(); return (1); } - prev_ctx = ctx; - } + + ctx = ctx->next; + } while (ctx != head); + mutex_exit(&t->t_ctx_lock); kpreempt_enable(); - return (0); } void savectx(kthread_t *t) { - struct ctxop *ctx; - ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->save_op != NULL) - (ctx->save_op)(ctx->arg); + + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->save_op != NULL) { + ctx->save_ts = gethrtime_unscaled(); + (ctx->save_op)(ctx->arg); + } + ctx = ctx->next; + } while (ctx != head); + } } void restorectx(kthread_t *t) { - struct ctxop *ctx; - ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->restore_op != NULL) - (ctx->restore_op)(ctx->arg); + + if (t->t_ctx != NULL) { + struct ctxop *ctx, *tail; + + /* Backward traversal (starting at the tail) */ + ctx = tail = t->t_ctx->prev; + do { + if (ctx->restore_op != NULL) { + ctx->restore_ts = gethrtime_unscaled(); + (ctx->restore_op)(ctx->arg); + } + ctx = ctx->prev; + } while (ctx != tail); + } } void forkctx(kthread_t *t, kthread_t *ct) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->fork_op != NULL) - (ctx->fork_op)(t, ct); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->fork_op != NULL) { + (ctx->fork_op)(t, ct); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1164,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct) void lwp_createctx(kthread_t *t, kthread_t *ct) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->lwp_create_op != NULL) - (ctx->lwp_create_op)(t, ct); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->lwp_create_op != NULL) { + (ctx->lwp_create_op)(t, ct); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1181,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct) void exitctx(kthread_t *t) { - struct ctxop *ctx; - - for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) - if (ctx->exit_op != NULL) - (ctx->exit_op)(t); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + /* Forward traversal */ + ctx = head = t->t_ctx; + do { + if (ctx->exit_op != NULL) { + (ctx->exit_op)(t); + } + ctx = ctx->next; + } while (ctx != head); + } } /* @@ -1195,14 +1278,21 @@ exitctx(kthread_t *t) void freectx(kthread_t *t, int isexec) { - struct ctxop *ctx; - kpreempt_disable(); - while ((ctx = t->t_ctx) != NULL) { - t->t_ctx = ctx->next; - if (ctx->free_op != NULL) - (ctx->free_op)(ctx->arg, isexec); - kmem_free(ctx, sizeof (struct ctxop)); + if (t->t_ctx != NULL) { + struct ctxop *ctx, *head; + + ctx = head = t->t_ctx; + t->t_ctx = NULL; + do { + struct ctxop *next = ctx->next; + + if (ctx->free_op != NULL) { + (ctx->free_op)(ctx->arg, isexec); + } + kmem_free(ctx, sizeof (struct ctxop)); + ctx = next; + } while (ctx != head); } kpreempt_enable(); } @@ -1217,17 +1307,22 @@ freectx(kthread_t *t, int isexec) void freectx_ctx(struct ctxop *ctx) { - struct ctxop *nctx; + struct ctxop *head = ctx; ASSERT(ctx != NULL); kpreempt_disable(); + + head = ctx; do { - nctx = ctx->next; - if (ctx->free_op != NULL) + struct ctxop *next = ctx->next; + + if (ctx->free_op != NULL) { (ctx->free_op)(ctx->arg, 0); + } kmem_free(ctx, sizeof (struct ctxop)); - } while ((ctx = nctx) != NULL); + ctx = next; + } while (ctx != head); kpreempt_enable(); } @@ -1326,6 +1421,8 @@ thread_unpin() itp = t->t_intr; /* interrupted thread */ t->t_intr = NULL; /* clear interrupt ptr */ + ht_end_intr(); + /* * Get state from interrupt thread for the one * it interrupted. @@ -1422,7 +1519,7 @@ thread_create_intr(struct cpu *cp) static kmutex_t tsd_mutex; /* linked list spin lock */ static uint_t tsd_nkeys; /* size of destructor array */ /* per-key destructor funcs */ -static void (**tsd_destructor)(void *); +static void (**tsd_destructor)(void *); /* list of tsd_thread's */ static struct tsd_thread *tsd_list; @@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) return (on_rq); } + +/* + * There are occasions in the kernel when we need much more stack than we + * allocate by default, but we do not wish to have that work done + * asynchronously by another thread. To accommodate these scenarios, we allow + * for a split stack (also known as a "segmented stack") whereby a new stack + * is dynamically allocated and the current thread jumps onto it for purposes + * of executing the specified function. After the specified function returns, + * the stack is deallocated and control is returned to the caller. This + * functionality is implemented by thread_splitstack(), below; there are a few + * constraints on its use: + * + * - The caller must be in a context where it is safe to block for memory. + * - The caller cannot be in a t_onfault context + * - The called function must not call thread_exit() while on the split stack + * + * The code will explicitly panic if these constraints are violated. Notably, + * however, thread_splitstack() _can_ be called on a split stack -- there + * is no limit to the level that split stacks can nest. + * + * When the stack is split, it is constructed such that stack backtraces + * from kernel debuggers continue to function -- though note that DTrace's + * stack() action and stackdepth function will only show the stack up to and + * including thread_splitstack_run(); DTrace explicitly bounds itself to + * pointers that exist within the current declared stack as a safety + * mechanism. + */ +void +thread_splitstack(void (*func)(void *), void *arg, size_t stksize) +{ + kthread_t *t = curthread; + caddr_t ostk, ostkbase, stk; + ushort_t otflag; + + if (t->t_onfault != NULL) + panic("thread_splitstack: called with non-NULL t_onfault"); + + ostk = t->t_stk; + ostkbase = t->t_stkbase; + otflag = t->t_flag; + + stksize = roundup(stksize, PAGESIZE); + + if (stksize < default_stksize) + stksize = default_stksize; + + if (stksize == default_stksize) { + stk = (caddr_t)segkp_cache_get(segkp_thread); + } else { + stksize = roundup(stksize, PAGESIZE); + stk = (caddr_t)segkp_get(segkp, stksize, + (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED)); + } + + /* + * We're going to lock ourselves before we set T_SPLITSTK to assure + * that we're not swapped out in the meantime. (Note that we don't + * bother to set t_swap, as we're not going to be swapped out.) + */ + thread_lock(t); + + if (!(otflag & T_SPLITSTK)) + t->t_flag |= T_SPLITSTK; + + t->t_stk = stk + stksize; + t->t_stkbase = stk; + + thread_unlock(t); + + /* + * Now actually run on the new (split) stack... + */ + thread_splitstack_run(t->t_stk, func, arg); + + /* + * We're back onto our own stack; lock ourselves and restore our + * pre-split state. + */ + thread_lock(t); + + t->t_stk = ostk; + t->t_stkbase = ostkbase; + + if (!(otflag & T_SPLITSTK)) + t->t_flag &= ~T_SPLITSTK; + + thread_unlock(t); + + /* + * Now that we are entirely back on our own stack, call back into + * the platform layer to perform any platform-specific cleanup. + */ + thread_splitstack_cleanup(); + + segkp_release(segkp, stk); +} + /* * Tunable kmem_stackinfo is set, fill the kernel thread stack with a * specific pattern. diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c index 67ccc6922f..c840bdf31a 100644 --- a/usr/src/uts/common/disp/thread_intr.c +++ b/usr/src/uts/common/disp/thread_intr.c @@ -23,19 +23,10 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - /* - * FILE NOTICE BEGIN - * - * This file should not be modified. If you wish to modify it or have it - * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com-> - * (without anti-spam dashes) - * - * FILE NOTICE END + * Copyright 2015, Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/cpuvar.h> #include <sys/stack.h> #include <vm/seg_kp.h> @@ -44,6 +35,17 @@ #include <sys/sysmacros.h> /* + * Use a slightly larger thread stack size for interrupt threads rather than the + * default. This is useful for cases where the networking stack may do an rx and + * a tx in the context of a single interrupt and when combined with various + * promisc hooks that need memory, can cause us to get dangerously close to the + * edge of the traditional stack sizes. This is only a few pages more than a + * traditional stack and given that we don't have that many interrupt threads, + * the memory costs end up being more than worthwhile. + */ +#define LL_INTR_STKSZ (32 * 1024) + +/* * Create and initialize an interrupt thread. */ static void @@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp) { kthread_t *tp; - tp = thread_create(NULL, 0, + tp = thread_create(NULL, LL_INTR_STKSZ, (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0); /* @@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp) } /* - * Allocate a given number of interrupt threads for a given CPU. - * These threads will get freed by cpu_destroy_bound_threads() - * when CPU gets unconfigured. + * Allocate a given number of interrupt threads for a given CPU. These threads + * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured. + * + * Note, high level interrupts are always serviced using cpu_intr_stack and are + * not allowed to block. Low level interrupts or soft-interrupts use the + * kthread_t's that we create through the calls to thread_create_intr(). */ void cpu_intr_alloc(cpu_t *cp, int n) @@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n) thread_create_intr(cp); cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE, - KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + - INTR_STACK_SIZE - SA(MINFRAME); + KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + + INTR_STACK_SIZE - SA(MINFRAME); } diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c index 61cfc43693..8d5ccdc64b 100644 --- a/usr/src/uts/common/dtrace/dtrace.c +++ b/usr/src/uts/common/dtrace/dtrace.c @@ -7770,7 +7770,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); - *zoneidp = crgetzoneid(cr); + *zoneidp = crgetzonedid(cr); priv = 0; if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) @@ -8266,7 +8266,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { provider->dtpv_priv.dtpp_uid = crgetuid(cr); - provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr); } provider->dtpv_pops = *pops; @@ -8877,6 +8877,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) uint32_t priv; uid_t uid; zoneid_t zoneid; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; ASSERT(MUTEX_HELD(&dtrace_lock)); dtrace_ecb_create_cache = NULL; @@ -8891,8 +8892,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) } dtrace_probekey(desc, &pkey); - dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, - &priv, &uid, &zoneid); + dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid); + + if ((priv & DTRACE_PRIV_ZONEOWNER) && + state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) { + /* + * If we have the privilege of instrumenting all zones but we + * have been told to instrument but one, we will spoof this up + * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes + * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for + * security but rather for performance: it allows the global + * zone to instrument USDT probes in a local zone without + * requiring all zones to be instrumented.) + */ + priv &= ~DTRACE_PRIV_ZONEOWNER; + zoneid = state->dts_options[DTRACEOPT_ZONE]; + } return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab)); diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index 157acc25fc..3d350ff278 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, }; +/* + * When adding a new provider you must add it before sdt as sdt is a catch all + * for remaining probes. + */ sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr }, { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER }, @@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = { { "fc", "__fc_", &fc_attr }, { "srp", "__srp_", &fc_attr }, { "sysevent", "__sysevent_", &stab_attr }, + { "vnd", "__vnd_", &stab_attr }, { "sdt", NULL, &sdt_attr }, { NULL } }; @@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = { { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", "fc_port_info_t *" }, + { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-in", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-out", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" }, + { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "send", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "send", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" }, + { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "recv", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" }, { NULL } }; diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c index fc45bd9544..5dbb2ed28c 100644 --- a/usr/src/uts/common/exec/aout/aout.c +++ b/usr/src/uts/common/exec/aout/aout.c @@ -22,6 +22,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -54,7 +55,7 @@ static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action); + caddr_t exec_file, cred_t *cred, int *brand_action); static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz, int *isdyn); static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp, @@ -130,7 +131,7 @@ _info(struct modinfo *modinfop) static int aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action) + caddr_t exec_file, cred_t *cred, int *brand_action) { auxv32_t auxflags_auxv32; int error; diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index 9e6b6bf69e..a4078bb351 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -80,15 +80,32 @@ extern volatile size_t aslr_max_brk_skew; #define ORIGIN_STR "ORIGIN" #define ORIGIN_STR_SIZE 6 -static int getelfhead(vnode_t *, cred_t *, Ehdr *, int *, int *, int *); -static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, int, caddr_t *, - ssize_t *); -static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *, - ssize_t *, caddr_t *, ssize_t *); -static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *); -static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t, - Phdr **, Phdr **, Phdr **, Phdr **, Phdr *, - caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *); +static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *, + uint_t *); +static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *, + size_t *); +static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t, + caddr_t *, size_t *, caddr_t *, size_t *); +static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *); +static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **, + Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *, + size_t, size_t *, size_t *); + +#ifdef _ELF32_COMPAT +/* Link against the non-compat instances when compiling the 32-bit version. */ +extern size_t elf_datasz_max; +extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t); +extern uint_t elf_nphdr_max; +extern uint_t elf_nshdr_max; +extern size_t elf_shstrtab_max; +#else +size_t elf_datasz_max = 1 * 1024 * 1024; +uint_t elf_nphdr_max = 1000; +uint_t elf_nshdr_max = 10000; +size_t elf_shstrtab_max = 100 * 1024; +#endif + + typedef enum { STR_CTF, @@ -110,8 +127,8 @@ static const char *shstrtab_data[] = { }; typedef struct shstrtab { - int sst_ndx[STR_NUM]; - int sst_cur; + uint_t sst_ndx[STR_NUM]; + uint_t sst_cur; } shstrtab_t; static void @@ -121,10 +138,10 @@ shstrtab_init(shstrtab_t *s) s->sst_cur = 1; } -static int +static uint_t shstrtab_ndx(shstrtab_t *s, shstrtype_t type) { - int ret; + uint_t ret; if ((ret = s->sst_ndx[type]) != 0) return (ret); @@ -144,7 +161,7 @@ shstrtab_size(const shstrtab_t *s) static void shstrtab_dump(const shstrtab_t *s, char *buf) { - int i, ndx; + uint_t i, ndx; *buf = '\0'; for (i = 0; i < STR_NUM; i++) { @@ -206,31 +223,54 @@ handle_secflag_dt(proc_t *p, uint_t dt, uint_t val) return (0); } + +#ifndef _ELF32_COMPAT +void +elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz) +{ + size_t target = MIN(sz, elf_datasz_max); + + if (target > ctx->ecc_bufsz) { + if (ctx->ecc_buf != NULL) { + kmem_free(ctx->ecc_buf, ctx->ecc_bufsz); + } + ctx->ecc_buf = kmem_alloc(target, KM_SLEEP); + ctx->ecc_bufsz = target; + } +} +#endif /* _ELF32_COMPAT */ + /* - * Map in the executable pointed to by vp. Returns 0 on success. + * Map in the executable pointed to by vp. Returns 0 on success. Note that + * this function currently has the maximum number of arguments allowed by + * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without + * adding to MAXNARG. (Better yet, do not add to this monster of a function + * signature!) */ int mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, - intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase, - caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap) + intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase, + caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp) { - size_t len; + size_t len, phdrsize; struct vattr vat; caddr_t phdrbase = NULL; - ssize_t phdrsize; - int nshdrs, shstrndx, nphdrs; + uint_t nshdrs, shstrndx, nphdrs; int error = 0; Phdr *uphdr = NULL; Phdr *junk = NULL; Phdr *dynphdr = NULL; Phdr *dtrphdr = NULL; - uintptr_t lddata; - long execsz; - intptr_t minaddr; + char *interp = NULL; + uintptr_t lddata, minaddr; + size_t execsz; if (lddatap != NULL) *lddatap = 0; + if (minaddrp != NULL) + *minaddrp = NULL; + if (error = execpermissions(vp, &vat, args)) { uprintf("%s: Cannot execute %s\n", exec_file, args->pathname); return (error); @@ -256,25 +296,91 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr, len, &execsz, brksize)) { uprintf("%s: Cannot map %s\n", exec_file, args->pathname); + if (uphdr != NULL && uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); kmem_free(phdrbase, phdrsize); return (error); } + if (minaddrp != NULL) + *minaddrp = minaddr; + /* - * Inform our caller if the executable needs an interpreter. + * If the executable requires an interpreter, determine its name. */ - *interp = (dynphdr == NULL) ? 0 : 1; + if (dynphdr != NULL) { + ssize_t resid; + + if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) { + uprintf("%s: Invalid interpreter\n", exec_file); + kmem_free(phdrbase, phdrsize); + return (ENOEXEC); + } + + interp = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if ((error = vn_rdwr(UIO_READ, vp, interp, + (ssize_t)dynphdr->p_filesz, + (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0, + (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 || + interp[dynphdr->p_filesz - 1] != '\0') { + uprintf("%s: Cannot obtain interpreter pathname\n", + exec_file); + kmem_free(interp, MAXPATHLEN); + kmem_free(phdrbase, phdrsize); + return (error != 0 ? error : ENOEXEC); + } + } /* * If this is a statically linked executable, voffset should indicate * the address of the executable itself (it normally holds the address * of the interpreter). */ - if (ehdr->e_type == ET_EXEC && *interp == 0) + if (ehdr->e_type == ET_EXEC && interp == NULL) *voffset = minaddr; + /* + * If the caller has asked for the interpreter name, return it (it's + * up to the caller to free it); if the caller hasn't asked for it, + * free it ourselves. + */ + if (interpp != NULL) { + *interpp = interp; + } else if (interp != NULL) { + kmem_free(interp, MAXPATHLEN); + } + if (uphdr != NULL) { *uphdr_vaddr = uphdr->p_vaddr; + + if (uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); + } else if (ehdr->e_type == ET_DYN) { + /* + * If we don't have a uphdr, we'll apply the logic found + * in mapelfexec() and use the p_vaddr of the first PT_LOAD + * section as the base address of the object. + */ + const Phdr *phdr = (Phdr *)phdrbase; + const uint_t hsize = ehdr->e_phentsize; + uint_t i; + + for (i = nphdrs; i > 0; i--) { + if (phdr->p_type == PT_LOAD) { + *uphdr_vaddr = (uintptr_t)phdr->p_vaddr + + ehdr->e_phoff; + break; + } + + phdr = (Phdr *)((caddr_t)phdr + hsize); + } + + /* + * If we don't have a PT_LOAD segment, we should have returned + * ENOEXEC when elfsize() returned 0, above. + */ + VERIFY(i > 0); } else { *uphdr_vaddr = (Addr)-1; } @@ -286,14 +392,14 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, /*ARGSUSED*/ int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, - int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, + int *brand_action) { caddr_t phdrbase = NULL; caddr_t bssbase = 0; caddr_t brkbase = 0; size_t brksize = 0; - ssize_t dlnsize; + size_t dlnsize, nsize = 0; aux_entry_t *aux; int error; ssize_t resid; @@ -305,20 +411,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, Phdr *uphdr = NULL; Phdr *junk = NULL; size_t len; + size_t postfixsize = 0; size_t i; - ssize_t phdrsize; - int postfixsize = 0; - int hsize; Phdr *phdrp; Phdr *dataphdrp = NULL; Phdr *dtrphdr; Phdr *capphdr = NULL; Cap *cap = NULL; - ssize_t capsize; + size_t capsize; int hasu = 0; int hasauxv = 0; int hasintp = 0; int branded = 0; + int dynuphdr = 0; struct proc *p = ttoproc(curthread); struct user *up = PTOU(p); @@ -331,7 +436,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, struct execenv exenv; } *bigwad; /* kmem_alloc this behemoth so we don't blow stack */ Ehdr *ehdrp; - int nshdrs, shstrndx, nphdrs; + uint_t nshdrs, shstrndx, nphdrs; + size_t phdrsize; char *dlnp; char *pathbufp; rlim64_t limit; @@ -373,7 +479,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1); } else { args->to_model = DATAMODEL_LP64; - args->stk_prot &= ~PROT_EXEC; + if (!args->stk_prot_override) { + args->stk_prot &= ~PROT_EXEC; + } #if defined(__i386) || defined(__amd64) args->dat_prot &= ~PROT_EXEC; #endif @@ -385,11 +493,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* _LP64 */ /* - * We delay invoking the brand callback until we've figured out - * what kind of elf binary we're trying to run, 32-bit or 64-bit. - * We do this because now the brand library can just check - * args->to_model to see if the target is 32-bit or 64-bit without - * having do duplicate all the code above. + * We delay invoking the brand callback until we've figured out what + * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this + * because now the brand library can just check args->to_model to see if + * the target is 32-bit or 64-bit without having do duplicate all the + * code above. + * + * We also give the brand a chance to indicate that based on the ELF + * OSABI of the target binary it should become unbranded and optionally + * indicate that it should be treated as existing in a specific prefix. + * + * Note that if a brand opts to go down this route it does not actually + * end up being debranded. In other words, future programs that exec + * will still be considered for branding unless this escape hatch is + * used. Consider the case of lx brand for example. If a user runs + * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable + * of DTrace that's in /native will take this escape hatch and be run + * and interpreted using the normal system call table; however, the + * execution of a non-illumos binary in the form of /bin/ls will still + * be branded and be subject to all of the normal actions of the brand. * * The level checks associated with brand handling below are used to * prevent a loop since the brand elfexec function typically comes back @@ -397,8 +519,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * handling in the #! interpreter code will increment the level before * calling gexec to run the final elfexec interpreter. */ + if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) && + (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) { + if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI], + &args->brand_nroot) == B_TRUE) { + ASSERT(ehdrp->e_ident[EI_OSABI]); + *brand_action = EBA_NATIVE; + /* Add one for the trailing '/' in the path */ + if (args->brand_nroot != NULL) + nsize = strlen(args->brand_nroot) + 1; + } + } + if ((level <= INTP_MAXDEPTH) && - (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { error = BROP(p)->b_elfexec(vp, uap, args, idatap, level + 1, execsz, setid, exec_file, cred, brand_action); @@ -411,7 +545,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * determine any non-default stack protections, * and still have this code be machine independent. */ - hsize = ehdrp->e_phentsize; + const uint_t hsize = ehdrp->e_phentsize; phdrp = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { switch (phdrp->p_type) { @@ -472,14 +606,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * AT_BASE * AT_FLAGS * AT_PAGESZ + * AT_RANDOM (added in stk_copyout) * AT_SUN_AUXFLAGS * AT_SUN_HWCAP * AT_SUN_HWCAP2 - * AT_SUN_PLATFORM (added in stk_copyout) - * AT_SUN_EXECNAME (added in stk_copyout) + * AT_SUN_PLATFORM (added in stk_copyout) + * AT_SUN_EXECNAME (added in stk_copyout) * AT_NULL * - * total == 9 + * total == 10 */ if (hasintp && hasu) { /* @@ -494,7 +629,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 5 */ - args->auxsize = (9 + 5) * sizeof (aux_entry_t); + args->auxsize = (10 + 5) * sizeof (aux_entry_t); } else if (hasintp) { /* * Has PT_INTERP but no PT_PHDR @@ -504,9 +639,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 2 */ - args->auxsize = (9 + 2) * sizeof (aux_entry_t); + args->auxsize = (10 + 2) * sizeof (aux_entry_t); } else { - args->auxsize = 9 * sizeof (aux_entry_t); + args->auxsize = 10 * sizeof (aux_entry_t); } } else { args->auxsize = 0; @@ -520,6 +655,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, args->auxsize += sizeof (aux_entry_t); /* + * If this is a native binary that's been given a modified interpreter + * root, inform it that the native system exists at that root. + */ + if (args->brand_nroot != NULL) { + args->auxsize += sizeof (aux_entry_t); + } + + + /* * On supported kernels (x86_64) make room in the auxv for the * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems * which do not provide such functionality. @@ -531,13 +675,24 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, args->auxsize += 3 * sizeof (aux_entry_t); #endif /* defined(__amd64) */ - if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + /* + * If we have user credentials, we'll supply the following entries: + * AT_SUN_UID + * AT_SUN_RUID + * AT_SUN_GID + * AT_SUN_RGID + */ + if (cred != NULL) { + args->auxsize += 4 * sizeof (aux_entry_t); + } + + if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { branded = 1; /* - * We will be adding 4 entries to the aux vectors. One for - * the the brandname and 3 for the brand specific aux vectors. + * We will be adding 5 entries to the aux vectors. One for + * the the brandname and 4 for the brand specific aux vectors. */ - args->auxsize += 4 * sizeof (aux_entry_t); + args->auxsize += 5 * sizeof (aux_entry_t); } /* If the binary has an explicit ASLR flag, it must be honoured */ @@ -566,7 +721,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn, (ssize_t)dynsize, (offset_t)(dynoffset + i), UIO_SYSSPACE, 0, (rlim64_t)0, - CRED(), &resid)) != 0) { + CRED(), NULL)) != 0) { uprintf("%s: cannot read .dynamic section\n", exec_file); goto out; @@ -594,13 +749,13 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (capphdr != NULL && (capsize = capphdr->p_filesz) > 0 && capsize <= 16 * sizeof (*cap)) { - int ncaps = capsize / sizeof (*cap); + const uint_t ncaps = capsize / sizeof (*cap); Cap *cp; cap = kmem_alloc(capsize, KM_SLEEP); if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap, - capsize, (offset_t)capphdr->p_offset, - UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) { + (ssize_t)capsize, (offset_t)capphdr->p_offset, + UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) { uprintf("%s: Cannot read capabilities section\n", exec_file); goto out; @@ -618,7 +773,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, aux = bigwad->elfargs; /* * Move args to the user's stack. - * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries. + * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM + * aux entries. */ if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) { if (error == -1) { @@ -640,10 +796,19 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; - if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr, + error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr, &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL, - len, execsz, &brksize)) != 0) + len, execsz, &brksize); + /* + * Our uphdr has been dynamically allocated if (and only if) its + * program header flags are clear. To avoid leaks, this must be + * checked regardless of whether mapelfexec() emitted an error. + */ + dynuphdr = (uphdr != NULL && uphdr->p_flags == 0); + + if (error != 0) { goto bad; + } if (uphdr != NULL && intphdr == NULL) goto bad; @@ -659,17 +824,28 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, char *p; struct vnode *nvp; - dlnsize = intphdr->p_filesz; + dlnsize = intphdr->p_filesz + nsize; - if (dlnsize > MAXPATHLEN || dlnsize <= 0) + /* + * Make sure none of the component pieces of dlnsize result in + * an oversized or zeroed result. + */ + if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN || + dlnsize == 0 || dlnsize < intphdr->p_filesz) { goto bad; + } + + if (nsize != 0) { + bcopy(args->brand_nroot, dlnp, nsize - 1); + dlnp[nsize - 1] = '/'; + } /* * Read in "interpreter" pathname. */ - if ((error = vn_rdwr(UIO_READ, vp, dlnp, intphdr->p_filesz, - (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0, - CRED(), &resid)) != 0) { + if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize, + (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset, + UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) { uprintf("%s: Cannot obtain interpreter pathname\n", exec_file); goto bad; @@ -814,9 +990,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; - error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk, + error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk, &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len, execsz, NULL); + if (error || junk != NULL) { VN_RELE(nvp); uprintf("%s: Cannot map %s\n", exec_file, dlnp); @@ -849,8 +1026,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* defined(__amd64) */ /* - * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via - * exec_args() + * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were + * filled in via exec_args() */ ADDAUX(aux, AT_BASE, voffset) ADDAUX(aux, AT_FLAGS, at_flags) @@ -878,7 +1055,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * malicious user within the zone from crafting a wrapper to * run native suid commands with unsecure libraries interposed. */ - if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && + if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && (setid &= ~EXECSETID_SETID) != 0)) auxf &= ~AF_SUN_SETUGID; @@ -893,6 +1070,17 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_AUXFLAGS, auxf); /* + * Record information about the real and effective user and + * group IDs. + */ + if (cred != NULL) { + ADDAUX(aux, AT_SUN_UID, crgetuid(cred)); + ADDAUX(aux, AT_SUN_RUID, crgetruid(cred)); + ADDAUX(aux, AT_SUN_GID, crgetgid(cred)); + ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred)); + } + + /* * Hardware capability flag word (performance hints) * Used for choosing faster library routines. * (Potentially different between 32-bit and 64-bit ABIs) @@ -921,6 +1109,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_BRAND_AUX1, 0) ADDAUX(aux, AT_SUN_BRAND_AUX2, 0) ADDAUX(aux, AT_SUN_BRAND_AUX3, 0) + ADDAUX(aux, AT_SUN_BRAND_AUX4, 0) } /* @@ -952,7 +1141,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* defined(__amd64) */ ADDAUX(aux, AT_NULL, 0) - postfixsize = (char *)aux - (char *)bigwad->elfargs; + postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs; /* * We make assumptions above when we determine how many aux @@ -963,8 +1152,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * We detect that now and error out. */ if (postfixsize != args->auxsize) { - DTRACE_PROBE2(elfexec_badaux, int, postfixsize, - int, args->auxsize); + DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize, + size_t, args->auxsize); goto bad; } ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t)); @@ -992,7 +1181,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, bzero(up->u_auxv, sizeof (up->u_auxv)); up->u_commpagep = args->commpage; if (postfixsize) { - int num_auxv; + size_t num_auxv; /* * Copy the aux vector to the user stack. @@ -1057,6 +1246,8 @@ bad: if (error == 0) error = ENOEXEC; out: + if (dynuphdr) + kmem_free(uphdr, sizeof (Phdr)); if (phdrbase != NULL) kmem_free(phdrbase, phdrsize); if (cap != NULL) @@ -1069,32 +1260,23 @@ out: * Compute the memory size requirement for the ELF file. */ static size_t -elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata) +elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase, + uintptr_t *lddata) { - size_t len; - Phdr *phdrp = (Phdr *)phdrbase; - int hsize = ehdrp->e_phentsize; - int first = 1; - int dfirst = 1; /* first data segment */ - uintptr_t loaddr = 0; + const Phdr *phdrp = (Phdr *)phdrbase; + const uint_t hsize = ehdrp->e_phentsize; + boolean_t dfirst = B_TRUE; + uintptr_t loaddr = UINTPTR_MAX; uintptr_t hiaddr = 0; - uintptr_t lo, hi; - int i; + uint_t i; for (i = nphdrs; i > 0; i--) { if (phdrp->p_type == PT_LOAD) { - lo = phdrp->p_vaddr; - hi = lo + phdrp->p_memsz; - if (first) { - loaddr = lo; - hiaddr = hi; - first = 0; - } else { - if (loaddr > lo) - loaddr = lo; - if (hiaddr < hi) - hiaddr = hi; - } + const uintptr_t lo = phdrp->p_vaddr; + const uintptr_t hi = lo + phdrp->p_memsz; + + loaddr = MIN(lo, loaddr); + hiaddr = MAX(hi, hiaddr); /* * save the address of the first data segment @@ -1104,16 +1286,18 @@ elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata) if ((lddata != NULL) && dfirst && (phdrp->p_flags & PF_W)) { *lddata = lo; - dfirst = 0; + dfirst = B_FALSE; } } phdrp = (Phdr *)((caddr_t)phdrp + hsize); } - len = hiaddr - (loaddr & PAGEMASK); - len = roundup(len, PAGESIZE); + if (hiaddr <= loaddr) { + /* No non-zero PT_LOAD segment found */ + return (0); + } - return (len); + return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE)); } /* @@ -1123,8 +1307,8 @@ elfsize(Ehdr *ehdrp, int nphdrs, caddr_t phdrbase, uintptr_t *lddata) * EINVAL Format recognized but execution not supported */ static int -getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, - int *nphdrs) +getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs, + uint_t *shstrndx, uint_t *nphdrs) { int error; ssize_t resid; @@ -1133,10 +1317,10 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, * We got here by the first two bytes in ident, * now read the entire ELF header. */ - if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, - sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0, - (rlim64_t)0, credp, &resid)) != 0) + if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr), + (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) { return (error); + } /* * Since a separate version is compiled for handling 32-bit and @@ -1145,8 +1329,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, */ if (resid != 0 || ehdr->e_ident[EI_MAG2] != ELFMAG2 || - ehdr->e_ident[EI_MAG3] != ELFMAG3) + ehdr->e_ident[EI_MAG3] != ELFMAG3) { return (ENOEXEC); + } if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) || #if defined(_ILP32) || defined(_ELF32_COMPAT) @@ -1155,8 +1340,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, ehdr->e_ident[EI_CLASS] != ELFCLASS64 || #endif !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine, - ehdr->e_flags)) + ehdr->e_flags)) { return (EINVAL); + } *nshdrs = ehdr->e_shnum; *shstrndx = ehdr->e_shstrndx; @@ -1164,7 +1350,7 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, /* * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need - * to read in the section header at index zero to acces the true + * to read in the section header at index zero to access the true * values for those fields. */ if ((*nshdrs == 0 && ehdr->e_shoff != 0) || @@ -1176,7 +1362,7 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr, sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, - (rlim64_t)0, credp, &resid)) != 0) + (rlim64_t)0, credp, NULL)) != 0) return (error); if (*nshdrs == 0) @@ -1190,33 +1376,29 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, int *nshdrs, int *shstrndx, return (0); } -#ifdef _ELF32_COMPAT -extern size_t elf_nphdr_max; +/* + * We use members through p_flags on 32-bit files and p_memsz on 64-bit files, + * so e_phentsize must be at least large enough to include those members. + */ +#if !defined(_LP64) || defined(_ELF32_COMPAT) +#define MINPHENTSZ (offsetof(Phdr, p_flags) + \ + sizeof (((Phdr *)NULL)->p_flags)) #else -size_t elf_nphdr_max = 1000; +#define MINPHENTSZ (offsetof(Phdr, p_memsz) + \ + sizeof (((Phdr *)NULL)->p_memsz)) #endif static int -getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs, - caddr_t *phbasep, ssize_t *phsizep) +getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs, + caddr_t *phbasep, size_t *phsizep) { - ssize_t resid, minsize; int err; /* - * Since we're going to be using e_phentsize to iterate down the - * array of program headers, it must be 8-byte aligned or else - * a we might cause a misaligned access. We use all members through - * p_flags on 32-bit ELF files and p_memsz on 64-bit ELF files so - * e_phentsize must be at least large enough to include those - * members. + * Ensure that e_phentsize is large enough for required fields to be + * accessible and will maintain 8-byte alignment. */ -#if !defined(_LP64) || defined(_ELF32_COMPAT) - minsize = offsetof(Phdr, p_flags) + sizeof (((Phdr *)NULL)->p_flags); -#else - minsize = offsetof(Phdr, p_memsz) + sizeof (((Phdr *)NULL)->p_memsz); -#endif - if (ehdr->e_phentsize < minsize || (ehdr->e_phentsize & 3)) + if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3)) return (EINVAL); *phsizep = nphdrs * ehdr->e_phentsize; @@ -1228,9 +1410,9 @@ getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs, *phbasep = kmem_alloc(*phsizep, KM_SLEEP); } - if ((err = vn_rdwr(UIO_READ, vp, *phbasep, *phsizep, + if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep, (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0, - credp, &resid)) != 0) { + credp, NULL)) != 0) { kmem_free(*phbasep, *phsizep); *phbasep = NULL; return (err); @@ -1239,21 +1421,14 @@ getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, int nphdrs, return (0); } -#ifdef _ELF32_COMPAT -extern size_t elf_nshdr_max; -extern size_t elf_shstrtab_max; -#else -size_t elf_nshdr_max = 10000; -size_t elf_shstrtab_max = 100 * 1024; -#endif - +#define MINSHDRSZ (offsetof(Shdr, sh_entsize) + \ + sizeof (((Shdr *)NULL)->sh_entsize)) static int -getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, - int nshdrs, int shstrndx, caddr_t *shbasep, ssize_t *shsizep, - char **shstrbasep, ssize_t *shstrsizep) +getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs, + uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep, + size_t *shstrsizep) { - ssize_t resid, minsize; int err; Shdr *shdr; @@ -1265,9 +1440,8 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, * must be at least large enough to include that member. The index * of the string table section must also be valid. */ - minsize = offsetof(Shdr, sh_entsize) + sizeof (shdr->sh_entsize); - if (ehdr->e_shentsize < minsize || (ehdr->e_shentsize & 3) || - shstrndx >= nshdrs) + if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) || + nshdrs == 0 || shstrndx >= nshdrs) return (EINVAL); *shsizep = nshdrs * ehdr->e_shentsize; @@ -1279,16 +1453,16 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, *shbasep = kmem_alloc(*shsizep, KM_SLEEP); } - if ((err = vn_rdwr(UIO_READ, vp, *shbasep, *shsizep, + if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep, (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0, - credp, &resid)) != 0) { + credp, NULL)) != 0) { kmem_free(*shbasep, *shsizep); return (err); } /* - * Pull the section string table out of the vnode; fail if the size - * is zero. + * Grab the section string table. Walking through the shdrs is + * pointless if their names cannot be interrogated. */ shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize); if ((*shstrsizep = shdr->sh_size) == 0) { @@ -1306,9 +1480,9 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP); } - if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, *shstrsizep, + if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep, (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0, - credp, &resid)) != 0) { + credp, NULL)) != 0) { kmem_free(*shbasep, *shsizep); kmem_free(*shstrbasep, *shstrsizep); return (err); @@ -1323,11 +1497,29 @@ getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, return (0); } + +int +elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs, + caddr_t *phbasep, size_t *phsizep) +{ + int error; + uint_t nshdrs, shstrndx; + + if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx, + nphdrs)) != 0 || + (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep, + phsizep)) != 0) { + return (error); + } + return (0); +} + + static int mapelfexec( vnode_t *vp, Ehdr *ehdr, - int nphdrs, + uint_t nphdrs, caddr_t phdrbase, Phdr **uphdr, Phdr **intphdr, @@ -1337,23 +1529,25 @@ mapelfexec( caddr_t *bssbase, caddr_t *brkbase, intptr_t *voffset, - intptr_t *minaddr, + uintptr_t *minaddrp, size_t len, - long *execsz, + size_t *execsz, size_t *brksize) { Phdr *phdr; - int i, prot, error; + int error, page, prot, lastprot = 0; caddr_t addr = NULL; - size_t zfodsz; - int ptload = 0; - int page; + caddr_t minaddr = (caddr_t)UINTPTR_MAX; + uint_t i; + size_t zfodsz, memsz; + boolean_t ptload = B_FALSE; off_t offset; - int hsize = ehdr->e_phentsize; - caddr_t mintmp = (caddr_t)-1; + const uint_t hsize = ehdr->e_phentsize; + uintptr_t lastaddr = 0; extern int use_brk_lpg; if (ehdr->e_type == ET_DYN) { + caddr_t vaddr; secflagset_t flags = 0; /* * Obtain the virtual address of a hole in the @@ -1365,34 +1559,74 @@ mapelfexec( map_addr(&addr, len, (offset_t)0, 1, flags); if (addr == NULL) return (ENOMEM); - *voffset = (intptr_t)addr; /* - * Calculate the minimum vaddr so it can be subtracted out. - * According to the ELF specification, since PT_LOAD sections - * must be sorted by increasing p_vaddr values, this is - * guaranteed to be the first PT_LOAD section. + * Despite the fact that mmapobj(2) refuses to load them, we + * need to support executing ET_DYN objects that have a + * non-NULL p_vaddr. When found in the wild, these objects + * are likely to be due to an old (and largely obviated) Linux + * facility, prelink(8), that rewrites shared objects to + * prefer specific (disjoint) virtual address ranges. (Yes, + * this is putatively for performance -- and yes, it has + * limited applicability, many edge conditions and grisly + * failure modes; even for Linux, it's insane.) As ELF + * mandates that the PT_LOAD segments be in p_vaddr order, we + * find the lowest p_vaddr by finding the first PT_LOAD + * segment. */ phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { if (phdr->p_type == PT_LOAD) { - *voffset -= (uintptr_t)phdr->p_vaddr; + addr = (caddr_t)(uintptr_t)phdr->p_vaddr; break; } phdr = (Phdr *)((caddr_t)phdr + hsize); } + /* + * We have a non-zero p_vaddr in the first PT_LOAD segment -- + * presumably because we're directly executing a prelink(8)'d + * ld-linux.so. While we could correctly execute such an + * object without locating it at its desired p_vaddr (it is, + * after all, still relocatable), our inner antiquarian + * derives a perverse pleasure in accommodating the steampunk + * prelink(8) contraption -- goggles on! + */ + if ((vaddr = addr) != NULL) { + if (as_gap(curproc->p_as, len, &addr, &len, + AH_LO, NULL) == -1 || addr != vaddr) { + addr = NULL; + } + } + + if (addr == NULL) { + /* + * We either have a NULL p_vaddr (the common case, by + * many orders of magnitude) or we have a non-NULL + * p_vaddr and we were unable to obtain the specified + * VA range (presumably because it's an illegal + * address). Either way, obtain an address in which + * to map the interpreter. + */ + map_addr(&addr, len, (offset_t)0, 1, 0); + if (addr == NULL) + return (ENOMEM); + } + + /* + * Our voffset is the difference between where we landed and + * where we wanted to be. + */ + *voffset = (uintptr_t)addr - (uintptr_t)vaddr; } else { *voffset = 0; } + phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { switch (phdr->p_type) { case PT_LOAD: - if ((*intphdr != NULL) && (*uphdr == NULL)) - return (0); - - ptload = 1; + ptload = B_TRUE; prot = PROT_USER; if (phdr->p_flags & PF_R) prot |= PROT_READ; @@ -1403,12 +1637,84 @@ mapelfexec( addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset); + if ((*intphdr != NULL) && uphdr != NULL && + (*uphdr == NULL)) { + /* + * The PT_PHDR program header is, strictly + * speaking, optional. If we find that this + * is missing, we will determine the location + * of the program headers based on the address + * of the lowest PT_LOAD segment (namely, this + * one): we subtract the p_offset to get to + * the ELF header and then add back the program + * header offset to get to the program headers. + * We then cons up a Phdr that corresponds to + * the (missing) PT_PHDR, setting the flags + * to 0 to denote that this is artificial and + * should (must) be freed by the caller. + */ + Phdr *cons; + + cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP); + + cons->p_flags = 0; + cons->p_type = PT_PHDR; + cons->p_vaddr = ((uintptr_t)addr - + phdr->p_offset) + ehdr->e_phoff; + + *uphdr = cons; + } + + /* + * The ELF spec dictates that p_filesz may not be + * larger than p_memsz in PT_LOAD segments. + */ + if (phdr->p_filesz > phdr->p_memsz) { + error = EINVAL; + goto bad; + } + /* * Keep track of the segment with the lowest starting * address. */ - if (addr < mintmp) - mintmp = addr; + if (addr < minaddr) + minaddr = addr; + + /* + * Segments need not correspond to page boundaries: + * they are permitted to share a page. If two PT_LOAD + * segments share the same page, and the permissions + * of the segments differ, the behavior is historically + * that the permissions of the latter segment are used + * for the page that the two segments share. This is + * also historically a non-issue: binaries generated + * by most anything will make sure that two PT_LOAD + * segments with differing permissions don't actually + * share any pages. However, there exist some crazy + * things out there (including at least an obscure + * Portuguese teaching language called G-Portugol) that + * actually do the wrong thing and expect it to work: + * they have a segment with execute permission share + * a page with a subsequent segment that does not + * have execute permissions and expect the resulting + * shared page to in fact be executable. To accommodate + * such broken link editors, we take advantage of a + * latitude explicitly granted to the loader: it is + * permitted to make _any_ PT_LOAD segment executable + * (provided that it is readable or writable). If we + * see that we're sharing a page and that the previous + * page was executable, we will add execute permissions + * to our segment. + */ + if (btop(lastaddr) == btop((uintptr_t)addr) && + (phdr->p_flags & (PF_R | PF_W)) && + (lastprot & PROT_EXEC)) { + prot |= PROT_EXEC; + } + + lastaddr = (uintptr_t)addr + phdr->p_filesz; + lastprot = prot; zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz; @@ -1428,14 +1734,22 @@ mapelfexec( if (brksize != NULL && use_brk_lpg && zfodsz != 0 && phdr == dataphdrp && (prot & PROT_WRITE)) { - size_t tlen = P2NPHASE((uintptr_t)addr + + const size_t tlen = P2NPHASE((uintptr_t)addr + phdr->p_filesz, PAGESIZE); if (zfodsz > tlen) { + const caddr_t taddr = addr + + phdr->p_filesz + tlen; + + /* + * Since a hole in the AS large enough + * for this object as calculated by + * elfsize() is available, we do not + * need to fear overflow for 'taddr'. + */ curproc->p_brkpageszc = page_szc(map_pgsz(MAPPGSZ_HEAP, - curproc, addr + phdr->p_filesz + - tlen, zfodsz - tlen, 0)); + curproc, taddr, zfodsz - tlen, 0)); } } @@ -1477,12 +1791,31 @@ mapelfexec( *brkbase = addr + phdr->p_memsz; } - *execsz += btopr(phdr->p_memsz); + memsz = btopr(phdr->p_memsz); + if ((*execsz + memsz) < *execsz) { + error = ENOMEM; + goto bad; + } + *execsz += memsz; break; case PT_INTERP: - if (ptload) - goto bad; + /* + * The ELF specification is unequivocal about the + * PT_INTERP program header with respect to any PT_LOAD + * program header: "If it is present, it must precede + * any loadable segment entry." Linux, however, makes + * no attempt to enforce this -- which has allowed some + * binary editing tools to get away with generating + * invalid ELF binaries in the respect that PT_INTERP + * occurs after the first PT_LOAD program header. This + * is unfortunate (and of course, disappointing) but + * it's no worse than that: there is no reason that we + * can't process the PT_INTERP entry (if present) after + * one or more PT_LOAD entries. We therefore + * deliberately do not check ptload here and always + * store dyphdr to be the PT_INTERP program header. + */ *intphdr = phdr; break; @@ -1491,9 +1824,12 @@ mapelfexec( break; case PT_PHDR: - if (ptload) + if (ptload || phdr->p_flags == 0) goto bad; - *uphdr = phdr; + + if (uphdr != NULL) + *uphdr = phdr; + break; case PT_NULL: @@ -1512,9 +1848,9 @@ mapelfexec( phdr = (Phdr *)((caddr_t)phdr + hsize); } - if (minaddr != NULL) { - ASSERT(mintmp != (caddr_t)-1); - *minaddr = (intptr_t)mintmp; + if (minaddrp != NULL) { + ASSERT(minaddr != (caddr_t)UINTPTR_MAX); + *minaddrp = (uintptr_t)minaddr; } if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) { @@ -1586,24 +1922,39 @@ elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc, return (0); } + /* * Copy the section data from one vnode to the section of another vnode. */ static void -copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset, - void *buf, size_t size, cred_t *credp, rlim64_t rlimit) +elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst) { - ssize_t resid; - size_t len, n = src->sh_size; - offset_t off = 0; + size_t n = src->sh_size; + u_offset_t off = 0; + const u_offset_t soff = src->sh_offset; + const u_offset_t doff = ctx->ecc_doffset; + void *buf = ctx->ecc_buf; + vnode_t *dst_vp = ctx->ecc_vp; + cred_t *credp = ctx->ecc_credp; + + /* Protect the copy loop below from overflow on the offsets */ + if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX || + (n + soff) < n || (n + doff) < n) { + dst->sh_size = 0; + dst->sh_offset = 0; + return; + } while (n != 0) { - len = MIN(size, n); - if (vn_rdwr(UIO_READ, src_vp, buf, len, src->sh_offset + off, + const size_t len = MIN(ctx->ecc_bufsz, n); + ssize_t resid; + + if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len, + (offset_t)(soff + off), UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 || - resid >= len || - core_write(dst_vp, UIO_SYSSPACE, *doffset + off, - buf, len - resid, rlimit, credp) != 0) { + resid >= len || resid < 0 || + core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off), + buf, len - resid, ctx->ecc_rlimit, credp) != 0) { dst->sh_size = 0; dst->sh_offset = 0; return; @@ -1615,62 +1966,222 @@ copy_scn(Shdr *src, vnode_t *src_vp, Shdr *dst, vnode_t *dst_vp, Off *doffset, off += len - resid; } - *doffset += src->sh_size; + ctx->ecc_doffset += src->sh_size; } -#ifdef _ELF32_COMPAT -extern size_t elf_datasz_max; -#else -size_t elf_datasz_max = 1 * 1024 * 1024; -#endif +/* + * Walk sections for a given ELF object, counting (or copying) those of + * interest (CTF, symtab, strtab). + */ +static uint_t +elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, + Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab) +{ + Ehdr ehdr; + const core_content_t content = ctx->ecc_content; + cred_t *credp = ctx->ecc_credp; + Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL; + uintptr_t off = 0; + uint_t nshdrs, shstrndx, nphdrs, count = 0; + u_offset_t *doffp = &ctx->ecc_doffset; + boolean_t ctf_link = B_FALSE; + caddr_t shbase; + size_t shsize, shstrsize; + char *shstrbase; + + if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) == 0) { + return (0); + } + + if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 || + getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize, + &shstrbase, &shstrsize) != 0) { + return (0); + } + + /* Starting at index 1 skips SHT_NULL which is expected at index 0 */ + off = ehdr.e_shentsize; + for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) { + Shdr *shdr, *symchk = NULL, *strchk; + const char *name; + + shdr = (Shdr *)(shbase + off); + if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL) + continue; + + name = shstrbase + shdr->sh_name; + + if (ctf == NULL && + (content & CC_CONTENT_CTF) != 0 && + strcmp(name, shstrtab_data[STR_CTF]) == 0) { + ctf = shdr; + if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) { + /* check linked symtab below */ + symchk = (Shdr *)(shbase + + shdr->sh_link * ehdr.e_shentsize); + ctf_link = B_TRUE; + } else { + continue; + } + } else if (symtab == NULL && + (content & CC_CONTENT_SYMTAB) != 0 && + strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) { + symchk = shdr; + } else { + continue; + } + + ASSERT(symchk != NULL); + if ((symchk->sh_type != SHT_DYNSYM && + symchk->sh_type != SHT_SYMTAB) || + symchk->sh_link == 0 || symchk->sh_link >= nshdrs) { + ctf_link = B_FALSE; + continue; + } + strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize); + if (strchk->sh_type != SHT_STRTAB) { + ctf_link = B_FALSE; + continue; + } + symtab = symchk; + strtab = strchk; + + if (symtab != NULL && ctf != NULL) { + /* No other shdrs are of interest at this point */ + break; + } + } + + if (ctf != NULL) + count += 1; + if (symtab != NULL) + count += 2; + if (v == NULL || count == 0 || count > remain) { + count = MIN(count, remain); + goto done; + } + + /* output CTF section */ + if (ctf != NULL) { + elf_ctx_resize_scratch(ctx, ctf->sh_size); + + v[idx].sh_name = shstrtab_ndx(shstrtab, STR_CTF); + v[idx].sh_addr = (Addr)(uintptr_t)saddr; + v[idx].sh_type = SHT_PROGBITS; + v[idx].sh_addralign = 4; + *doffp = roundup(*doffp, v[idx].sh_addralign); + v[idx].sh_offset = *doffp; + v[idx].sh_size = ctf->sh_size; + + if (ctf_link) { + /* + * The linked symtab (and strtab) will be output + * immediately after this CTF section. Its shdr index + * directly follows this one. + */ + v[idx].sh_link = idx + 1; + ASSERT(symtab != NULL); + } else { + v[idx].sh_link = 0; + } + elf_copy_scn(ctx, ctf, mvp, &v[idx]); + idx++; + } + + /* output SYMTAB/STRTAB sections */ + if (symtab != NULL) { + uint_t symtab_name, strtab_name; + + elf_ctx_resize_scratch(ctx, + MAX(symtab->sh_size, strtab->sh_size)); + + if (symtab->sh_type == SHT_DYNSYM) { + symtab_name = shstrtab_ndx(shstrtab, STR_DYNSYM); + strtab_name = shstrtab_ndx(shstrtab, STR_DYNSTR); + } else { + symtab_name = shstrtab_ndx(shstrtab, STR_SYMTAB); + strtab_name = shstrtab_ndx(shstrtab, STR_STRTAB); + } + + v[idx].sh_name = symtab_name; + v[idx].sh_type = symtab->sh_type; + v[idx].sh_addr = symtab->sh_addr; + if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0) + v[idx].sh_addr += (Addr)(uintptr_t)saddr; + v[idx].sh_addralign = symtab->sh_addralign; + *doffp = roundup(*doffp, v[idx].sh_addralign); + v[idx].sh_offset = *doffp; + v[idx].sh_size = symtab->sh_size; + v[idx].sh_link = idx + 1; + v[idx].sh_entsize = symtab->sh_entsize; + v[idx].sh_info = symtab->sh_info; + + elf_copy_scn(ctx, symtab, mvp, &v[idx]); + idx++; + + v[idx].sh_name = strtab_name; + v[idx].sh_type = SHT_STRTAB; + v[idx].sh_flags = SHF_STRINGS; + v[idx].sh_addr = strtab->sh_addr; + if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0) + v[idx].sh_addr += (Addr)(uintptr_t)saddr; + v[idx].sh_addralign = strtab->sh_addralign; + *doffp = roundup(*doffp, v[idx].sh_addralign); + v[idx].sh_offset = *doffp; + v[idx].sh_size = strtab->sh_size; + + elf_copy_scn(ctx, strtab, mvp, &v[idx]); + idx++; + } + +done: + kmem_free(shstrbase, shstrsize); + kmem_free(shbase, shsize); + return (count); +} /* - * This function processes mappings that correspond to load objects to - * examine their respective sections for elfcore(). It's called once with - * v set to NULL to count the number of sections that we're going to need - * and then again with v set to some allocated buffer that we fill in with - * all the section data. + * Walk mappings in process address space, examining those which correspond to + * loaded objects. It is called twice from elfcore: Once to simply count + * relevant sections, and again later to copy those sections once an adequate + * buffer has been allocated for the shdr details. */ static int -process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp, - Shdr *v, int nv, rlim64_t rlimit, Off *doffsetp, int *nshdrsp) +elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp) { vnode_t *lastvp = NULL; struct seg *seg; - int i, j; - void *data = NULL; - size_t datasz = 0; + uint_t idx = 0, remain; shstrtab_t shstrtab; - struct as *as = p->p_as; + struct as *as = ctx->ecc_p->p_as; int error = 0; - if (v != NULL) + ASSERT(AS_WRITE_HELD(as)); + + if (v != NULL) { + ASSERT(nv != 0); + shstrtab_init(&shstrtab); + remain = nv; + } else { + ASSERT(nv == 0); - i = 1; + /* + * The shdrs are being counted, rather than outputting them + * into a buffer. Leave room for two entries: the SHT_NULL at + * index 0 and the shstrtab at the end. + */ + remain = UINT_MAX - 2; + } + + /* Per the ELF spec, shdr index 0 is reserved. */ + idx = 1; for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { - uint_t prot; vnode_t *mvp; void *tmp = NULL; - caddr_t saddr = seg->s_base; - caddr_t naddr; - caddr_t eaddr; + caddr_t saddr = seg->s_base, naddr, eaddr; size_t segsize; - - Ehdr ehdr; - int nshdrs, shstrndx, nphdrs; - caddr_t shbase; - ssize_t shsize; - char *shstrbase; - ssize_t shstrsize; - - Shdr *shdr; - const char *name; - size_t sz; - uintptr_t off; - - int ctf_ndx = 0; - int symtab_ndx = 0; + uint_t count, prot; /* * Since we're just looking for text segments of load @@ -1696,222 +2207,51 @@ process_scns(core_content_t content, proc_t *p, cred_t *credp, vnode_t *vp, if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC) continue; - if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, - &nphdrs) != 0 || - getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, - &shbase, &shsize, &shstrbase, &shstrsize) != 0) - continue; - - off = ehdr.e_shentsize; - for (j = 1; j < nshdrs; j++, off += ehdr.e_shentsize) { - Shdr *symtab = NULL, *strtab; - - shdr = (Shdr *)(shbase + off); - - if (shdr->sh_name >= shstrsize) - continue; - - name = shstrbase + shdr->sh_name; - - if (strcmp(name, shstrtab_data[STR_CTF]) == 0) { - if ((content & CC_CONTENT_CTF) == 0 || - ctf_ndx != 0) - continue; - - if (shdr->sh_link > 0 && - shdr->sh_link < nshdrs) { - symtab = (Shdr *)(shbase + - shdr->sh_link * ehdr.e_shentsize); - } - - if (v != NULL && i < nv - 1) { - if (shdr->sh_size > datasz && - shdr->sh_size <= elf_datasz_max) { - if (data != NULL) - kmem_free(data, datasz); - - datasz = shdr->sh_size; - data = kmem_alloc(datasz, - KM_SLEEP); - } - - v[i].sh_name = shstrtab_ndx(&shstrtab, - STR_CTF); - v[i].sh_addr = (Addr)(uintptr_t)saddr; - v[i].sh_type = SHT_PROGBITS; - v[i].sh_addralign = 4; - *doffsetp = roundup(*doffsetp, - v[i].sh_addralign); - v[i].sh_offset = *doffsetp; - v[i].sh_size = shdr->sh_size; - if (symtab == NULL) { - v[i].sh_link = 0; - } else if (symtab->sh_type == - SHT_SYMTAB && - symtab_ndx != 0) { - v[i].sh_link = - symtab_ndx; - } else { - v[i].sh_link = i + 1; - } - - copy_scn(shdr, mvp, &v[i], vp, - doffsetp, data, datasz, credp, - rlimit); - } - - ctf_ndx = i++; - - /* - * We've already dumped the symtab. - */ - if (symtab != NULL && - symtab->sh_type == SHT_SYMTAB && - symtab_ndx != 0) - continue; - - } else if (strcmp(name, - shstrtab_data[STR_SYMTAB]) == 0) { - if ((content & CC_CONTENT_SYMTAB) == 0 || - symtab != 0) - continue; - - symtab = shdr; - } - - if (symtab != NULL) { - if ((symtab->sh_type != SHT_DYNSYM && - symtab->sh_type != SHT_SYMTAB) || - symtab->sh_link == 0 || - symtab->sh_link >= nshdrs) - continue; - - strtab = (Shdr *)(shbase + - symtab->sh_link * ehdr.e_shentsize); - - if (strtab->sh_type != SHT_STRTAB) - continue; - - if (v != NULL && i < nv - 2) { - sz = MAX(symtab->sh_size, - strtab->sh_size); - if (sz > datasz && - sz <= elf_datasz_max) { - if (data != NULL) - kmem_free(data, datasz); - - datasz = sz; - data = kmem_alloc(datasz, - KM_SLEEP); - } - - if (symtab->sh_type == SHT_DYNSYM) { - v[i].sh_name = shstrtab_ndx( - &shstrtab, STR_DYNSYM); - v[i + 1].sh_name = shstrtab_ndx( - &shstrtab, STR_DYNSTR); - } else { - v[i].sh_name = shstrtab_ndx( - &shstrtab, STR_SYMTAB); - v[i + 1].sh_name = shstrtab_ndx( - &shstrtab, STR_STRTAB); - } - - v[i].sh_type = symtab->sh_type; - v[i].sh_addr = symtab->sh_addr; - if (ehdr.e_type == ET_DYN || - v[i].sh_addr == 0) - v[i].sh_addr += - (Addr)(uintptr_t)saddr; - v[i].sh_addralign = - symtab->sh_addralign; - *doffsetp = roundup(*doffsetp, - v[i].sh_addralign); - v[i].sh_offset = *doffsetp; - v[i].sh_size = symtab->sh_size; - v[i].sh_link = i + 1; - v[i].sh_entsize = symtab->sh_entsize; - v[i].sh_info = symtab->sh_info; - - copy_scn(symtab, mvp, &v[i], vp, - doffsetp, data, datasz, credp, - rlimit); - - v[i + 1].sh_type = SHT_STRTAB; - v[i + 1].sh_flags = SHF_STRINGS; - v[i + 1].sh_addr = symtab->sh_addr; - if (ehdr.e_type == ET_DYN || - v[i + 1].sh_addr == 0) - v[i + 1].sh_addr += - (Addr)(uintptr_t)saddr; - v[i + 1].sh_addralign = - strtab->sh_addralign; - *doffsetp = roundup(*doffsetp, - v[i + 1].sh_addralign); - v[i + 1].sh_offset = *doffsetp; - v[i + 1].sh_size = strtab->sh_size; - - copy_scn(strtab, mvp, &v[i + 1], vp, - doffsetp, data, datasz, credp, - rlimit); - } + count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain, + &shstrtab); - if (symtab->sh_type == SHT_SYMTAB) - symtab_ndx = i; - i += 2; - } - } - - kmem_free(shstrbase, shstrsize); - kmem_free(shbase, shsize); + ASSERT(count <= remain); + ASSERT(v == NULL || (idx + count) < nv); + remain -= count; + idx += count; lastvp = mvp; } if (v == NULL) { - if (i == 1) + if (idx == 1) { *nshdrsp = 0; - else - *nshdrsp = i + 1; - goto done; + } else { + /* Include room for the shrstrtab at the end */ + *nshdrsp = idx + 1; + } + return (0); } - if (i != nv - 1) { + if (idx != nv - 1) { cmn_err(CE_WARN, "elfcore: core dump failed for " - "process %d; address space is changing", p->p_pid); - error = EIO; - goto done; + "process %d; address space is changing", + ctx->ecc_p->p_pid); + return (EIO); } - v[i].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB); - v[i].sh_size = shstrtab_size(&shstrtab); - v[i].sh_addralign = 1; - *doffsetp = roundup(*doffsetp, v[i].sh_addralign); - v[i].sh_offset = *doffsetp; - v[i].sh_flags = SHF_STRINGS; - v[i].sh_type = SHT_STRTAB; - - if (v[i].sh_size > datasz) { - if (data != NULL) - kmem_free(data, datasz); - - datasz = v[i].sh_size; - data = kmem_alloc(datasz, - KM_SLEEP); + v[idx].sh_name = shstrtab_ndx(&shstrtab, STR_SHSTRTAB); + v[idx].sh_size = shstrtab_size(&shstrtab); + v[idx].sh_addralign = 1; + v[idx].sh_offset = ctx->ecc_doffset; + v[idx].sh_flags = SHF_STRINGS; + v[idx].sh_type = SHT_STRTAB; + + elf_ctx_resize_scratch(ctx, v[idx].sh_size); + VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size); + shstrtab_dump(&shstrtab, ctx->ecc_buf); + + error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset, + ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp); + if (error == 0) { + ctx->ecc_doffset += v[idx].sh_size; } - shstrtab_dump(&shstrtab, data); - - if ((error = core_write(vp, UIO_SYSSPACE, *doffsetp, - data, v[i].sh_size, rlimit, credp)) != 0) - goto done; - - *doffsetp += v[i].sh_size; - -done: - if (data != NULL) - kmem_free(data, datasz); - return (error); } @@ -1919,27 +2259,30 @@ int elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content) { - offset_t poffset, soffset; - Off doffset; - int error, i, nphdrs, nshdrs; - int overflow = 0; + u_offset_t poffset, soffset, doffset; + int error; + uint_t i, nphdrs, nshdrs; struct seg *seg; struct as *as = p->p_as; - union { - Ehdr ehdr; - Phdr phdr[1]; - Shdr shdr[1]; - } *bigwad; - size_t bigsize; - size_t phdrsz, shdrsz; + void *bigwad; + size_t bigsize, phdrsz, shdrsz; Ehdr *ehdr; - Phdr *v; - caddr_t brkbase; - size_t brksize; - caddr_t stkbase; - size_t stksize; - int ntries = 0; + Phdr *phdr; + Shdr shdr0; + caddr_t brkbase, stkbase; + size_t brksize, stksize; + boolean_t overflowed = B_FALSE, retried = B_FALSE; klwp_t *lwp = ttolwp(curthread); + elf_core_ctx_t ctx = { + .ecc_vp = vp, + .ecc_p = p, + .ecc_credp = credp, + .ecc_rlimit = rlimit, + .ecc_content = content, + .ecc_doffset = 0, + .ecc_buf = NULL, + .ecc_bufsz = 0 + }; top: /* @@ -1957,28 +2300,32 @@ top: */ nshdrs = 0; if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB)) { - (void) process_scns(content, p, credp, NULL, NULL, 0, 0, - NULL, &nshdrs); + VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs)); } AS_LOCK_EXIT(as); - ASSERT(nshdrs == 0 || nshdrs > 1); - /* - * The core file contents may required zero section headers, but if + * The core file contents may require zero section headers, but if * we overflow the 16 bits allotted to the program header count in * the ELF header, we'll need that program header at index zero. */ - if (nshdrs == 0 && nphdrs >= PN_XNUM) + if (nshdrs == 0 && nphdrs >= PN_XNUM) { nshdrs = 1; + } + /* + * Allocate a buffer which is sized adequately to hold the ehdr, phdrs + * or shdrs needed to produce the core file. It is used for the three + * tasks sequentially, not simultaneously, so it does not need space + * for all three data at once, only the largest one. + */ + VERIFY(nphdrs >= 2); phdrsz = nphdrs * sizeof (Phdr); shdrsz = nshdrs * sizeof (Shdr); - - bigsize = MAX(sizeof (*bigwad), MAX(phdrsz, shdrsz)); + bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz)); bigwad = kmem_alloc(bigsize, KM_SLEEP); - ehdr = &bigwad->ehdr; + ehdr = (Ehdr *)bigwad; bzero(ehdr, sizeof (*ehdr)); ehdr->e_ident[EI_MAG0] = ELFMAG0; @@ -2014,6 +2361,11 @@ top: #endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */ + poffset = sizeof (Ehdr); + soffset = sizeof (Ehdr) + phdrsz; + doffset = sizeof (Ehdr) + phdrsz + shdrsz; + bzero(&shdr0, sizeof (shdr0)); + /* * If the count of program headers or section headers or the index * of the section string table can't fit in the mere 16 bits @@ -2021,50 +2373,52 @@ top: * extended formats and put the real values in the section header * as index 0. */ - ehdr->e_version = EV_CURRENT; - ehdr->e_ehsize = sizeof (Ehdr); - - if (nphdrs >= PN_XNUM) + if (nphdrs >= PN_XNUM) { ehdr->e_phnum = PN_XNUM; - else + shdr0.sh_info = nphdrs; + } else { ehdr->e_phnum = (unsigned short)nphdrs; - - ehdr->e_phoff = sizeof (Ehdr); - ehdr->e_phentsize = sizeof (Phdr); + } if (nshdrs > 0) { - if (nshdrs >= SHN_LORESERVE) + if (nshdrs >= SHN_LORESERVE) { ehdr->e_shnum = 0; - else + shdr0.sh_size = nshdrs; + } else { ehdr->e_shnum = (unsigned short)nshdrs; + } - if (nshdrs - 1 >= SHN_LORESERVE) + if (nshdrs - 1 >= SHN_LORESERVE) { ehdr->e_shstrndx = SHN_XINDEX; - else + shdr0.sh_link = nshdrs - 1; + } else { ehdr->e_shstrndx = (unsigned short)(nshdrs - 1); + } - ehdr->e_shoff = ehdr->e_phoff + ehdr->e_phentsize * nphdrs; + ehdr->e_shoff = soffset; ehdr->e_shentsize = sizeof (Shdr); } + ehdr->e_version = EV_CURRENT; + ehdr->e_ehsize = sizeof (Ehdr); + ehdr->e_phoff = poffset; + ehdr->e_phentsize = sizeof (Phdr); + if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr, - sizeof (Ehdr), rlimit, credp)) + sizeof (Ehdr), rlimit, credp)) { goto done; + } - poffset = sizeof (Ehdr); - soffset = sizeof (Ehdr) + phdrsz; - doffset = sizeof (Ehdr) + phdrsz + shdrsz; - - v = &bigwad->phdr[0]; - bzero(v, phdrsz); + phdr = (Phdr *)bigwad; + bzero(phdr, phdrsz); - setup_old_note_header(&v[0], p); - v[0].p_offset = doffset = roundup(doffset, sizeof (Word)); - doffset += v[0].p_filesz; + setup_old_note_header(&phdr[0], p); + phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word)); + doffset += phdr[0].p_filesz; - setup_note_header(&v[1], p); - v[1].p_offset = doffset = roundup(doffset, sizeof (Word)); - doffset += v[1].p_filesz; + setup_note_header(&phdr[1], p); + phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word)); + doffset += phdr[1].p_filesz; mutex_enter(&p->p_lock); @@ -2096,21 +2450,23 @@ top: prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr); prot &= PROT_READ | PROT_WRITE | PROT_EXEC; - if ((size = (size_t)(naddr - saddr)) == 0) - continue; - if (i == nphdrs) { - overflow++; + if ((size = (size_t)(naddr - saddr)) == 0) { + ASSERT(tmp == NULL); continue; + } else if (i == nphdrs) { + pr_getprot_done(&tmp); + overflowed = B_TRUE; + break; } - v[i].p_type = PT_LOAD; - v[i].p_vaddr = (Addr)(uintptr_t)saddr; - v[i].p_memsz = size; + phdr[i].p_type = PT_LOAD; + phdr[i].p_vaddr = (Addr)(uintptr_t)saddr; + phdr[i].p_memsz = size; if (prot & PROT_READ) - v[i].p_flags |= PF_R; + phdr[i].p_flags |= PF_R; if (prot & PROT_WRITE) - v[i].p_flags |= PF_W; + phdr[i].p_flags |= PF_W; if (prot & PROT_EXEC) - v[i].p_flags |= PF_X; + phdr[i].p_flags |= PF_X; /* * Figure out which mappings to include in the core. @@ -2172,20 +2528,23 @@ top: } doffset = roundup(doffset, sizeof (Word)); - v[i].p_offset = doffset; - v[i].p_filesz = size; + phdr[i].p_offset = doffset; + phdr[i].p_filesz = size; doffset += size; exclude: i++; } - ASSERT(tmp == NULL); + VERIFY(tmp == NULL); + if (overflowed) + break; } AS_LOCK_EXIT(as); - if (overflow || i != nphdrs) { - if (ntries++ == 0) { + if (overflowed || i != nphdrs) { + if (!retried) { + retried = B_TRUE; + overflowed = B_FALSE; kmem_free(bigwad, bigsize); - overflow = 0; goto top; } cmn_err(CE_WARN, "elfcore: core dump failed for " @@ -2195,23 +2554,25 @@ exclude: } if ((error = core_write(vp, UIO_SYSSPACE, poffset, - v, phdrsz, rlimit, credp)) != 0) + phdr, phdrsz, rlimit, credp)) != 0) { goto done; + } - if ((error = write_old_elfnotes(p, sig, vp, v[0].p_offset, rlimit, - credp)) != 0) + if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit, + credp)) != 0) { goto done; - - if ((error = write_elfnotes(p, sig, vp, v[1].p_offset, rlimit, - credp, content)) != 0) + } + if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit, + credp, content)) != 0) { goto done; + } for (i = 2; i < nphdrs; i++) { prkillinfo_t killinfo; sigqueue_t *sq; int sig, j; - if (v[i].p_filesz == 0) + if (phdr[i].p_filesz == 0) continue; /* @@ -2222,8 +2583,8 @@ exclude: * this from mappings that were excluded due to the core file * content settings. */ - if ((error = core_seg(p, vp, v[i].p_offset, - (caddr_t)(uintptr_t)v[i].p_vaddr, v[i].p_filesz, + if ((error = core_seg(p, vp, phdr[i].p_offset, + (caddr_t)(uintptr_t)phdr[i].p_vaddr, phdr[i].p_filesz, rlimit, credp)) == 0) { continue; } @@ -2236,14 +2597,14 @@ exclude: * bytes. This undocumented interface will let us * understand the nature of the failure. */ - (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset, + (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset, &error, sizeof (error), rlimit, credp); - v[i].p_filesz = 0; - v[i].p_flags |= PF_SUNW_FAILURE; + phdr[i].p_filesz = 0; + phdr[i].p_flags |= PF_SUNW_FAILURE; if ((error = core_write(vp, UIO_SYSSPACE, - poffset + sizeof (v[i]) * i, &v[i], sizeof (v[i]), - rlimit, credp)) != 0) + poffset + sizeof (Phdr) * i, &phdr[i], + sizeof (Phdr), rlimit, credp)) != 0) goto done; continue; @@ -2285,15 +2646,15 @@ exclude: } #endif - (void) core_write(vp, UIO_SYSSPACE, v[i].p_offset, + (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset, &killinfo, sizeof (killinfo), rlimit, credp); /* * For the segment on which we took the signal, indicate that * its data now refers to a siginfo. */ - v[i].p_filesz = 0; - v[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED | + phdr[i].p_filesz = 0; + phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED | PF_SUNW_SIGINFO; /* @@ -2301,50 +2662,46 @@ exclude: * is due to a signal. */ for (j = i + 1; j < nphdrs; j++) { - v[j].p_filesz = 0; - v[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED; + phdr[j].p_filesz = 0; + phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED; } /* * Finally, write out our modified program headers. */ if ((error = core_write(vp, UIO_SYSSPACE, - poffset + sizeof (v[i]) * i, &v[i], - sizeof (v[i]) * (nphdrs - i), rlimit, credp)) != 0) + poffset + sizeof (Phdr) * i, &phdr[i], + sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) { goto done; + } break; } if (nshdrs > 0) { - bzero(&bigwad->shdr[0], shdrsz); - - if (nshdrs >= SHN_LORESERVE) - bigwad->shdr[0].sh_size = nshdrs; - - if (nshdrs - 1 >= SHN_LORESERVE) - bigwad->shdr[0].sh_link = nshdrs - 1; - - if (nphdrs >= PN_XNUM) - bigwad->shdr[0].sh_info = nphdrs; + Shdr *shdr = (Shdr *)bigwad; + bzero(shdr, shdrsz); if (nshdrs > 1) { + ctx.ecc_doffset = doffset; AS_LOCK_ENTER(as, RW_WRITER); - if ((error = process_scns(content, p, credp, vp, - &bigwad->shdr[0], nshdrs, rlimit, &doffset, - NULL)) != 0) { - AS_LOCK_EXIT(as); + error = elf_process_scns(&ctx, shdr, nshdrs, NULL); + AS_LOCK_EXIT(as); + if (error != 0) { goto done; } - AS_LOCK_EXIT(as); } + /* Copy any extended format data destined for the first shdr */ + bcopy(&shdr0, shdr, sizeof (shdr0)); - if ((error = core_write(vp, UIO_SYSSPACE, soffset, - &bigwad->shdr[0], shdrsz, rlimit, credp)) != 0) - goto done; + error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz, + rlimit, credp); } done: + if (ctx.ecc_bufsz != 0) { + kmem_free(ctx.ecc_buf, ctx.ecc_bufsz); + } kmem_free(bigwad, bigsize); return (error); } @@ -2369,9 +2726,9 @@ static struct modlexec modlexec = { #ifdef _LP64 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args, - intpdata_t *idatap, int level, long *execsz, + intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action); + int *brand_action); extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); diff --git a/usr/src/uts/common/exec/elf/elf_impl.h b/usr/src/uts/common/exec/elf/elf_impl.h index 010d5e6256..504cf84dd2 100644 --- a/usr/src/uts/common/exec/elf/elf_impl.h +++ b/usr/src/uts/common/exec/elf/elf_impl.h @@ -22,12 +22,13 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2019 Joyent, Inc. + */ #ifndef _ELF_ELF_IMPL_H #define _ELF_ELF_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -71,6 +72,17 @@ typedef struct { char name[8]; } Note; +typedef struct { + vnode_t *ecc_vp; + proc_t *ecc_p; + cred_t *ecc_credp; + rlim64_t ecc_rlimit; + core_content_t ecc_content; + u_offset_t ecc_doffset; + void *ecc_buf; + size_t ecc_bufsz; +} elf_core_ctx_t; + #ifdef _ELF32_COMPAT /* * These are defined only for the 32-bit compatibility @@ -79,6 +91,7 @@ typedef struct { #define elfexec elf32exec #define elfnote elf32note #define elfcore elf32core +#define elfreadhdr elf32readhdr #define mapexec_brand mapexec32_brand #define setup_note_header setup_note_header32 #define write_elfnotes write_elfnotes32 diff --git a/usr/src/uts/common/exec/elf/elf_notes.c b/usr/src/uts/common/exec/elf/elf_notes.c index fbc87fea66..6a024d0d1f 100644 --- a/usr/src/uts/common/exec/elf/elf_notes.c +++ b/usr/src/uts/common/exec/elf/elf_notes.c @@ -337,11 +337,13 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset, /* open file table */ + mutex_enter(&p->p_lock); vroot = PTOU(p)->u_rdir; if (vroot == NULL) vroot = rootdir; VN_HOLD(vroot); + mutex_exit(&p->p_lock); fip = P_FINFO(p); diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c index 269ba86b1b..388d913ea0 100644 --- a/usr/src/uts/common/exec/intp/intp.c +++ b/usr/src/uts/common/exec/intp/intp.c @@ -22,6 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -47,6 +48,7 @@ #include <sys/kmem.h> #include <sys/note.h> #include <sys/sdt.h> +#include <sys/brand.h> /* * This is the loadable module wrapper. @@ -54,7 +56,7 @@ #include <sys/modctl.h> extern int intpexec(struct vnode *, struct execa *, struct uarg *, - struct intpdata *, int, long *, int, caddr_t, struct cred *, int); + struct intpdata *, int, size_t *, int, caddr_t, struct cred *, int *); static struct execsw esw = { intpmagicstr, @@ -126,13 +128,20 @@ getintphead(struct vnode *vp, struct intpdata *idatap) *cp = '\0'; /* - * Locate the beginning and end of the interpreter name. - * In addition to the name, one additional argument may - * optionally be included here, to be prepended to the - * arguments provided on the command line. Thus, for - * example, you can say + * Locate the beginning and end of the interpreter name. Historically, + * for illumos and its predecessors, in addition to the name, one + * additional argument may optionally be included here, to be prepended + * to the arguments provided on the command line. Thus, for example, + * you can say * * #! /usr/bin/awk -f + * + * However, handling of interpreter arguments varies across operating + * systems and other systems allow more than one argument. In + * particular, Linux allows more than one and delivers all arguments + * as a single string (argv[1] is "-arg1 -arg2 ..."). We support this + * style of argument handling as a brand-specific option (setting + * b_intp_parse_arg to B_FALSE). */ for (cp = &linep[2]; *cp == ' '; cp++) ; @@ -151,9 +160,12 @@ getintphead(struct vnode *vp, struct intpdata *idatap) idatap->intp_arg[0] = NULL; else { idatap->intp_arg[0] = cp; - while (*cp && *cp != ' ') - cp++; - *cp = '\0'; + if (!PROC_IS_BRANDED(curproc) || + BROP(curproc)->b_intp_parse_arg) { + while (*cp && *cp != ' ') + cp++; + *cp = '\0'; + } } } return (0); @@ -184,13 +196,12 @@ intpexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { - _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; int error = 0; struct intpdata idata; @@ -281,7 +292,7 @@ intpexec( } error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, - EBA_NONE); + brand_action); if (!error) { /* diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c index fdc327dcbb..a61a6f105f 100644 --- a/usr/src/uts/common/exec/java/java.c +++ b/usr/src/uts/common/exec/java/java.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* @@ -84,8 +85,8 @@ char *jexec_arg = "-jar"; /*ARGSUSED3*/ static int javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, - struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t execfile, cred_t *cred, int brand_action) + struct intpdata *idatap, int level, size_t *execsz, int setid, + caddr_t execfile, cred_t *cred, int *brand_action) { struct intpdata idata; int error; diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c index ee5060a07e..7b653a4c98 100644 --- a/usr/src/uts/common/exec/shbin/shbin.c +++ b/usr/src/uts/common/exec/shbin/shbin.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -54,11 +55,11 @@ shbinexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, int setid, caddr_t exec_file, struct cred *cred, - int brand_action); + int *brand_action); #define SHBIN_CNTL(x) ((x)&037) #define SHBINMAGIC_LEN 4 @@ -158,11 +159,11 @@ shbinexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c index a426eeaf10..ce08e3697b 100644 --- a/usr/src/uts/common/fs/dev/sdev_netops.c +++ b/usr/src/uts/common/fs/dev/sdev_netops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -41,8 +42,102 @@ #include <sys/zone.h> #include <sys/dls.h> +static const char *devnet_zpath = "/dev/net/zone/"; struct vnodeops *devnet_vnodeops; +static zoneid_t +devnet_nodetozone(sdev_node_t *dv) +{ + char *zname = NULL, *dup; + zone_t *zone; + int duplen; + zoneid_t zid; + + /* + * If in a non-global zone, always return it's zid no matter what the + * node is. + */ + zid = getzoneid(); + if (zid != GLOBAL_ZONEID) + return (zid); + + /* + * If it doesn't have /dev/net/zone/ then it can't be a specific zone + * we're targetting. + */ + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0) + return (GLOBAL_ZONEID); + + if (dv->sdev_vnode->v_type == VDIR) { + zone = zone_find_by_name(dv->sdev_name); + } else { + /* Non directories have the form /dev/net/zone/%z/%s */ + dup = strdup(dv->sdev_path); + duplen = strlen(dup); + zname = strrchr(dup, '/'); + *zname = '\0'; + zname--; + zname = strrchr(dup, '/'); + zname++; + zone = zone_find_by_name(zname); + kmem_free(dup, duplen + 1); + } + if (zone == NULL) + return (GLOBAL_ZONEID); + zid = zone->zone_id; + zone_rele(zone); + return (zid); +} + +static int +devnet_mkdir(struct sdev_node *ddv, char *name) +{ + sdev_node_t *dv; + struct vattr va; + int ret; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + dv = sdev_cache_lookup(ddv, name); + if (dv != NULL) { + SDEV_SIMPLE_RELE(dv); + return (EEXIST); + } + + va = *sdev_getdefault_attr(VDIR); + gethrestime(&va.va_atime); + va.va_mtime = va.va_atime; + va.va_ctime = va.va_atime; + + ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(dv); + return (0); +} + +/* + * We basically need to walk down the directory path to determine what we should + * do. At the top level of /dev/net, only the directory /dev/net/zone is valid, + * and it is always valid. Following on that, /dev/net/zone/%zonename is valid + * if and only if we can look up that zone name. If it's not, or it's some other + * name, then it's SDEV_VTOR_INVALID. + */ +static int +devnet_dirvalidate(struct sdev_node *dv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, dv->sdev_path) == 0) + return (SDEV_VTOR_VALID); + + zonep = zone_find_by_name(dv->sdev_name); + if (zonep == NULL) + return (SDEV_VTOR_INVALID); + zone_rele(zonep); + return (SDEV_VTOR_VALID); +} + /* * Check if a net sdev_node is still valid - i.e. it represents a current * network link. @@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv) ASSERT(dv->sdev_state == SDEV_READY); - if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0) + if (dv->sdev_vnode->v_type == VDIR) + return (devnet_dirvalidate(dv)); + + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) { + ASSERT(SDEV_IS_GLOBAL(dv)); + zoneid = devnet_nodetozone(dv); + } else { + zoneid = getzoneid(); + } + + if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0) return (SDEV_VTOR_INVALID); - if (SDEV_IS_GLOBAL(dv)) + if (zoneid == GLOBAL_ZONEID) return (SDEV_VTOR_VALID); - zoneid = getzoneid(); return (zone_check_datalink(&zoneid, linkid) == 0 ? SDEV_VTOR_VALID : SDEV_VTOR_INVALID); } @@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv) * a net entry when the node is not found in the cache. */ static int -devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp) +devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp, + zoneid_t zid) { timestruc_t now; dev_t dev; int error; - if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) { + if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) { sdcmn_err12(("devnet_create_rvp: not a valid vanity name " "network node: %s\n", nm)); return (error); @@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct sdev_node *ddv = VTOSDEV(dvp); struct sdev_node *dv = NULL; dls_dl_handle_t ddh = NULL; + zone_t *zone; struct vattr vattr; int nmlen; int error = ENOENT; @@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, if (SDEVTOV(ddv)->v_type != VDIR) return (ENOTDIR); + if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID) + return (EPERM); + /* * Empty name or ., return node itself. */ @@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, rw_enter(&ddv->sdev_contents, RW_WRITER); /* + * ZOMBIED parent does not allow new node creation, bail out early. + */ + if (ddv->sdev_state == SDEV_ZOMBIE) + goto failed; + + /* * directory cache lookup: */ if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) { @@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, goto found; } + if (SDEV_IS_GLOBAL(ddv)) { + /* + * Check for /dev/net/zone + */ + if (strcmp("zone", nm) == 0 && strcmp("/dev/net", + ddv->sdev_path) == 0) { + (void) devnet_mkdir(ddv, nm); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + + /* + * Check for /dev/net/zone/%z. We can't use devnet_zpath due to + * its trailing slash. + */ + if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) { + zone = zone_find_by_name(nm); + if (zone == NULL) + goto failed; + (void) devnet_mkdir(ddv, nm); + zone_rele(zone); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + } else if (strcmp("/dev/net", ddv->sdev_path) != 0) { + goto failed; + } + /* - * ZOMBIED parent does not allow new node creation, bail out early. + * We didn't find what we were looking for. What that is depends a lot + * on what directory we're in. */ - if (ddv->sdev_state == SDEV_ZOMBIE) - goto failed; - error = devnet_create_rvp(nm, &vattr, &ddh); + error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv)); if (error != 0) goto failed; @@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg) if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL) goto found; - if (devnet_create_rvp(link, &vattr, &ddh) != 0) + if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0) return (0); ASSERT(ddh != NULL); @@ -244,16 +388,77 @@ found: return (0); } +/* + * Fill in all the entries for the current zone. + */ static void -devnet_filldir(struct sdev_node *ddv) +devnet_fillzone(struct sdev_node *ddv, zoneid_t zid) { - sdev_node_t *dv, *next; datalink_id_t linkid; + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + if (zid == GLOBAL_ZONEID) { + ASSERT(SDEV_IS_GLOBAL(ddv)); + linkid = DATALINK_INVALID_LINKID; + do { + linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); + if (linkid != DATALINK_INVALID_LINKID) + (void) devnet_filldir_datalink(linkid, ddv); + } while (linkid != DATALINK_INVALID_LINKID); + } else { + (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv); + } +} + +/* + * Callback for zone_walk when filling up /dev/net/zone/... + */ +static int +devnet_fillzdir_cb(zone_t *zonep, void *arg) +{ + sdev_node_t *ddv = arg; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + (void) devnet_mkdir(ddv, zonep->zone_name); + return (0); +} + +/* + * Fill in a directory that isn't the top level /dev/net. + */ +static void +devnet_fillzdir(struct sdev_node *ddv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, ddv->sdev_path) == 0) { + (void) zone_walk(devnet_fillzdir_cb, ddv); + return; + } + + zonep = zone_find_by_name(ddv->sdev_name); + if (zonep == NULL) + return; + devnet_fillzone(ddv, zonep->zone_id); + zone_rele(zonep); +} + +static void +devnet_filldir(struct sdev_node *ddv) +{ + int ret; + sdev_node_t *dv, *next; + ASSERT(RW_READ_HELD(&ddv->sdev_contents)); if (rw_tryupgrade(&ddv->sdev_contents) == 0) { rw_exit(&ddv->sdev_contents); rw_enter(&ddv->sdev_contents, RW_WRITER); + if (ddv->sdev_state == SDEV_ZOMBIE) { + rw_exit(&ddv->sdev_contents); + return; + } } for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) { @@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv) if (SDEVTOV(dv)->v_count > 0) continue; + SDEV_HOLD(dv); + + /* + * Clean out everything underneath before we remove ourselves. + */ + if (SDEVTOV(dv)->v_type == VDIR) { + ret = sdev_cleandir(dv, NULL, 0); + ASSERT(ret == 0); + } /* remove the cache node */ (void) sdev_cache_update(ddv, &dv, dv->sdev_name, SDEV_CACHE_DELETE); SDEV_RELE(dv); } + if (strcmp(ddv->sdev_path, "/dev/net") != 0) { + devnet_fillzdir(ddv); + goto done; + } + if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild()) goto done; if (SDEV_IS_GLOBAL(ddv)) { - linkid = DATALINK_INVALID_LINKID; - do { - linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, - DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); - if (linkid != DATALINK_INVALID_LINKID) - (void) devnet_filldir_datalink(linkid, ddv); - } while (linkid != DATALINK_INVALID_LINKID); + devnet_fillzone(ddv, GLOBAL_ZONEID); + (void) devnet_mkdir(ddv, "zone"); } else { - (void) zone_datalink_walk(getzoneid(), - devnet_filldir_datalink, ddv); + devnet_fillzone(ddv, getzoneid()); } ddv->sdev_flags &= ~SDEV_BUILD; - done: rw_downgrade(&ddv->sdev_contents); } @@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, ASSERT(sdvp); + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + if (uiop->uio_offset == 0) devnet_filldir(sdvp); diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c new file mode 100644 index 0000000000..6e1618dc3c --- /dev/null +++ b/usr/src/uts/common/fs/dev/sdev_plugin.c @@ -0,0 +1,948 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +/* + * Dynamic directory plugin interface for sdev. + * + * The sdev plugin interfaces provides a means for a dynamic directory based on + * in-kernel state to be simply created. Traditionally, dynamic directories were + * built into sdev itself. While these legacy plugins are useful, it makes more + * sense for these pieces of functionality to live with the individual drivers. + * + * The plugin interface requires folks to implement three interfaces and + * provides a series of callbacks that can be made in the context of those + * interfaces to interrogate the sdev_node_t without having to leak + * implementation details of the sdev_node_t. These interfaces are: + * + * o spo_validate + * + * Given a particular node, answer the question as to whether or not this + * entry is still valid. Here, plugins should use the name and the dev_t + * associated with the node to verify that it matches something that still + * exists. + * + * o spo_filldir + * + * Fill all the entries inside of a directory. Note that some of these entries + * may already exist. + * + * o spo_inactive + * + * The given node is no longer being used. This allows the consumer to + * potentially tear down anything that was being held open related to this. + * Note that this only fires when the given sdev_node_t becomes a zombie. + * + * During these callbacks a consumer is not allowed to register or unregister a + * plugin, especially their own. They may call the sdev_ctx style functions. All + * callbacks fire in a context where blocking is allowed (eg. the spl is below + * LOCK_LEVEL). + * + * When a plugin is added, we create its directory in the global zone. By doing + * that, we ensure that something isn't already there and that nothing else can + * come along and try and create something without our knowledge. We only have + * to create it in the GZ and not for all other instances of sdev because an + * instance of sdev that isn't at /dev does not have dynamic directories, and + * second, any instance of sdev present in a non-global zone cannot create + * anything, therefore we know that by it not being in the global zone's + * instance of sdev that we're good to go. + * + * Lock Ordering + * ------------- + * + * The global sdev_plugin_lock must be held before any of the individual + * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held, + * it is not legal to take any holds on any sdev_node_t or to grab the + * sdev_node_t`contents_lock in any way. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/fs/sdev_impl.h> +#include <sys/fs/sdev_plugin.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/list.h> +#include <sys/ctype.h> + +kmutex_t sdev_plugin_lock; +list_t sdev_plugin_list; +kmem_cache_t *sdev_plugin_cache; +struct vnodeops *sdev_plugin_vnops; + +#define SDEV_PLUGIN_NAMELEN 64 + +typedef struct sdev_plugin { + list_node_t sp_link; + char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */ + int sp_nflags; /* E */ + struct vnodeops *sp_vnops; /* E */ + sdev_plugin_ops_t *sp_pops; /* E */ + boolean_t sp_islegacy; /* E */ + int (*sp_lvtor)(sdev_node_t *); /* E */ + kmutex_t sp_lock; /* Protects everything below */ + kcondvar_t sp_nodecv; + size_t sp_nnodes; +} sdev_plugin_t; + +/* ARGSUSED */ +static int +sdev_plugin_cache_constructor(void *buf, void *arg, int tags) +{ + sdev_plugin_t *spp = buf; + mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0); + cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +sdev_plugin_cache_destructor(void *buf, void *arg) +{ + sdev_plugin_t *spp = buf; + cv_destroy(&spp->sp_nodecv); + mutex_destroy(&spp->sp_lock); +} + +enum vtype +sdev_ctx_vtype(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_vnode->v_type); +} + +const char * +sdev_ctx_path(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_path); +} + +const char * +sdev_ctx_name(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_name); +} + +int +sdev_ctx_minor(sdev_ctx_t ctx, minor_t *minorp) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + ASSERT(minorp != NULL); + if (sdp->sdev_vnode->v_type == VCHR || + sdp->sdev_vnode->v_type == VBLK) { + *minorp = getminor(sdp->sdev_vnode->v_rdev); + return (0); + } + + return (ENODEV); +} + +/* + * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL. + */ +sdev_ctx_flags_t +sdev_ctx_flags(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_flags & SDEV_GLOBAL); +} + +/* + * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'. + */ +static int +sdev_plugin_name_isvalid(const char *c, int buflen) +{ + int i; + + for (i = 0; i < buflen; i++, c++) { + if (*c == '\0') + return (1); + + if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.') + return (0); + } + /* Never found a null terminator */ + return (0); +} + +static int +sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name, + vattr_t *vap) +{ + int ret; + sdev_node_t *svp; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + ASSERT(spp != NULL); + svp = sdev_cache_lookup(sdvp, name); + if (svp != NULL) { + SDEV_SIMPLE_RELE(svp); + return (EEXIST); + } + + ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred, + SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(svp); + + return (0); +} + +/* + * Plugin node creation callbacks + */ +int +sdev_plugin_mkdir(sdev_ctx_t ctx, char *name) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(sdvp->sdev_private != NULL); + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); +} + +int +sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + mode_t type = mode & S_IFMT; + mode_t access = mode & S_IAMB; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + /* + * Ensure only type and user/group/other permission bits are present. + * Do not allow setuid, setgid, etc. + */ + if ((mode & ~(S_IFMT | S_IAMB)) != 0) + return (EINVAL); + + /* Disallow types other than character and block devices */ + if (type != S_IFCHR && type != S_IFBLK) + return (EINVAL); + + /* Disallow execute bits */ + if ((access & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0) + return (EINVAL); + + /* No bits other than 0666 in access */ + ASSERT((access & + ~(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) == 0); + + /* Default to relatively safe access bits if none specified. */ + if (access == 0) + access = 0600; + + ASSERT(sdvp->sdev_private != NULL); + + vap = *sdev_getdefault_attr(type == S_IFCHR ? VCHR : VBLK); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + vap.va_rdev = dev; + vap.va_mode = type | access; + + /* Despite the similar name, this is in fact a different function */ + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); +} + +static int +sdev_plugin_validate(sdev_node_t *sdp) +{ + int ret; + sdev_plugin_t *spp; + + ASSERT(sdp->sdev_private != NULL); + spp = sdp->sdev_private; + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + rw_enter(&sdp->sdev_contents, RW_READER); + ret = spp->sp_pops->spo_validate((uintptr_t)sdp); + rw_exit(&sdp->sdev_contents); + return (ret); +} + +static void +sdev_plugin_validate_dir(sdev_node_t *sdvp) +{ + int ret; + sdev_node_t *svp, *next; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) { + + next = SDEV_NEXT_ENTRY(sdvp, svp); + ASSERT(svp->sdev_state != SDEV_ZOMBIE); + /* skip nodes that aren't ready */ + if (svp->sdev_state == SDEV_INIT) + continue; + + switch (sdev_plugin_validate(svp)) { + case SDEV_VTOR_VALID: + case SDEV_VTOR_SKIP: + continue; + case SDEV_VTOR_INVALID: + case SDEV_VTOR_STALE: + break; + } + + SDEV_HOLD(svp); + + /* + * Clean out everything underneath this node before we + * remove it. + */ + if (svp->sdev_vnode->v_type == VDIR) { + ret = sdev_cleandir(svp, NULL, 0); + ASSERT(ret == 0); + } + /* remove the cache node */ + (void) sdev_cache_update(sdvp, &svp, svp->sdev_name, + SDEV_CACHE_DELETE); + SDEV_RELE(svp); + } +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, + int *eofp, caller_context_t *ct_unused, int flags_unused) +{ + int ret; + sdev_node_t *sdvp = VTOSDEV(dvp); + sdev_plugin_t *spp; + + ASSERT(RW_READ_HELD(&sdvp->sdev_contents)); + + /* Sanity check we're not a zombie before we do anyting else */ + if (sdvp->sdev_state == SDEV_ZOMBIE) + return (ENOENT); + + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + if (uiop->uio_offset == 0) { + /* + * We upgrade to a write lock and grab the plugin's lock along + * the way. We're almost certainly going to get creation + * callbacks, so this is the only safe way to go. + */ + if (rw_tryupgrade(&sdvp->sdev_contents) == 0) { + rw_exit(&sdvp->sdev_contents); + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_downgrade(&sdvp->sdev_contents); + return (ENOENT); + } + } + + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_downgrade(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + } + + return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); +} + +/* + * If we don't have a callback function that returns a failure, then sdev will + * try to create a node for us which violates all of our basic assertions. To + * work around that we create our own callback for devname_lookup_func which + * always returns ENOENT as at this point either it was created with the filldir + * callback or it was not. + */ +/*ARGSUSED*/ +static int +sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred, + void *unused, char *unused2) +{ + return (ENOENT); +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + int ret; + sdev_node_t *sdvp; + sdev_plugin_t *spp; + + /* execute access is required to search the directory */ + if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) + return (ret); + + sdvp = VTOSDEV(dvp); + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + /* + * Go straight for the write lock. + */ + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_exit(&sdvp->sdev_contents); + return (ENOENT); + } + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_exit(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + + return (devname_lookup_func(sdvp, nm, vpp, cred, + sdev_plugin_vop_lookup_cb, SDEV_VATTR)); +} + +/* + * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes + * to zero, but isn't necessairily a zombie yet. As such, to make things easier + * for users, we only fire the inactive callback when the node becomes a zombie + * and thus will be torn down here. + */ +static void +sdev_plugin_vop_inactive_cb(struct vnode *dvp) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + + rw_enter(&sdp->sdev_contents, RW_READER); + if (sdp->sdev_state != SDEV_ZOMBIE) { + rw_exit(&sdp->sdev_contents); + return; + } + spp->sp_pops->spo_inactive((uintptr_t)sdp); + mutex_enter(&spp->sp_lock); + VERIFY(spp->sp_nnodes > 0); + spp->sp_nnodes--; + cv_signal(&spp->sp_nodecv); + mutex_exit(&spp->sp_lock); + rw_exit(&sdp->sdev_contents); +} + +/*ARGSUSED*/ +static void +sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred, + caller_context_t *ct) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + ASSERT(sdp->sdev_private != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb); +} + +const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = { + VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir }, + VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup }, + VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive }, + VOPNAME_CREATE, { .error = fs_nosys }, + VOPNAME_REMOVE, { .error = fs_nosys }, + VOPNAME_MKDIR, { .error = fs_nosys }, + VOPNAME_RMDIR, { .error = fs_nosys }, + VOPNAME_SYMLINK, { .error = fs_nosys }, + VOPNAME_SETSECATTR, { .error = fs_nosys }, + NULL, NULL +}; + +/* + * construct a new template with overrides from vtab + */ +static fs_operation_def_t * +sdev_merge_vtab(const fs_operation_def_t tab[]) +{ + fs_operation_def_t *new; + const fs_operation_def_t *tab_entry; + + /* make a copy of standard vnode ops table */ + new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); + bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); + + /* replace the overrides from tab */ + for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { + fs_operation_def_t *std_entry = new; + while (std_entry->name) { + if (strcmp(tab_entry->name, std_entry->name) == 0) { + std_entry->func = tab_entry->func; + break; + } + std_entry++; + } + } + + return (new); +} + +/* free memory allocated by sdev_merge_vtab */ +static void +sdev_free_vtab(fs_operation_def_t *new) +{ + kmem_free(new, sdev_vnodeops_tbl_size); +} + +/* + * Register a new plugin. + */ +sdev_plugin_hdl_t +sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp) +{ + char buf[sizeof ("dev")] = ""; + struct pathname pn = { 0 }; + sdev_plugin_t *spp, *iter; + vnode_t *vp, *nvp; + sdev_node_t *sdp, *slp; + timestruc_t now; + struct vattr vap; + int ret, err; + + /* + * Some consumers don't care about why they failed. To keep the code + * simple, we'll just pretend they gave us something. + */ + if (errp == NULL) + errp = &err; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_version != 1) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_validate == NULL || ops->spo_filldir == NULL || + ops->spo_inactive == NULL) { + *errp = EINVAL; + return (NULL); + } + + if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) { + *errp = EINVAL; + return (NULL); + } + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN); + + spp->sp_pops = ops; + spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR; + if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE) + spp->sp_nflags |= SDEV_NO_NCACHE; + if (ops->spo_flags & SDEV_PLUGIN_SUBDIR) + spp->sp_nflags |= SDEV_SUBDIR; + spp->sp_vnops = sdev_plugin_vnops; + spp->sp_islegacy = B_FALSE; + spp->sp_lvtor = NULL; + spp->sp_nnodes = 0; + + /* + * Make sure our /dev entry is unique and install it. We also need to + * go through and grab the sdev root node as we cannot grab any sdev + * node locks once we've grabbed the sdev_plugin_lock. We effectively + * assert that if a directory is not present in the GZ's /dev, then it + * doesn't exist in any of the local zones. + * + * Note that we may be in NGZ context: during a prof_filldir(".../dev/") + * enumeration, for example. So we have to dig as deep as lookuppnvp() + * to make sure we really get to the global /dev (i.e. escape both + * CRED() and ->u_rdir). + */ + pn_get_buf("dev", UIO_SYSSPACE, &pn, buf, sizeof (buf)); + VN_HOLD(rootdir); + ret = lookuppnvp(&pn, NULL, NO_FOLLOW, NULLVPP, + &vp, rootdir, rootdir, kcred); + + if (ret != 0) { + *errp = ret; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + /* Make sure we have the real vnode */ + if (VOP_REALVP(vp, &nvp, NULL) == 0) { + VN_HOLD(nvp); + VN_RELE(vp); + vp = nvp; + nvp = NULL; + } + VERIFY(vp->v_op == sdev_vnodeops); + sdp = VTOSDEV(vp); + rw_enter(&sdp->sdev_contents, RW_WRITER); + slp = sdev_cache_lookup(sdp, spp->sp_name); + if (slp != NULL) { + SDEV_RELE(slp); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + + mutex_enter(&sdev_plugin_lock); + for (iter = list_head(&sdev_plugin_list); iter != NULL; + iter = list_next(&sdev_plugin_list, iter)) { + if (strcmp(spp->sp_name, iter->sp_name) == 0) { + mutex_exit(&sdev_plugin_lock); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + } + + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + /* + * Now go ahead and create the top level directory for the global zone. + */ + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap); + + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + + *errp = 0; + + return ((sdev_plugin_hdl_t)spp); +} + +static void +sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg) +{ + sdev_plugin_t *spp = arg; + sdev_node_t *sdp; + + rw_enter(&rdp->sdev_contents, RW_WRITER); + sdp = sdev_cache_lookup(rdp, spp->sp_name); + /* If it doesn't exist, we're done here */ + if (sdp == NULL) { + rw_exit(&rdp->sdev_contents); + return; + } + + /* + * We first delete the directory before recursively marking everything + * else stale. This ordering should ensure that we don't accidentally + * miss anything. + */ + sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE); + sdev_stale(sdp); + SDEV_RELE(sdp); + rw_exit(&rdp->sdev_contents); +} + +int sdev_plugin_unregister_allowed; + +/* + * Remove a plugin. This will block until everything has become a zombie, thus + * guaranteeing the caller that nothing will call into them again once this call + * returns. While the call is ongoing, it could be called into. Note that while + * this is ongoing, it will block other mounts. + * + * NB: this is not safe when used from detach() context - we will be DEVI_BUSY, + * and other sdev threads may be waiting for this. Only use the over-ride if + * willing to risk it. + */ +int +sdev_plugin_unregister(sdev_plugin_hdl_t hdl) +{ + sdev_plugin_t *spp = (sdev_plugin_t *)hdl; + if (spp->sp_islegacy) + return (EINVAL); + + if (!sdev_plugin_unregister_allowed) + return (EBUSY); + + mutex_enter(&sdev_plugin_lock); + list_remove(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + sdev_mnt_walk(sdev_plugin_unregister_cb, spp); + mutex_enter(&spp->sp_lock); + while (spp->sp_nnodes > 0) + cv_wait(&spp->sp_nodecv, &spp->sp_lock); + mutex_exit(&spp->sp_lock); + kmem_cache_free(sdev_plugin_cache, spp); + return (0); +} + +/* + * Register an old sdev style plugin to deal with what used to be in the vtab. + */ +static int +sdev_plugin_register_legacy(struct sdev_vop_table *vtp) +{ + sdev_plugin_t *spp; + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN); + spp->sp_islegacy = B_TRUE; + spp->sp_pops = NULL; + spp->sp_nflags = vtp->vt_flags; + spp->sp_lvtor = vtp->vt_vtor; + spp->sp_nnodes = 0; + + if (vtp->vt_service != NULL) { + fs_operation_def_t *templ; + templ = sdev_merge_vtab(vtp->vt_service); + if (vn_make_ops(vtp->vt_name, + (const fs_operation_def_t *)templ, + &spp->sp_vnops) != 0) { + cmn_err(CE_WARN, "%s: malformed vnode ops\n", + vtp->vt_name); + sdev_free_vtab(templ); + kmem_cache_free(sdev_plugin_cache, spp); + return (1); + } + + if (vtp->vt_global_vops) { + *(vtp->vt_global_vops) = spp->sp_vnops; + } + + sdev_free_vtab(templ); + } else { + spp->sp_vnops = sdev_vnodeops; + } + + /* + * No need to check for EEXIST here. These are loaded as a part of the + * sdev's initialization function. Further, we don't have to create them + * as that's taken care of in sdev's mount for the GZ. + */ + mutex_enter(&sdev_plugin_lock); + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + return (0); +} + +/* + * We need to match off of the sdev_path, not the sdev_name. We are only allowed + * to exist directly under /dev. + */ +static sdev_plugin_t * +sdev_match(sdev_node_t *dv) +{ + int vlen; + const char *path; + sdev_plugin_t *spp; + + if (strlen(dv->sdev_path) <= 5) + return (NULL); + + if (strncmp(dv->sdev_path, "/dev/", 5) != 0) + return (NULL); + path = dv->sdev_path + 5; + + mutex_enter(&sdev_plugin_lock); + + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + if (spp->sp_nflags & SDEV_SUBDIR) { + vlen = strlen(spp->sp_name); + if ((strncmp(spp->sp_name, path, + vlen - 1) == 0) && path[vlen] == '/') { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + } + } + + mutex_exit(&sdev_plugin_lock); + return (NULL); +} + +void +sdev_set_no_negcache(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + ASSERT(dv->sdev_path); + path = dv->sdev_path + strlen("/dev/"); + + mutex_enter(&sdev_plugin_lock); + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + if (spp->sp_nflags & SDEV_NO_NCACHE) + dv->sdev_flags |= SDEV_NO_NCACHE; + break; + } + } + mutex_exit(&sdev_plugin_lock); +} + +struct vnodeops * +sdev_get_vop(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + path = dv->sdev_path; + ASSERT(path); + + /* gets the relative path to /dev/ */ + path += 5; + + if ((spp = sdev_match(dv)) != NULL) { + dv->sdev_flags |= spp->sp_nflags; + if (SDEV_IS_PERSIST(dv->sdev_dotdot) && + (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) + dv->sdev_flags |= SDEV_PERSIST; + return (spp->sp_vnops); + } + + /* child inherits the persistence of the parent */ + if (SDEV_IS_PERSIST(dv->sdev_dotdot)) + dv->sdev_flags |= SDEV_PERSIST; + return (sdev_vnodeops); +} + +void * +sdev_get_vtor(sdev_node_t *dv) +{ + sdev_plugin_t *spp; + + if (dv->sdev_private == NULL) { + spp = sdev_match(dv); + if (spp == NULL) + return (NULL); + } else { + spp = dv->sdev_private; + } + + if (spp->sp_islegacy) + return ((void *)spp->sp_lvtor); + else + return ((void *)sdev_plugin_validate); +} + +void +sdev_plugin_nodeready(sdev_node_t *sdp) +{ + sdev_plugin_t *spp; + + ASSERT(RW_WRITE_HELD(&sdp->sdev_contents)); + ASSERT(sdp->sdev_private == NULL); + + spp = sdev_match(sdp); + if (spp == NULL) + return; + if (spp->sp_islegacy) + return; + sdp->sdev_private = spp; + mutex_enter(&spp->sp_lock); + spp->sp_nnodes++; + mutex_exit(&spp->sp_lock); +} + +int +sdev_plugin_init(void) +{ + sdev_vop_table_t *vtp; + fs_operation_def_t *templ; + + sdev_plugin_cache = kmem_cache_create("sdev_plugin", + sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor, + sdev_plugin_cache_destructor, NULL, NULL, NULL, 0); + if (sdev_plugin_cache == NULL) + return (1); + mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&sdev_plugin_list, sizeof (sdev_plugin_t), + offsetof(sdev_plugin_t, sp_link)); + + /* + * Register all of the legacy vnops + */ + for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++) + if (sdev_plugin_register_legacy(vtp) != 0) + return (1); + + templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl); + if (vn_make_ops("sdev_plugin", + (const fs_operation_def_t *)templ, + &sdev_plugin_vnops) != 0) { + sdev_free_vtab(templ); + return (1); + } + + sdev_free_vtab(templ); + return (0); +} diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index d810dd9a31..42a3874b95 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -151,12 +151,6 @@ vattr_t sdev_vattr_chr = { kmem_cache_t *sdev_node_cache; /* sdev_node cache */ int devtype; /* fstype */ -/* static */ -static struct vnodeops *sdev_get_vop(struct sdev_node *); -static void sdev_set_no_negcache(struct sdev_node *); -static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []); -static void sdev_free_vtab(fs_operation_def_t *); - static void sdev_prof_free(struct sdev_node *dv) { @@ -314,6 +308,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv, (void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm); /* overwritten for VLNK nodes */ dv->sdev_symlink = NULL; + list_link_init(&dv->sdev_plist); vp = SDEVTOV(dv); vn_reinit(vp); @@ -402,6 +397,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp, } else { dv->sdev_nlink = 1; } + sdev_plugin_nodeready(dv); if (!(SDEV_IS_GLOBAL(dv))) { dv->sdev_origin = (struct sdev_node *)args; @@ -498,37 +494,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp, return (dv); } -/* directory dependent vop table */ -struct sdev_vop_table { - char *vt_name; /* subdirectory name */ - const fs_operation_def_t *vt_service; /* vnodeops table */ - struct vnodeops *vt_vops; /* constructed vop */ - struct vnodeops **vt_global_vops; /* global container for vop */ - int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ - int vt_flags; -}; - -/* - * A nice improvement would be to provide a plug-in mechanism - * for this table instead of a const table. - */ -static struct sdev_vop_table vtab[] = -{ - { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate, +struct sdev_vop_table vtab[] = { + { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate, + { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops, + { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops, devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE }, + { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE }, - { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate, - SDEV_DYNAMIC | SDEV_VTOR }, + { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate, + SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops, + { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops, devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE }, /* @@ -543,132 +524,14 @@ static struct sdev_vop_table vtab[] = * preventing a mkdir. */ - { "lofi", NULL, NULL, NULL, NULL, + { "lofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { "rlofi", NULL, NULL, NULL, NULL, + { "rlofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { NULL, NULL, NULL, NULL, NULL, 0} + { NULL, NULL, NULL, NULL, 0} }; -/* - * We need to match off of the sdev_path, not the sdev_name. We are only allowed - * to exist directly under /dev. - */ -struct sdev_vop_table * -sdev_match(struct sdev_node *dv) -{ - int vlen; - int i; - const char *path; - - if (strlen(dv->sdev_path) <= 5) - return (NULL); - - if (strncmp(dv->sdev_path, "/dev/", 5) != 0) - return (NULL); - path = dv->sdev_path + 5; - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) - return (&vtab[i]); - if (vtab[i].vt_flags & SDEV_SUBDIR) { - vlen = strlen(vtab[i].vt_name); - if ((strncmp(vtab[i].vt_name, path, - vlen - 1) == 0) && path[vlen] == '/') - return (&vtab[i]); - } - - } - return (NULL); -} - -/* - * sets a directory's vnodeops if the directory is in the vtab; - */ -static struct vnodeops * -sdev_get_vop(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - char *path; - - path = dv->sdev_path; - ASSERT(path); - - /* gets the relative path to /dev/ */ - path += 5; - - /* gets the vtab entry it matches */ - if ((vtp = sdev_match(dv)) != NULL) { - dv->sdev_flags |= vtp->vt_flags; - if (SDEV_IS_PERSIST(dv->sdev_dotdot) && - (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) - dv->sdev_flags |= SDEV_PERSIST; - - if (vtp->vt_vops) { - if (vtp->vt_global_vops) - *(vtp->vt_global_vops) = vtp->vt_vops; - - return (vtp->vt_vops); - } - - if (vtp->vt_service) { - fs_operation_def_t *templ; - templ = sdev_merge_vtab(vtp->vt_service); - if (vn_make_ops(vtp->vt_name, - (const fs_operation_def_t *)templ, - &vtp->vt_vops) != 0) { - cmn_err(CE_PANIC, "%s: malformed vnode ops\n", - vtp->vt_name); - /*NOTREACHED*/ - } - if (vtp->vt_global_vops) { - *(vtp->vt_global_vops) = vtp->vt_vops; - } - sdev_free_vtab(templ); - - return (vtp->vt_vops); - } - - return (sdev_vnodeops); - } - - /* child inherits the persistence of the parent */ - if (SDEV_IS_PERSIST(dv->sdev_dotdot)) - dv->sdev_flags |= SDEV_PERSIST; - - return (sdev_vnodeops); -} - -static void -sdev_set_no_negcache(struct sdev_node *dv) -{ - int i; - char *path; - - ASSERT(dv->sdev_path); - path = dv->sdev_path + strlen("/dev/"); - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) { - if (vtab[i].vt_flags & SDEV_NO_NCACHE) - dv->sdev_flags |= SDEV_NO_NCACHE; - break; - } - } -} - -void * -sdev_get_vtor(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - - vtp = sdev_match(dv); - if (vtp) - return ((void *)vtp->vt_vtor); - else - return (NULL); -} /* * Build the base root inode @@ -948,8 +811,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) dv->sdev_path = NULL; } - if (!SDEV_IS_GLOBAL(dv)) + if (!SDEV_IS_GLOBAL(dv)) { sdev_prof_free(dv); + if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL) + SDEV_RELE(dv->sdev_origin); + } if (SDEVTOV(dv)->v_type == VDIR) { ASSERT(SDEV_FIRST_ENTRY(dv) == NULL); @@ -963,6 +829,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) (void) memset((void *)&dv->sdev_instance_data, 0, sizeof (dv->sdev_instance_data)); vn_invalid(SDEVTOV(dv)); + dv->sdev_private = NULL; kmem_cache_free(sdev_node_cache, dv); } @@ -2945,46 +2812,6 @@ sdev_modctl_devexists(const char *path) return (error); } -extern int sdev_vnodeops_tbl_size; - -/* - * construct a new template with overrides from vtab - */ -static fs_operation_def_t * -sdev_merge_vtab(const fs_operation_def_t tab[]) -{ - fs_operation_def_t *new; - const fs_operation_def_t *tab_entry; - - /* make a copy of standard vnode ops table */ - new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); - bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); - - /* replace the overrides from tab */ - for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { - fs_operation_def_t *std_entry = new; - while (std_entry->name) { - if (strcmp(tab_entry->name, std_entry->name) == 0) { - std_entry->func = tab_entry->func; - break; - } - std_entry++; - } - if (std_entry->name == NULL) - cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.", - tab_entry->name); - } - - return (new); -} - -/* free memory allocated by sdev_merge_vtab */ -static void -sdev_free_vtab(fs_operation_def_t *new) -{ - kmem_free(new, sdev_vnodeops_tbl_size); -} - /* * a generic setattr() function * diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c index d81702185e..55b388c2d4 100644 --- a/usr/src/uts/common/fs/dev/sdev_vfsops.c +++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c @@ -173,7 +173,13 @@ devinit(int fstype, char *name) if ((devmajor = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name); - return (1); + return (ENXIO); + } + + if (sdev_plugin_init() != 0) { + cmn_err(CE_WARN, "%s: failed to set init plugin subsystem", + sdev_vfssw.name); + return (EIO); } /* initialize negative cache */ @@ -350,6 +356,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, ASSERT(sdev_origins); dv->sdev_flags &= ~SDEV_GLOBAL; dv->sdev_origin = sdev_origins->sdev_root; + SDEV_HOLD(dv->sdev_origin); } else { sdev_ncache_setup(); rw_enter(&dv->sdev_contents, RW_WRITER); @@ -527,3 +534,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo) mutex_exit(&vp->v_lock); mutex_exit(&sdev_lock); } + +void +sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg) +{ + struct sdev_data *mntinfo; + + mutex_enter(&sdev_lock); + mntinfo = sdev_mntinfo; + while (mntinfo != NULL) { + func(mntinfo->sdev_root, arg); + mntinfo = mntinfo->sdev_next; + } + mutex_exit(&sdev_lock); +} diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c index 79ebd8b2e5..5a00242482 100644 --- a/usr/src/uts/common/fs/dev/sdev_vnops.c +++ b/usr/src/uts/common/fs/dev/sdev_vnops.c @@ -22,7 +22,7 @@ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2018, Joyent, Inc. */ /* @@ -372,7 +372,7 @@ sdev_close(struct vnode *vp, int flag, int count, /*ARGSUSED*/ static int sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred, - struct caller_context *ct) + struct caller_context *ct) { struct sdev_node *dv = (struct sdev_node *)VTOSDEV(vp); int error; @@ -399,7 +399,7 @@ sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred, /*ARGSUSED*/ static int sdev_write(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred, - struct caller_context *ct) + struct caller_context *ct) { struct sdev_node *dv = VTOSDEV(vp); int error = 0; @@ -582,7 +582,9 @@ sdev_self_access(sdev_node_t *dv, int mode, int flags, struct cred *cr, { int ret; + ASSERT(RW_READ_HELD(&dv->sdev_contents)); ASSERT(dv->sdev_attr || dv->sdev_attrvp); + if (dv->sdev_attrvp) { ret = VOP_ACCESS(dv->sdev_attrvp, mode, flags, cr, ct); } else if (dv->sdev_attr) { @@ -892,6 +894,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred, } } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } @@ -1216,6 +1221,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1288,6 +1294,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1403,6 +1410,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } @@ -1438,32 +1448,24 @@ sdev_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred, /*ARGSUSED4*/ static int -sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp, +sdev_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, caller_context_t *ct, int flags) { - struct sdev_node *parent = VTOSDEV(dvp); + struct sdev_node *dv = VTOSDEV(vp); int error; + VERIFY(RW_READ_HELD(&dv->sdev_contents)); + /* - * We must check that we have execute access to search the directory -- - * but because our sdev_contents lock is already held as a reader (the - * caller must have done a VOP_RWLOCK()), we call directly into the - * underlying access routine if sdev_attr is non-NULL. + * We can't recursively take ->sdev_contents via an indirect + * VOP_ACCESS(), but we don't need to use that anyway. */ - if (parent->sdev_attr != NULL) { - VERIFY(RW_READ_HELD(&parent->sdev_contents)); - - if (sdev_unlocked_access(parent, VEXEC, cred) != 0) - return (EACCES); - } else { - if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) - return (error); - } + if ((error = sdev_self_access(dv, VEXEC, 0, cred, ct)) != 0) + return (error); - ASSERT(parent); - if (!SDEV_IS_GLOBAL(parent)) - prof_filldir(parent); - return (devname_readdir_func(dvp, uiop, cred, eofp, SDEV_BROWSE)); + if (!SDEV_IS_GLOBAL(dv)) + prof_filldir(dv); + return (devname_readdir_func(vp, uiop, cred, eofp, SDEV_BROWSE)); } /*ARGSUSED1*/ diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index 8f22ef32f0..e236eb3f72 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -472,8 +472,10 @@ devzvol_create_pool_dirs(struct vnode *dvp) ASSERT(dvp->v_count > 0); rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, NULL, kcred, NULL, 0, NULL); - /* should either work, or not be visible from a zone */ - ASSERT(rc == 0 || rc == ENOENT); + /* + * should either work or we should get an error if this should + * not be visible from the zone, or disallowed in the zone + */ if (rc == 0) VN_RELE(vp); pools++; diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index ca0952642a..50633859ce 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> @@ -33,11 +37,12 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> - #include <sys/fem.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */ /* @@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1) } #endif +/* + * File event monitoring handoffs + * + * File event monitoring relies on being able to inject stack frames between + * vnode consumers and the underlying file systems. This becomes problematic + * when there exist many monitors, as kernel stack depth is finite. The model + * very much encodes this injected frame: the flow of control deliberately + * lies with the monitor, not with the monitoring system. While we could + * conceivably address this by allowing each subsystem to install at most + * one monitor per vnode (and impose on subsystems that they handle any + * of their own consumer multiplexing internally), this in fact exports a + * substantial amount of run-time complexity to deal with an uncommon case + * (and, it must be said, assumes a small number of consuming subsystems). + * To allow our abstraction to remain clean, we instead check our remaining + * stack in every vnext_*() call; if the amount of stack remaining is lower + * than a threshold (fem_stack_needed), we call thread_splitstack() to carry + * on the execution of the monitors and the underlying vnode operation on a + * split stack. Because we can only pass a single argument to our split stack + * function, we must marshal our arguments, the mechanics of which are somewhat + * ornate in terms of the code: to marshal in a type-safe manner, we define a + * baton that is a union of payload structures for each kind of operation, + * loading the per-operation payload explicitly and calling into common handoff + * code that itself calls thread_splitstack(). The function passed to + * thread_splitstack() is a per-entry point function that continues monitor + * processing given the specified (marshalled) arguments. While this method + * is a little verbose to implement, it has the advantage of being relatively + * robust (that is, broadly type-safe) while imposing minimal burden on each + * vnext_*() entry point. + * + * In terms of the implementation: + * + * - The FEM_BATON_n macros define the per-entry point baton structures + * - The fem_baton_payload_t contains the union of these structures + * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point + * - The FEM_VNEXTn macros constitute the per-handoff entry point + * + * Note that we don't use variadic macros -- we define a variant of these + * macros for each of our relevant argument counts. This may seem overly + * explicit, but it is deliberate: the object here is to minimize the + * future maintenance burden by minimizing the likelihood of introduced + * error -- not to minimize the number of characters in this source file. + */ + +#ifndef STACK_GROWTH_DOWN +#error Downward stack growth assumed. +#endif + +int fem_stack_toodeep; +uintptr_t fem_stack_needed = 8 * 1024; +size_t fem_handoff_stacksize = 128 * 1024; + +#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ + (uintptr_t)curthread->t_stkbase < fem_stack_needed) + +#define FEM_BATON_1(what, t0, l0) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + } fb_##what + +#define FEM_BATON_2(what, t0, l0, t1, l1) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + } fb_##what + +#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + } fb_##what + +#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + } fb_##what + +#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + } fb_##what + +#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + } fb_##what + +#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + } fb_##what + +#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7, t8, l8) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + t8 fb_##what##_##l8; \ + } fb_##what + +typedef union { + FEM_BATON_2(open, int, mode, cred_t *, cr); + FEM_BATON_4(close, int, flag, int, count, + offset_t, offset, cred_t *, cr); + FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_5(ioctl, int, cmd, intptr_t, arg, + int, flag, cred_t *, cr, int *, rvalp); + FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr); + FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr); + FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp, + pathname_t *, pnp, int, flags, vnode_t *, rdir, + cred_t *, cr, int *, direntflags, pathname_t *, realpnp); + FEM_BATON_8(create, char *, name, vattr_t *, vap, + vcexcl_t, excl, int, mode, vnode_t **, vpp, + cred_t *, cr, int, flag, vsecattr_t *, vsecp); + FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags); + FEM_BATON_4(link, vnode_t *, svp, char *, tnm, + cred_t *, cr, int, flags); + FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp, + char *, tnm, cred_t *, cr, int, flags); + FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap, + vnode_t **, vpp, cred_t *, cr, int, flags, + vsecattr_t *, vsecp); + FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir, + cred_t *, cr, int, flags); + FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr, + int *, eofp, int, flags); + FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap, + char *, target, cred_t *, cr, int, flags); + FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr); + FEM_BATON_2(fsync, int, syncflag, cred_t *, cr); + FEM_BATON_1(inactive, cred_t *, cr); + FEM_BATON_1(fid, fid_t *, fidp); + FEM_BATON_1(rwlock, int, write_lock); + FEM_BATON_1(rwunlock, int, write_lock); + FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp); + FEM_BATON_1(cmp, vnode_t *, vp2); + FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, struct flk_callback *, flk_cbp, + cred_t *, cr); + FEM_BATON_5(space, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, cred_t *, cr); + FEM_BATON_1(realvp, vnode_t **, vpp); + FEM_BATON_9(getpage, offset_t, off, size_t, len, + uint_t *, protp, struct page **, plarr, size_t, plsz, + struct seg *, seg, caddr_t, addr, enum seg_rw, rw, + cred_t *, cr); + FEM_BATON_4(putpage, offset_t, off, size_t, len, + int, flags, cred_t *, cr); + FEM_BATON_8(map, offset_t, off, struct as *, as, + caddr_t *, addrp, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(addmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(delmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uint_t, prot, + uint_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_4(poll, short, events, int, anyyet, + short *, reventsp, struct pollhead **, phpp); + FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks); + FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr); + FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off, + size_t, io_len, int, flags, cred_t *, cr); + FEM_BATON_2(dumpctl, int, action, offset_t *, blkp); + FEM_BATON_4(dispose, struct page *, pp, int, flag, + int, dn, cred_t *, cr); + FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr, + int, flag, cred_t *, cr); + FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname); + FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag, + xuio_t *, xuiop, cred_t *, cr); + FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr); +} fem_baton_payload_t; + +typedef struct { + fem_baton_payload_t fb_payload; + int (*fb_func)(); + void (*fb_handoff)(); + int fb_rval; +} fem_baton_t; + +static int +fem_handoff(fem_baton_t *bp) +{ + fem_stack_toodeep++; + thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize); + + return (bp->fb_rval); +} + +#define FEM_VNEXT3_DECL(what, a0, a1, a2) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2); \ +} + +#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3); \ +} + +#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4); \ +} + +#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5); \ +} + +#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6); \ +} + +#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7); \ +} + +#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9); \ +} + +#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9, \ + bp->fb_payload.fb_##what.fb_##what##_##a10); \ +} + +#define FEM_VNEXT3(what, func, a0, a1, a2) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2)) + +#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3)) + +#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4)) + +#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5)) + +#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6)) + +#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7)) + +#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)) + +#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)) + static fem_t * fem_alloc() { @@ -2040,10 +2575,60 @@ static struct fs_operation_def fshead_vfs_spec[] = { * 5. Return by invoking the base operation with the base object. * * for each classification, there needs to be at least one "next" operation - * for each "head"operation. - * + * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros + * to define the function to run when the stack is split; see the discussion + * on "File event monitoring handoffs", above. */ +FEM_VNEXT4_DECL(open, arg0, mode, cr, ct) +FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct) +FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct) +FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct) +FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct) +FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp) +FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp) +FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags) +FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags) +FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags) +FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp) +FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags) +FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags) +FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags) +FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct) +FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct) +FEM_VNEXT3_DECL(fid, arg0, fidp, ct) +FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct) +FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct) +FEM_VNEXT3_DECL(cmp, arg0, vp2, ct) +FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct) +FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct) +FEM_VNEXT3_DECL(realvp, arg0, vpp, ct) +FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz, + seg, addr, rw, cr, ct) +FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct) +FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct) +FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct) +FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct) +FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct) +FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct) +FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct) +FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct) +FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct) +FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct) + int vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) { @@ -2055,7 +2640,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_open, femop_open); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, cr, ct)); + FEM_VNEXT4(open, func, arg0, mode, cr, ct); } int @@ -2070,7 +2655,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_close, femop_close); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, flag, count, offset, cr, ct)); + FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct); } int @@ -2085,7 +2670,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_read, femop_read); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct); } int @@ -2100,7 +2685,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_write, femop_write); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct); } int @@ -2115,7 +2700,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct)); + FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct); } int @@ -2130,7 +2715,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, oflags, nflags, cr, ct)); + FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct); } int @@ -2145,7 +2730,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct); } int @@ -2160,7 +2745,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct); } int @@ -2175,7 +2760,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_access, femop_access); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, flags, cr, ct)); + FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct); } int @@ -2191,8 +2776,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp, vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct, - direntflags, realpnp)); + FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct, + direntflags, realpnp); } int @@ -2208,7 +2793,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, vsop_find(vf, &func, int, &arg0, vop_create, femop_create); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)); + FEM_VNEXT10(create, func, arg0, name, vap, excl, + mode, vpp, cr, flag, ct, vsecp); } int @@ -2223,7 +2809,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cr, ct, flags)); + FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags); } int @@ -2238,7 +2824,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_link, femop_link); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, svp, tnm, cr, ct, flags)); + FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags); } int @@ -2253,7 +2839,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags)); + FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags); } int @@ -2268,7 +2854,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp, vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp)); + FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp); } int @@ -2283,7 +2869,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cdir, cr, ct, flags)); + FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags); } int @@ -2298,7 +2884,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, eofp, ct, flags)); + FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags); } int @@ -2313,7 +2899,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target, vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, linkname, vap, target, cr, ct, flags)); + FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags); } int @@ -2327,7 +2913,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, ct)); + FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct); } int @@ -2341,7 +2927,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, syncflag, cr, ct)); + FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct); } void @@ -2369,7 +2955,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, fidp, ct)); + FEM_VNEXT3(fid, func, arg0, fidp, ct); } int @@ -2383,7 +2969,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, write_lock, ct)); + FEM_VNEXT3(rwlock, func, arg0, write_lock, ct); } void @@ -2411,7 +2997,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ooff, noffp, ct)); + FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct); } int @@ -2425,7 +3011,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vp2, ct)); + FEM_VNEXT3(cmp, func, arg0, vp2, ct); } int @@ -2441,7 +3027,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)); + FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct); } int @@ -2456,7 +3042,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_space, femop_space); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct)); + FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct); } int @@ -2470,7 +3056,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vpp, ct)); + FEM_VNEXT3(realvp, func, arg0, vpp, ct); } int @@ -2486,8 +3072,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp, vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw, - cr, ct)); + FEM_VNEXT11(getpage, func, arg0, off, len, protp, + plarr, plsz, seg, addr, rw, cr, ct); } int @@ -2502,7 +3088,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags, vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, flags, cr, ct)); + FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct); } int @@ -2518,8 +3104,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp, vsop_find(vf, &func, int, &arg0, vop_map, femop_map); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags, + cr, ct); } int @@ -2535,8 +3121,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2552,8 +3138,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2568,7 +3154,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp, vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, events, anyyet, reventsp, phpp, ct)); + FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct); } int @@ -2583,7 +3169,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks, vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, addr, lbdn, dblks, ct)); + FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct); } int @@ -2598,7 +3184,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, valp, cr, ct)); + FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct); } int @@ -2613,7 +3199,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off, vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct)); + FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct); } int @@ -2627,7 +3213,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, action, blkp, ct)); + FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct); } void @@ -2657,7 +3243,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2672,7 +3258,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2687,7 +3273,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, shr, flag, cr, ct)); + FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct); } int @@ -2702,7 +3288,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vnevent, dvp, cname, ct)); + FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct); } int @@ -2717,7 +3303,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ioflag, xuiop, cr, ct)); + FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct); } int @@ -2731,7 +3317,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, xuiop, cr, ct)); + FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct); } int diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 6e56000ffe..a908f91267 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -61,7 +62,6 @@ #if FIFODEBUG int Fifo_fastmode = 1; /* pipes/fifos will be opened in fast mode */ int Fifo_verbose = 0; /* msg when switching out of fast mode */ -int Fifohiwat = FIFOHIWAT; /* Modifiable FIFO high water mark */ #endif /* @@ -196,6 +196,7 @@ fnode_constructor(void *buf, void *cdrarg, int kmflags) fnp->fn_dest = fnp; fnp->fn_mp = NULL; fnp->fn_count = 0; + fnp->fn_hiwat = FIFOHIWAT; fnp->fn_rsynccnt = 0; fnp->fn_wsynccnt = 0; fnp->fn_wwaitcnt = 0; @@ -388,11 +389,7 @@ fifoinit(int fstype, char *name) pipe_constructor, pipe_destructor, NULL, (void *)(sizeof (fifodata_t)), NULL, 0); -#if FIFODEBUG - if (Fifohiwat < FIFOHIWAT) - Fifohiwat = FIFOHIWAT; -#endif /* FIFODEBUG */ - fifo_strdata.qi_minfo->mi_hiwat = Fifohiwat; + fifo_strdata.qi_minfo->mi_hiwat = FIFOHIWAT; return (0); } @@ -614,9 +611,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld) /* * The other end of the pipe is almost closed so * reject any other open on this end of the pipe - * This only happens with a pipe mounted under namefs + * This normally only happens with a pipe mounted under namefs, but + * we can also see an open via proc/fd, which should still succeed. + * To indicate the proc/fd case the FKLYR flag is passed. */ - if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) { + if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) && + (flag & FKLYR) == 0) { fifo_cleanup(oldvp, flag); cv_broadcast(&fnp->fn_wait_cv); if (!lockheld) @@ -1161,7 +1161,8 @@ fifo_wakewriter(fifonode_t *fn_dest, fifolock_t *fn_lock) int fn_dflag = fn_dest->fn_flag; ASSERT(MUTEX_HELD(&fn_lock->flk_lock)); - ASSERT(fn_dest->fn_dest->fn_count < Fifohiwat); + ASSERT(fn_dest->fn_dest->fn_count < fn_dest->fn_dest->fn_hiwat); + if ((fn_dflag & FIFOWANTW)) { cv_broadcast(&fn_dest->fn_wait_cv); } diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index ef8d76e8e8..c288a2eb61 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -28,7 +28,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2017, Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. */ @@ -104,10 +104,6 @@ static int fifo_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *, static int fifo_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *, caller_context_t *); -/* functions local to this file */ -static boolean_t fifo_stayfast_enter(fifonode_t *); -static void fifo_stayfast_exit(fifonode_t *); - /* * Define the data structures external to this file. */ @@ -645,7 +641,7 @@ fifo_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *crp, * (3) write-only FIFO with no data * (4) no data and FNDELAY flag is set. * Otherwise return - * EAGAIN if FNONBLOCK is set and no data to read + * EAGAIN if FNONBLOCK is set and no data to read or FIFORDBLOCK is set * EINTR if signal received while waiting for data * * While there is no data to read.... @@ -681,7 +677,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, * Check for data on our input queue */ - while (fnp->fn_count == 0) { + while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) { /* * No data on first attempt and no writer, then EOF */ @@ -731,6 +727,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, } ASSERT(fnp->fn_mp != NULL); + VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0); /* For pipes copy should not bypass cache */ uiop->uio_extflg |= UIO_COPY_CACHED; @@ -772,6 +769,18 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, &fn_lock->flk_lock)) goto trywake; + /* + * If another thread snuck in and started to + * consume data using read-blocking out of + * the pipe while we were blocked in the + * cv_wait, then since we have already consumed + * some of the data out of the pipe we need + * to return with a short read. + */ + if ((fnp->fn_flag & FIFORDBLOCK) != 0) { + goto trywake; + } + if (!(fnp->fn_flag & FIFOFAST)) goto stream_mode; } @@ -787,11 +796,11 @@ trywake: /* * wake up any blocked writers, processes * sleeping on POLLWRNORM, or processes waiting for SIGPOLL - * Note: checking for fn_count < Fifohiwat emulates + * Note: checking for fn_count < fn_hiwat emulates * STREAMS functionality when low water mark is 0 */ if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) && - fnp->fn_count < Fifohiwat) { + fnp->fn_count < fn_dest->fn_hiwat) { fifo_wakewriter(fn_dest, fn_lock); } goto done; @@ -904,7 +913,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp, /* * check to make sure we are not over high water mark */ - while (fn_dest->fn_count >= Fifohiwat) { + while (fn_dest->fn_count >= fn_dest->fn_hiwat) { /* * Indicate that we have gone over high * water mark @@ -962,7 +971,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp, * then we must break the message up into PIPE_BUF * chunks to stay compliant with STREAMS */ - if (uiop->uio_resid + fn_dest->fn_count > Fifohiwat) + if (uiop->uio_resid + fn_dest->fn_count > fn_dest->fn_hiwat) size = MIN(uiop->uio_resid, PIPE_BUF); else size = uiop->uio_resid; @@ -1198,7 +1207,8 @@ fifo_fastioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, if (arg != 0) { goto turn_fastoff; } - *rvalp = (fnp->fn_dest->fn_count < Fifohiwat) ? 1 : 0; + *rvalp = (fnp->fn_dest->fn_count < fnp->fn_dest->fn_hiwat) ? + 1 : 0; mutex_exit(&fn_lock->flk_lock); return (0); @@ -1817,7 +1827,7 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, retevents = POLLHUP; } else if (events & (POLLWRNORM | POLLWRBAND)) { if (events & POLLWRNORM) { - if (fn_dest->fn_count < Fifohiwat) + if (fn_dest->fn_count < fn_dest->fn_hiwat) retevents = POLLWRNORM; else fnp->fn_flag |= FIFOHIWATW; @@ -1986,7 +1996,7 @@ fifo_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *crp, * the lock. * If the fifo switches into stream mode while we are waiting, return failure. */ -static boolean_t +boolean_t fifo_stayfast_enter(fifonode_t *fnp) { ASSERT(MUTEX_HELD(&fnp->fn_lock->flk_lock)); @@ -2008,7 +2018,7 @@ fifo_stayfast_enter(fifonode_t *fnp) * - threads wanting to turn into stream mode waiting in fifo_fastoff(), * - other writers threads waiting in fifo_stayfast_enter(). */ -static void +void fifo_stayfast_exit(fifonode_t *fnp) { fifonode_t *fn_dest = fnp->fn_dest; diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c index 3249a574f7..e3d07b595d 100644 --- a/usr/src/uts/common/fs/fs_subr.c +++ b/usr/src/uts/common/fs/fs_subr.c @@ -60,6 +60,9 @@ #include <acl/acl_common.h> #include <sys/pathname.h> +/* required for fs_reject_epoll */ +#include <sys/poll_impl.h> + static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *); /* @@ -406,10 +409,20 @@ fs_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) } /* - * Return the answer requested to poll() for non-device files. - * Only POLLIN, POLLRDNORM, and POLLOUT are recognized. + * Unlike poll(2), epoll should reject attempts to add normal files or + * directories to a given handle. Most non-pseudo filesystems rely on + * fs_poll() as their implementation of polling behavior. Exceptions to that + * rule (ufs) can use fs_reject_epoll(), so they don't require access to the + * inner details of poll. Potential race conditions related to the poll module + * being loaded are avoided by implementing the check here in genunix. */ -struct pollhead fs_pollhd; +boolean_t +fs_reject_epoll() +{ + /* Check if the currently-active pollcache is epoll-enabled. */ + return (curthread->t_pollcache != NULL && + (curthread->t_pollcache->pc_flag & PC_EPOLL) != 0); +} /* ARGSUSED */ int @@ -417,13 +430,12 @@ fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp, struct pollhead **phpp, caller_context_t *ct) { /* - * Reject all attempts for edge-triggered polling. These should only - * occur when regular files are added to a /dev/poll handle which is in - * epoll mode. The Linux epoll does not allow epoll-ing on regular - * files at all, so rejecting EPOLLET requests is congruent with those - * expectations. + * Regular filesystems should reject epollers. On the off chance that + * a non-epoll consumer expresses the desire for edge-triggered + * polling, we reject them too. Yes, the expected error for this + * really is EPERM. */ - if (events & POLLET) { + if (fs_reject_epoll() || (events & POLLET) != 0) { return (EPERM); } @@ -438,15 +450,7 @@ fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp, *reventsp |= POLLOUT; if (events & POLLWRBAND) *reventsp |= POLLWRBAND; - /* - * Emitting a pollhead without the intention of issuing pollwakeup() - * calls against it is a recipe for trouble. It's only acceptable in - * this case since the above logic matches practically all useful - * events. - */ - if (*reventsp == 0 && !anyyet) { - *phpp = &fs_pollhd; - } + return (0); } diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h index 27c9e3d830..877dc36f9c 100644 --- a/usr/src/uts/common/fs/fs_subr.h +++ b/usr/src/uts/common/fs/fs_subr.h @@ -24,6 +24,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_FS_SUBR_H @@ -95,6 +96,9 @@ extern int fs_need_estale_retry(int); extern void fs_vscan_register(int (*av_scan)(vnode_t *, cred_t *, int)); extern int fs_vscan(vnode_t *, cred_t *, int); +/* Helper function to detect when epoll checks VOP_POLL handlers */ +extern boolean_t fs_reject_epoll(); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..05ee2c6e09 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + kmem_free(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + kmem_free(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..1d857309f3 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..c582a8cac2 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,614 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), + KM_NORMALPRI | KM_NOSLEEP)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + kmem_free(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..52dba31761 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (EISDIR); + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + uint_t len, cnt; + int i, error; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen >= sizeof (nm)) { + kmem_free(e, len); + return (EINVAL); + } + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen >= sizeof (path)) { + kmem_free(e, len); + return (EINVAL); + } + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen >= sizeof (nm)) { + kmem_free(e32, len); + return (EINVAL); + } + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen >= sizeof (path)) { + kmem_free(e32, len); + return (EINVAL); + } + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + vattr_t tmp_va; + + if (tp->hln_looped == 1) { + int error; + + if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr, + ct)) != 0) + return (error); + } + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + if (tp->hln_looped == 1) { + vap->va_nblocks = tmp_va.va_nblocks; + } else { + vap->va_nblocks = + (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + } + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, uint_t *pcnt, uint_t n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + uint_t cnt; + uint_t len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == vn_vpath_empty) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + uint_t limit, cnt; + int error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + ulong_t outcount = 0; + ulong_t bufsize; + size_t reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + kmem_free(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 69c9efff97..f6910c07cf 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -21,6 +21,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. */ @@ -58,6 +59,7 @@ #include <sys/zone.h> #include <sys/dnlc.h> #include <sys/fs/snode.h> +#include <sys/brand.h> /* Controls whether paths are stored with vnodes. */ int vfs_vnode_path = 1; diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..24c010a463 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,526 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..9bcc0f7e8b --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support the LX brand with a Linux /proc entirely compatible with the Linux + * world view; the other -- this one -- is to support native (but Linux-borne) + * programs that wish to view the native system via the Linux /proc model. So + * the aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more -- and as such, it should not be unified with the much + * more complete Linux /proc implementation found in the LX brand. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + -1, /* 41: illumos SIGINFO */ + LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) && + (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + ulong_t total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = ptob(k_anoninfo.ani_max); + used_swap = ptob(k_anoninfo.ani_phys_resv); + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..eadb2ccd27 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,278 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LXPROC_H +#define _LXPROC_H + +#ifdef _LXPROC_BRANDED_H +#error Attempted to include native lxproc.h after branded lx_proc.h +#endif + +#define _LXPROC_NATIVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index d6a88a97c3..f6c6b62925 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index b7354c168a..d3b12817ba 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -29,7 +29,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -5523,8 +5522,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs3setattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index f0320aaee0..25088aafcb 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index 4112cbee05..945d37533d 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -38,7 +38,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -3745,8 +3745,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, */ error = nfs4setattr(vp, vap, flags, cr, NULL); - if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (vap->va_mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -8062,8 +8067,9 @@ link_call: * vnode if it already existed. */ if (error == 0) { - vnode_t *tvp; + vnode_t *tvp, *tovp; rnode4_t *trp; + /* * Notify the vnode. Each links is represented by * a different vnode, in nfsv4. @@ -8076,23 +8082,20 @@ link_call: vnevent_rename_dest(tvp, ndvp, nnm, ct); } - /* - * if the source and destination directory are not the - * same notify the destination directory. - */ - if (VTOR4(odvp) != VTOR4(ndvp)) { - trp = VTOR4(ndvp); - tvp = ndvp; - if (IS_SHADOW(ndvp, trp)) - tvp = RTOV4(trp); - vnevent_rename_dest_dir(tvp, ct); - } - trp = VTOR4(ovp); - tvp = ovp; + tovp = ovp; if (IS_SHADOW(ovp, trp)) + tovp = RTOV4(trp); + + vnevent_rename_src(tovp, odvp, onm, ct); + + trp = VTOR4(ndvp); + tvp = ndvp; + + if (IS_SHADOW(ndvp, trp)) tvp = RTOV4(trp); - vnevent_rename_src(tvp, odvp, onm, ct); + + vnevent_rename_dest_dir(tvp, tovp, nnm, ct); } if (nvp) { @@ -10997,8 +11000,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs4setattr(vp, &va, 0, cr, NULL); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c index 2851f8bef9..5fa0e6414f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_auth.c +++ b/usr/src/uts/common/fs/nfs/nfs_auth.c @@ -22,6 +22,7 @@ /* * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright (c) 2015 by Delphix. All rights reserved. */ @@ -561,11 +562,16 @@ retry: *access = res.ares.auth_perm; *srv_uid = res.ares.auth_srv_uid; *srv_gid = res.ares.auth_srv_gid; - *srv_gids_cnt = res.ares.auth_srv_gids.len; - *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t), - KM_SLEEP); - bcopy(res.ares.auth_srv_gids.val, *srv_gids, - *srv_gids_cnt * sizeof (gid_t)); + + if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) { + *srv_gids = kmem_alloc(*srv_gids_cnt * + sizeof (gid_t), KM_SLEEP); + bcopy(res.ares.auth_srv_gids.val, *srv_gids, + *srv_gids_cnt * sizeof (gid_t)); + } else { + *srv_gids = NULL; + } + break; case NFSAUTH_DR_EFAIL: @@ -1054,9 +1060,13 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, if (gid != NULL) *gid = p->auth_srv_gid; if (ngids != NULL && gids != NULL) { - *ngids = p->auth_srv_ngids; - *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP); - bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t)); + if ((*ngids = p->auth_srv_ngids) != 0) { + size_t sz = *ngids * sizeof (gid_t); + *gids = kmem_alloc(sz, KM_SLEEP); + bcopy(p->auth_srv_gids, *gids, sz); + } else { + *gids = NULL; + } } access = p->auth_access; diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index 476da6685a..c6ae29d220 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -2573,6 +2573,9 @@ nfs_srvinit(void) { int error; + if (getzoneid() != GLOBAL_ZONEID) + return (EACCES); + error = nfs_exportinit(); if (error != 0) return (error); diff --git a/usr/src/uts/common/fs/nfs/nfs_sys.c b/usr/src/uts/common/fs/nfs/nfs_sys.c index e6ff4a2e0b..b4fc9884b1 100644 --- a/usr/src/uts/common/fs/nfs/nfs_sys.c +++ b/usr/src/uts/common/fs/nfs/nfs_sys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. @@ -247,7 +248,7 @@ nfssys(enum nfssys_op opcode, void *arg) lsa.n_fmly = STRUCT_FGET(ulsa, n_fmly); lsa.n_proto = STRUCT_FGET(ulsa, n_proto); lsa.n_rdev = expldev(STRUCT_FGET(ulsa, n_rdev)); - lsa.debug = STRUCT_FGET(ulsa, debug); + lsa.n_v4_only = STRUCT_FGET(ulsa, n_v4_only); lsa.timout = STRUCT_FGET(ulsa, timout); lsa.grace = STRUCT_FGET(ulsa, grace); lsa.retransmittimeout = STRUCT_FGET(ulsa, diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c index c9cc306f95..5041ebb6fe 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 1a1082bcb8..ee3bac484f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -26,7 +26,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, error = nfssetattr(vp, vap, flags, cr); - if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); - ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfssetattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c index 976715e346..275330a0ae 100644 --- a/usr/src/uts/common/fs/pcfs/pc_dir.c +++ b/usr/src/uts/common/fs/pcfs/pc_dir.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -826,8 +826,7 @@ top: if (error == 0) { vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); - if (dp != tdp) - vnevent_rename_dest_dir(PCTOV(tdp), ctp); + vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp); } done: diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c index cb43f0fe59..b307fe11d7 100644 --- a/usr/src/uts/common/fs/pcfs/pc_vnops.c +++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c @@ -782,8 +782,11 @@ pcfs_setattr( if (error) goto out; - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file modified times. diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c index 70f773ab55..04a2a421db 100644 --- a/usr/src/uts/common/fs/portfs/port.c +++ b/usr/src/uts/common/fs/portfs/port.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/systm.h> #include <sys/cred.h> @@ -1379,12 +1383,18 @@ portnowait: if (model == DATAMODEL_NATIVE) { eventsz = sizeof (port_event_t); - kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp = NULL; + } else { + kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { @@ -1421,12 +1431,18 @@ portnowait: port_event32_t *kevp32; eventsz = sizeof (port_event32_t); - kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp32 == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp32 = NULL; + } else { + kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp32 == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp32; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c new file mode 100644 index 0000000000..b09a9c8afc --- /dev/null +++ b/usr/src/uts/common/fs/proc/prargv.c @@ -0,0 +1,441 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/sysmacros.h> +#include <vm/as.h> + +/* + * Safely read a contiguous region of memory from 'addr' in the address space + * of a particular process into the supplied kernel buffer (*buf, sz). + * Partially mapped regions will result in a partial read terminating at the + * first hole in the address space. The number of bytes actually read is + * returned to the caller via 'rdsz'. + */ +int +prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz) +{ + int error = 0; + size_t rem = sz; + off_t pos = 0; + + if (rdsz != NULL) + *rdsz = 0; + + while (rem != 0) { + uintptr_t addr = ustart + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if ((error = uread(p, buf + pos, len, addr)) != 0) { + if (error == ENXIO) { + /* + * ENXIO from uread() indicates that the page + * does not exist. This will simply be a + * partial read. + */ + error = 0; + } + break; + } + + rem -= len; + pos += len; + } + + if (rdsz != NULL) + *rdsz = pos; + + return (error); +} + +/* + * Attempt to read the argument vector (argv) from this process. The caller + * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via + * prlock or lx_prlock). + * + * The caller must provide a buffer (buf, buflen). We will concatenate each + * argument string (including the NUL terminator) into this buffer. The number + * of characters written to this buffer (including the final NUL terminator) + * will be stored in 'slen'. + */ +int +prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *argv = NULL; + size_t argvsz = 0; + int i; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) { + /* + * Return the regular psargs string to the caller. + */ + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + + return (0); + } + + /* + * Allocate space to store argv array. + */ + argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + argv = kmem_alloc(argvsz, KM_SLEEP); + + /* + * Extract the argv array from the target process. Drop p_lock + * while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz, + NULL)) != 0) { + kmem_free(argv, argvsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each argument string from the pointers in the argv array. + */ + pos = 0; + for (i = 0; i < up->u_argc; i++) { + size_t rdsz, trysz; + uintptr_t arg; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + arg = (uintptr_t)((caddr32_t *)argv)[i]; + } else { + arg = (uintptr_t)argv[i]; + } +#else + arg = (uintptr_t)argv[i]; +#endif + + /* + * Stop trying to read arguments if we reach a NULL + * pointer in the vector. + */ + if (arg == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual argument strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this argument. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(argv, argvsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} + +/* + * Similar to prreadargv except reads the env vector. This is slightly more + * complex because there is no count for the env vector that corresponds to + * u_argc. + */ +int +prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *envp = NULL; + uintptr_t tmpp = NULL; + size_t envpsz = 0, rdsz = 0; + int i; + int cnt, bound; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) { + /* + * Return empty string. + */ + buf[0] = '\0'; + *slen = 1; + + return (0); + } + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + + /* + * We first have to count how many env entries we have. This is + * somewhat painful. We extract the env entries from the target process + * one entry at a time. Stop trying to read env entries if we reach a + * NULL pointer in the vector or hit our upper bound (which we take + * as the bufsz/4) to ensure we don't run off. + */ + rdsz = (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + bound = (int)(bufsz / 4); + for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) { + caddr_t tmp = NULL; + + if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz, + NULL)) != 0) { + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + if (tmp == NULL) + break; + } + if (cnt == 0) { + /* Return empty string. */ + buf[0] = '\0'; + *slen = 1; + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (0); + } + + /* + * Allocate space to store env array. + */ + envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + envp = kmem_alloc(envpsz, KM_SLEEP); + + /* + * Extract the env array from the target process. + */ + if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz, + NULL)) != 0) { + kmem_free(envp, envpsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each env string from the pointers in the env array. + */ + pos = 0; + for (i = 0; i < cnt; i++) { + size_t rdsz, trysz; + uintptr_t ev; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + ev = (uintptr_t)((caddr32_t *)envp)[i]; + } else { + ev = (uintptr_t)envp[i]; + } +#else + ev = (uintptr_t)envp[i]; +#endif + + /* + * Stop trying to read env entries if we reach a NULL + * pointer in the vector. + */ + if (ev == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual env strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this env var. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(envp, envpsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index 6b151a6369..07dcb1e7db 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ p->p_stopsig = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; t->t_dtrace_stop = 0; setrun_locked(t); } @@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr) return (EPERM); if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id) return (EINVAL); - if ((zptr = zone_find_by_id(zoneid)) == NULL) - return (EINVAL); + /* + * We cannot hold p_lock when we call zone_find_by_id since that can + * lead to a deadlock. zone_find_by_id() takes zonehash_lock. + * zone_enter() can hold the zonehash_lock and needs p_lock when it + * calls task_join. + */ mutex_exit(&p->p_lock); + if ((zptr = zone_find_by_id(zoneid)) == NULL) { + mutex_enter(&p->p_lock); + return (EINVAL); + } mutex_enter(&p->p_crlock); oldcred = p->p_cred; crhold(oldcred); diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index de816d49e7..706e3ad14d 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -123,6 +123,7 @@ typedef enum prnodetype { #if defined(__i386) || defined(__amd64) PR_LDT, /* /proc/<pid>/ldt */ #endif + PR_ARGV, /* /proc/<pid>/argv */ PR_USAGE, /* /proc/<pid>/usage */ PR_LUSAGE, /* /proc/<pid>/lusage */ PR_PAGEDATA, /* /proc/<pid>/pagedata */ @@ -349,6 +350,8 @@ extern int pr_unset(proc_t *, long); extern void pr_sethold(prnode_t *, sigset_t *); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); extern int prwritectl(vnode_t *, struct uio *, cred_t *); extern int prlock(prnode_t *, int); extern void prunmark(proc_t *); @@ -375,6 +378,7 @@ extern int clear_watched_area(proc_t *, struct watched_area *); extern void pr_free_watchpoints(proc_t *); extern proc_t *pr_cancel_watch(prnode_t *); extern struct seg *break_seg(proc_t *); +extern void prgethold(kthread_t *, sigset_t *); /* * Machine-dependent routines (defined in prmachdep.c). diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c index 8202e49df0..08c5f6ffc0 100644 --- a/usr/src/uts/common/fs/proc/prioctl.c +++ b/usr/src/uts/common/fs/proc/prioctl.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -930,8 +930,7 @@ startover: } case PIOCGHOLD: /* get signal-hold mask */ - schedctl_finish_sigblock(t); - sigktou(&t->t_hold, &un.holdmask); + prgethold(t, &un.holdmask); prunlock(pnp); if (copyout(&un.holdmask, cmaddr, sizeof (un.holdmask))) error = EFAULT; @@ -944,7 +943,7 @@ startover: case PIOCNMAP: /* get number of memory mappings */ { - int n; + uint_t n; struct as *as = p->p_as; if ((p->p_flag & SSYS) || as == &kas) @@ -957,7 +956,7 @@ startover: mutex_enter(&p->p_lock); } prunlock(pnp); - if (copyout(&n, cmaddr, sizeof (int))) + if (copyout(&n, cmaddr, sizeof (uint_t))) error = EFAULT; break; } @@ -1395,8 +1394,7 @@ oprgetstatus32(kthread_t *t, prstatus32_t *sp, zone_t *zp) sp->pr_cursig = lwp->lwp_cursig; prassignset(&sp->pr_sigpend, &p->p_sig); prassignset(&sp->pr_lwppend, &t->t_sig); - schedctl_finish_sigblock(t); - prassignset(&sp->pr_sighold, &t->t_hold); + prgethold(t, &sp->pr_sighold); sp->pr_altstack.ss_sp = (caddr32_t)(uintptr_t)lwp->lwp_sigaltstack.ss_sp; sp->pr_altstack.ss_size = (size32_t)lwp->lwp_sigaltstack.ss_size; @@ -1673,14 +1671,8 @@ oprgetpsinfo32(proc_t *p, prpsinfo32_t *psp, kthread_t *tp) /*ARGSUSED*/ static int -prioctl32( - struct vnode *vp, - int cmd, - intptr_t arg, - int flag, - cred_t *cr, - int *rvalp, - caller_context_t *ct) +prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp, caller_context_t *ct) { int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG; caddr_t cmaddr = (caddr_t)arg; @@ -2557,8 +2549,7 @@ startover: } case PIOCGHOLD: /* get signal-hold mask */ - schedctl_finish_sigblock(t); - sigktou(&t->t_hold, &un32.holdmask); + prgethold(t, &un32.holdmask); prunlock(pnp); if (copyout(&un32.holdmask, cmaddr, sizeof (un32.holdmask))) error = EFAULT; @@ -2571,7 +2562,7 @@ startover: case PIOCNMAP: /* get number of memory mappings */ { - int n; + uint_t n; struct as *as = p->p_as; if ((p->p_flag & SSYS) || as == &kas) @@ -2584,7 +2575,7 @@ startover: mutex_enter(&p->p_lock); } prunlock(pnp); - if (copyout(&n, cmaddr, sizeof (int))) + if (copyout(&n, cmaddr, sizeof (uint_t))) error = EFAULT; break; } @@ -3235,8 +3226,7 @@ oprgetstatus(kthread_t *t, prstatus_t *sp, zone_t *zp) sp->pr_cursig = lwp->lwp_cursig; prassignset(&sp->pr_sigpend, &p->p_sig); prassignset(&sp->pr_lwppend, &t->t_sig); - schedctl_finish_sigblock(t); - prassignset(&sp->pr_sighold, &t->t_hold); + prgethold(t, &sp->pr_sighold); sp->pr_altstack = lwp->lwp_sigaltstack; prgetaction(p, up, lwp->lwp_cursig, &sp->pr_action); sp->pr_pid = p->p_pid; diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index a2ab06d769..3b4a7f36d0 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -148,6 +148,11 @@ prchoose(proc_t *p) continue; } + /* If this is a process kernel thread, ignore it. */ + if ((t->t_proc_flag & TP_KTHREAD) != 0) { + continue; + } + thread_lock(t); /* make sure thread is in good state */ switch (t->t_state) { default: @@ -201,6 +206,7 @@ prchoose(proc_t *p) case PR_SYSEXIT: case PR_SIGNALLED: case PR_FAULTED: + case PR_BRAND: /* * Make an lwp calling exit() be the * last lwp seen in the process. @@ -534,6 +540,12 @@ prexecend(void) pcp->prc_tslot = tslot; } } + + /* + * There may be threads waiting for the flag change blocked behind the + * pr_pid_cv as well. + */ + cv_signal(&pr_pid_cv[p->p_slot]); } /* @@ -919,6 +931,29 @@ prgetstatus(proc_t *p, pstatus_t *sp, zone_t *zp) sp->pr_flags = sp->pr_lwp.pr_flags; } +/* + * Query mask of held signals for a given thread. + * + * This makes use of schedctl_sigblock() to query if userspace has requested + * that all maskable signals be held. While it would be tempting to call + * schedctl_finish_sigblock() and apply that update to t->t_hold, it cannot be + * done safely without the risk of racing with the thread under consideration. + */ +void +prgethold(kthread_t *t, sigset_t *sp) +{ + k_sigset_t set; + + if (schedctl_sigblock(t)) { + set.__sigbits[0] = FILLSET0 & ~CANTMASK0; + set.__sigbits[1] = FILLSET1 & ~CANTMASK1; + set.__sigbits[2] = FILLSET2 & ~CANTMASK2; + } else { + set = t->t_hold; + } + sigktou(&set, sp); +} + #ifdef _SYSCALL32_IMPL void prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp) @@ -980,8 +1015,7 @@ prgetlwpstatus32(kthread_t *t, lwpstatus32_t *sp, zone_t *zp) sp->pr_lwpid = t->t_tid; sp->pr_cursig = lwp->lwp_cursig; prassignset(&sp->pr_lwppend, &t->t_sig); - schedctl_finish_sigblock(t); - prassignset(&sp->pr_lwphold, &t->t_hold); + prgethold(t, &sp->pr_lwphold); if (t->t_whystop == PR_FAULTED) { siginfo_kto32(&lwp->lwp_siginfo, &sp->pr_info); if (t->t_whatstop == FLTPAGE) @@ -1212,8 +1246,7 @@ prgetlwpstatus(kthread_t *t, lwpstatus_t *sp, zone_t *zp) sp->pr_lwpid = t->t_tid; sp->pr_cursig = lwp->lwp_cursig; prassignset(&sp->pr_lwppend, &t->t_sig); - schedctl_finish_sigblock(t); - prassignset(&sp->pr_lwphold, &t->t_hold); + prgethold(t, &sp->pr_lwphold); if (t->t_whystop == PR_FAULTED) bcopy(&lwp->lwp_siginfo, &sp->pr_info, sizeof (k_siginfo_t)); @@ -1370,10 +1403,10 @@ prgetaction32(proc_t *p, user_t *up, uint_t sig, struct sigaction32 *sp) /* * Count the number of segments in this process's address space. */ -int +uint_t prnsegs(struct as *as, int reserved) { - int n = 0; + uint_t n = 0; struct seg *seg; ASSERT(as != &kas && AS_WRITE_HELD(as)); @@ -1390,8 +1423,21 @@ prnsegs(struct as *as, int reserved) for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { (void) pr_getprot(seg, reserved, &tmp, &saddr, &naddr, eaddr); - if (saddr != naddr) + if (saddr != naddr) { n++; + /* + * prnsegs() was formerly designated to return + * an 'int' despite having no ability or use + * for negative results. As part of changing + * it to 'uint_t', keep the old effective limit + * of INT_MAX in place. + */ + if (n == INT_MAX) { + pr_getprot_done(&tmp); + ASSERT(tmp == NULL); + return (n); + } + } } ASSERT(tmp == NULL); @@ -2591,7 +2637,6 @@ prgetlwpsinfo(kthread_t *t, lwpsinfo_t *psp) void prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp) { - proc_t *p = ttoproc(t); klwp_t *lwp = ttolwp(t); sobj_ops_t *sobj; char c, state; @@ -2599,7 +2644,7 @@ prgetlwpsinfo32(kthread_t *t, lwpsinfo32_t *psp) int retval, niceval; hrtime_t hrutime, hrstime; - ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); bzero(psp, sizeof (*psp)); diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 2cf007c42c..657cebf8c2 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -98,6 +98,11 @@ struct prdirect { #define PRSDSIZE (sizeof (struct prdirect)) /* + * Maximum length of the /proc/$$/argv file: + */ +int prmaxargvlen = 4096; + +/* * Directory characteristics. */ typedef struct prdirent { @@ -170,6 +175,8 @@ static prdirent_t piddir[] = { { PR_LDT, 28 * sizeof (prdirent_t), sizeof (prdirent_t), "ldt" }, #endif + { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t), + "argv" }, }; #define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2) @@ -588,6 +595,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(), #if defined(__x86) pr_read_ldt(), #endif + pr_read_argv(), pr_read_usage(), pr_read_lusage(), pr_read_pagedata(), pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(), pr_read_lwpusage(), pr_read_lwpname(), @@ -617,6 +625,7 @@ static int (*pr_read_function[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage, /* /proc/<pid>/usage */ pr_read_lusage, /* /proc/<pid>/lusage */ pr_read_pagedata, /* /proc/<pid>/pagedata */ @@ -681,6 +690,41 @@ pr_uioread(void *base, long count, uio_t *uiop) } static int +pr_read_argv(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = prmaxargvlen, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_ARGV); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int pr_read_as(prnode_t *pnp, uio_t *uiop) { int error; @@ -1827,6 +1871,7 @@ static int (*pr_read_function_32[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage_32, /* /proc/<pid>/usage */ pr_read_lusage_32, /* /proc/<pid>/lusage */ pr_read_pagedata_32, /* /proc/<pid>/pagedata */ @@ -2753,6 +2798,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) #endif } +/* + * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile + * time that PRFNSZ has the same definition as MAXCOMLEN. + */ +#if PRFNSZ != MAXCOMLEN +#error PRFNSZ/MAXCOMLEN mismatch +#endif + +static int +pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop) +{ + char fname[PRFNSZ]; + int offset = offsetof(psinfo_t, pr_fname), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_fname); +#endif + + /* + * If this isn't a write to pr_fname (or if the size doesn't match + * PRFNSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ) + return (0); + + if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0) + return (error); + + fname[PRFNSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ); + + prunlock(pnp); + + return (0); +} + +/* + * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile + * time that PRARGSZ has the same definition as PSARGSZ. + */ +#if PRARGSZ != PSARGSZ +#error PRARGSZ/PSARGSZ mismatch +#endif + +static int +pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop) +{ + char psargs[PRARGSZ]; + int offset = offsetof(psinfo_t, pr_psargs), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_psargs); +#endif + + /* + * If this isn't a write to pr_psargs (or if the size doesn't match + * PRARGSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ) + return (0); + + if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0) + return (error); + + psargs[PRARGSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ); + + prunlock(pnp); + + return (0); +} + +int +pr_write_psinfo(prnode_t *pnp, uio_t *uiop) +{ + int error; + + if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0) + return (error); + + if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0) + return (error); + + return (0); +} + + /* Note we intentionally don't handle partial writes/updates. */ static int pr_write_lwpname(prnode_t *pnp, uio_t *uiop) @@ -2879,6 +3021,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) uiop->uio_resid = resid; return (error); + case PR_PSINFO: + return (pr_write_psinfo(pnp, uiop)); + case PR_LWPNAME: return (pr_write_lwpname(pnp, uiop)); @@ -3168,6 +3313,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, case PR_AUXV: vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t); break; + case PR_ARGV: + if ((p->p_flag & SSYS) || p->p_as == &kas) { + vap->va_size = PSARGSZ; + } else { + vap->va_size = prmaxargvlen; + } + break; #if defined(__x86) case PR_LDT: mutex_exit(&p->p_lock); @@ -3344,6 +3496,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) @@ -3429,6 +3582,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = { #if defined(__x86) pr_lookup_notdir, /* /proc/<pid>/ldt */ #endif + pr_lookup_notdir, /* /proc/<pid>/argv */ pr_lookup_notdir, /* /proc/<pid>/usage */ pr_lookup_notdir, /* /proc/<pid>/lusage */ pr_lookup_notdir, /* /proc/<pid>/pagedata */ @@ -4706,16 +4860,17 @@ prgetnode(vnode_t *dp, prnodetype_t type) pnp->pr_mode = 0600; /* read-write by owner only */ break; + case PR_PSINFO: case PR_LWPNAME: pnp->pr_mode = 0644; /* readable by all + owner can write */ break; - case PR_PSINFO: case PR_LPSINFO: case PR_LWPSINFO: case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: pnp->pr_mode = 0444; /* read-only by all */ break; @@ -4821,6 +4976,7 @@ static int (*pr_readdir_function[PR_NFILES])() = { #if defined(__x86) pr_readdir_notdir, /* /proc/<pid>/ldt */ #endif + pr_readdir_notdir, /* /proc/<pid>/argv */ pr_readdir_notdir, /* /proc/<pid>/usage */ pr_readdir_notdir, /* /proc/<pid>/lusage */ pr_readdir_notdir, /* /proc/<pid>/pagedata */ @@ -4972,6 +5128,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp) case PR_PROCDIR: case PR_PSINFO: case PR_USAGE: + case PR_ARGV: break; default: continue; diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c index 126eb9f82e..62d2c080b6 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c +++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c @@ -362,6 +362,7 @@ smb_kshare_g_fini(void) kmem_cache_destroy(smb_kshare_cache_vfs); } + /* * A list of shares in nvlist format can be sent down * from userspace thourgh the IOCTL interface. The nvlist diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c index cf6082e477..1c0010b2c2 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_server.c +++ b/usr/src/uts/common/fs/smbsrv/smb_server.c @@ -847,6 +847,22 @@ smb_server_enum(smb_ioc_svcenum_t *ioc) smb_svcenum_t *svcenum = &ioc->svcenum; smb_server_t *sv; int rc; + uint32_t buflen_adjusted; + + /* + * Reality check that the buffer-length insize the enum doesn't + * overrun the ioctl's total length. + * + * NOTE: Assume se_buf is at the end of smb_svcenum_t. + */ + buflen_adjusted = svcenum->se_buflen + + offsetof(smb_svcenum_t, se_buf) + sizeof (ioc->hdr); + if (buflen_adjusted < svcenum->se_buflen || /* Overflow check 1, */ + buflen_adjusted < offsetof(smb_svcenum_t, se_buf) || /* check 2, */ + buflen_adjusted < sizeof (ioc->hdr) || /* check 3. */ + buflen_adjusted > ioc->hdr.len) { + return (EINVAL); + } /* * Reality check that the buffer-length insize the enum doesn't diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index 87e29b21ae..e7d69f9896 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright 2017 Sebastian Wiedenroth */ @@ -504,6 +505,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + return (0); } @@ -657,6 +661,10 @@ sonode_fini(struct sonode *so) if (so->so_filter_top != NULL) sof_sonode_cleanup(so); + /* Clean up any remnants of krecv callbacks */ + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + ASSERT(list_is_empty(&so->so_acceptq_list)); ASSERT(list_is_empty(&so->so_acceptq_defer)); ASSERT(!list_link_active(&so->so_acceptq_node)); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index e5bc6dc845..9b8186a8a0 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr)); ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); @@ -305,7 +305,7 @@ so_connect(struct sonode *so, struct sockaddr *name, * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -404,7 +404,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -513,7 +513,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag, error = EPIPE; break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); - if ((so->so_mode & SM_SENDFILESUPP) == 0) { - SO_UNBLOCK_FALLBACK(so); - return (EOPNOTSUPP); - } - error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top, B_FALSE); @@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, { int error; - SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); if (so->so_filter_active == 0 || (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) @@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_getsockopt(so, option_name, optval, optlenp, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); if ((so->so_filter_active == 0 || @@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_setsockopt(so, option_name, optval, optlen, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); /* X/Open requires this check */ @@ -876,7 +871,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * If there is a pending error, return error * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -1329,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp, } } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + boolean_t cont; + so_krecv_f func = so->so_krecv_cb; + void *arg = so->so_krecv_arg; + + mutex_exit(&so->so_lock); + cont = func(so, mp, msg_size, flags & MSG_OOB, arg); + mutex_enter(&so->so_lock); + if (cont == B_TRUE) { + space_left = so->so_rcvbuf; + } else { + so->so_rcv_queued = so->so_rcvlowat; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; + } + mutex_exit(&so->so_lock); + if (flags & MSG_OOB) { so_queue_oob(so, mp, msg_size); mutex_enter(&so->so_lock); @@ -1607,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, return (ENOTCONN); } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + mutex_exit(&so->so_lock); + return (EOPNOTSUPP); + } + mutex_exit(&so->so_lock); + if (msg->msg_flags & MSG_PEEK) msg->msg_flags &= ~MSG_WAITALL; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index 957c8f93b4..df159a122c 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -24,6 +24,7 @@ */ /* * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -670,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop, int more = 0; int error; ssize_t oobmark; + ssize_t copied = 0; sodirect_t *sodp = so->so_direct; + xuio_t *xuio = NULL; partial_read = B_FALSE; *mctlp = NULL; + if ((uiop->uio_extflg & UIO_XUIO) != 0) { + xuio = (xuio_t *)uiop; + } again: mutex_enter(&so->so_lock); again1: @@ -784,8 +790,6 @@ again1: * enabled socket, uio_resid can be 0. */ if (uiop->uio_resid >= 0) { - ssize_t copied = 0; - if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) { mutex_enter(&so->so_lock); ASSERT(uiop == (uio_t *)&sodp->sod_uioa); @@ -843,6 +847,18 @@ again1: } if (mp != NULL) { /* more data blocks in msg */ more |= MOREDATA; + + /* + * If requested, tally up remaining data along with the + * amount already copied. + */ + if (xuio != NULL && + xuio->xu_type == UIOTYPE_PEEKSIZE) { + xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE; + xuio->xu_ext.xu_ps.xu_ps_size = + copied + msgdsize(mp); + } + if ((flags & (MSG_PEEK|MSG_TRUNC))) { if (flags & MSG_PEEK) { freemsg(mp); @@ -2276,9 +2292,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) fbfunc = sp->sp_smod_info->smod_proto_fallback_func; /* - * Cannot fallback if the socket has active filters + * Cannot fallback if the socket has active filters or a krecv callback. */ - if (so->so_filter_active > 0) + if (so->so_filter_active > 0 || so->so_krecv_cb != NULL) return (EINVAL); switch (so->so_family) { @@ -2456,3 +2472,53 @@ out: return (error); } + +int +so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg) +{ + int ret; + + if (cb == NULL && arg != NULL) + return (EINVAL); + + SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg)); + + mutex_enter(&so->so_lock); + if (so->so_state & SS_FALLBACK_COMP) { + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + return (ENOTSUP); + } + + ret = so_lock_read(so, 0); + VERIFY(ret == 0); + /* + * Other consumers may actually care about getting extant data delivered + * to them, when they come along, they should figure out the best API + * for that. + */ + so_rcv_flush(so); + + so->so_krecv_cb = cb; + so->so_krecv_arg = arg; + + so_unlock_read(so); + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + + return (0); +} + +void +so_krecv_unblock(sonode_t *so) +{ + mutex_enter(&so->so_lock); + VERIFY(so->so_krecv_cb != NULL); + + so->so_rcv_queued = 0; + (void) so_check_flow_control(so); + /* + * so_check_flow_control() always drops so->so_lock, so we won't + * need to drop it ourselves. + */ +} diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index 971523945e..7dca6ae6fc 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/systm.h> @@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name, /* Module loaded OK, so there must be an ops vector */ ASSERT(ent->sofe_mod != NULL); + + /* + * Check again to confirm ATTACH is ok. See if the the module + * is not SOF_ATT_SAFE after an unsafe operation has taken + * place. + */ + if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 && + so->so_state & SS_FILOP_UNSF) { + sof_instance_destroy(inst); + return (EINVAL); + } + inst->sofi_ops = &ent->sofe_mod->sofm_ops; SOF_STAT_ADD(inst, tot_active_attach, 1); @@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * sof_register(version, name, ops, flags) * * Register a socket filter identified by name `name' and which should use - * the ops vector `ops' for event notification. `flags' should be set to 0. + * the ops vector `ops' for event notification. `flags' should be set to 0 + * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An + * unsafe filter is one that cannot be attached after any socket operation has + * occured. This is the legacy default. A "safe" filter can be attached even + * after some basic initial socket operations have taken place. This set is + * currently bind, getsockname, getsockopt and setsockopt. The order in which + * a "safe" filter can be attached is more relaxed, and thus more flexible. * On success 0 is returned, otherwise an errno is returned. */ int @@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags) { sof_module_t *mod; - _NOTE(ARGUNUSED(flags)); - if (version != SOF_VERSION) return (EINVAL); mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(mod->sofm_name, name); + mod->sofm_flags = flags; mod->sofm_ops = *ops; mutex_enter(&sof_module_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index 7f7aece1f1..cf2ad8b20d 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKFILTER_H @@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t; struct sof_module { char *sofm_name; + int sofm_flags; sof_ops_t sofm_ops; uint_t sofm_refcnt; list_node_t sofm_node; diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index ed3c5967e1..7a7651edb5 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -437,8 +438,10 @@ sogetoff(mblk_t *mp, t_uscalar_t offset, * * The underlying filesystem VSOCK vnode has a v_stream pointer that * references the actual stream head (hence indirectly the actual sonode). + * + * This function is non-static so it can be used by brand emulation. */ -static int +int so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, vnode_t **vpp) { @@ -1883,7 +1886,7 @@ ssize_t soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) { struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec aiov[1]; register vnode_t *vp; int ioflag, rwflag; ssize_t cnt; diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index 4cbd079539..e0b6b5de43 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -21,6 +21,8 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -54,6 +56,7 @@ #include <sys/cmn_err.h> #include <sys/vmsystm.h> #include <sys/policy.h> +#include <sys/limits.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -86,12 +89,6 @@ extern void nl7c_init(void); extern int sockfs_defer_nl7c_init; /* - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? - */ -#define MSG_MAXIOVLEN 16 - -/* * Kernel component of socket creation. * * The socket library determines which version number to use. @@ -1021,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) STRUCT_HANDLE(nmsghdr, umsgptr); struct nmsghdr lmsg; struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; int *flagsp; model_t model; @@ -1066,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1089,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1105,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1119,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) (do_useracc == 0 || useracc(lmsg.msg_control, lmsg.msg_controllen, B_WRITE) != 0)) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } - return (recvit(sock, &lmsg, &auio, flags, + rval = recvit(sock, &lmsg, &auio, flags, STRUCT_FADDR(umsgptr, msg_namelen), - STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); + STRUCT_FADDR(umsgptr, msg_controllen), flagsp); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } /* @@ -1262,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) struct nmsghdr lmsg; STRUCT_DECL(nmsghdr, u_lmsg); struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; model_t model; @@ -1307,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { /* * Unless this is XPG 4.2 we allow iovcnt == 0 to * be compatible with SunOS 4.X and 4.4BSD. @@ -1316,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + if (iovcnt != 0 && - copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1336,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (iovcnt != 0 && copyin(lmsg.msg_iov, aiov, (unsigned)iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1354,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1364,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) auio.uio_segflg = UIO_USERSPACE; auio.uio_limit = 0; - return (sendit(sock, &lmsg, &auio, flags)); + rval = sendit(sock, &lmsg, &auio, flags); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } ssize_t diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h index 6a515be122..24acb81a0a 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h +++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKTPI_IMPL_H @@ -56,6 +57,8 @@ extern int sogetrderr(vnode_t *, int, int *); extern int sogetwrerr(vnode_t *, int, int *); extern int so_addr_verify(struct sonode *, const struct sockaddr *, socklen_t); +extern int so_ux_lookup(struct sonode *, struct sockaddr_un *, int, + vnode_t **); extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *, socklen_t, int, void **, socklen_t *); extern void so_unix_close(struct sonode *); diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index f6621c8097..1a620642cc 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -383,20 +382,7 @@ tdirenter( /* * Unmake the inode we just made. */ - rw_enter(&tp->tn_rwlock, RW_WRITER); - if ((tp->tn_type) == VDIR) { - ASSERT(tdp == NULL); - /* - * cleanup allocs made by tdirinit() - */ - tdirtrunc(tp); - } - mutex_enter(&tp->tn_tlock); - tp->tn_nlink = 0; - mutex_exit(&tp->tn_tlock); - gethrestime(&tp->tn_ctime); - rw_exit(&tp->tn_rwlock); - tmpnode_rele(tp); + tmpnode_cleanup(tp); tp = NULL; } } else if (tpp) { @@ -431,6 +417,7 @@ tdirdelete( enum dr_op op, struct cred *cred) { + struct tmount *tm; struct tdirent *tpdp; int error; size_t namelen; @@ -516,7 +503,8 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + tm = TNTOTM(dir); + tmp_kmem_free(tm, tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -538,19 +526,27 @@ tdirdelete( * tdirinit is used internally to initialize a directory (dir) * with '.' and '..' entries without checking permissions and locking */ -void +int tdirinit( struct tmpnode *parent, /* parent of directory to initialize */ struct tmpnode *dir) /* the new directory */ { + struct tmount *tm; struct tdirent *dot, *dotdot; timestruc_t now; ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + tm = TNTOTM(parent); + dot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 2, KM_SLEEP); + if (dot == NULL) + return (ENOSPC); + dotdot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 3, KM_SLEEP); + if (dotdot == NULL) { + tmp_kmem_free(tm, dot, sizeof (struct tdirent) + 2); + return (ENOSPC); + } /* * Initialize the entries @@ -601,6 +597,8 @@ tdirinit( dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */ dir->tn_dirents = 2; dir->tn_nlink = 2; + + return (0); } @@ -612,6 +610,7 @@ tdirtrunc(struct tmpnode *dir) { struct tdirent *tdp; struct tmpnode *tp; + struct tmount *tm; size_t namelen; timestruc_t now; int isvattrdir, isdotdot, skip_decr; @@ -619,6 +618,8 @@ tdirtrunc(struct tmpnode *dir) ASSERT(RW_WRITE_HELD(&dir->tn_rwlock)); ASSERT(dir->tn_type == VDIR); + tm = TNTOTM(dir); + isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0; for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) { ASSERT(tdp->td_next != tdp); @@ -650,7 +651,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + tmp_kmem_free(tm, tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -903,6 +904,7 @@ tdiraddentry( enum de_op op, struct tmpnode *fromtp) { + struct tmount *tm; struct tdirent *tdp, *tpdp; size_t namelen, alloc_size; timestruc_t now; @@ -923,9 +925,10 @@ tdiraddentry( /* * Allocate and initialize directory entry */ + tm = TNTOTM(dir); namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP | KM_NORMALPRI); if (tdp == NULL) return (ENOSPC); @@ -1025,7 +1028,10 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + return (ENOSPC); + } tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ @@ -1087,8 +1093,13 @@ tdirmaketnode( if (va->va_mask & AT_MTIME) tp->tn_mtime = va->va_mtime; - if (op == DE_MKDIR) - tdirinit(dir, tp); + if (op == DE_MKDIR) { + int ret; + if ((ret = tdirinit(dir, tp)) != 0) { + tmpnode_cleanup(tp); + return (ret); + } + } *newnode = tp; return (0); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 8723631555..0c48c03a75 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -43,6 +43,7 @@ #include <sys/fs/tmpnode.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <vm/anon.h> #define KILOBYTE 1024 #define MEGABYTE (1024 * KILOBYTE) @@ -54,6 +55,80 @@ extern pgcnt_t swapfs_minfree; +void * +tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag) +{ + void *buf; + zone_t *zone; + size_t pages; + + mutex_enter(&tm->tm_contents); + zone = tm->tm_vfsp->vfs_zone; + if (tm->tm_anonmem + size > tm->tm_anonmax || + tm->tm_anonmem + size < tm->tm_anonmem || + size + ptob(tmpfs_minfree) <= size || + !anon_checkspace(size + ptob(tmpfs_minfree), zone)) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + /* + * Only make anonymous memory reservations when a page boundary is + * crossed. This is necessary since the anon_resv functions rounds up + * to PAGESIZE internally. + */ + pages = btopr(tm->tm_allocmem + size); + pages -= btopr(tm->tm_allocmem); + if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + tm->tm_allocmem += size; + tm->tm_anonmem += size; + mutex_exit(&tm->tm_contents); + + buf = kmem_zalloc(size, flag); + if (buf == NULL) { + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + if (pages > 0) { + /* + * Re-chasing the zone pointer is necessary since a + * forced umount could have been performed while the + * tm_contents lock was dropped during allocation. + */ + anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); + } + + return (buf); +} + +void +tmp_kmem_free(struct tmount *tm, void *buf, size_t size) +{ + size_t pages; + + kmem_free(buf, size); + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + pages = btopr(tm->tm_allocmem); + tm->tm_allocmem -= size; + pages -= btopr(tm->tm_allocmem); + /* + * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when + * a page boundary has been crossed. + */ + if (pages > 0) { + anon_unresv_zone(size, tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); +} + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -99,42 +174,8 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* - * Convert a string containing a number (number of bytes) to a pgcnt_t, - * containing the corresponding number of pages. On 32-bit kernels, the + * Convert a string containing a number (number of bytes) to a size_t, + * containing the corresponding number of bytes. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * @@ -152,7 +193,7 @@ tmp_memfree(void *cp, size_t size) * error. */ int -tmp_convnum(char *str, pgcnt_t *maxpg) +tmp_convnum(char *str, size_t *maxbytes) { u_longlong_t num = 0; #ifdef _LP64 @@ -160,6 +201,7 @@ tmp_convnum(char *str, pgcnt_t *maxpg) #else u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif + size_t pages; char *c; const struct convchar { char *cc_char; @@ -250,13 +292,21 @@ valid_char: done: /* - * Since btopr() rounds up to page granularity, this round-up can - * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) - * and (max_bytes). In this case the resulting number is zero, which - * is what we check for below. + * We've been given a size in bytes; however, we want to make sure that + * we have at least one page worth no matter what. Therefore we use + * btopr to round up. However, this may cause an overflow only if 'num' + * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the + * resulting number is zero, which is what we check for below. Note, we + * require at least one page, so if pages is zero, well, it wasn't going + * to work anyways. */ - if ((*maxpg = (pgcnt_t)btopr(num)) == 0 && num != 0) + pages = btopr(num); + if (pages == 0) { return (EINVAL); + } + + *maxbytes = ptob(pages); + return (0); } diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 51e57b2611..13ea356924 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -64,21 +65,35 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + size_t pbytes = ptob(pages); zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* - * pagecreate is set only if we actually need to call anon_resv - * to reserve an additional page of anonymous memory. - * Since anon_resv always reserves a page at a time, - * it should only get called when we know we're growing the - * file into a new page or filling a hole. + * pagecreate is set only if we actually need to call anon_resv to + * reserve an additional page of anonymous memory. Since anon_resv + * always reserves a page at a time, it should only get called when we + * know we're growing the file into a new page or filling a hole. This + * is why we transform delta into a number of pages. However, because we + * track bytes and not pages, we convert that back to a number of bytes + * that we allocate against. * - * Deny if trying to reserve more than tmpfs can allocate + * Deny if trying to reserve more than tmpfs can allocate, the + * allocation causes an overflow, or the delta round up overflowed. + * Note, that btopr rounds up, so we need to catch the unsigned + * overflow. Note, rounding up when we are within a page of SIZE_MAX is + * done by adding a page, overflowing, which will then be rounded back + * to zero. Hence the following check. */ + if (pages == 0 && delta != 0) + return (1); + zone = tm->tm_vfsp->vfs_zone; - if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || + if (pagecreate && ((tm->tm_anonmem + pbytes > tm->tm_anonmax) || + (tm->tm_anonmem + pbytes < tm->tm_anonmem) || + (ptob(pages + tmpfs_minfree) <= pbytes) || (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || (anon_try_resv_zone(delta, zone) == 0))) { return (1); @@ -89,7 +104,7 @@ tmp_resv( */ if (pagecreate) { mutex_enter(&tm->tm_contents); - tm->tm_anonmem += pages; + tm->tm_anonmem += pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", @@ -110,13 +125,27 @@ tmp_unresv( struct tmpnode *tp, size_t delta) { + size_t pages, pbytes; + ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* + * If this is true, we have a grevious overflow bug and some size + * accounting has been messed with as having an amount to truncate at + * this size would imply that all of memory was used for this file. No + * matter how small the kernel, it will always need at least one page. + */ + pages = btopr(delta); + if (pages == 0 && delta != 0) + panic("tmpfs unsigned overflow detected"); + pbytes = ptob(pages); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); - tm->tm_anonmem -= btopr(delta); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - pbytes); + tm->tm_anonmem -= pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", tp, delta); @@ -154,6 +183,26 @@ tmpnode_growmap(struct tmpnode *tp, ulong_t newsize) } /* + * This is used to clean up a tmpnode that hasn't made it out the door. In other + * words, we allocated it and did a tmpnode_init; however, before it could get + * fully inserted into a directory, bad things happened and it failed. + */ +void +tmpnode_cleanup(struct tmpnode *tp) +{ + rw_enter(&tp->tn_rwlock, RW_WRITER); + if ((tp->tn_type) == VDIR) { + tdirtrunc(tp); + } + mutex_enter(&tp->tn_tlock); + tp->tn_nlink = 0; + mutex_exit(&tp->tn_tlock); + gethrestime(&tp->tn_ctime); + rw_exit(&tp->tn_rwlock); + tmpnode_rele(tp); +} + +/* * Initialize a tmpnode and add it to file list under mount point. */ void @@ -232,7 +281,6 @@ tmpnode_trunc( { size_t oldsize = tp->tn_size; size_t delta; - struct vnode *vp = TNTOV(tp); timestruc_t now; int error = 0; @@ -316,7 +364,7 @@ tmpnode_trunc( /* Delete anon array for tmpnode */ ASSERT(tp->tn_nblocks == 0); ASSERT(anon_get_ptr(tp->tn_anon, 0) == NULL); - ASSERT(!vn_has_cached_data(vp)); + ASSERT(!vn_has_cached_data(TNTOV(tp))); anon_release(tp->tn_anon, tp->tn_asize); tp->tn_anon = NULL; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index a7cf62cb99..c52a6f7c77 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -56,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -65,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -123,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -141,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -157,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -178,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -212,18 +221,12 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } @@ -234,7 +237,7 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) struct tmpnode *tp; struct pathname dpn; int error; - pgcnt_t anonmax; + size_t anonmax; struct vattr rattr; int got_attrs; boolean_t mode_arg = B_FALSE; @@ -278,7 +281,18 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { - anonmax = ULONG_MAX; + anonmax = SIZE_MAX; + } + + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; } /* @@ -311,7 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) goto out; } - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if ((tm = kmem_zalloc(sizeof (struct tmount), + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -343,17 +358,37 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* + * Preemptively set vfs_zone before any of the tmp_kmem_* functions are + * called. That field is not populated until after a successful + * VFS_MOUNT when domount() sets vfsp metadata via vfs_add(). An + * accurate value is required for proper swap usage accounting. + */ + ASSERT0(uap->flags & MS_REMOUNT); + ASSERT(vfsp->vfs_zone == NULL); + vfsp->vfs_zone = curproc->p_zone; + + /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + + pn_free(&dpn); + error = ENOMEM; + goto out; + } tmpnode_init(tm, tp, &rattr, cr); /* @@ -392,12 +427,34 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) tp->tn_nlink = 0; tm->tm_rootnode = tp; - tdirinit(tp, tp); + if (tdirinit(tp, tp) != 0) { + /* + * While we would normally let our VOP_INACTIVE function take + * care of cleaning up here, we're in a bit of a delicate + * situation, so we do so manually. While it's tempting to try + * and rely upon tmpfs_freevfs() and others, it's probably safer + * for the time to do this manually at the cost of duplication. + */ + vn_invalid(TNTOV(tp)); + rw_destroy(&tp->tn_rwlock); + mutex_destroy(&tp->tn_tlock); + vn_free(TNTOV(tp)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + pn_free(&dpn); + error = ENOMEM; + goto out; + } rw_exit(&tp->tn_rwlock); pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -413,36 +470,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -452,14 +580,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -469,6 +633,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -535,15 +709,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -605,18 +780,19 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * If tm_anonmax for this mount is less than the available swap space * (minus the amount tmpfs can't use), use that instead */ - if (blocks > tmpfs_minfree) + if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) { sbp->f_bfree = MIN(blocks - tmpfs_minfree, - tm->tm_anonmax - tm->tm_anonmem); - else + btop(tm->tm_anonmax) - btopr(tm->tm_anonmem)); + } else { sbp->f_bfree = 0; + } sbp->f_bavail = sbp->f_bfree; /* * Total number of blocks is what's available plus what's been used */ - sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem)); if (eff_zid != GLOBAL_ZONEUNIQID && zp->zone_max_swap_ctl != UINT64_MAX) { @@ -646,13 +822,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index a09f206d88..a356f22750 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017 by Delphix. All rights reserved. @@ -586,6 +586,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -615,6 +619,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -788,8 +796,13 @@ tmp_setattr( rw_exit(&tp->tn_contents); rw_exit(&tp->tn_rwlock); - if (error == 0 && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } goto out1; } @@ -835,6 +848,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -853,6 +869,8 @@ tmp_lookup( rw_enter(&tp->tn_rwlock, RW_WRITER); if (tp->tn_xattrdp == NULL) { + int err; + if (!(flags & CREATE_XATTR_DIR)) { rw_exit(&tp->tn_rwlock); return (ENOENT); @@ -873,9 +891,13 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); tm = VTOTM(dvp); + xdp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), + KM_SLEEP); + if (xdp == NULL) { + rw_exit(&tp->tn_rwlock); + return (ENOSPC); + } tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* * Fix-up fields unique to attribute directories. @@ -893,7 +915,16 @@ tmp_lookup( } xdp->tn_vnode->v_type = VDIR; xdp->tn_vnode->v_flag |= V_XATTRDIR; - tdirinit(tp, xdp); + if ((err = tdirinit(tp, xdp)) != 0) { + rw_exit(&tp->tn_rwlock); + /* + * This never got properly initialized so we can + * just clean it up. + */ + xdp->tn_vnode->v_flag &= V_XATTRDIR; + tmpnode_cleanup(tp); + return (err); + } tp->tn_xattrdp = xdp; } else { VN_HOLD(tp->tn_xattrdp->tn_vnode); @@ -1302,10 +1333,8 @@ tmp_rename( vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); /* * vnevent_rename_dest is called in tdirenter(). - * Notify the target dir if not same as source dir. */ - if (ndvp != odvp) - vnevent_rename_dest_dir(ndvp, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); } done: @@ -1474,6 +1503,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1607,12 +1640,12 @@ tmp_symlink( rw_exit(&parent->tn_rwlock); if (error) { - if (self) + if (self != NULL) tmpnode_rele(self); return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP | KM_NORMALPRI); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1677,10 +1710,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - VN_RELE_LOCKED(vp); + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + VN_RELE_LOCKED(vp); + } else { + VN_RELE_LOCKED(vp); + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1705,7 +1755,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + tmp_kmem_free(tm, tp->tn_symlink, tp->tn_size + 1); } /* @@ -1739,7 +1789,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1861,6 +1915,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2082,6 +2140,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ @@ -2342,8 +2404,13 @@ tmp_space( return (EFBIG); error = tmp_freesp(vp, bfp, flag); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); } diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c index c1e2c74a87..def046a0bf 100644 --- a/usr/src/uts/common/fs/udfs/udf_dir.c +++ b/usr/src/uts/common/fs/udfs/udf_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -562,9 +563,8 @@ out: namep, ctp); } - if (sdp != tdp) { - vnevent_rename_dest_dir(ITOV(tdp), ctp); - } + vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip), + namep, ctp); } /* diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c index 054056c63a..51ce9b28af 100644 --- a/usr/src/uts/common/fs/udfs/udf_vnops.c +++ b/usr/src/uts/common/fs/udfs/udf_vnops.c @@ -569,8 +569,11 @@ udf_setattr( goto update_inode; } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file access or modified times. @@ -1649,8 +1652,13 @@ udf_space( } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { error = ud_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c index 79ff1b7071..370c982f08 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vnops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c @@ -2084,8 +2084,13 @@ again: goto update_inode; } - if (error == 0 && vap->va_size) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } if (ulp) { @@ -3610,12 +3615,7 @@ retry_firstlock: if (error == 0) { vnevent_rename_src(ITOV(sip), sdvp, snm, ct); - /* - * Notify the target directory of the rename event - * if source and target directories are not the same. - */ - if (sdvp != tdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); } errout: @@ -4350,8 +4350,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, return (error); error = ufs_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else if (cmd == F_ALLOCSP) { error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FALLOCATE_MASK); @@ -5630,10 +5635,10 @@ ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp, struct ufsvfs *ufsvfsp; /* - * Regular files reject edge-triggered pollers. + * Regular files reject epollers (and edge-triggered pollers). * See the comment in fs_poll() for a more detailed explanation. */ - if (ev & POLLET) { + if (fs_reject_epoll() || (ev & POLLET) != 0) { return (EPERM); } diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index 1bee02bfe6..77bc7817a8 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -857,9 +857,11 @@ vfs_mountroot(void) for (p = practive; p != NULL; p = p->p_next) { ASSERT(p == &p0 || p->p_parent == &p0); + mutex_enter(&p->p_lock); PTOU(p)->u_cdir = rootdir; VN_HOLD(PTOU(p)->u_cdir); PTOU(p)->u_rdir = NULL; + mutex_exit(&p->p_lock); } mutex_exit(&pidlock); @@ -3885,6 +3887,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 6e8f65cacb..6d6c4af5ca 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -207,6 +207,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -2543,6 +2548,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2557,12 +2563,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2649,6 +2656,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct) (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); } +void +vnevent_resize(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct); +} + /* * Vnode accessors. */ @@ -3424,14 +3440,58 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3443,14 +3503,63 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c index 0ab3513718..0dc61e4907 100644 --- a/usr/src/uts/common/fs/zfs/abd.c +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -146,7 +146,10 @@ boolean_t zfs_abd_scatter_enabled = B_TRUE; * it at runtime would cause ABD iteration to work incorrectly for ABDs which * were allocated with the old size, so a safeguard has been put in place which * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. + * within a scattered ABD. Note that tuning this value to be smaller than the + * page size can induce heavy fragmentation in the slab layer, which may itself + * result in more memory waste than is saved by the smaller chunk size -- and + * will induces more computational work in the slab layer. Tune with caution! */ size_t zfs_abd_chunk_size = 4096; diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 844abbcd5d..1175faf65d 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -263,6 +263,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/zio_checksum.h> #include <sys/multilist.h> #include <sys/abd.h> @@ -324,7 +325,7 @@ int arc_grow_retry = 60; int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; +int zfs_arc_overflow_shift = 3; /* shift of arc_c for calculating both min and max arc_p */ int arc_p_min_shift = 4; @@ -5342,6 +5343,14 @@ top: rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -6297,6 +6306,10 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + /* On larger-memory machines, we clamp the minimum at 1GB */ + if (zfs_arc_min == 0) + arc_c_min = MIN(arc_c_min, (1 << 30)); + if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 7421ea291b..979bb8848e 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -1008,8 +1008,17 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + if (bonuslen) { + /* + * Absent byzantine on-disk corruption, we fully expect + * our bonuslen to be no more than max_bonuslen -- + * but we nonetheless explicitly clamp it on the bcopy() + * to prevent any on-disk corruption from becoming + * rampant in-kernel corruption. + */ + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + MIN(bonuslen, max_bonuslen)); + } DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 95ca9f76aa..966d155a9c 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -2178,7 +2178,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } - /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c index 542bb42f3f..bee41bd95e 100644 --- a/usr/src/uts/common/fs/zfs/dmu_recv.c +++ b/usr/src/uts/common/fs/zfs/dmu_recv.c @@ -1526,8 +1526,12 @@ receive_read_record(struct receive_arg *ra) { struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); + void *buf = NULL; dmu_object_info_t doi; + + if (size > 0) + buf = kmem_zalloc(size, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 6d65086079..d42a7c66de 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 53d5765bcb..6cb39d61a5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -39,11 +39,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -213,6 +213,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 298516f8a4..35e76e273e 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -42,6 +42,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include <sys/zfeature.h> #include <sys/policy.h> #include <sys/zfs_znode.h> @@ -1398,7 +1399,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); + zfs_zone_txg_delay(), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 54c88b1e3c..ce77b8c611 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -44,6 +44,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/vdev_impl.h> #include <sys/metaslab_impl.h> #include <sys/bptree.h> @@ -865,7 +866,7 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; - ASSERT3U(dp->dp_dirty_total, >=, space); + VERIFY3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 0044f37964..bfac5ddf1c 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -61,6 +62,11 @@ int zfs_metaslab_sm_blksz = (1 << 12); int zfs_condense_pct = 200; /* + * Never condense any space map. This is for debugging/recovery only. + */ +int zfs_condense_never = 0; + +/* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the @@ -152,6 +158,18 @@ int metaslab_load_pct = 50; int metaslab_unload_delay = TXG_SIZE * 2; /* + * Tunables used to reduce metaslab load/unload thrashing when selection + * algorithm is allocating across metaslabs very evenly. In addition to + * tracking when the slab was used for allocation (ms_selected_txg), we also + * track when it was loaded (ms_loaded_txg). If the slab would be unloaded, + * but the load txg is within the window of + * metaslab_unload_delay + metaslab_load_window + * then we ramp up metaslab_unload_delay instead of unloading the metaslab. + */ +int metaslab_load_window = 10; +int metaslab_unload_delay_max = 256; + +/* * Max number of metaslabs per group to preload. */ int metaslab_preload_limit = SPA_DVAS_PER_BP; @@ -713,6 +731,7 @@ metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; + char kstat_name[KSTAT_STRLEN]; ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); @@ -737,6 +756,33 @@ metaslab_group_activate(metaslab_group_t *mg) mgprev->mg_next = mg; mgnext->mg_prev = mg; } + + /* Create a kstat to monitor the loading and unloading of metaslabs. */ + (void) snprintf(kstat_name, sizeof (kstat_name), "%llx", + (unsigned long long) mg->mg_vd->vdev_guid); + + mutex_init(&mg->mg_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + if ((mg->mg_kstat = kstat_create("zfs_metaslab_group", 0, + kstat_name, "misc", KSTAT_TYPE_NAMED, + sizeof (metaslab_group_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + + metaslab_group_kstat_t *mg_kstat = kmem_zalloc( + sizeof (metaslab_group_kstat_t), KM_SLEEP); + kstat_named_init(&mg_kstat->mg_loads, "loads", + KSTAT_DATA_UINT64); + kstat_named_init(&mg_kstat->mg_unloads, "unloads", + KSTAT_DATA_UINT64); + kstat_named_init(&mg_kstat->mg_spa_name, "spa_name", + KSTAT_DATA_STRING); + kstat_named_setstr(&mg_kstat->mg_spa_name, + mg->mg_vd->vdev_spa->spa_name); + + mg->mg_kstat->ks_data = mg_kstat; + mg->mg_kstat->ks_lock = &mg->mg_kstat_lock; + kstat_install(mg->mg_kstat); + } + mc->mc_rotor = mg; } @@ -813,6 +859,14 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_prev = NULL; mg->mg_next = NULL; + + if (mg->mg_kstat != NULL) { + metaslab_group_kstat_t *data = mg->mg_kstat->ks_data; + + kstat_delete(mg->mg_kstat); + kmem_free(data, sizeof (metaslab_group_kstat_t)); + } + mutex_destroy(&mg->mg_kstat_lock); } boolean_t @@ -1773,8 +1827,9 @@ metaslab_load_impl(metaslab_t *msp) } int -metaslab_load(metaslab_t *msp) +metaslab_load(metaslab_t *msp, uint64_t txg) { + kstat_t *ksp; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* @@ -1787,9 +1842,16 @@ metaslab_load(metaslab_t *msp) VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); + ksp = msp->ms_group->mg_kstat; + if (ksp != NULL) { + metaslab_group_kstat_t *mg_ksp = ksp->ks_data; + atomic_inc_64(&mg_ksp->mg_loads.value.ui64); + } + msp->ms_loading = B_TRUE; int error = metaslab_load_impl(msp); msp->ms_loading = B_FALSE; + msp->ms_loaded_txg = txg; cv_broadcast(&msp->ms_load_cv); return (error); @@ -1804,6 +1866,7 @@ metaslab_unload(metaslab_t *msp) range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_loaded_txg = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; @@ -1918,7 +1981,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, */ if (metaslab_debug_load && ms->ms_sm != NULL) { mutex_enter(&ms->ms_lock); - VERIFY0(metaslab_load(ms)); + VERIFY0(metaslab_load(ms, txg)); mutex_exit(&ms->ms_lock); } @@ -2432,12 +2495,13 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, } static int -metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight, + uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = metaslab_load(msp); + int error = metaslab_load(msp, txg); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); @@ -2552,7 +2616,7 @@ metaslab_preload(void *arg) ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); - (void) metaslab_load(msp); + (void) metaslab_load(msp, spa_syncing_txg(spa)); msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); } @@ -2625,6 +2689,9 @@ metaslab_should_condense(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + if (zfs_condense_never != 0) + return (B_FALSE); + /* * Allocations and frees in early passes are generally more space * efficient (in terms of blocks described in space map entries) @@ -3087,22 +3154,35 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. + * from it in 'metaslab_unload_delay' txgs, then we normally unload it. + * However, to prevent thrashing, if the metaslab was recently loaded, + * then instead of unloading it, we increase the unload delay (only up + * to the maximum). */ if (msp->ms_loaded && msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } + if (msp->ms_loaded_txg != 0 && msp->ms_loaded_txg + + metaslab_unload_delay + metaslab_load_window >= txg) { + if (metaslab_unload_delay + metaslab_load_window <= + metaslab_unload_delay_max) { + metaslab_unload_delay += metaslab_load_window; + } + DTRACE_PROBE1(zfs__metaslab__delay__unload, + metaslab_t *, msp); + } else { + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } - if (!metaslab_debug_unload) - metaslab_unload(msp); + if (!metaslab_debug_unload) + metaslab_unload(msp); + } } ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); @@ -3362,8 +3442,6 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); - /* Track the last successful allocation */ - msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } @@ -3545,7 +3623,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - if (metaslab_activate(msp, allocator, activation_weight) != 0) { + if (metaslab_activate(msp, allocator, activation_weight, + txg) != 0) { mutex_exit(&msp->ms_lock); continue; } @@ -4252,7 +4331,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM, txg); /* * No need to fail in that case; someone else has activated the * metaslab, but that doesn't preclude us from using it. diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index d3c0a3e8ef..1d7b72d72c 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -24,6 +24,7 @@ * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -400,15 +401,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; - int i; + int i, size; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; - tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + + if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) { + tb->lot_attrs = kmem_alloc(size, KM_SLEEP); + bcopy(attrs, tb->lot_attrs, size); + } + tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 403ace2d9d..0795e2c69b 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -27,7 +27,7 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <tsoome@me.com> - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. @@ -227,6 +227,13 @@ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; uint64_t zfs_max_missing_tvds_scan = 0; /* + * Interval in seconds at which to poll spare vdevs for health. + * Setting this to zero disables spare polling. + * Set to three hours by default. + */ +uint_t spa_spare_poll_interval_seconds = 60 * 60 * 3; + +/* * Debugging aid that pauses spa_sync() towards the end. */ boolean_t zfs_pause_spa_sync = B_FALSE; @@ -1854,6 +1861,12 @@ spa_check_for_missing_logs(spa_t *spa) if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); + + /* Save the timestamp of the last completed txg. */ + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, + spa->spa_last_ubsync_txg_ts) == 0); + return (SET_ERROR(ENXIO)); } } else { @@ -1862,10 +1875,21 @@ spa_check_for_missing_logs(spa_t *spa) if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + nvlist_t *rewind_info = fnvlist_alloc(); + spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); + + VERIFY(nvlist_add_uint64(rewind_info, + ZPOOL_CONFIG_LOAD_TIME, + spa->spa_uberblock.ub_timestamp) == 0); + + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_INFO, + rewind_info) == 0); + break; } } @@ -7150,6 +7174,8 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_probe(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -8156,6 +8182,14 @@ spa_sync(spa_t *spa, uint64_t txg) spa_handle_ignored_writes(spa); + /* Mark unused spares as needing a health check. */ + if (spa_spare_poll_interval_seconds != 0 && + NSEC2SEC(gethrtime() - spa->spa_spares_last_polled) > + spa_spare_poll_interval_seconds) { + spa_spare_poll(spa); + spa->spa_spares_last_polled = gethrtime(); + } + /* * If any async tasks have been requested, kick them off. */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 9a80f89a8a..f951f1cc97 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017 Datto Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -1023,6 +1024,41 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) * be completely consistent with respect to other vdev configuration changes. */ +/* + * Poll the spare vdevs to make sure they are not faulty. + * + * The probe operation will raise an ENXIO error and create an FM ereport if the + * probe fails. + */ +void +spa_spare_poll(spa_t *spa) +{ + boolean_t async_request = B_FALSE; + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + for (int i = 0; i < spa->spa_spares.sav_count; i++) { + spa_aux_t search, *found; + vdev_t *vd = spa->spa_spares.sav_vdevs[i]; + + search.aux_guid = vd->vdev_guid; + + mutex_enter(&spa_spare_lock); + found = avl_find(&spa_spare_avl, &search, NULL); + /* This spare is in use by a pool. */ + if (found != NULL && found->aux_pool != NULL) { + mutex_exit(&spa_spare_lock); + continue; + } + mutex_exit(&spa_spare_lock); + + vd->vdev_probe_wanted = B_TRUE; + async_request = B_TRUE; + } + if (async_request) + spa_async_request(spa, SPA_ASYNC_PROBE); + + spa_config_exit(spa, SCL_STATE, FTAG); +} + static int spa_spare_compare(const void *a, const void *b) { diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index d26b095d14..567ab411e4 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -49,7 +49,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); -int metaslab_load(metaslab_t *); +int metaslab_load(metaslab_t *, uint64_t); void metaslab_unload(metaslab_t *); uint64_t metaslab_allocated_space(metaslab_t *); diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index f8d36f38f7..fe93fdc0d1 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -276,8 +276,17 @@ struct metaslab_group { boolean_t mg_initialize_updating; kmutex_t mg_ms_initialize_lock; kcondvar_t mg_ms_initialize_cv; + + kstat_t *mg_kstat; + kmutex_t mg_kstat_lock; }; +typedef struct metaslab_group_kstat { + kstat_named_t mg_loads; + kstat_named_t mg_unloads; + kstat_named_t mg_spa_name; +} metaslab_group_kstat_t; + /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. @@ -461,8 +470,8 @@ struct metaslab { * stay cached. */ uint64_t ms_selected_txg; + uint64_t ms_loaded_txg; /* track when metaslab was loaded */ - uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 4ff552447e..82a1514598 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -25,7 +25,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -679,6 +679,9 @@ extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); +/* spare polling */ +extern void spa_spare_poll(spa_t *spa); + /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index dcb6cc9f19..539ed4b43e 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -25,6 +25,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -252,6 +253,7 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + hrtime_t spa_spares_last_polled; /* time spares last polled */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 6ddbe55a0c..7ef03e0483 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -143,6 +143,7 @@ struct vdev_queue { avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; + zoneid_t vq_last_zone_id; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..f1431b3f55 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern hrtime_t zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t, + avl_tree_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 517764f1ce..fa45fd8385 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -24,7 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome <tsoome@me.com> */ @@ -378,8 +378,14 @@ typedef int zio_pipe_stage_t(zio_t *zio); * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ -#define ZIO_REEXECUTE_NOW 0x01 -#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_NO_SUSPEND 0x04 + +#define ZIO_SHOULD_REEXECUTE(x) \ + ((x)->io_reexecute & ZIO_REEXECUTE_NOW || \ + ((x)->io_reexecute & ZIO_REEXECUTE_SUSPEND && \ + (((x)->io_reexecute & ZIO_REEXECUTE_NO_SUSPEND) == 0))) typedef struct zio_alloc_list { list_t zal_list; @@ -440,6 +446,7 @@ struct zio { hrtime_t io_timestamp; hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; @@ -472,6 +479,7 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index cb3d6a51cb..c97cfdb82c 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -32,6 +32,7 @@ #include <sys/dsl_scan.h> #include <sys/zil.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * ZFS Transaction Groups @@ -535,6 +536,8 @@ txg_sync_thread(void *arg) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 2d431373ce..af653c2f28 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -26,6 +26,7 @@ */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -52,6 +53,11 @@ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; @@ -150,6 +156,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; /* * Ignore events other than offline. @@ -613,6 +621,7 @@ static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; if (vd->vdev_reopening || dvd == NULL) return; @@ -847,6 +856,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } @@ -856,6 +867,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c index e1aa4e9523..9b103811d4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_initialize.c +++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c @@ -474,7 +474,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ - VERIFY0(metaslab_load(msp)); + VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa))); for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { @@ -605,7 +605,7 @@ vdev_initialize_thread(void *arg) vdev_initialize_ms_mark(msp); mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp)); + VERIFY0(metaslab_load(msp, spa_syncing_txg(spa))); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index dff83e3108..74860c5c0a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* @@ -34,6 +35,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/metaslab_impl.h> #include <sys/abd.h> @@ -144,7 +146,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_min_active = 3; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; @@ -260,6 +262,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_last_zone_id = 0; + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -298,6 +302,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_enqueue(zio); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -314,6 +319,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -693,7 +699,11 @@ again: search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); +#ifdef _KERNEL + zio = zfs_zone_schedule(vq, p, idx, tree); +#else zio = avl_nearest(tree, idx, AVL_AFTER); +#endif if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index ad78295a54..8c4c05a1d1 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2015, Joyent, Inc. diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 712abee22f..fcdc8bcbc7 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -25,7 +25,7 @@ * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright (c) 2014, 2019 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -634,9 +634,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); @@ -966,6 +967,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; + if (secpolicy_fs_import(cr) != 0) + return (set_errno(EPERM)); + if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); @@ -2088,7 +2092,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -2106,7 +2111,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -2133,11 +2139,24 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); } @@ -2332,8 +2351,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); @@ -2363,8 +2395,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -3049,6 +3083,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -3095,8 +3130,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3105,13 +3141,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); @@ -5849,7 +5887,8 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void -zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, +zfs_ioctl_register_legacy(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { @@ -5860,6 +5899,7 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); + vec->zvec_name = name; vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; @@ -5901,7 +5941,7 @@ zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } @@ -5909,14 +5949,15 @@ static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void -zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +zfs_ioctl_register_pool_modify(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func) { - zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, + zfs_ioctl_register_legacy(name, ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } @@ -5924,7 +5965,7 @@ static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } @@ -5932,7 +5973,7 @@ static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } @@ -5944,10 +5985,10 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) } static void -zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) +zfs_ioctl_register_dataset_modify(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(name, ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } @@ -6042,34 +6083,35 @@ zfs_ioctl_init(void) /* IOCTLS that use the legacy function signature */ - zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, - zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); + zfs_ioctl_register_legacy("pool_freeze", ZFS_IOC_POOL_FREEZE, + zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, + POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, + zfs_ioctl_register_pool_modify("pool_scan", ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, + zfs_ioctl_register_pool_modify("pool_upgrade", ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, + zfs_ioctl_register_pool_modify("vdev_add", ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, + zfs_ioctl_register_pool_modify("vdev_remove", ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, + zfs_ioctl_register_pool_modify("vdev_set_state", ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, + zfs_ioctl_register_pool_modify("vdev_attach", ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, + zfs_ioctl_register_pool_modify("vdev_detach", ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, + zfs_ioctl_register_pool_modify("vdev_setpath", ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, + zfs_ioctl_register_pool_modify("vdev_setfru", ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, + zfs_ioctl_register_pool_modify("pool_set_props", ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, + zfs_ioctl_register_pool_modify("vdev_split", ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, + zfs_ioctl_register_pool_modify("pool_reguid", ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, @@ -6147,20 +6189,20 @@ zfs_ioctl_init(void) zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, - zfs_secpolicy_none); - zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, - zfs_secpolicy_destroy); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, - zfs_secpolicy_rename); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, + zfs_ioctl_register_dataset_modify("set_prop", ZFS_IOC_SET_PROP, + zfs_ioc_set_prop, zfs_secpolicy_none); + zfs_ioctl_register_dataset_modify("destroy", ZFS_IOC_DESTROY, + zfs_ioc_destroy, zfs_secpolicy_destroy); + zfs_ioctl_register_dataset_modify("rename", ZFS_IOC_RENAME, + zfs_ioc_rename, zfs_secpolicy_rename); + zfs_ioctl_register_dataset_modify("recv", ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); - zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, - zfs_secpolicy_promote); - zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, + zfs_ioctl_register_dataset_modify("promote", ZFS_IOC_PROMOTE, + zfs_ioc_promote, zfs_secpolicy_promote); + zfs_ioctl_register_dataset_modify("inherit_prop", ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, - zfs_secpolicy_set_fsacl); + zfs_ioctl_register_dataset_modify("set_fsacl", ZFS_IOC_SET_FSACL, + zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); @@ -6443,7 +6485,32 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) nvlist_free(outnvl); } else { + spa_t *spa; + uint64_t orig_cookie = zc->zc_cookie; + error = vec->zvec_legacy_func(zc); + + if (error == 0 && vec->zvec_allow_log && + vec->zvec_name != NULL && + spa_open(zc->zc_name, &spa, FTAG) == 0) { + nvlist_t *lognv = NULL; + char *msg; + uint_t len = strlen(vec->zvec_name) + + strlen(zc->zc_name) + 128; + + msg = kmem_alloc(len, KM_SLEEP); + + lognv = fnvlist_alloc(); + (void) snprintf(msg, len, + "%s pool: %s cookie: %lu guid: %lx", vec->zvec_name, + zc->zc_name, orig_cookie, zc->zc_guid); + fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, msg); + + (void) spa_history_log_nvl(spa, lognv); + spa_close(spa, FTAG); + fnvlist_free(lognv); + kmem_free(msg, len); + } } out: diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index f7beea4cc9..a5912b19ab 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -1912,6 +1913,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index a68fc3dd34..96e03d9291 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -687,6 +687,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -741,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); - - /* * If in append mode, set the io offset pointer to eof. */ locked_range_t *lr; @@ -996,9 +996,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) break; ASSERT(tx_bytes == nbytes); n -= nbytes; - - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); } rangelock_exit(lr); @@ -2854,8 +2851,11 @@ top: return (err); } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } } if (mask & (AT_ATIME|AT_MTIME) || @@ -3783,9 +3783,7 @@ top: if (error == 0) { vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - /* notify the target dir if it is not the same as source dir */ - if (tdvp != sdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); } out: if (zl != NULL) @@ -4819,10 +4817,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } @@ -4888,8 +4882,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - if (error == 0 && off == 0 && len == 0) - vnevent_truncate(ZTOV(zp), ct); + if (error == 0 && len == 0) { + if (off == 0) { + vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } + } ZFS_EXIT(zfsvfs); return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..f151595095 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1419 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. All rights reserved. + */ + +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + * 1) Ensure consistent and predictable I/O latency across all zones. + * 2) Sequential and random workloads have very different characteristics, + * so it is a non-starter to track IOPS or throughput. + * 3) A zone should be able to use the full disk bandwidth if no other zone + * is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + * (# of read syscalls) x (Average read latency) + + * (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +hrtime_t +zfs_zone_txg_delay() +{ + return (MSEC2NSEC(10)); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */ + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. + */ +uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */ +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; + uint_t cycle_cnt; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + * ((now - zfs_zone_last_checked) * 1000); + */ +kmutex_t zfs_disk_lock; /* protects the following: */ +uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ +hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ + +hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */ +hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; + +/* + * Data used to keep track of how often txg sync is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_sync_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on the zfs_vdev_sync_read_max_active value for the + * number of I/Os that can be pending on a device. If there are more than the + * max_active ops already queued up, beyond those already issued to the vdev, + * then use zone-based scheduling to get the next synchronous zio. + */ +uint32_t zfs_zone_schedule_thresh = 10; + +/* + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define SCHED_WEIGHT_MAX 20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale). + */ +int zfs_zone_txg_throttle_scale = 2; +hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); + +typedef struct { + int zq_qdepth; + zio_priority_t zq_queue; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op) +{ + zone_zfs_io_t *iop; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops); + iop->zpers_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops); + iop->zpers_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops); + iop->zpers_lwr_ops.cycle_cnt++; + break; + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static boolean_t +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (B_FALSE); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (B_TRUE); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + atomic_inc_uint(&rd_lat.cycle_cnt); + atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat); + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + atomic_inc_uint(&wr_lat.cycle_cnt); + atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat); + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + DTRACE_PROBE3(zfs__zone__calc__wt__avg, + uintptr_t, cp->sys_avg_lat, + uintptr_t, cp->cycle_lat, + uintptr_t, cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op, + hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zpd, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + ASSERT3P(zpd, !=, NULL); + + *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops); + *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops); + *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd, + uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, + uintptr_t, *wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; + + ASSERT3P(iop, !=, NULL); + + mutex_enter(&zpd->zpers_zfs_lock); + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) + + (lwops * sp->zi_avgwlat); + sp->zi_totutil += iop->zpers_io_util; + + if (iop->zpers_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += iop->zpers_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, + uint_t, rops, uint_t, wops, uint_t, lwops, + uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri); + + mutex_exit(&zpd->zpers_zfs_lock); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_zfs_io_t *zpd) +{ + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay < zfs_zone_delay_ceiling) + zpd->zpers_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_zfs_io_t *zpd) +{ + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay > 0) + zpd->zpers_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; + zoneio_stats_t *sp = arg; + uint8_t delay; + uint_t fairutil = 0; + + ASSERT3P(iop, !=, NULL); + + mutex_enter(&zpd->zpers_zfs_lock); + delay = iop->zpers_io_delay; + iop->zpers_io_util_above_avg = 0; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (iop->zpers_io_util > fairutil && sp->zi_overutil) { + iop->zpers_io_util_above_avg = 1; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(iop); + } else if (iop->zpers_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { + zfs_zone_delay_dec(iop); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, + uintptr_t, delay, uintptr_t, iop->zpers_io_delay, + uintptr_t, fairutil, uintptr_t, iop->zpers_io_util); + + mutex_exit(&zpd->zpers_zfs_lock); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) +{ + zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, + uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, + uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, + uintptr_t, stats.zi_diskutil); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + uint_t cnt; + zone_q_bump_t *qbp = arg; + zio_priority_t p = qbp->zq_queue; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + cnt = iop->zpers_zfs_queued[p]; + if (cnt == 0) { + iop->zpers_zfs_weight = 0; + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX) + iop->zpers_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / cnt) * + iop->zpers_zfs_io_pri * iop->zpers_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * it becomes the new leading contender. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = iop->zpers_zfs_weight; + } + mutex_exit(&zpd->zpers_zfs_lock); + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. This is only + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller. + * + * For single-threaded synchronous processes a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple processes + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel processes (and ops in the queue) vs. other zones which + * are doing simple single-threaded processes, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, + avl_tree_t *tree) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + qbump.zq_queue = p; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(tree); zp != NULL; + zp = avl_walk(tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, + uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out TX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track and throttle IO operations per zone. Called from: + * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes + * go through this path) + * - arc_read for read ops that miss the ARC (both dataset and zvol) + * For each operation, increment that zone's counter based on the type of + * operation, then delay the operation, if necessary. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. We only see logical writes in this + * function, we see the physcial writes in the zfs_zone_zio_start and + * zfs_zone_zio_done functions. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice; first because this function + * is always called by a zone thread for logical writes, but then we also will + * count the physical writes that are performed at a low level via + * zfs_zone_zio_start. Without this, it can look like a non-global zone never + * writes (case 1). Depending on when the TXG is synced, the counts may be in + * the same sample bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through arc_read and we only come into this + * function when we have an arc miss. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zoneid_t zid = curzone->zone_id; + zone_persist_t *zpd = &zone_pdata[zid]; + zone_zfs_io_t *iop; + hrtime_t unow; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counter for logical writes here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + add_iop(zpd, unow, type, 0); + } + + if (!zfs_zone_delay_enable) + return; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + /* + * If the zone's I/O priority is set to zero, don't throttle that zone's + * operations at all. + */ + if (iop->zpers_zfs_io_pri == 0) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + /* Handle periodically updating the per-zone I/O parameters */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + hrtime_t last_checked; + boolean_t do_update = B_FALSE; + + /* Recheck under mutex */ + mutex_enter(&zfs_last_check_lock); + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + do_update = B_TRUE; + } + mutex_exit(&zfs_last_check_lock); + + if (do_update) { + mutex_exit(&zpd->zpers_zfs_lock); + + zfs_zone_wait_adjust(unow, last_checked); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + } + } + + wait = iop->zpers_io_delay; + mutex_exit(&zpd->zpers_zfs_lock); + + if (wait > 0) { + /* + * If this is a write and we're doing above normal TXG + * syncing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_sync_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid, + uintptr_t, type, uintptr_t, wait); + + drv_usecwait(wait); + + if (curzone->zone_vfs_stats != NULL) { + atomic_inc_64(&curzone->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&curzone->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the sync rate is going to be 1. When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second). In this case the rate + * will be > 1. The sync rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_sync_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_sync_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +hrtime_t +zfs_zone_txg_delay() +{ + zone_persist_t *zpd = &zone_pdata[curzone->zone_id]; + zone_zfs_io_t *iop; + uint8_t above; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + above = iop->zpers_io_util_above_avg; + mutex_exit(&zpd->zpers_zfs_lock); + + if (above) { + return (zfs_zone_txg_delay_nsec); + } + + return (MSEC2NSEC(10)); +} + +/* + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&iop->zpers_zfs_rwstats); + iop->zpers_zfs_weight = 0; + } + mutex_exit(&zpd->zpers_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); +} + +/* + * Called from vdev_disk_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_persist_t *zpd; + zone_zfs_io_t *iop; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if (zp->io_dispatched == 0) + return; + + zpd = &zone_pdata[zp->io_zoneid]; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + /* + * To calculate the wsvc_t average, keep a cumulative sum of + * all the wait time before each I/O was dispatched. Since most + * writes are asynchronous, only track the wait time for + * read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + iop->zpers_zfs_rwstats.reads++; + iop->zpers_zfs_rwstats.nread += zp->io_size; + iop->zpers_zfs_rd_waittime += + zp->io_dispatched - zp->io_timestamp; + kstat_runq_exit(&iop->zpers_zfs_rwstats); + } else { + iop->zpers_zfs_rwstats.writes++; + iop->zpers_zfs_rwstats.nwritten += zp->io_size; + } + } + mutex_exit(&zpd->zpers_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + } + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, + uintptr_t, zp->io_type, uintptr_t, udelta); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zio_priority_t p; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + ASSERT(iop->zpers_zfs_queued[p] > 0); + if (iop->zpers_zfs_queued[p] == 0) { + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + } else { + iop->zpers_zfs_queued[p]--; + } + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zio_priority_t p; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + iop->zpers_zfs_queued[p]++; + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +/* + * Called from vdev_queue_io_to_issue. That function is where zio's are listed + * in FIFO order on one of the sync queues, then pulled off (by + * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling + * here to find a zone's zio deeper in the sync queue and issue that instead + * of simply doing FIFO. + * + * We only do zone-based zio scheduling for the two synchronous I/O queues + * (read & write). These queues are normally serviced in FIFO order but we + * may decide to move a zone's zio to the head of the line. A typical I/O + * load will be mostly synchronous reads and some asynchronous writes (which + * are scheduled differently due to transaction groups). There will also be + * some synchronous writes for those apps which want to ensure their data is on + * disk. We want to make sure that a zone with a single-threaded app (e.g. the + * shell) that is doing synchronous I/O (typically reads) isn't penalized by + * other zones which are doing lots of synchronous I/O because they have many + * running threads. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, + avl_tree_t *tree) +{ + vdev_queue_class_t *vqc = &vq->vq_class[p]; + uint_t cnt; + zoneid_t last_zone; + zio_t *zio; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* Don't change the order on the LBA ordered queues. */ + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return (avl_nearest(tree, idx, AVL_AFTER)); + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + cnt = avl_numnodes(tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few zios in the queue then just issue the head. + * If there are more than a few zios already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zio = avl_nearest(tree, idx, AVL_AFTER); + else + zio = get_next_zio(vqc, cnt, p, tree); + + vq->vq_last_zone_id = zio->io_zoneid; + + /* + * Probe with 4 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, the zone that was associated + * with the next IO that is scheduled, and which queue (priority). + */ + DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, + uint_t, zio->io_zoneid, uint_t, p); + + return (zio); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 546e4f3d1e..547ebac383 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -3064,13 +3065,20 @@ zil_close(zilog_t *zilog) txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); mutex_exit(&zilog->zl_lock); - /* - * We need to use txg_wait_synced() to wait long enough for the - * ZIL to be clean, and to wait for all pending lwbs to be - * written out. - */ - if (txg != 0) + if (zilog_is_dirty(zilog)) { + /* + * If we're dirty, always wait for the current transaction -- + * our lwb_max_txg may be in the past. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (txg != 0) { + /* + * We need to use txg_wait_synced() to wait long enough for the + * ZIL to be clean, and to wait for all pending lwbs to be + * written out. + */ txg_wait_synced(zilog->zl_dmu_pool, txg); + } if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 619dad47f3..8aaa2e19a2 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -22,7 +22,9 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -41,6 +43,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/zfs_zone.h> #include <sys/metaslab_impl.h> #include <sys/abd.h> #include <sys/cityhash.h> @@ -621,6 +624,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) @@ -628,6 +632,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); @@ -3827,6 +3833,24 @@ zio_done(zio_t *zio) } } + /* + * When we have an error on a slog vdev, we must ensure that the + * zio is not suspended. Suspending the zio will cause dataset deletion + * or an attempt to remove the slog to hang. In both cases, the code + * might be trying to clean up the zil blocks on the slog, but because + * the slog is dead, the suspended zio causes this to hang indefinitely. + * The system properly switches over to using zils on regular storage + * when the slog dies. + * + * This is a reasonable point in the stack to detect that the vdev is + * a slog. The 'no_suspend' flag will propagate up to the logical zio + * via zio_notify_parent. + */ + if (zio->io_error && vd != NULL && vd->vdev_islog && + !vdev_accessible(vd, zio)) { + zio->io_reexecute |= ZIO_REEXECUTE_NO_SUSPEND; + } + if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will @@ -3871,7 +3895,7 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - if ((zio->io_error || zio->io_reexecute) && + if ((zio->io_error || ZIO_SHOULD_REEXECUTE(zio)) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); @@ -3885,7 +3909,7 @@ zio_done(zio_t *zio) (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; - if (zio->io_reexecute) { + if (ZIO_SHOULD_REEXECUTE(zio)) { /* * This is a logical I/O that wants to reexecute. * @@ -3956,7 +3980,7 @@ zio_done(zio_t *zio) } ASSERT(zio->io_child_count == 0); - ASSERT(zio->io_reexecute == 0); + ASSERT(!ZIO_SHOULD_REEXECUTE(zio)); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 10ea804f8d..33bac61d21 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -25,7 +25,6 @@ * * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2019, Joyent, Inc. */ @@ -85,11 +84,13 @@ #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/zfeature.h> #include <sys/zio_checksum.h> #include <sys/zil_impl.h> +#include <sys/ht.h> #include <sys/dkioc_free_util.h> #include <sys/zfs_rlock.h> @@ -142,6 +143,11 @@ typedef struct zvol_state { #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * zvol maximum transfer in one DMU tx. */ @@ -1272,6 +1278,8 @@ zvol_strategy(buf_t *bp) (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) && !doread && !is_dumpified; + ht_begin_unsafe(); + /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. @@ -1319,6 +1327,8 @@ zvol_strategy(buf_t *bp) zil_commit(zv->zv_zilog, ZVOL_OBJ); biodone(bp); + ht_end_unsafe(); + return (0); } @@ -1380,6 +1390,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) zvol_state_t *zv; uint64_t volsize; int error = 0; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1396,6 +1409,16 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + ht_begin_unsafe(); + + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1405,6 +1428,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; + tot_bytes += bytes; error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ @@ -1415,6 +1439,40 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } rangelock_exit(lr); + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += tot_bytes; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + + ht_end_unsafe(); + return (error); } @@ -1427,6 +1485,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; int error = 0; boolean_t sync; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1443,6 +1504,21 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + ht_begin_unsafe(); + + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for zvol write operations. There's no + * actual wait queue for zvol operations. + */ + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1456,6 +1532,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; + tot_bytes += bytes; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1474,6 +1551,41 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + + ht_end_unsafe(); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += tot_bytes; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + return (error); } @@ -1714,11 +1826,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case DKIOCFLUSHWRITECACHE: dkc = (struct dk_callback *)arg; mutex_exit(&zfsdev_state_lock); + + ht_begin_unsafe(); + zil_commit(zv->zv_zilog, ZVOL_OBJ); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { (*dkc->dkc_callback)(dkc->dkc_cookie, error); error = 0; } + + ht_end_unsafe(); + return (error); case DKIOCGETWCE: @@ -1743,7 +1861,9 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) } else { zv->zv_flags &= ~ZVOL_WCE; mutex_exit(&zfsdev_state_lock); + ht_begin_unsafe(); zil_commit(zv->zv_zilog, ZVOL_OBJ); + ht_end_unsafe(); } return (0); } @@ -1796,6 +1916,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) mutex_exit(&zfsdev_state_lock); + ht_begin_unsafe(); + for (int i = 0; i < dfl->dfl_num_exts; i++) { uint64_t start = dfl->dfl_exts[i].dfle_start, length = dfl->dfl_exts[i].dfle_length, @@ -1851,6 +1973,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) if (!(flag & FKIOCTL)) dfl_free(dfl); + ht_end_unsafe(); + return (error); } diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h new file mode 100644 index 0000000000..e3eac799e5 --- /dev/null +++ b/usr/src/uts/common/inet/bpf.h @@ -0,0 +1,49 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _INET_BPF_H +#define _INET_BPF_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef _KERNEL + +#include <sys/types.h> + +/* + * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason + * about struct sizing. + */ +typedef struct ip_bpf_insn { + uint16_t code; + uint8_t jt; + uint8_t jf; + uint32_t k; +} ip_bpf_insn_t; + +extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t); +extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t); + + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_BPF_H */ diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c index db5b224a5e..5a9ba38da6 100644 --- a/usr/src/uts/common/io/bpf/bpf_filter.c +++ b/usr/src/uts/common/inet/bpf_filter.c @@ -38,6 +38,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -45,11 +46,12 @@ #include <sys/stream.h> #include <sys/byteorder.h> #include <sys/sdt.h> +#include <inet/bpf.h> +#include <net/bpf.h> #define EXTRACT_SHORT(p) BE_IN16(p) #define EXTRACT_LONG(p) BE_IN32(p) -#ifdef _KERNEL #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) #define mtod(_a, _t) ((_t)((_a)->b_rptr)) #define MINDEX(len, m, k) \ @@ -123,11 +125,7 @@ m_xhalf(mblk_t *m, uint32_t k, int *err) *err = 0; return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]); } -#else /* _KERNEL */ -#include <stdlib.h> -#endif /* !_KERNEL */ -#include <net/bpf.h> /* * Execute the filter program starting at pc on the packet p @@ -137,8 +135,8 @@ m_xhalf(mblk_t *m, uint32_t k, int *err) * packet is only in one mblk_t. * When buflen is 0, p is an mblk_t pointer. */ -uint_t -bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +uint32_t +ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen) { uint32_t A, X, k; uint32_t mem[BPF_MEMWORDS]; @@ -147,7 +145,7 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) /* * No filter means accept all. */ - return ((uint_t)-1); + return ((uint32_t)-1); A = 0; X = 0; --pc; @@ -165,10 +163,10 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) abort(); #endif case BPF_RET|BPF_K: - return ((uint_t)pc->k); + return (pc->k); case BPF_RET|BPF_A: - return ((uint_t)A); + return (A); case BPF_LD|BPF_W|BPF_ABS: k = pc->k; @@ -456,7 +454,6 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) /* NOTREACHED */ } -#ifdef _KERNEL /* * Return true if the 'fcode' is a valid filter program. * The constraints are that each jump be forward and to a valid @@ -468,14 +465,14 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) * The kernel needs to be able to verify an application's filter code. * Otherwise, a bogus program could easily crash the system. */ -int -bpf_validate(struct bpf_insn *f, int len) +boolean_t +ip_bpf_validate(ip_bpf_insn_t *f, uint_t len) { uint_t i, from; - struct bpf_insn *p; + ip_bpf_insn_t *p; if (len < 1 || len > BPF_MAXINSNS) - return (0); + return (B_FALSE); for (i = 0; i < len; ++i) { p = &f[i]; @@ -489,7 +486,7 @@ bpf_validate(struct bpf_insn *f, int len) switch (BPF_MODE(p->code)) { case BPF_MEM: if (p->k >= BPF_MEMWORDS) - return (0); + return (B_FALSE); break; case BPF_ABS: case BPF_IND: @@ -498,13 +495,13 @@ bpf_validate(struct bpf_insn *f, int len) case BPF_LEN: break; default: - return (0); + return (B_FALSE); } break; case BPF_ST: case BPF_STX: if (p->k >= BPF_MEMWORDS) - return (0); + return (B_FALSE); break; case BPF_ALU: switch (BPF_OP(p->code)) { @@ -522,10 +519,10 @@ bpf_validate(struct bpf_insn *f, int len) * Check for constant division by 0. */ if (BPF_RVAL(p->code) == BPF_K && p->k == 0) - return (0); + return (B_FALSE); break; default: - return (0); + return (B_FALSE); } break; case BPF_JMP: @@ -549,17 +546,17 @@ bpf_validate(struct bpf_insn *f, int len) switch (BPF_OP(p->code)) { case BPF_JA: if (from + p->k < from || from + p->k >= len) - return (0); + return (B_FALSE); break; case BPF_JEQ: case BPF_JGT: case BPF_JGE: case BPF_JSET: if (from + p->jt >= len || from + p->jf >= len) - return (0); + return (B_FALSE); break; default: - return (0); + return (B_FALSE); } break; case BPF_RET: @@ -567,10 +564,9 @@ bpf_validate(struct bpf_insn *f, int len) case BPF_MISC: break; default: - return (0); + return (B_FALSE); } } return (BPF_CLASS(f[len - 1].code) == BPF_RET); } -#endif diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h new file mode 100644 index 0000000000..a790a797d1 --- /dev/null +++ b/usr/src/uts/common/inet/inet_hash.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _INET_INET_HASH_H +#define _INET_INET_HASH_H + +/* + * Common packet hashing routines shared across MAC, UDP, and others. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define INET_PKT_HASH_L2 0x01 +#define INET_PKT_HASH_L3 0x02 +#define INET_PKT_HASH_L4 0x04 + +extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_INET_HASH_H */ diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index f67ade9060..e9a3fcdeeb 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1415,6 +1415,7 @@ typedef union ill_g_head_u { #define ILL_CAPAB_DLD 0x20 /* DLD capabilities */ #define ILL_CAPAB_DLD_POLL 0x40 /* Polling */ #define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */ +#define ILL_CAPAB_DLD_IPCHECK 0x100 /* Check if IPs are permitted */ /* * Per-ill Hardware Checksumming capbilities. @@ -1729,6 +1730,8 @@ typedef struct ill_s { * Capabilities related fields. */ uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */ + kcondvar_t ill_dlpi_capab_cv; /* CV for broadcasting state changes */ + kmutex_t ill_dlpi_capab_lock; /* Lock for accessing above Cond Var */ uint_t ill_capab_pending_cnt; uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */ ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */ @@ -1770,6 +1773,10 @@ typedef struct ill_s { * Used to save errors that occur during plumbing */ uint_t ill_ifname_pending_err; + /* + * Used to save errors that occur during binding + */ + uint_t ill_dl_bind_err; avl_node_t ill_avl_byppa; /* avl node based on ppa */ uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */ list_t ill_nce; /* pointer to nce_s list */ @@ -1936,6 +1943,7 @@ typedef struct ill_s { * ill_nd_lla_len ipsq + down ill only when ill is up * ill_phys_addr_pend ipsq + down ill only when ill is up * ill_ifname_pending_err ipsq ipsq + * ill_dl_bind_err ipsq ipsq * ill_avl_byppa ipsq, ill_g_lock write once * * ill_fastpath_list ill_lock ill_lock @@ -3578,6 +3586,8 @@ typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t); +typedef boolean_t (*ip_mac_ipcheck_t)(void *, boolean_t, + in6_addr_t *); typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); @@ -3630,6 +3640,12 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ void *idd_tx_fctl_dh; /* mac_client_handle */ } ill_dld_direct_t; +/* IP - DLD direct function call to check if an IP is allowed */ +typedef struct ill_dld_ipcheck_s { + ip_mac_ipcheck_t idi_allowed_df; + void *idi_allowed_dh; +} ill_dld_ipcheck_t; + /* IP - DLD polling capability */ typedef struct ill_dld_poll_s { ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS]; @@ -3641,6 +3657,7 @@ struct ill_dld_capab_s { void *idc_capab_dh; /* dld_str_t *dsp */ ill_dld_direct_t idc_direct; ill_dld_poll_t idc_poll; + ill_dld_ipcheck_t idc_ipcheck; }; /* diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index bcbc1c4949..b4bff4d7b4 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } switch (name) { case IP_TTL: diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 36eb88d743..b1a77ae0cc 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -81,6 +81,7 @@ #include <sys/tsol/tnet.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> #include <sys/disp.h> @@ -1013,6 +1014,12 @@ icmp_close_free(conn_t *connp) icmp->icmp_filter = NULL; } + if (icmp->icmp_bpf_len != 0) { + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + } + /* * Clear any fields which the kmem_cache constructor clears. * Only icmp_connp needs to be preserved. @@ -1966,6 +1973,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) return (err); } +static int +icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp) +{ + struct bpf_program prog; + ip_bpf_insn_t *insns = NULL; + unsigned int size; + +#ifdef _LP64 + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct bpf_program32 *prog32; + + if (inlen != sizeof (struct bpf_program32)) { + return (EINVAL); + } + prog32 = (struct bpf_program32 *)invalp; + prog.bf_len = prog32->bf_len; + prog.bf_insns = (void *)(uint64_t)prog32->bf_insns; + } else +#endif + if (inlen == sizeof (struct bpf_program)) { + bcopy(invalp, &prog, sizeof (prog)); + } else { + return (EINVAL); + } + + if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) { + return (EINVAL); + } + size = prog.bf_len * sizeof (struct bpf_insn); + insns = kmem_alloc(size, KM_SLEEP); + if (copyin(prog.bf_insns, insns, size) != 0) { + kmem_free(insns, size); + return (EFAULT); + } + if (!ip_bpf_validate(insns, prog.bf_len)) { + kmem_free(insns, size); + return (EINVAL); + } + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len != 0) { + ASSERT(icmp->icmp_bpf_prog != NULL); + + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + } + icmp->icmp_bpf_len = size; + icmp->icmp_bpf_prog = insns; + rw_exit(&icmp->icmp_bpf_lock); + return (0); +} + +static int +icmp_detach_filter(icmp_t *icmp) +{ + int error; + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len == 0) { + ASSERT(icmp->icmp_bpf_prog == NULL); + error = ENOENT; + } else { + kmem_free(icmp->icmp_bpf_prog, + icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + error = 0; + } + rw_exit(&icmp->icmp_bpf_lock); + return (error); +} + +static boolean_t +icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira) +{ + boolean_t res; + uchar_t *buf = mp->b_rptr; + uint_t wirelen, len = MBLKL(mp); + + rw_enter(&icmp->icmp_bpf_lock, RW_READER); + if (icmp->icmp_bpf_len == 0) { + rw_exit(&icmp->icmp_bpf_lock); + return (B_FALSE); + } + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)buf; + + wirelen = ntohs(ipha->ipha_length); + } else { + ip6_t *ip6h = (ip6_t *)buf; + + wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len); + rw_exit(&icmp->icmp_bpf_lock); + + return (res); +} + /* * This routine sets socket options. */ @@ -2055,6 +2160,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, return (ENOBUFS); } break; + case SO_ATTACH_FILTER: + return (icmp_attach_filter(icmp, inlen, invalp)); + case SO_DETACH_FILTER: + return (icmp_detach_filter(icmp)); } break; @@ -2600,6 +2709,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) /* Initialize regardless of IP version */ ipps.ipp_fields = 0; + /* Apply socket filter, if needed */ + if (icmp->icmp_bpf_len != 0) { + if (icmp_eval_filter(icmp, mp, ira)) { + freemsg(mp); + return; + } + } + if (ira->ira_flags & IRAF_IS_IPV4) { ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); ASSERT(MBLKL(mp) >= sizeof (ipha_t)); diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index ff0310de0c..d65d3164d3 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -41,6 +42,7 @@ #include <netinet/ip_mroute.h> #include <inet/optcom.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> /* * Table of all known options handled on a ICMP protocol stack. @@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = { 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, + sizeof (struct bpf_program), 0 }, +{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 }, + { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 46272b2b22..5c256729dc 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -4123,6 +4123,8 @@ ip_modclose(ill_t *ill) rw_destroy(&ill->ill_mcast_lock); mutex_destroy(&ill->ill_mcast_serializer); list_destroy(&ill->ill_nce); + cv_destroy(&ill->ill_dlpi_capab_cv); + mutex_destroy(&ill->ill_dlpi_capab_lock); /* * Now we are done with the module close pieces that @@ -8197,7 +8199,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) conn_t *connp = NULL; t_uscalar_t paddrreq; mblk_t *mp_hw; - boolean_t success; boolean_t ioctl_aborted = B_FALSE; boolean_t log = B_TRUE; @@ -8297,7 +8298,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; mutex_exit(&ill->ill_lock); /* - * Something went wrong with the bind. We presumably + * Something went wrong with the bind. If this was the + * result of a DL_NOTE_REPLUMB, then we presumably * have an IOCTL hanging out waiting for completion. * Find it, take down the interface that was coming * up, and complete the IOCTL with the error noted. @@ -8314,6 +8316,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ + } else { + /* + * There's no pending IOCTL, so the bind was + * most likely started by ill_dl_up(). We save + * the error and let it take care of responding + * to the IOCTL. + */ + ill->ill_dl_bind_err = dlea->dl_unix_errno ? + dlea->dl_unix_errno : ENXIO; } break; case DL_ENABMULTI_REQ: @@ -8437,55 +8448,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill); ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); - /* - * Now bring up the resolver; when that is complete, we'll - * create IREs. Note that we intentionally mirror what - * ipif_up() would have done, because we got here by way of - * ill_dl_up(), which stopped ipif_up()'s processing. - */ - if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) - err = ipif_up_done_v6(ipif); - } else if (ill->ill_net_type == IRE_IF_RESOLVER) { - /* - * ARP and other v4 external resolvers. - * Leave the pending mblk intact so that - * the ioctl completes in ip_rput(). - */ - if (connp != NULL) - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - if (connp != NULL) - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - mp1 = ipsq_pending_mp_get(ipsq, &connp); - } else { - /* The conn has started closing */ - err = EINTR; - } - } else { - /* - * This one is complete. Reply to pending ioctl. - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done(ipif); - } - - if ((err == 0) && (ill->ill_up_ipifs)) { + if (ill->ill_up_ipifs) { err = ill_up_ipifs(ill, q, mp1); if (err == EINPROGRESS) { freemsg(mp); @@ -8493,25 +8456,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - /* - * If we have a moved ipif to bring up, and everything has - * succeeded to this point, bring it up on the IPMP ill. - * Otherwise, leave it down -- the admin can try to bring it - * up by hand if need be. - */ - if (ill->ill_move_ipif != NULL) { - if (err != 0) { - ill->ill_move_ipif = NULL; - } else { - ipif = ill->ill_move_ipif; - ill->ill_move_ipif = NULL; - err = ipif_up(ipif, q, mp1); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - } - } break; case DL_NOTIFY_IND: { @@ -9635,12 +9579,18 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req) if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) { return (1); } + if (level == MIB2_UDP) { + goto done; + } } if (level != MIB2_UDP) { if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) { return (1); } + if (level == MIB2_TCP) { + goto done; + } } if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl, @@ -9717,6 +9667,7 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req) if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) { return (1); } +done: freemsg(mpctl); return (1); } @@ -12573,6 +12524,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12693,6 +12645,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12700,9 +12659,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c index c7c241f944..96cc281da5 100644 --- a/usr/src/uts/common/inet/ip/ip6_input.c +++ b/usr/src/uts/common/inet/ip/ip6_input.c @@ -23,6 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -143,11 +144,9 @@ static void ip_input_multicast_v6(ire_t *, mblk_t *, ip6_t *, * The ill will always be valid if this function is called directly from * the driver. * - * If ip_input_v6() is called from GLDv3: - * - * - This must be a non-VLAN IP stream. - * - 'mp' is either an untagged or a special priority-tagged packet. - * - Any VLAN tag that was in the MAC header has been stripped. + * If this chain is part of a VLAN stream, then the VLAN tag is + * stripped from the MAC header before being delivered to this + * function. * * If the IP header in packet is not 32-bit aligned, every message in the * chain will be aligned before further operations. This is required on SPARC @@ -1892,6 +1891,16 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h, return (B_TRUE); } + hck_flags = DB_CKSUMFLAGS(mp); + + if (hck_flags & HW_LOCAL_MAC) { + /* + * The packet is from a same-machine sender in which + * case we assume data integrity. + */ + return (B_TRUE); + } + /* * Revert to software checksum calculation if the interface * isn't capable of checksum offload. @@ -1908,9 +1917,6 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h, * We apply this for all ULP protocols. Does the HW know to * not set the flags for SCTP and other protocols. */ - - hck_flags = DB_CKSUMFLAGS(mp); - if (hck_flags & HCK_FULLCKSUM_OK) { /* * Hardware has already verified the checksum. diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c index b023a2fe6a..dc074454e3 100644 --- a/usr/src/uts/common/inet/ip/ip6_output.c +++ b/usr/src/uts/common/inet/ip/ip6_output.c @@ -23,6 +23,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h, ixa->ixa_raw_cksum_offset); cksum = htons(protocol); } else if (protocol == IPPROTO_ICMPV6) { - cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); - cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */ + /* + * Currently we assume no HW support for ICMP checksum calc. + * + * When HW support is advertised for ICMP, we'll want the + * following to be set: + * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); + * cksum = IP_ICMPV6_CSUM_COMP; Pseudo-header cksum + */ + + return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); } else { ip_hdr_cksum: /* No IP header checksum for IPv6 */ diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 31789fb8de..11a9024053 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, static int ill_alloc_ppa(ill_if_t *, ill_t *); static void ill_delete_interface_type(ill_if_t *); -static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); +static int ill_dl_up(ill_t *ill, ipif_t *ipif); static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); static void ill_down_ipifs(ill_t *, boolean_t); @@ -1380,6 +1380,35 @@ ill_capability_probe(ill_t *ill) ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; } +static boolean_t +ill_capability_wait(ill_t *ill) +{ + /* + * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can + * only be set by someone who is the writer. Since we + * drop-and-reacquire the squeue in this loop, we need to check for + * ILL_CONDEMNED, which if set means nothing can signal our capability + * condition variable. + */ + ASSERT(IAM_WRITER_ILL(ill)); + + while (ill->ill_capab_pending_cnt != 0 && + (ill->ill_state_flags & ILL_CONDEMNED) == 0) { + mutex_enter(&ill->ill_dlpi_capab_lock); + ipsq_exit(ill->ill_phyint->phyint_ipsq); + cv_wait(&ill->ill_dlpi_capab_cv, &ill->ill_dlpi_capab_lock); + mutex_exit(&ill->ill_dlpi_capab_lock); + /* + * If ipsq_enter() fails, someone set ILL_CONDEMNED + * while we dropped the squeue. Indicate such to the caller. + */ + if (!ipsq_enter(ill, B_FALSE, CUR_OP)) + return (B_FALSE); + } + + return ((ill->ill_state_flags & ILL_CONDEMNED) == 0); +} + void ill_capability_reset(ill_t *ill, boolean_t reneg) { @@ -1390,6 +1419,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg) ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; + ASSERT(ill->ill_capab_reset_mp != NULL); + ill_capability_send(ill, ill->ill_capab_reset_mp); ill->ill_capab_reset_mp = NULL; /* @@ -2108,6 +2139,49 @@ ill_capability_lso_enable(ill_t *ill) } } +/* + * Check whether or not mac will prevent us from sending with a given IP + * address. This requires having the IPCHECK capability, which we should + * always be able to successfully negotiate, but if it's somehow missing + * then we just permit the caller to use the address, since mac does the + * actual enforcement and ip is just performing a courtesy check to help + * prevent users from unwittingly setting and attempting to use blocked + * addresses. + */ +static boolean_t +ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr) +{ + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0) + return (B_TRUE); + + ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck; + ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df; + return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr)); +} + +static void +ill_capability_ipcheck_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + ill_dld_ipcheck_t *idi = &idc->idc_ipcheck; + dld_capab_ipcheck_t spoof; + int rc; + + ASSERT(IAM_WRITER_ILL(ill)); + + bzero(&spoof, sizeof (spoof)); + if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + &spoof, DLD_ENABLE)) == 0) { + idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df; + idi->idi_allowed_dh = spoof.ipc_allowed_dh; + ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK; + } else { + cmn_err(CE_WARN, "warning: could not enable IPCHECK " + "capability, rc = %d\n", rc); + DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc); + } +} + static void ill_capability_dld_enable(ill_t *ill) { @@ -2115,15 +2189,15 @@ ill_capability_dld_enable(ill_t *ill) ASSERT(IAM_WRITER_ILL(ill)); - if (ill->ill_isv6) - return; - ill_mac_perim_enter(ill, &mph); if (!ill->ill_isv6) { ill_capability_direct_enable(ill); ill_capability_poll_enable(ill); ill_capability_lso_enable(ill); } + + ill_capability_ipcheck_enable(ill); + ill->ill_capabilities |= ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -2188,6 +2262,15 @@ ill_capability_dld_disable(ill_t *ill) NULL, DLD_DISABLE); } + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) { + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL); + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL); + + ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK; + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + NULL, DLD_DISABLE); + } + ill->ill_capabilities &= ~ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -3430,6 +3513,9 @@ ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback, ill->ill_max_buf = ND_MAX_Q; ill->ill_refcnt = 0; + cv_init(&ill->ill_dlpi_capab_cv, NULL, NULL, NULL); + mutex_init(&ill->ill_dlpi_capab_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); } @@ -9677,7 +9763,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, in6_addr_t v6addr; boolean_t need_up = B_FALSE; ill_t *ill; - int i; ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -9752,20 +9837,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); } - /* - * verify that the address being configured is permitted by the - * ill_allowed_ips[] for the interface. - */ - if (ill->ill_allowed_ips_cnt > 0) { - for (i = 0; i < ill->ill_allowed_ips_cnt; i++) { - if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i], - &v6addr)) - break; - } - if (i == ill->ill_allowed_ips_cnt) { - pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr); - return (EPERM); - } + /* verify that the address being configured is permitted by mac */ + if (!ill_ipcheck_addr(ill, &v6addr)) { + return (EPERM); } /* * Even if there is no change we redo things just to rerun @@ -12705,6 +12779,12 @@ ill_dl_down(ill_t *ill) } ill->ill_unbind_mp = NULL; + + mutex_enter(&ill->ill_lock); + ill->ill_dl_up = 0; + ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); + mutex_exit(&ill->ill_lock); + if (mp != NULL) { ip1dbg(("ill_dl_down: %s (%u) for %s\n", dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, @@ -12727,11 +12807,13 @@ ill_dl_down(ill_t *ill) ill_capability_dld_disable(ill); ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); + + /* + * Wait for the capability reset to finish. + * In this case, it doesn't matter WHY or HOW it finished. + */ + (void) ill_capability_wait(ill); } - mutex_enter(&ill->ill_lock); - ill->ill_dl_up = 0; - ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); - mutex_exit(&ill->ill_lock); } void @@ -12860,6 +12942,10 @@ ill_capability_done(ill_t *ill) if (ill->ill_capab_pending_cnt == 0 && ill->ill_dlpi_capab_state == IDCS_OK) ill_capability_reset_alloc(ill); + + mutex_enter(&ill->ill_dlpi_capab_lock); + cv_broadcast(&ill->ill_dlpi_capab_cv); + mutex_exit(&ill->ill_dlpi_capab_lock); } /* @@ -14481,7 +14567,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) * address/netmask etc cause a down/up dance, but * does not cause an unbind (DL_UNBIND) with the driver */ - return (ill_dl_up(ill, ipif, mp, q)); + if ((err = ill_dl_up(ill, ipif)) != 0) { + return (err); + } + } + + /* Reject bringing up interfaces with unusable IP addresses */ + if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) { + return (EPERM); } /* @@ -14594,24 +14687,22 @@ ill_delete_ires(ill_t *ill) /* * Perform a bind for the physical device. - * When the routine returns EINPROGRESS then mp has been consumed and - * the ioctl will be acked from ip_rput_dlpi. - * Allocate an unbind message and save it until ipif_down. + * + * When the routine returns successfully then dlpi has been bound and + * capabilities negotiated. An unbind message will have been allocated + * for later use in ipif_down. */ static int -ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) +ill_dl_up(ill_t *ill, ipif_t *ipif) { mblk_t *bind_mp = NULL; mblk_t *unbind_mp = NULL; - conn_t *connp; - boolean_t success; int err; DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(mp != NULL); /* * Make sure we have an IRE_MULTICAST in case we immediately @@ -14646,19 +14737,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) if (unbind_mp == NULL) goto bad; } - /* - * Record state needed to complete this operation when the - * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. - */ - connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; - ASSERT(connp != NULL || !CONN_Q(q)); - GRAB_CONN_LOCK(q); - mutex_enter(&ipif->ipif_ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); - mutex_exit(&ipif->ipif_ill->ill_lock); - RELEASE_CONN_LOCK(q); - if (!success) - goto bad; /* * Save the unbind message for ill_dl_down(); it will be consumed when @@ -14670,6 +14748,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) ill_dlpi_send(ill, bind_mp); /* Send down link-layer capabilities probe if not already done. */ ill_capability_probe(ill); + /* + * Wait for DLPI to be bound and the capability probe to finish. + * The call drops-and-reacquires the squeue. If it couldn't because + * ILL_CONDEMNED got set, bail. + */ + if (!ill_capability_wait(ill)) + return (ENXIO); + + /* DLPI failed to bind. Return the saved error */ + if (!ill->ill_dl_up) { + return (ill->ill_dl_bind_err); + } /* * Sysid used to rely on the fact that netboots set domainname @@ -14687,11 +14777,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) cmn_err(CE_WARN, "no cached dhcp response"); } - /* - * This operation will complete in ip_rput_dlpi with either - * a DL_BIND_ACK or DL_ERROR_ACK. - */ - return (EINPROGRESS); + return (0); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c index 6aa70b014a..e2e7dca22c 100644 --- a/usr/src/uts/common/inet/ip/ip_input.c +++ b/usr/src/uts/common/inet/ip/ip_input.c @@ -23,6 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -56,6 +57,7 @@ #include <sys/vtrace.h> #include <sys/isa_defs.h> #include <sys/mac.h> +#include <sys/mac_client.h> #include <net/if.h> #include <net/if_arp.h> #include <net/route.h> @@ -146,11 +148,9 @@ static void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *, * The ill will always be valid if this function is called directly from * the driver. * - * If ip_input() is called from GLDv3: - * - * - This must be a non-VLAN IP stream. - * - 'mp' is either an untagged or a special priority-tagged packet. - * - Any VLAN tag that was in the MAC header has been stripped. + * If this chain is part of a VLAN stream, then the VLAN tag is + * stripped from the MAC header before being delivered to this + * function. * * If the IP header in packet is not 32-bit aligned, every message in the * chain will be aligned before further operations. This is required on SPARC @@ -660,11 +660,13 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, } /* - * If there is a good HW IP header checksum we clear the need + * If the packet originated from a same-machine sender or + * there is a good HW IP header checksum, we clear the need * look at the IP header checksum. */ - if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && - ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { + if ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) || + ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && + ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) { /* Header checksum was ok. Clear the flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; @@ -2241,6 +2243,17 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, /* No ULP checksum to verify. */ return (B_TRUE); } + + hck_flags = DB_CKSUMFLAGS(mp); + + if (hck_flags & HW_LOCAL_MAC) { + /* + * The packet is from a same-machine sender in which + * case we assume data integrity. + */ + return (B_TRUE); + } + /* * Revert to software checksum calculation if the interface * isn't capable of checksum offload. @@ -2257,9 +2270,6 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, * We apply this for all ULP protocols. Does the HW know to * not set the flags for SCTP and other protocols. */ - - hck_flags = DB_CKSUMFLAGS(mp); - if (hck_flags & HCK_FULLCKSUM_OK) { /* * Hardware has already verified the checksum. diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c index ea69412933..169859707e 100644 --- a/usr/src/uts/common/inet/ip/ip_output.c +++ b/usr/src/uts/common/inet/ip/ip_output.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1737,6 +1738,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, #endif sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); goto ip_hdr_cksum; + } else if (protocol == IPPROTO_ICMP) { + /* + * Note that we always calculate a SW checksum for ICMP. In the + * future, if HW support for ICMP is advertised, we can change + * this. + */ + return (ip_output_sw_cksum_v4(mp, ipha, ixa)); } else { ip_hdr_cksum: /* Calculate IPv4 header checksum */ diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index ee7c7b0f1d..b6565d9c1f 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* @@ -101,10 +102,6 @@ * * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or * /dev/ip. - * - * ip_squeue_worker_wait: global value for the sq_wait field for all squeues * - * created. This is the time squeue code waits before waking up the worker - * thread after queuing a request. */ #include <sys/types.h> @@ -142,13 +139,6 @@ kmutex_t sqset_lock; static void (*ip_squeue_create_callback)(squeue_t *) = NULL; -/* - * ip_squeue_worker_wait: global value for the sq_wait field for all squeues - * created. This is the time squeue code waits before waking up the worker - * thread after queuing a request. - */ -uint_t ip_squeue_worker_wait = 10; - static squeue_t *ip_squeue_create(pri_t); static squeue_set_t *ip_squeue_set_create(processorid_t); static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); @@ -163,7 +153,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(ip_squeue_worker_wait, pri); + sqp = squeue_create(pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index bc2173ff24..a59027801f 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -2092,6 +2117,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) connp->conn_flags = IPCL_RAWIPCONN; connp->conn_proto = IPPROTO_ICMP; icmp->icmp_connp = connp; + rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL); rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); if (connp->conn_ixa == NULL) @@ -2116,6 +2142,7 @@ rawip_conn_destructor(void *buf, void *cdrarg) mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); rw_destroy(&connp->conn_ilg_lock); + rw_destroy(&icmp->icmp_bpf_lock); /* Can be NULL if constructor failed */ if (connp->conn_ixa != NULL) { diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c index 40d5078526..44ebb21db3 100644 --- a/usr/src/uts/common/inet/ip/sadb.c +++ b/usr/src/uts/common/inet/ip/sadb.c @@ -3767,7 +3767,8 @@ sadb_expire_assoc(queue_t *pfkey_q, ipsa_t *assoc) } alloclen = sizeof (*samsg) + sizeof (*current) + sizeof (*expire) + - 2 * sizeof (sadb_address_t) + sizeof (*saext); + 2 * sizeof (sadb_address_t) + sizeof (*saext) + + sizeof (sadb_x_kmc_t); af = assoc->ipsa_addrfam; switch (af) { @@ -3896,6 +3897,10 @@ sadb_expire_assoc(queue_t *pfkey_q, ipsa_t *assoc) ASSERT(mp->b_wptr != NULL); } + mp->b_wptr = sadb_make_kmc_ext(mp->b_wptr, end, assoc->ipsa_kmp, + assoc->ipsa_kmc); + ASSERT(mp->b_wptr != NULL); + /* Can just putnext, we're ready to go! */ putnext(pfkey_q, mp1); } diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 2b37528eb9..fc90e6f217 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #ifndef _INET_IP_IMPL_H @@ -159,9 +160,27 @@ extern "C" { #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -/* This macro is used by the mac layer */ +/* + * Determine if a mblk needs to take the "slow path", aka OTH + * softring. There are multiple reasons why a mblk might take the slow + * path. + * + * o The mblk is not a data message. + * + * o There is more than one outstanding reference to the mblk and it + * does not originate from a local MAC client. If the mblk does + * originate from a local MAC then allow it to pass through with + * more than one reference and leave the copying up to the consumer. + * + * o The IP header is not aligned (we assume alignment in the checksum + * routine). + * + * o The mblk doesn't contain enough data to populate a simple IP header. + */ #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ - (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \ + (DB_TYPE(mp) != M_DATA || \ + (DB_REF(mp) != 1 && ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) == 0)) || \ + !OK_32PTR(ipha) || \ (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr)) /* diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index f6466434f6..c3139d9288 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _INET_IPCLASSIFIER_H @@ -293,7 +294,8 @@ struct conn_s { conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_pad_to_bit_31 : 12; + conn_reuseport : 1, /* SO_REUSEPORT state */ + conn_pad_to_bit_31 : 11; boolean_t conn_blocked; /* conn is flow-controlled */ diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c index d1c5dfdb9b..25e0b699c5 100644 --- a/usr/src/uts/common/inet/ipd/ipd.c +++ b/usr/src/uts/common/inet/ipd/ipd.c @@ -9,7 +9,7 @@ * http://www.illumos.org/license/CDDL. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -222,7 +222,7 @@ typedef struct ipd_netstack { net_handle_t ipdn_v6hdl; /* IPv4 net handle */ int ipdn_hooked; /* are hooks registered */ hook_t *ipdn_v4in; /* IPv4 traffic in hook */ - hook_t *ipdn_v4out; /* IPv4 traffice out hook */ + hook_t *ipdn_v4out; /* IPv4 traffic out hook */ hook_t *ipdn_v6in; /* IPv6 traffic in hook */ hook_t *ipdn_v6out; /* IPv6 traffic out hook */ int ipdn_enabled; /* which perturbs are on */ @@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay) /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the - * request to enable corruption. + * request to enable packet delays. */ if (rval != 0) ins->ipdn_delay = 0; diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index f958ca2261..4cb67a2dab 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #if !defined(lint) @@ -22,11 +22,13 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21 #include <sys/filio.h> #include <sys/systm.h> #include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/cred.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/ksynch.h> #include <sys/kmem.h> +#include <sys/mac_provider.h> #include <sys/mkdev.h> #include <sys/protosw.h> #include <sys/socket.h> @@ -83,9 +85,27 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); + +static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *)); +static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t, + void *)); + extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); +static int ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *, + const char *, const char *, const char *)); +static int ipf_hook_instance_notify __P((hook_notify_cmd_t, void *, + const char *, const char *, const char *)); + #if SOLARIS2 < 10 #if SOLARIS2 >= 7 u_int *ip_ttl_ptr = NULL; @@ -152,6 +172,22 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; char *hook6_loop_out = "ipfilter_hook6_loop_out"; char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; + +/* viona hook names */ +char *hook_viona_in = "ipfilter_hookviona_in"; +char *hook_viona_in_gz = "ipfilter_hookviona_in_gz"; +char *hook_viona_out = "ipfilter_hookviona_out"; +char *hook_viona_out_gz = "ipfilter_hookviona_out_gz"; + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -248,8 +284,65 @@ ipf_stack_t *ifs; ifs->ifs_ipf_ipv4 = NULL; } + /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + + /* + * Remove notification of viona hooks + */ + net_instance_notify_unregister(ifs->ifs_netid, + ipf_hook_instance_notify); + #undef UNDO_HOOK + /* + * Normally, viona will unregister itself before ipldetach() is called, + * so these will be no-ops, but out of caution, we try to make sure + * we've removed any of our references. + */ + (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, + NH_PHYSICAL_IN); + (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, + NH_PHYSICAL_OUT); + + { + char netidstr[12]; /* Large enough for INT_MAX + NUL */ + (void) snprintf(netidstr, sizeof (netidstr), "%d", + ifs->ifs_netid); + + /* + * The notify callbacks expect the netid value passed as a + * string in the third argument. To prevent confusion if + * traced, we pass the same value the nethook framework would + * pass, even though the callback does not currently use the + * value. + */ + (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr, + NULL, Hn_VIONA); + } + #ifdef IPFDEBUG cmn_err(CE_CONT, "ipldetach()\n"); #endif @@ -445,6 +538,64 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + + /* + * VIONA INET hooks. While the nethook framework allows us to register + * hooks for events that haven't been registered yet, we instead + * register and unregister our hooks in response to notifications + * about the viona hooks from the nethook framework. This prevents + * problems when the viona module gets unloaded while the ipf module + * does not. If we do not unregister our hooks after the viona module + * is unloaded, the viona module cannot later re-register them if it + * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded + * even on DEBUG kernels, they do not experience this issue. + */ + if (net_instance_notify_register(id, ipf_hook_instance_notify, + ifs) != 0) + goto hookup_failed; + + /* * Reacquire ipf_global, now it is safe. */ WRITE_ENTER(&ifs->ifs_ipf_global); @@ -507,6 +658,155 @@ hookup_failed: return -1; } +/* ------------------------------------------------------------------------ */ +/* + * Called whenever a nethook protocol is registered or unregistered. Currently + * only used to add or remove the hooks for viona. + * + * While the function signature requires returning int, nothing + * in usr/src/uts/common/io/hook.c that invokes the callbacks + * captures the return value (nor is there currently any documentation + * on what return values should be). For now at least, we'll return 0 + * on success (or 'not applicable') or an error value. Even if the + * nethook framework doesn't use the return address, it can be observed via + * dtrace if needed. + */ +static int +ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy __unused, const char *he_name) +{ + ipf_stack_t *ifs = arg; + hook_t **hookpp; + char *hook_name, *hint_name; + hook_func_t hookfn; + boolean_t *hookedp; + hook_hint_t hint; + boolean_t out; + int ret = 0; + + const boolean_t gz = ifs->ifs_gz_controlled; + + /* We currently only care about viona hooks notifications */ + if (strcmp(name, Hn_VIONA) != 0) + return (0); + + if (strcmp(he_name, NH_PHYSICAL_IN) == 0) { + out = B_FALSE; + } else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) { + out = B_TRUE; + } else { + /* + * If we've added more hook events to viona, we must add + * the corresponding handling here (even if it's just to + * ignore it) to prevent the firewall from not working as + * intended. + */ + cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__, + he_name); + + return (0); + } + + if (out) { + hookpp = &ifs->ifs_ipfhookviona_out; + hookfn = ipf_hookviona_out; + hookedp = &ifs->ifs_hookviona_physical_out; + name = gz ? hook_viona_out_gz : hook_viona_out; + hint = gz ? HH_AFTER : HH_BEFORE; + hint_name = gz ? hook_viona_out : hook_viona_out_gz; + } else { + hookpp = &ifs->ifs_ipfhookviona_in; + hookfn = ipf_hookviona_in; + hookedp = &ifs->ifs_hookviona_physical_in; + name = gz ? hook_viona_in_gz : hook_viona_in; + hint = gz ? HH_BEFORE : HH_AFTER; + hint_name = gz ? hook_viona_in : hook_viona_in_gz; + } + + switch (command) { + default: + case HN_NONE: + break; + case HN_REGISTER: + HOOK_INIT(*hookpp, hookfn, (char *)name, ifs); + (*hookpp)->h_hint = hint; + (*hookpp)->h_hintvalue = (uintptr_t)hint_name; + ret = net_hook_register(ifs->ifs_ipf_viona, + (char *)he_name, *hookpp); + if (ret != 0) { + cmn_err(CE_NOTE, "%s: could not register hook " + "(hook family=%s hook=%s) err=%d", __func__, + name, he_name, ret); + *hookedp = B_FALSE; + return (ret); + } + *hookedp = B_TRUE; + break; + case HN_UNREGISTER: + if (ifs->ifs_ipf_viona == NULL) + break; + + ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona, + (char *)he_name, *hookpp) : 0; + if ((ret == 0 || ret == ENXIO)) { + if (*hookpp != NULL) { + hook_free(*hookpp); + *hookpp = NULL; + } + *hookedp = B_FALSE; + } + break; + } + + return (ret); +} + +/* + * Called whenever a new nethook instance is created. Currently only used + * with the Hn_VIONA nethooks. Similar to ipf_hook_protocol_notify, the out + * function signature must return an int, though the result is never used. + * We elect to return 0 on success (or not applicable) or a non-zero value + * on error. + */ +static int +ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy __unused, const char *instance) +{ + ipf_stack_t *ifs = arg; + int ret = 0; + + /* We currently only care about viona hooks */ + if (strcmp(instance, Hn_VIONA) != 0) + return (0); + + switch (command) { + case HN_NONE: + default: + return (0); + case HN_REGISTER: + ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid, + NHF_VIONA); + + if (ifs->ifs_ipf_viona == NULL) + return (EPROTONOSUPPORT); + + ret = net_protocol_notify_register(ifs->ifs_ipf_viona, + ipf_hook_protocol_notify, ifs); + VERIFY(ret == 0 || ret == ESHUTDOWN); + break; + case HN_UNREGISTER: + if (ifs->ifs_ipf_viona == NULL) + break; + VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona, + ipf_hook_protocol_notify)); + VERIFY0(net_protocol_release(ifs->ifs_ipf_viona)); + ifs->ifs_ipf_viona = NULL; + break; + } + + return (ret); +} + static int fr_setipfloopback(set, ifs) int set; ipf_stack_t *ifs; @@ -1011,7 +1311,6 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; - /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -1715,8 +2014,7 @@ int len; * Need to preserve checksum information by copying them * to newmp which heads the pulluped message. */ - hcksum_retrieve(m, NULL, NULL, &start, &stuff, &end, - &value, &flags); + mac_hcksum_get(m, &start, &stuff, &end, &value, &flags); if (pullupmsg(m, len + ipoff + inc) == 0) { ATOMIC_INCL(ifs->ifs_frstats[out].fr_pull[1]); @@ -1729,8 +2027,7 @@ int len; return NULL; } - (void) hcksum_assoc(m, NULL, NULL, start, stuff, end, - value, flags, 0); + mac_hcksum_set(m, start, stuff, end, value, flags); m->b_prev = m2; m->b_rptr += inc; @@ -1856,8 +2153,12 @@ frdest_t *fdp; return (-1); } - /* Check the src here, fin_ifp is the src interface. */ - if (!fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) + /* + * If we're forwarding (vs. injecting), check the src here, fin_ifp is + * the src interface. + */ + if (fdp != NULL && + !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) return (-1); inj = net_inject_alloc(NETINFO_VERSION); @@ -1924,8 +2225,8 @@ frdest_t *fdp; inj->ni_physical = net_routeto(net_data_p, sinp, NULL); } - /* we're checking the destinatation here */ - if (!fr_forwarding_enabled(inj->ni_physical, net_data_p)) + /* If we're forwarding (vs. injecting), check the destinatation here. */ + if (fdp != NULL && !fr_forwarding_enabled(inj->ni_physical, net_data_p)) goto bad_fastroute; /* @@ -2045,6 +2346,160 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* Static constants used by ipf_hook_ether */ +static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; +static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* ------------------------------------------------------------------------ */ +/* Function: ipf_hook_ether */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: token(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The ipf_hook_ether hook is currently private to illumos. It represents */ +/* a layer 2 datapath generally used by virtual machines. Currently the */ +/* hook is only used by the viona driver to pass along L2 frames for */ +/* inspection. It requires that the L2 ethernet header is contained within */ +/* a single dblk_t (however layers above the L2 header have no restrctions */ +/* in ipf). ipf does not currently support filtering on L2 fields (e.g. */ +/* filtering on a MAC address or ethertype), however virtual machines do */ +/* not have native IP stack instances where ipf traditionally hooks in. */ +/* Instead this entry point is used to determine if the packet is unicast, */ +/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the */ +/* traditional ip hooks for filtering. Non IPv4 or non IPv6 packets are */ +/* not subject to examination. */ +/* ------------------------------------------------------------------------ */ +int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, + boolean_t out) +{ + struct ether_header *ethp; + hook_pkt_event_t *hpe = (hook_pkt_event_t *)info; + mblk_t *mp; + size_t offset, len; + uint16_t etype; + boolean_t v6; + + /* + * viona will only pass us mblks with the L2 header contained in a + * single data block. + */ + mp = *hpe->hpe_mp; + len = MBLKL(mp); + + VERIFY3S(len, >=, sizeof (struct ether_header)); + + ethp = (struct ether_header *)mp->b_rptr; + if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) { + struct ether_vlan_header *evh = + (struct ether_vlan_header *)ethp; + + VERIFY3S(len, >=, sizeof (struct ether_vlan_header)); + + etype = ntohs(evh->ether_type); + offset = sizeof (*evh); + } else { + offset = sizeof (*ethp); + } + + /* + * ipf only support filtering IPv4 and IPv6. Ignore other types. + */ + if (etype == ETHERTYPE_IP) + v6 = B_FALSE; + else if (etype == ETHERTYPE_IPV6) + v6 = B_TRUE; + else + return (0); + + if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0) + hpe->hpe_flags |= HPE_BROADCAST; + else if (bcmp(ipf_eth_ipv4_mcast, ethp, + sizeof (ipf_eth_ipv4_mcast)) == 0) + hpe->hpe_flags |= HPE_MULTICAST; + else if (bcmp(ipf_eth_ipv6_mcast, ethp, + sizeof (ipf_eth_ipv6_mcast)) == 0) + hpe->hpe_flags |= HPE_MULTICAST; + + /* Find the start of the IPv4 or IPv6 header */ + for (; offset >= len; len = MBLKL(mp)) { + offset -= len; + mp = mp->b_cont; + if (mp == NULL) { + freemsg(*hpe->hpe_mp); + *hpe->hpe_mp = NULL; + return (-1); + } + } + hpe->hpe_mb = mp; + hpe->hpe_hdr = mp->b_rptr + offset; + + return (v6 ? ipf_hook6(info, out, 0, arg) : + ipf_hook(info, out, 0, arg)); +} + +/* ------------------------------------------------------------------------ */ +/* Function: ipf_hookviona_{in,out} */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The viona hooks are private hooks to illumos. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. */ +/* along L2 packets. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +int +ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return (ipf_hook_ether(token, info, arg, B_FALSE)); +} + +int +ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return (ipf_hook_ether(token, info, arg, B_TRUE)); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hook4_loop_in */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ @@ -2387,7 +2842,7 @@ fr_info_t *fin; #ifdef USE_INET6 struct in6_addr tmp_src6; #endif - + ASSERT(fin->fin_p == IPPROTO_TCP); /* @@ -2429,7 +2884,7 @@ fr_info_t *fin; #endif if (tcp != NULL) { - /* + /* * Adjust TCP header: * swap ports, * set flags, diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index a239f1c1ca..5c156e9c44 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. */ #ifndef __IPF_STACK_H__ @@ -87,8 +87,8 @@ struct ipf_stack { #endif int ifs_ipf_locks_done; - ipftoken_t *ifs_ipftokenhead; - ipftoken_t **ifs_ipftokentail; + ipftoken_t *ifs_ipftokenhead; + ipftoken_t **ifs_ipftokentail; ipfmutex_t ifs_ipl_mutex; ipfmutex_t ifs_ipf_authmx; @@ -126,6 +126,14 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; + + hook_t *ifs_ipfhookviona_in; + hook_t *ifs_ipfhookviona_out; + /* flags to indicate whether hooks are registered. */ boolean_t ifs_hook4_physical_in; boolean_t ifs_hook4_physical_out; @@ -137,10 +145,19 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; + boolean_t ifs_hookviona_physical_in; + boolean_t ifs_hookviona_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; + net_handle_t ifs_ipf_viona; /* ip_auth.c */ int ifs_fr_authsize; @@ -167,8 +184,8 @@ struct ipf_stack { ipfr_t **ifs_ipfr_nattail; ipfr_t **ifs_ipfr_nattab; - ipfr_t *ifs_ipfr_ipidlist; - ipfr_t **ifs_ipfr_ipidtail; + ipfr_t *ifs_ipfr_ipidlist; + ipfr_t **ifs_ipfr_ipidtail; ipfr_t **ifs_ipfr_ipidtab; ipfrstat_t ifs_ipfr_stats; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5d56debc31 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index f6b6b996a8..847ad1c560 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -20,7 +20,10 @@ * * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ -/* Copyright (c) 1990 Mentat Inc. */ +/* + * Copyright (c) 1990 Mentat Inc. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + */ #ifndef _INET_MIB2_H #define _INET_MIB2_H @@ -1354,25 +1357,46 @@ typedef struct mib2_tcpConnEntry { /* remote port for this connection { tcpConnEntry 5 } */ int tcpConnRemPort; /* In host byte order */ struct tcpConnEntryInfo_s { - /* seq # of next segment to send */ + Counter64 ce_in_data_inorder_bytes; + Counter64 ce_in_data_inorder_segs; + Counter64 ce_in_data_unorder_bytes; + Counter64 ce_in_data_unorder_segs; + Counter64 ce_in_zwnd_probes; + + Counter64 ce_out_data_bytes; + Counter64 ce_out_data_segs; + Counter64 ce_out_retrans_bytes; + Counter64 ce_out_retrans_segs; + Counter64 ce_out_zwnd_probes; + Counter64 ce_rtt_sum; + + /* seq # of next segment to send */ Gauge ce_snxt; /* seq # of of last segment unacknowledged */ Gauge ce_suna; - /* currect send window size */ + /* current send window size */ Gauge ce_swnd; + /* current congestion window size */ + Gauge ce_cwnd; /* seq # of next expected segment */ Gauge ce_rnxt; /* seq # of last ack'd segment */ Gauge ce_rack; - /* currenct receive window size */ + /* # of unsent bytes in the xmit queue */ + Gauge ce_unsent; + /* current receive window size */ Gauge ce_rwnd; - /* current rto (retransmit timeout) */ + /* round-trip time smoothed average (us) */ + Gauge ce_rtt_sa; + /* current rto (retransmit timeout) */ Gauge ce_rto; - /* current max segment size */ + /* round-trip time count */ + Gauge ce_rtt_cnt; + /* current max segment size */ Gauge ce_mss; /* actual internal state */ int ce_state; - } tcpConnEntryInfo; + } tcpConnEntryInfo; /* pid of the processes that created this connection */ uint32_t tcpConnCreationProcess; @@ -1408,26 +1432,7 @@ typedef struct mib2_tcp6ConnEntry { DeviceIndex tcp6ConnIfIndex; /* state of tcp6 connection { ipv6TcpConnEntry 6 } RW */ int tcp6ConnState; - struct tcp6ConnEntryInfo_s { - /* seq # of next segment to send */ - Gauge ce_snxt; - /* seq # of of last segment unacknowledged */ - Gauge ce_suna; - /* currect send window size */ - Gauge ce_swnd; - /* seq # of next expected segment */ - Gauge ce_rnxt; - /* seq # of last ack'd segment */ - Gauge ce_rack; - /* currenct receive window size */ - Gauge ce_rwnd; - /* current rto (retransmit timeout) */ - Gauge ce_rto; - /* current max segment size */ - Gauge ce_mss; - /* actual internal state */ - int ce_state; - } tcp6ConnEntryInfo; + struct tcpConnEntryInfo_s tcp6ConnEntryInfo; /* pid of the processes that created this connection */ uint32_t tcp6ConnCreationProcess; diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 6fb72d1d08..ddb482db78 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -43,6 +44,7 @@ extern "C" { #include <inet/ip.h> #include <inet/optcom.h> #include <inet/tunables.h> +#include <inet/bpf.h> /* * ICMP stack instances @@ -84,6 +86,10 @@ typedef struct icmp_s { mblk_t *icmp_fallback_queue_head; mblk_t *icmp_fallback_queue_tail; struct sockaddr_storage icmp_delayed_addr; + + krwlock_t icmp_bpf_lock; /* protects icmp_bpf */ + ip_bpf_insn_t *icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */ + uint_t icmp_bpf_len; } icmp_t; /* diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c index 586d7f06f8..76191e93b8 100644 --- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c +++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -51,6 +51,7 @@ #include <sys/mac_client.h> #include <sys/mac_provider.h> #include <sys/mac_client_priv.h> +#include <inet/bpf.h> #include <netpacket/packet.h> @@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag) buffer = (uchar_t *)mp; } rw_enter(&ps->ps_bpflock, RW_READER); - if (bpf_filter(ps->ps_bpf.bf_insns, buffer, + if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer, hdr.mhi_pktsize, buflen) == 0) { rw_exit(&ps->ps_bpflock); ps->ps_stats.tp_drops++; @@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, const void *optval, socklen_t optlen) { struct bpf_program prog; - struct bpf_insn *fcode; + ip_bpf_insn_t *fcode; struct pfpsock *ps; struct sock_proto_props sopp; int error = 0; @@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, return (EFAULT); } - if (bpf_validate(fcode, (int)prog.bf_len)) { + if (ip_bpf_validate(fcode, prog.bf_len)) { rw_enter(&ps->ps_bpflock, RW_WRITER); pfp_release_bpf(ps); - ps->ps_bpf.bf_insns = fcode; + ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode; ps->ps_bpf.bf_len = size; rw_exit(&ps->ps_bpflock); diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 2e08dc359b..a1c0dbe697 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -23,7 +23,7 @@ */ /* - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -132,21 +136,20 @@ #include <sys/squeue_impl.h> -static void squeue_fire(void *); static void squeue_drain(squeue_t *, uint_t, hrtime_t); static void squeue_worker(squeue_t *sqp); static void squeue_polling_thread(squeue_t *sqp); +static void squeue_worker_wakeup(squeue_t *sqp); +static void squeue_try_drain_one(squeue_t *, conn_t *); kmem_cache_t *squeue_cache; #define SQUEUE_MSEC_TO_NSEC 1000000 int squeue_drain_ms = 20; -int squeue_workerwait_ms = 0; /* The values above converted to ticks or nano seconds */ -static int squeue_drain_ns = 0; -static int squeue_workerwait_tick = 0; +static uint_t squeue_drain_ns = 0; uintptr_t squeue_drain_stack_needed = 10240; uint_t squeue_drain_stack_toodeep; @@ -239,19 +242,16 @@ squeue_init(void) sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0); squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC; - squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms); } -/* ARGSUSED */ squeue_t * -squeue_create(clock_t wait, pri_t pri) +squeue_create(pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); bzero(sqp, sizeof (squeue_t)); sqp->sq_bind = PBIND_NONE; sqp->sq_priority = pri; - sqp->sq_wait = MSEC_TO_TICK(wait); sqp->sq_worker = thread_create(NULL, 0, squeue_worker, sqp, 0, &p0, TS_RUN, pri); @@ -260,11 +260,36 @@ squeue_create(clock_t wait, pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -309,97 +334,6 @@ squeue_unbind(squeue_t *sqp) mutex_exit(&sqp->sq_lock); } -void -squeue_worker_wakeup(squeue_t *sqp) -{ - timeout_id_t tid = (sqp)->sq_tid; - - ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); - - if (sqp->sq_wait == 0) { - ASSERT(tid == 0); - ASSERT(!(sqp->sq_state & SQS_TMO_PROG)); - sqp->sq_awaken = ddi_get_lbolt(); - cv_signal(&sqp->sq_worker_cv); - mutex_exit(&sqp->sq_lock); - return; - } - - /* - * Queue isn't being processed, so take - * any post enqueue actions needed before leaving. - */ - if (tid != 0) { - /* - * Waiting for an enter() to process mblk(s). - */ - clock_t now = ddi_get_lbolt(); - clock_t waited = now - sqp->sq_awaken; - - if (TICK_TO_MSEC(waited) >= sqp->sq_wait) { - /* - * Times up and have a worker thread - * waiting for work, so schedule it. - */ - sqp->sq_tid = 0; - sqp->sq_awaken = now; - cv_signal(&sqp->sq_worker_cv); - mutex_exit(&sqp->sq_lock); - (void) untimeout(tid); - return; - } - mutex_exit(&sqp->sq_lock); - return; - } else if (sqp->sq_state & SQS_TMO_PROG) { - mutex_exit(&sqp->sq_lock); - return; - } else { - clock_t wait = sqp->sq_wait; - /* - * Wait up to sqp->sq_wait ms for an - * enter() to process this queue. We - * don't want to contend on timeout locks - * with sq_lock held for performance reasons, - * so drop the sq_lock before calling timeout - * but we need to check if timeout is required - * after re acquiring the sq_lock. Once - * the sq_lock is dropped, someone else could - * have processed the packet or the timeout could - * have already fired. - */ - sqp->sq_state |= SQS_TMO_PROG; - mutex_exit(&sqp->sq_lock); - tid = timeout(squeue_fire, sqp, wait); - mutex_enter(&sqp->sq_lock); - /* Check again if we still need the timeout */ - if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) == - SQS_TMO_PROG) && (sqp->sq_tid == 0) && - (sqp->sq_first != NULL)) { - sqp->sq_state &= ~SQS_TMO_PROG; - sqp->sq_tid = tid; - mutex_exit(&sqp->sq_lock); - return; - } else { - if (sqp->sq_state & SQS_TMO_PROG) { - sqp->sq_state &= ~SQS_TMO_PROG; - mutex_exit(&sqp->sq_lock); - (void) untimeout(tid); - } else { - /* - * The timer fired before we could - * reacquire the sq_lock. squeue_fire - * removes the SQS_TMO_PROG flag - * and we don't need to do anything - * else. - */ - mutex_exit(&sqp->sq_lock); - } - } - } - - ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); -} - /* * squeue_enter() - enter squeue sqp with mblk mp (which can be * a chain), while tail points to the end and cnt in number of @@ -475,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -497,23 +434,28 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, sqp->sq_run = NULL; if (sqp->sq_first == NULL || process_flag == SQ_NODRAIN) { - if (sqp->sq_first != NULL) { - squeue_worker_wakeup(sqp); - return; + /* + * Even if SQ_NODRAIN was specified, it may + * still be best to process a single queued + * item if it matches the active connection. + */ + if (sqp->sq_first != NULL && sqp->sq_isip) { + squeue_try_drain_one(sqp, connp); } + /* - * We processed inline our packet and nothing - * new has arrived. We are done. In case any - * control actions are pending, wake up the - * worker. + * If work or control actions are pending, wake + * up the worker thread. */ - if (sqp->sq_state & SQS_WORKER_THR_CONTROL) - cv_signal(&sqp->sq_worker_cv); + if (sqp->sq_first != NULL || + sqp->sq_state & SQS_WORKER_THR_CONTROL) { + squeue_worker_wakeup(sqp); + } mutex_exit(&sqp->sq_lock); return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -565,10 +507,9 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * up the worker. */ sqp->sq_run = NULL; - if (sqp->sq_state & SQS_WORKER_THR_CONTROL) - cv_signal(&sqp->sq_worker_cv); - mutex_exit(&sqp->sq_lock); - return; + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) { + squeue_worker_wakeup(sqp); + } } else { /* * We let a thread processing a squeue reenter only @@ -587,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -602,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -631,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -657,54 +605,33 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, tail = mp = attrmp; } ENQUEUE_CHAIN(sqp, mp, tail, cnt); - if (!(sqp->sq_state & SQS_PROC)) { - squeue_worker_wakeup(sqp); - return; - } /* - * In case any control actions are pending, wake - * up the worker. + * If the worker isn't running or control actions are pending, + * wake it it up now. */ - if (sqp->sq_state & SQS_WORKER_THR_CONTROL) - cv_signal(&sqp->sq_worker_cv); - mutex_exit(&sqp->sq_lock); - return; + if ((sqp->sq_state & SQS_PROC) == 0 || + (sqp->sq_state & SQS_WORKER_THR_CONTROL) != 0) { + squeue_worker_wakeup(sqp); + } } + mutex_exit(&sqp->sq_lock); } /* * PRIVATE FUNCTIONS */ + +/* + * Wake up worker thread for squeue to process queued work. + */ static void -squeue_fire(void *arg) +squeue_worker_wakeup(squeue_t *sqp) { - squeue_t *sqp = arg; - uint_t state; - - mutex_enter(&sqp->sq_lock); - - state = sqp->sq_state; - if (sqp->sq_tid == 0 && !(state & SQS_TMO_PROG)) { - mutex_exit(&sqp->sq_lock); - return; - } - - sqp->sq_tid = 0; - /* - * The timeout fired before we got a chance to set it. - * Process it anyway but remove the SQS_TMO_PROG so that - * the guy trying to set the timeout knows that it has - * already been processed. - */ - if (state & SQS_TMO_PROG) - sqp->sq_state &= ~SQS_TMO_PROG; + ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); - if (!(state & SQS_PROC)) { - sqp->sq_awaken = ddi_get_lbolt(); - cv_signal(&sqp->sq_worker_cv); - } - mutex_exit(&sqp->sq_lock); + cv_signal(&sqp->sq_worker_cv); + sqp->sq_awoken = gethrtime(); } static void @@ -714,10 +641,8 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) mblk_t *head; sqproc_t proc; conn_t *connp; - timeout_id_t tid; ill_rx_ring_t *sq_rx_ring = sqp->sq_rx_ring; hrtime_t now; - boolean_t did_wakeup = B_FALSE; boolean_t sq_poll_capable; ip_recv_attr_t *ira, iras; @@ -729,8 +654,7 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() - (uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) { ASSERT(mutex_owned(&sqp->sq_lock)); - sqp->sq_awaken = ddi_get_lbolt(); - cv_signal(&sqp->sq_worker_cv); + squeue_worker_wakeup(sqp); squeue_drain_stack_toodeep++; return; } @@ -746,9 +670,6 @@ again: sqp->sq_last = NULL; sqp->sq_count = 0; - if ((tid = sqp->sq_tid) != 0) - sqp->sq_tid = 0; - sqp->sq_state |= SQS_PROC | proc_type; /* @@ -765,9 +686,6 @@ again: SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring); mutex_exit(&sqp->sq_lock); - if (tid != 0) - (void) untimeout(tid); - while ((mp = head) != NULL) { head = mp->b_next; @@ -779,7 +697,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -804,20 +722,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -864,11 +787,9 @@ again: if (proc_type == SQS_WORKER) SQS_POLL_RING(sqp); goto again; - } else { - did_wakeup = B_TRUE; - sqp->sq_awaken = ddi_get_lbolt(); - cv_signal(&sqp->sq_worker_cv); } + + squeue_worker_wakeup(sqp); } /* @@ -927,17 +848,14 @@ again: SQS_POLL_QUIESCE_DONE))); SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring); sqp->sq_state &= ~(SQS_PROC | proc_type); - if (!did_wakeup && sqp->sq_first != NULL) { - squeue_worker_wakeup(sqp); - mutex_enter(&sqp->sq_lock); - } /* * If we are not the worker and there is a pending quiesce * event, wake up the worker */ if ((proc_type != SQS_WORKER) && - (sqp->sq_state & SQS_WORKER_THR_CONTROL)) - cv_signal(&sqp->sq_worker_cv); + (sqp->sq_state & SQS_WORKER_THR_CONTROL)) { + squeue_worker_wakeup(sqp); + } } } @@ -1051,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -1076,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1137,7 +1063,6 @@ poll_again: */ } - sqp->sq_awaken = ddi_get_lbolt(); /* * Put the SQS_PROC_HELD on so the worker * thread can distinguish where its called from. We @@ -1153,7 +1078,7 @@ poll_again: */ sqp->sq_state |= SQS_PROC_HELD; sqp->sq_state &= ~SQS_GET_PKTS; - cv_signal(&sqp->sq_worker_cv); + squeue_worker_wakeup(sqp); } else if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_WORKER)) { /* @@ -1173,8 +1098,9 @@ poll_again: * wake up the worker, since it is currently * not running. */ - if (sqp->sq_state & SQS_WORKER_THR_CONTROL) - cv_signal(&sqp->sq_worker_cv); + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) { + squeue_worker_wakeup(sqp); + } } else { /* * Worker thread is already running. We don't need @@ -1205,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1316,6 +1243,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1412,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1483,36 +1416,109 @@ again: } } -void -squeue_synch_exit(conn_t *connp) +/* + * If possible, attempt to immediately process a single queued request, should + * it match the supplied conn_t reference. This is primarily intended to elide + * squeue worker thread wake-ups during local TCP connect() or close() + * operations where the response is placed on the squeue during processing. + */ +static void +squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn) { - squeue_t *sqp = connp->conn_sqp; + mblk_t *next, *mp = sqp->sq_first; + conn_t *connp; + sqproc_t proc = (sqproc_t)mp->b_queue; + ip_recv_attr_t iras, *ira = NULL; - mutex_enter(&sqp->sq_lock); - if (sqp->sq_run == curthread) { - ASSERT(sqp->sq_state & SQS_PROC); + ASSERT(MUTEX_HELD(&sqp->sq_lock)); + ASSERT((sqp->sq_state & SQS_PROC) == 0); + ASSERT(sqp->sq_run == NULL); + ASSERT(sqp->sq_isip); + VERIFY(mp != NULL); - sqp->sq_state &= ~SQS_PROC; - sqp->sq_run = NULL; - connp->conn_on_sqp = B_FALSE; + /* + * There is no guarantee that compare_conn references a valid object at + * this time, so under no circumstance may it be deferenced unless it + * matches the squeue entry. + */ + connp = (conn_t *)mp->b_prev; + if (connp != compare_conn) { + return; + } - if (sqp->sq_first == NULL) { - mutex_exit(&sqp->sq_lock); - } else { - /* - * If this was a normal thread, then it would - * (most likely) continue processing the pending - * requests. Since the just completed operation - * was executed synchronously, the thread should - * not be delayed. To compensate, wake up the - * worker thread right away when there are outstanding - * requests. - */ - sqp->sq_awaken = ddi_get_lbolt(); - cv_signal(&sqp->sq_worker_cv); - mutex_exit(&sqp->sq_lock); - } + next = mp->b_next; + proc = (sqproc_t)mp->b_queue; + + ASSERT(proc != NULL); + ASSERT(sqp->sq_count > 0); + + /* Dequeue item from squeue */ + if (next == NULL) { + sqp->sq_first = NULL; + sqp->sq_last = NULL; } else { + sqp->sq_first = next; + } + sqp->sq_count--; + + sqp->sq_state |= SQS_PROC; + sqp->sq_run = curthread; + mutex_exit(&sqp->sq_lock); + + /* Prep mblk_t and retrieve ira if needed */ + mp->b_prev = NULL; + mp->b_queue = NULL; + mp->b_next = NULL; + if (ip_recv_attr_is_mblk(mp)) { + mblk_t *attrmp = mp; + + ASSERT(attrmp->b_cont != NULL); + + mp = attrmp->b_cont; + attrmp->b_cont = NULL; + + ASSERT(mp->b_queue == NULL); + ASSERT(mp->b_prev == NULL); + + if (!ip_recv_attr_from_mblk(attrmp, &iras)) { + /* ill_t or ip_stack_t disappeared */ + ip_drop_input("ip_recv_attr_from_mblk", mp, NULL); + ira_cleanup(&iras, B_TRUE); + CONN_DEC_REF(connp); + goto done; + } + ira = &iras; + } + + SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); + connp->conn_on_sqp = B_TRUE; + DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, + conn_t *, connp); + (*proc)(connp, mp, sqp, ira); + DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + SQUEUE_DBG_CLEAR(sqp); + + if (ira != NULL) + ira_cleanup(ira, B_TRUE); + +done: + mutex_enter(&sqp->sq_lock); + sqp->sq_state &= ~(SQS_PROC); + sqp->sq_run = NULL; +} + +void +squeue_synch_exit(conn_t *connp, int flag) +{ + squeue_t *sqp = connp->conn_sqp; + + VERIFY(sqp->sq_isip == B_TRUE); + ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS); + + mutex_enter(&sqp->sq_lock); + if (sqp->sq_run != curthread) { /* * The caller doesn't own the squeue, clear the SQS_PAUSE flag, * and wake up the squeue owner, such that owner can continue @@ -1524,5 +1530,23 @@ squeue_synch_exit(conn_t *connp) /* There should be only one thread blocking on sq_synch_cv. */ cv_signal(&sqp->sq_synch_cv); mutex_exit(&sqp->sq_lock); + return; } + + ASSERT(sqp->sq_state & SQS_PROC); + + sqp->sq_state &= ~SQS_PROC; + sqp->sq_run = NULL; + connp->conn_on_sqp = B_FALSE; + + /* If the caller opted in, attempt to process the head squeue item. */ + if (flag == SQ_PROCESS && sqp->sq_first != NULL) { + squeue_try_drain_one(sqp, connp); + } + + /* Wake up the worker if further requests are pending. */ + if (sqp->sq_first != NULL) { + squeue_worker_wakeup(sqp); + } + mutex_exit(&sqp->sq_lock); } diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index b2b9973291..68404716b9 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -20,9 +20,9 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -134,6 +134,7 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; +struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -177,16 +178,11 @@ typedef struct tcp_s { mblk_t *tcp_xmit_tail; /* Last data sent */ uint32_t tcp_unsent; /* # of bytes in hand that are unsent */ uint32_t tcp_xmit_tail_unsent; /* # of unsent bytes in xmit_tail */ - uint32_t tcp_suna; /* Sender unacknowledged */ uint32_t tcp_rexmit_nxt; /* Next rexmit seq num */ uint32_t tcp_rexmit_max; /* Max retran seq num */ uint32_t tcp_cwnd; /* Congestion window */ int32_t tcp_cwnd_cnt; /* cwnd cnt in congestion avoidance */ - - uint32_t tcp_ibsegs; /* Inbound segments on this stream */ - uint32_t tcp_obsegs; /* Outbound segments on this stream */ - uint32_t tcp_naglim; /* Tunable nagle limit */ uint32_t tcp_valid_bits; #define TCP_ISS_VALID 0x1 /* Is the tcp_iss seq num active? */ @@ -194,8 +190,6 @@ typedef struct tcp_s { #define TCP_URG_VALID 0x4 /* Is the tcp_urg seq num active? */ #define TCP_OFO_FIN_VALID 0x8 /* Has TCP received an out of order FIN? */ - - timeout_id_t tcp_timer_tid; /* Control block for timer service */ uchar_t tcp_timer_backoff; /* Backoff shift count. */ int64_t tcp_last_recv_time; /* Last time we receive a segment. */ @@ -282,9 +276,11 @@ typedef struct tcp_s { uint32_t tcp_cwnd_max; uint32_t tcp_csuna; /* Clear (no rexmits in window) suna */ - clock_t tcp_rtt_sa; /* Round trip smoothed average */ - clock_t tcp_rtt_sd; /* Round trip smoothed deviation */ - clock_t tcp_rtt_update; /* Round trip update(s) */ + hrtime_t tcp_rtt_sum; /* Round trip sum */ + uint32_t tcp_rtt_cnt; /* Round trip count (non_dup ACKs) */ + hrtime_t tcp_rtt_sa; /* Round trip smoothed average */ + hrtime_t tcp_rtt_sd; /* Round trip smoothed deviation */ + uint32_t tcp_rtt_update; /* Round trip update(s) */ clock_t tcp_ms_we_have_waited; /* Total retrans time */ uint32_t tcp_swl1; /* These help us avoid using stale */ @@ -404,6 +400,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; + /* + * Group of tcp_t entries bound to the same adress and port via + * SO_REUSEPORT. The pointer itself is protected by tf_lock in the + * containing tcps_bind_fanout slot. + */ + struct tcp_rg_s *tcp_rg_bind; + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ @@ -493,6 +496,8 @@ typedef struct tcp_s { /* FIN-WAIT-2 flush timeout */ uint32_t tcp_fin_wait_2_flush_interval; + tcp_conn_stats_t tcp_cs; + #ifdef DEBUG pc_t tcmp_stk[15]; #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index d340aff2a5..ba66be0b2b 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -21,9 +21,9 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013,2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -266,8 +266,6 @@ typedef struct tcpt_s { /* * Functions called directly via squeue having a prototype of edesc_t. */ -void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *ira); void tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira); static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, @@ -640,15 +638,9 @@ tcp_set_destination(tcp_t *tcp) tcp->tcp_localnet = uinfo.iulp_localnet; if (uinfo.iulp_rtt != 0) { - clock_t rto; - - tcp->tcp_rtt_sa = uinfo.iulp_rtt; - tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5); - - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt); + tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd); + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); } if (uinfo.iulp_ssthresh != 0) tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; @@ -967,8 +959,7 @@ void tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -996,7 +987,7 @@ tcp_stop_lingering(tcp_t *tcp) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcps, tcp_detach_time_wait); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait); goto finish; } @@ -1239,11 +1230,6 @@ tcp_closei_local(tcp_t *tcp) if (!TCP_IS_SOCKET(tcp)) tcp_acceptor_hash_remove(tcp); - TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - /* * This can be called via tcp_time_wait_processing() if TCP gets a * SYN with sequence number outside the TIME-WAIT connection's @@ -1423,6 +1409,21 @@ tcp_free(tcp_t *tcp) tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); /* + * Destroy any association with SO_REUSEPORT group. + */ + if (tcp->tcp_rg_bind != NULL) { + /* + * This is only necessary for connections which enabled + * SO_REUSEPORT but were never bound. Such connections should + * be the one and only member of the tcp_rg_tp to which they + * have been associated. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = NULL; + } + + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is @@ -1912,15 +1913,6 @@ tcp_reinit(tcp_t *tcp) /* Cancel outstanding timers */ tcp_timers_stop(tcp); - /* - * Reset everything in the state vector, after updating global - * MIB data from instance counters. - */ - TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - tcp_close_mpp(&tcp->tcp_xmit_head); if (tcp->tcp_snd_zcopy_aware) tcp_zcopy_notify(tcp); @@ -2092,9 +2084,6 @@ tcp_reinit_values(tcp_t *tcp) tcp->tcp_swnd = 0; DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ - ASSERT(tcp->tcp_ibsegs == 0); - ASSERT(tcp->tcp_obsegs == 0); - if (connp->conn_ht_iphc != NULL) { kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); connp->conn_ht_iphc = NULL; @@ -2186,6 +2175,8 @@ tcp_reinit_values(tcp_t *tcp) DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ tcp->tcp_rtt_update = 0; + tcp->tcp_rtt_sum = 0; + tcp->tcp_rtt_cnt = 0; DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ @@ -2334,7 +2325,6 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; - clock_t rto; ASSERT((connp->conn_family == AF_INET && connp->conn_ipversion == IPV4_VERSION) || @@ -2403,12 +2393,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) * during first few transmissions of a connection as seen in slow * links. */ - tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; - tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + - tcps->tcps_conn_grace_period; - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; + tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, + tcps->tcps_conn_grace_period); tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; @@ -2455,8 +2443,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) * Path MTU might have changed by either increase or decrease, so need to * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny * or negative MSS, since tcp_mss_set() will do it. + * + * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE. */ -void +boolean_t tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { uint32_t pmtu; @@ -2466,10 +2456,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) iaflags_t ixaflags; if (tcp->tcp_tcps->tcps_ignore_path_mtu) - return; + return (B_FALSE); if (tcp->tcp_state < TCPS_ESTABLISHED) - return; + return (B_FALSE); /* * Always call ip_get_pmtu() to make sure that IP has updated @@ -2489,13 +2479,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) * Nothing to change, so just return. */ if (mss == tcp->tcp_mss) - return; + return (B_FALSE); /* * Currently, for ICMP errors, only PMTU decrease is handled. */ if (mss > tcp->tcp_mss && decrease_only) - return; + return (B_FALSE); DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); @@ -2530,6 +2520,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; } ixa->ixa_flags = ixaflags; + return (B_TRUE); } int @@ -3400,7 +3391,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, tcp_update_lso(tcp, connp->conn_ixa); break; case IXAN_PMTU: - tcp_update_pmtu(tcp, B_FALSE); + (void) tcp_update_pmtu(tcp, B_FALSE); break; case IXAN_ZCOPY: tcp_update_zcopy(tcp); @@ -3731,7 +3722,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) { tcp_stack_t *tcps; int i; - int error = 0; major_t major; size_t arrsz; @@ -3795,8 +3785,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_mibkp = tcp_kstat_init(stackid); major = mod_name_to_major(INET_NAME); - error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); - ASSERT(error == 0); + VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident)); tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index 72093af2f2..ec2a5d4e29 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); /* * Hash list insertion routine for tcp_t structures. Each hash bucket @@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); + + /* destroy any association with SO_REUSEPORT group */ + if (tcp->tcp_rg_bind != NULL) { + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Last one out turns off the lights */ + tcp_rg_destroy(tcp->tcp_rg_bind); + } + tcp->tcp_rg_bind = NULL; + } + if (tcp->tcp_ptpbhn) { tcpnext = tcp->tcp_bind_hash_port; if (tcpnext != NULL) { @@ -637,13 +649,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 + * If the "bind_to_req_port_only" parameter is set and the requested port + * number is available, return it (else return 0). * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. + * If "bind_to_req_port_only" parameter is not set and the requested port + * number is available, return it. If not, return the first anonymous port we + * happen across. If no anonymous ports are available, return 0. * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. @@ -663,6 +674,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t reuseport = connp->conn_reuseport; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -699,6 +711,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tf_t *tbf; tcp_t *ltcp; conn_t *lconnp; + boolean_t attempt_reuse = B_FALSE; lport = htons(port); @@ -725,6 +738,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; + boolean_t addrmatch; lconnp = ltcp->tcp_connp; @@ -830,22 +844,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, &lconnp->conn_faddr_v6))) continue; + addrmatch = IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6); + + if (addrmatch && reuseport && bind_to_req_port_only && + (ltcp->tcp_state == TCPS_BOUND || + ltcp->tcp_state == TCPS_LISTEN)) { + /* + * This entry is bound to the exact same + * address and port. If SO_REUSEPORT is set on + * the calling socket, attempt to reuse this + * binding if it too had SO_REUSEPORT enabled + * when it was bound. + */ + attempt_reuse = (ltcp->tcp_rg_bind != NULL); + break; + } + if (!reuseaddr) { /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. + * No socket option SO_REUSEADDR. If an + * existing port is bound to a non-wildcard IP + * address and the requesting stream is bound + * to a distinct different IP address + * (non-wildcard, also), keep going. */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) + !addrmatch) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -860,27 +887,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * socket option SO_REUSEADDR is set on the * binding tcp_t. * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. + * If two streams are bound to the same IP + * address or both addr and bound source are + * wildcards (INADDR_ANY), we want to stop + * searching. We have found a match of IP + * source address and source port, which is + * refused regardless of the SO_REUSEADDR + * setting, so we break. */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && + if (addrmatch && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; } } - if (ltcp != NULL) { + if (ltcp != NULL && !attempt_reuse) { /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { + if (attempt_reuse) { + int err; + struct tcp_rg_s *rg; + + ASSERT(ltcp != NULL); + ASSERT(ltcp->tcp_rg_bind != NULL); + ASSERT(tcp->tcp_rg_bind != NULL); + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); + + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + if (err != 0) { + mutex_exit(&tbf->tf_lock); + return (0); + } + /* + * Now that the newly-binding socket has joined + * the existing reuseport group on ltcp, it + * should clean up its own (empty) group. + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = ltcp->tcp_rg_bind; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } + /* * This port is ours. Insert in fanout and mark as * bound to prevent others from getting the port @@ -945,3 +994,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } + +/* Max number of members in TCP SO_REUSEPORT group */ +#define TCP_RG_SIZE_MAX 64 +/* Step size when expanding members array */ +#define TCP_RG_SIZE_STEP 2 + + +tcp_rg_t * +tcp_rg_init(tcp_t *tcp) +{ + tcp_rg_t *rg; + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI); + if (rg == NULL) + return (NULL); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (rg->tcprg_members == NULL) { + kmem_free(rg, sizeof (tcp_rg_t)); + return (NULL); + } + + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->tcprg_size = 2; + rg->tcprg_count = 1; + rg->tcprg_active = 1; + rg->tcprg_members[0] = tcp; + return (rg); +} + +void +tcp_rg_destroy(tcp_rg_t *rg) +{ + mutex_enter(&rg->tcprg_lock); + ASSERT(rg->tcprg_count == 0); + ASSERT(rg->tcprg_active == 0); + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); + mutex_destroy(&rg->tcprg_lock); + kmem_free(rg, sizeof (struct tcp_rg_s)); +} + +static int +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) +{ + mutex_enter(&rg->tcprg_lock); + + VERIFY(rg->tcprg_size > 0); + VERIFY(rg->tcprg_count <= rg->tcprg_size); + if (rg->tcprg_count != 0) { + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; + cred_t *newcred = tcp->tcp_connp->conn_cred; + + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->tcprg_lock); + return (EPERM); + } + } + + if (rg->tcprg_count == rg->tcprg_size) { + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; + tcp_t **newmembers; + + if (newsize > TCP_RG_SIZE_MAX) { + mutex_exit(&rg->tcprg_lock); + return (EINVAL); + } + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (newmembers == NULL) { + mutex_exit(&rg->tcprg_lock); + return (ENOMEM); + } + bcopy(rg->tcprg_members, newmembers, oldalloc); + kmem_free(rg->tcprg_members, oldalloc); + rg->tcprg_members = newmembers; + rg->tcprg_size = newsize; + } + + rg->tcprg_members[rg->tcprg_count] = tcp; + rg->tcprg_count++; + rg->tcprg_active++; + + mutex_exit(&rg->tcprg_lock); + return (0); +} + +boolean_t +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) +{ + int i; + boolean_t is_empty; + + mutex_enter(&rg->tcprg_lock); + for (i = 0; i < rg->tcprg_count; i++) { + if (rg->tcprg_members[i] == tcp) + break; + } + /* The item should be present */ + ASSERT(i < rg->tcprg_count); + /* Move the last member into this position */ + rg->tcprg_count--; + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; + rg->tcprg_members[rg->tcprg_count] = NULL; + if (tcp->tcp_connp->conn_reuseport != 0) + rg->tcprg_active--; + is_empty = (rg->tcprg_count == 0); + mutex_exit(&rg->tcprg_lock); + return (is_empty); +} + +void +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) +{ + mutex_enter(&rg->tcprg_lock); + if (is_active) { + rg->tcprg_active++; + } else { + rg->tcprg_active--; + } + mutex_exit(&rg->tcprg_lock); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 6acc02d769..e73c34de34 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -645,14 +646,16 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; TCPS_BUMP_MIB(tcps, tcpOutDataSegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size); + tcp->tcp_cs.tcp_out_data_bytes += send_size; + tcp->tcp_cs.tcp_out_data_segs++; TCPS_BUMP_MIB(tcps, tcpHCInSegs); TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size); - - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_LOCAL(peer_tcp->tcp_ibsegs); + peer_tcp->tcp_cs.tcp_in_data_inorder_bytes += send_size; + peer_tcp->tcp_cs.tcp_in_data_inorder_segs++; DTRACE_TCP5(send, void, NULL, ip_xmit_attr_t *, connp->conn_ixa, __dtrace_tcp_void_ip_t *, NULL, tcp_t *, tcp, diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index e917f7c774..11b40e7280 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -22,8 +22,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ @@ -166,7 +166,7 @@ static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); -static void tcp_set_rto(tcp_t *, time_t); +static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* @@ -559,7 +559,7 @@ tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) static mblk_t * tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) { - uint32_t end; + uint32_t end, bytes; mblk_t *mp1; mblk_t *mp2; mblk_t *next_mp; @@ -578,26 +578,26 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) freeb(mp); continue; } + bytes = end - start; mp->b_cont = NULL; TCP_REASS_SET_SEQ(mp, start); TCP_REASS_SET_END(mp, end); mp1 = tcp->tcp_reass_tail; - if (!mp1) { - tcp->tcp_reass_tail = mp; - tcp->tcp_reass_head = mp; - TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); - TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, - end - start); - continue; - } - /* New stuff completely beyond tail? */ - if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { - /* Link it on end. */ - mp1->b_cont = mp; + if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) { + if (mp1 != NULL) { + /* + * New stuff is beyond the tail; link it on the + * end. + */ + mp1->b_cont = mp; + } else { + tcp->tcp_reass_head = mp; + } tcp->tcp_reass_tail = mp; TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); - TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, - end - start); + TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes); + tcp->tcp_cs.tcp_in_data_unorder_segs++; + tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes; continue; } mp1 = tcp->tcp_reass_head; @@ -2414,7 +2414,7 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) flags = (unsigned int)tcpha->tha_flags & 0xFF; - BUMP_LOCAL(tcp->tcp_ibsegs); + TCPS_BUMP_MIB(tcps, tcpHCInSegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); if ((flags & TH_URG) && sqp != NULL) { @@ -2659,7 +2659,7 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) tcp->tcp_ack_tid = 0; } tcp_send_data(tcp, ack_mp); - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); if (!IPCL_IS_NONSTR(connp)) { @@ -3048,6 +3048,7 @@ try_again:; if (tcp->tcp_rwnd == 0) { TCPS_BUMP_MIB(tcps, tcpInWinProbe); + tcp->tcp_cs.tcp_in_zwnd_probes++; } else { TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); @@ -3297,6 +3298,9 @@ ok:; } else if (seg_len > 0) { TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); + tcp->tcp_cs.tcp_in_data_inorder_segs++; + tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len; + /* * If an out of order FIN was received before, and the seq * num and len of the new segment match that of the FIN, @@ -3362,7 +3366,7 @@ ok:; * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ - if (flags & TH_URG && urp >= 0) { + if ((flags & TH_URG) && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* @@ -4146,7 +4150,7 @@ process_ack: } mp = tcp_ack_mp(tcp); if (mp != NULL) { - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); tcp_send_data(tcp, mp); } @@ -4304,36 +4308,29 @@ process_ack: SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; - /* Can we update the RTT estimates? */ - if (tcp->tcp_snd_ts_ok) { - /* Ignore zero timestamp echo-reply. */ - if (tcpopt.tcp_opt_ts_ecr != 0) { - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)tcpopt.tcp_opt_ts_ecr); - } - - /* If needed, restart the timer. */ - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } - /* - * Update tcp_csuna in case the other side stops sending - * us timestamps. - */ - tcp->tcp_csuna = tcp->tcp_snxt; - } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { + /* + * Update the RTT estimates. Note that we don't use the TCP + * timestamp option to calculate RTT even if one is present. This is + * because the timestamp option's resolution (CPU tick) is + * too coarse to measure modern datacenter networks' microsecond + * latencies. The timestamp field's resolution is limited by its + * 4-byte width (see RFC1323), and since we always store a + * high-resolution nanosecond presision timestamp along with the data, + * there is no point to ever using the timestamp option. + */ + if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && - SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)(intptr_t)mp1->b_prev); - else + SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { + tcp_set_rto(tcp, gethrtime() - + (hrtime_t)(intptr_t)mp1->b_prev); + } else { TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + } /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; @@ -4362,7 +4359,7 @@ process_ack: if (SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { mp1->b_prev = - (mblk_t *)(uintptr_t)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); mp1->b_next = NULL; } break; @@ -4839,11 +4836,13 @@ xmit_check: if (mp1 != NULL) { tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, snd_size); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += snd_size; tcp_send_data(tcp, mp1); } } @@ -4873,9 +4872,10 @@ xmit_check: * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ - if (tcp->tcp_xmit_head != NULL) + if (tcp->tcp_xmit_head != NULL) { tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); + } } /* Anything more to do? */ @@ -4918,7 +4918,7 @@ ack_check: if (mp1 != NULL) { tcp_send_data(tcp, mp1); - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); } if (tcp->tcp_ack_tid != 0) { @@ -5211,38 +5211,53 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, return (mp); } -/* The minimum of smoothed mean deviation in RTO calculation. */ -#define TCP_SD_MIN 400 +/* The minimum of smoothed mean deviation in RTO calculation (nsec). */ +#define TCP_SD_MIN 400000000 /* - * Set RTO for this connection. The formula is from Jacobson and Karels' - * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names - * are the same as those in Appendix A.2 of that paper. + * Set RTO for this connection based on a new round-trip time measurement. + * The formula is from Jacobson and Karels' "Congestion Avoidance and Control" + * in SIGCOMM '88. The variable names are the same as those in Appendix A.2 + * of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void -tcp_set_rto(tcp_t *tcp, clock_t rtt) +tcp_set_rto(tcp_t *tcp, hrtime_t rtt) { - long m = TICK_TO_MSEC(rtt); - clock_t sa = tcp->tcp_rtt_sa; - clock_t sv = tcp->tcp_rtt_sd; - clock_t rto; - tcp_stack_t *tcps = tcp->tcp_tcps; + hrtime_t m = rtt; + hrtime_t sa = tcp->tcp_rtt_sa; + hrtime_t sv = tcp->tcp_rtt_sd; + tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; + tcp->tcp_rtt_sum += m; + tcp->tcp_rtt_cnt++; /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* - * Update average estimator: - * new rtt = 7/8 old rtt + 1/8 Error + * Update average estimator (see section 2.3 of RFC6298): + * SRTT = 7/8 SRTT + 1/8 rtt + * + * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to: + * tcp_rtt_sa = 7 * SRTT + rtt + * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt + * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8)) + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3)) + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3)) + * + * (rtt - tcp_rtt_sa / 8) is simply the difference + * between the new rtt measurement and the existing smoothed + * RTT average. This is referred to as "Error" in subsequent + * calculations. */ - /* m is now Error in estimate. */ + /* m is now Error. */ m -= sa >> 3; if ((sa += m) <= 0) { /* @@ -5255,7 +5270,13 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) /* * Update deviation estimator: - * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) + * mdev = 3/4 mdev + 1/4 abs(Error) + * + * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to: + * tcp_rtt_sd = 3 * mdev + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error) */ if (m < 0) m = -m; @@ -5275,33 +5296,21 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) } if (sv < TCP_SD_MIN) { /* - * We do not know that if sa captures the delay ACK - * effect as in a long train of segments, a receiver - * does not delay its ACKs. So set the minimum of sv - * to be TCP_SD_MIN, which is default to 400 ms, twice - * of BSD DATO. That means the minimum of mean + * Since a receiver doesn't delay its ACKs during a long run of + * segments, sa may not have captured the effect of delayed ACK + * timeouts on the RTT. To make sure we always account for the + * possible delay (and avoid the unnecessary retransmission), + * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of + * 200ms on older SunOS/BSD systems and modern Windows systems + * (as of 2019). This means that the minimum possible mean * deviation is 100 ms. - * */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; - /* - * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) - * - * Add tcp_rexmit_interval extra in case of extreme environment - * where the algorithm fails to work. The default value of - * tcp_rexmit_interval_extra should be 0. - * - * As we use a finer grained clock than BSD and update - * RTO for every ACKs, add in another .25 of RTT to the - * deviation of RTO to accomodate burstiness of 1/4 of - * window size. - */ - rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); - TCP_SET_RTO(tcp, rto); + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; @@ -5563,10 +5572,12 @@ noticmpv4: switch (icmph->icmph_code) { case ICMP_FRAGMENTATION_NEEDED: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the + * connection is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: @@ -5609,7 +5620,7 @@ noticmpv4: break; } break; - case ICMP_SOURCE_QUENCH: { + case ICMP_SOURCE_QUENCH: /* * use a global boolean to control * whether TCP should respond to ICMP_SOURCE_QUENCH. @@ -5630,7 +5641,6 @@ noticmpv4: } break; } - } freemsg(mp); } @@ -5683,10 +5693,12 @@ noticmpv6: switch (icmp6->icmp6_type) { case ICMP6_PACKET_TOO_BIG: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the connection + * is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 40148b416a..4774412992 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -62,7 +64,8 @@ opdes_t tcp_opt_arr[] = { { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -484,6 +487,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) } /* + * Set a TCP connection's participation in SO_REUSEPORT. This operation is + * performed under the protection of the squeue via tcp_setsockopt. + * The manipulation of tcp_rg_bind, as part of this operation, is subject to + * these constraints: + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport + * under the protection of the squeue. + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be + * altered until such time as tcp_free() cleans up the connection. + * 3. A connection undergoing bind, which matches to a connection participating + * in port-reuse, will switch its tcp_rg_bind pointer when it joins the + * group of an existing connection in tcp_bindi(). + */ +static int +tcp_set_reuseport(conn_t *connp, boolean_t do_enable) +{ + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (!IPCL_IS_NONSTR(connp)) { + if (do_enable) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have + * fallen back to the STREAMS API. + */ + return (EINVAL); + } else { + /* + * A connection with SO_REUSEPORT enabled should be + * prevented from falling back to STREAMS mode via + * logic in tcp_fallback. It is legal, however, for + * fallen-back connections to affirm the disabled state + * of SO_REUSEPORT. + */ + ASSERT(connp->conn_reuseport == 0); + return (0); + } + } + if (tcp->tcp_state <= TCPS_CLOSED) { + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + /* + * Connection state is not a concern when initially + * populating tcp_rg_bind. Setting it to non-NULL on a + * bound or listening connection would only mean that + * new reused-port binds become a possibility. + */ + if ((rg = tcp_rg_init(tcp)) == NULL) { + return (ENOMEM); + } + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + ASSERT(tcp->tcp_rg_bind != NULL); + if (tcp->tcp_state == TCPS_IDLE) { + /* + * If the connection has not been bound yet, discard + * the reuse group state. Since disabling SO_REUSEPORT + * on a bound socket will _not_ prevent others from + * reusing the port, the presence of tcp_rg_bind is + * used to determine reuse availability, not + * conn_reuseport. + * + * This allows proper behavior for examples such as: + * + * setsockopt(fd1, ... SO_REUSEPORT, &on_val...); + * bind(fd1, &myaddr, ...); + * setsockopt(fd1, ... SO_REUSEPORT, &off_val...); + * + * setsockopt(fd2, ... SO_REUSEPORT, &on_val...); + * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED + * + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = NULL; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } else { + /* + * If a connection has been bound, it's no longer safe + * to manipulate tcp_rg_bind until connection clean-up + * during tcp_free. Just mark the member status of the + * connection as inactive. + */ + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); +} + +/* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. */ @@ -653,6 +754,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); + } + return (0); } break; case IPPROTO_TCP: @@ -869,9 +975,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_cork = onoff; } break; - case TCP_RTO_INITIAL: { - clock_t rto; - + case TCP_RTO_INITIAL: if (checkonly || val == 0) break; @@ -901,15 +1005,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (tcp->tcp_state >= TCPS_SYN_SENT) break; - tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; - tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5) + - tcps->tcps_conn_grace_period; - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; + tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, + tcps->tcps_conn_grace_period); break; - } case TCP_RTO_MIN: if (checkonly || val == 0) break; @@ -976,10 +1076,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c index 60840a3d54..f54ab3fb33 100644 --- a/usr/src/uts/common/inet/tcp/tcp_output.c +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -21,7 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* This file contains all TCP output processing functions. */ @@ -58,12 +59,12 @@ static void tcp_wput_flush(tcp_t *, mblk_t *); static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); static int tcp_xmit_end(tcp_t *); static int tcp_send(tcp_t *, const int, const int, const int, - const int, int *, uint_t *, int *, mblk_t **, mblk_t *); + const int, int *, uint32_t *, int *, mblk_t **, mblk_t *); static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, int, ip_recv_attr_t *, ip_stack_t *, conn_t *); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); -static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); +static void tcp_fill_header(tcp_t *, uchar_t *, int); /* * Functions called directly via squeue having a prototype of edesc_t. @@ -454,7 +455,7 @@ data_null: } } - local_time = (mblk_t *)now; + local_time = (mblk_t *)(intptr_t)gethrtime(); /* * "Our" Nagle Algorithm. This is not the same as in the old @@ -1183,12 +1184,13 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) snxt = tcp->tcp_snxt; /* - * Check to see if this connection has been idled for some - * time and no ACK is expected. If it is, we need to slow - * start again to get back the connection's "self-clock" as - * described in VJ's paper. + * Check to see if this connection has been idle for some time and no + * ACK is expected. If so, then the congestion window size is no longer + * meaningfully tied to current network conditions. * - * Reinitialize tcp_cwnd after idle. + * We reinitialize tcp_cwnd, and slow start again to get back the + * connection's "self-clock" as described in Van Jacobson's 1988 paper + * "Congestion avoidance and control". */ now = LBOLT_FASTPATH; if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && @@ -1256,7 +1258,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) if ((mp1 = dupb(mp)) == 0) goto no_memory; - mp->b_prev = (mblk_t *)(uintptr_t)now; + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); mp->b_next = (mblk_t *)(uintptr_t)snxt; /* adjust tcp header information */ @@ -1271,7 +1273,9 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); + tcp->tcp_cs.tcp_out_data_segs++; + tcp->tcp_cs.tcp_out_data_bytes += len; /* Update the latest receive window size in TCP header. */ tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); @@ -1311,12 +1315,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(now, + (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } @@ -1771,7 +1773,7 @@ tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) static int tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, const int tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) + uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) { int num_lso_seg = 1; uint_t lso_usable; @@ -1960,16 +1962,21 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, } *snxt += len; *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + tcp->tcp_cs.tcp_out_data_segs++; + tcp->tcp_cs.tcp_out_data_bytes += len; tcp_send_data(tcp, mp); continue; } *snxt += len; /* Adjust later if we don't send all of len */ + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutDataSegs); TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + tcp->tcp_cs.tcp_out_data_segs++; + tcp->tcp_cs.tcp_out_data_bytes += len; if (*tail_unsent) { /* Are the bytes above us in flight? */ @@ -2066,7 +2073,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, * Fill in the header using the template header, and add * options such as time-stamp, ECN and/or SACK, as needed. */ - tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); + tcp_fill_header(tcp, rptr, num_sack_blk); mp->b_rptr = rptr; @@ -2145,6 +2152,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, *snxt += spill; tcp->tcp_last_sent_len += spill; TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill); + tcp->tcp_cs.tcp_out_data_bytes += spill; /* * Adjust the checksum */ @@ -2193,7 +2201,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, */ ixa->ixa_fragsize = ixa->ixa_pmtu; ixa->ixa_extra_ident = 0; - tcp->tcp_obsegs += num_lso_seg; + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCP_STAT(tcps, tcp_lso_times); TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { @@ -2204,7 +2212,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, */ lso_info_cleanup(mp); tcp_send_data(tcp, mp); - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); } } @@ -2284,8 +2292,8 @@ tcp_xmit_end(tcp_t *tcp) * So don't do any update. */ bzero(&uinfo, sizeof (uinfo)); - uinfo.iulp_rtt = tcp->tcp_rtt_sa; - uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; + uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa); + uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd); /* * Note that uinfo is kept for conn_faddr in the DCE. Could update even @@ -2420,7 +2428,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) tcp->tcp_rack_cnt = 0; TCPS_BUMP_MIB(tcps, tcpOutAck); } - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); tcpha->tha_seq = htonl(seq); tcpha->tha_ack = htonl(ack); /* @@ -3389,11 +3397,13 @@ tcp_sack_rexmit(tcp_t *tcp, uint_t *flags) /* * Update the send timestamp to avoid false retransmission. */ - snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += seg_len; /* * Update tcp_rexmit_max to extend this SACK recovery phase. * This happens when new data sent during fast recovery is @@ -3461,9 +3471,11 @@ tcp_ss_rexmit(tcp_t *tcp) * Update the send timestamp to avoid false * retransmission. */ - old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += cnt; tcp->tcp_rexmit_nxt = snxt; } @@ -3621,7 +3633,7 @@ tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) * ECN and/or SACK. */ static void -tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) +tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk) { tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; @@ -3643,7 +3655,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { - U32_TO_BE32((uint32_t)now, + U32_TO_BE32(LBOLT_FASTPATH, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index a431bf63d1..2de76ea060 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* This file contains all TCP kernel socket related functions. */ @@ -221,7 +222,7 @@ tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, error = tcp_do_bind(connp, sa, len, cr, B_TRUE); } - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); if (error < 0) { if (error == -TOUTSTATE) @@ -268,7 +269,7 @@ tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) else error = proto_tlitosyserr(-error); } - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); return (error); } @@ -332,7 +333,13 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, connp->conn_upper_handle, &sopp); } done: - squeue_synch_exit(connp); + /* + * Indicate (via SQ_PROCESS) that it is acceptable for the squeue to + * attempt to drain a pending request relevant to this connection when + * exiting the synchronous context. This can improve the performance + * and efficiency of TCP connect(2) operations to localhost. + */ + squeue_synch_exit(connp, SQ_PROCESS); return ((error == 0) ? EINPROGRESS : error); } @@ -401,7 +408,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, } len = tcp_opt_get(connp, level, option_name, optvalp_buf); - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); if (len == -1) { kmem_free(optvalp_buf, max_optbuf_len); @@ -462,14 +469,14 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, if (error < 0) { error = proto_tlitosyserr(-error); } - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); return (error); } error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, NULL, cr); - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); ASSERT(error >= 0); @@ -645,7 +652,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle) } } - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); } /* ARGSUSED */ @@ -1022,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, } /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (tcp->tcp_rg_bind != NULL) { + freeb(stropt_mp); + freeb(ordrel_mp); + squeue_synch_exit(connp, SQ_NODRAIN); + return (EINVAL); + } + + /* * Both endpoints must be of the same type (either STREAMS or * non-STREAMS) for fusion to be enabled. So if we are fused, * we have to unfuse. @@ -1051,7 +1068,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, * There should be atleast two ref's (IP + TCP) */ ASSERT(connp->conn_ref >= 2); - squeue_synch_exit(connp); + squeue_synch_exit(connp, SQ_NODRAIN); return (0); } diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c index e6b13fe6c9..dbf320d09d 100644 --- a/usr/src/uts/common/inet/tcp/tcp_stats.c +++ b/usr/src/uts/common/inet/tcp/tcp_stats.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -86,6 +87,50 @@ tcp_snmp_state(tcp_t *tcp) } } +static void +tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv) +{ + /* Don't want just anybody seeing these... */ + if (ispriv) { + tcei->ce_snxt = tcp->tcp_snxt; + tcei->ce_suna = tcp->tcp_suna; + tcei->ce_rnxt = tcp->tcp_rnxt; + tcei->ce_rack = tcp->tcp_rack; + } else { + /* + * Netstat, unfortunately, uses this to get send/receive queue + * sizes. How to fix? Why not compute the difference only? + */ + tcei->ce_snxt = tcp->tcp_snxt - tcp->tcp_suna; + tcei->ce_suna = 0; + tcei->ce_rnxt = tcp->tcp_rnxt - tcp->tcp_rack; + tcei->ce_rack = 0; + } + + tcei->ce_in_data_inorder_bytes = tcp->tcp_cs.tcp_in_data_inorder_bytes; + tcei->ce_in_data_inorder_segs = tcp->tcp_cs.tcp_in_data_inorder_segs; + tcei->ce_in_data_unorder_bytes = tcp->tcp_cs.tcp_in_data_unorder_bytes; + tcei->ce_in_data_unorder_segs = tcp->tcp_cs.tcp_in_data_unorder_segs; + tcei->ce_in_zwnd_probes = tcp->tcp_cs.tcp_in_zwnd_probes; + + tcei->ce_out_data_bytes = tcp->tcp_cs.tcp_out_data_bytes; + tcei->ce_out_data_segs = tcp->tcp_cs.tcp_out_data_segs; + tcei->ce_out_retrans_bytes = tcp->tcp_cs.tcp_out_retrans_bytes; + tcei->ce_out_retrans_segs = tcp->tcp_cs.tcp_out_retrans_segs; + tcei->ce_out_zwnd_probes = tcp->tcp_cs.tcp_out_zwnd_probes; + + tcei->ce_unsent = tcp->tcp_unsent; + tcei->ce_swnd = tcp->tcp_swnd; + tcei->ce_cwnd = tcp->tcp_cwnd; + tcei->ce_rwnd = tcp->tcp_rwnd; + tcei->ce_rto = tcp->tcp_rto; + tcei->ce_mss = tcp->tcp_mss; + tcei->ce_state = tcp->tcp_state; + tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3); + tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum); + tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt; +} + /* * Return SNMP stuff in buffer in mpdata. */ @@ -183,11 +228,6 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req) continue; /* not in this zone */ tcp = connp->conn_tcp; - TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - tce6.tcp6ConnState = tce.tcpConnState = tcp_snmp_state(tcp); if (tce.tcpConnState == MIB2_TCP_established || @@ -243,35 +283,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req) } else { tce6.tcp6ConnIfIndex = connp->conn_bound_if; } - /* Don't want just anybody seeing these... */ - if (ispriv) { - tce6.tcp6ConnEntryInfo.ce_snxt = - tcp->tcp_snxt; - tce6.tcp6ConnEntryInfo.ce_suna = - tcp->tcp_suna; - tce6.tcp6ConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt; - tce6.tcp6ConnEntryInfo.ce_rack = - tcp->tcp_rack; - } else { - /* - * Netstat, unfortunately, uses this to - * get send/receive queue sizes. How to fix? - * Why not compute the difference only? - */ - tce6.tcp6ConnEntryInfo.ce_snxt = - tcp->tcp_snxt - tcp->tcp_suna; - tce6.tcp6ConnEntryInfo.ce_suna = 0; - tce6.tcp6ConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt - tcp->tcp_rack; - tce6.tcp6ConnEntryInfo.ce_rack = 0; - } - tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; - tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; - tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; - tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; - tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; + tcp_set_conninfo(tcp, &tce6.tcp6ConnEntryInfo, + ispriv); tce6.tcp6ConnCreationProcess = (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : @@ -307,37 +321,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl, boolean_t legacy_req) } tce.tcpConnLocalPort = ntohs(connp->conn_lport); tce.tcpConnRemPort = ntohs(connp->conn_fport); - /* Don't want just anybody seeing these... */ - if (ispriv) { - tce.tcpConnEntryInfo.ce_snxt = - tcp->tcp_snxt; - tce.tcpConnEntryInfo.ce_suna = - tcp->tcp_suna; - tce.tcpConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt; - tce.tcpConnEntryInfo.ce_rack = - tcp->tcp_rack; - } else { - /* - * Netstat, unfortunately, uses this to - * get send/receive queue sizes. How - * to fix? - * Why not compute the difference only? - */ - tce.tcpConnEntryInfo.ce_snxt = - tcp->tcp_snxt - tcp->tcp_suna; - tce.tcpConnEntryInfo.ce_suna = 0; - tce.tcpConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt - tcp->tcp_rack; - tce.tcpConnEntryInfo.ce_rack = 0; - } - tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; - tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; - tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; - tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; - tce.tcpConnEntryInfo.ce_state = - tcp->tcp_state; + tcp_set_conninfo(tcp, &tce.tcpConnEntryInfo, + ispriv); tce.tcpConnCreationProcess = (connp->conn_cpid < 0) ? diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c index 72997de24a..caf7aeda50 100644 --- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -608,7 +608,7 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - BUMP_LOCAL(tcp->tcp_ibsegs); + TCPS_BUMP_MIB(tcps, tcpHCInSegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); flags = (unsigned int)tcpha->tha_flags & 0xFF; @@ -794,6 +794,8 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, TCPS_BUMP_MIB(tcps, tcpInClosed); TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); + tcp->tcp_cs.tcp_in_data_inorder_segs++; + tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len; } if (flags & TH_RST) { (void) tcp_clean_death(tcp, 0); diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index e3dba42c9b..81cf5c57a5 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -23,7 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -594,7 +594,7 @@ tcp_ack_timer(void *arg) mp = tcp_ack_mp(tcp); if (mp != NULL) { - BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpHCOutSegs); TCPS_BUMP_MIB(tcps, tcpOutAck); TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); tcp_send_data(tcp, mp); @@ -751,15 +751,14 @@ tcp_timer(void *arg) case TCPS_LAST_ACK: /* If we have data to rexmit */ if (tcp->tcp_suna != tcp->tcp_snxt) { - clock_t time_to_wait; + clock_t time_to_wait; TCPS_BUMP_MIB(tcps, tcpTimRetrans); if (!tcp->tcp_xmit_head) break; - time_to_wait = ddi_get_lbolt() - - (clock_t)tcp->tcp_xmit_head->b_prev; - time_to_wait = tcp->tcp_rto - - TICK_TO_MSEC(time_to_wait); + time_to_wait = NSEC2MSEC(gethrtime() - + (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev); + time_to_wait = tcp->tcp_rto - time_to_wait; /* * If the timer fires too early, 1 clock tick earlier, * restart the timer. @@ -854,6 +853,7 @@ tcp_timer(void *arg) tcp->tcp_swnd++; tcp->tcp_zero_win_probe = B_TRUE; TCPS_BUMP_MIB(tcps, tcpOutWinProbe); + tcp->tcp_cs.tcp_out_zwnd_probes++; } else { /* * Handle timeout from sender SWS avoidance. @@ -1012,8 +1012,8 @@ tcp_timer(void *arg) * window probe. */ if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { - tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + - (tcp->tcp_rtt_sa >> 5); + tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 + + tcp->tcp_rtt_sa >> 5; tcp->tcp_rtt_sa = 0; tcp_ip_notify(tcp); tcp->tcp_rtt_update = 0; @@ -1022,24 +1022,14 @@ tcp_timer(void *arg) timer_rexmit: tcp->tcp_timer_backoff++; - if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < - tcp->tcp_rto_min) { - /* - * This means the original RTO is tcp_rexmit_interval_min. - * So we will use tcp_rexmit_interval_min as the RTO value - * and do the backoff. - */ - ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; - } else { - ms <<= tcp->tcp_timer_backoff; - } + /* + * Calculate the backed off retransmission timeout. If the shift brings + * us back over the max, then we repin the value, and decrement the + * backoff to avoid overflow. + */ + ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff; if (ms > tcp->tcp_rto_max) { ms = tcp->tcp_rto_max; - /* - * ms is at max, decrement tcp_timer_backoff to avoid - * overflow. - */ tcp->tcp_timer_backoff--; } tcp->tcp_ms_we_have_waited += ms; @@ -1059,8 +1049,9 @@ timer_rexmit: if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) mss = tcp->tcp_swnd; - if ((mp = tcp->tcp_xmit_head) != NULL) - mp->b_prev = (mblk_t *)ddi_get_lbolt(); + if ((mp = tcp->tcp_xmit_head) != NULL) { + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + } mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, B_TRUE); @@ -1091,6 +1082,8 @@ timer_rexmit: tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); + tcp->tcp_cs.tcp_out_retrans_segs++; + tcp->tcp_cs.tcp_out_retrans_bytes += mss; tcp_send_data(tcp, mp); } diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 4ef1886bae..d2e24a71fb 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,9 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _INET_TCP_IMPL_H @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for @@ -300,17 +300,6 @@ typedef struct tcp_squeue_priv_s { } /* - * Set tcp_rto with boundary checking. - */ -#define TCP_SET_RTO(tcp, rto) \ - if ((rto) < (tcp)->tcp_rto_min) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_min; \ - else if ((rto) > (tcp)->tcp_rto_max) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_max; \ - else \ - (tcp)->tcp_rto = (rto); - -/* * TCP options struct returned from tcp_parse_options. */ typedef struct tcp_opt_s { @@ -406,6 +395,22 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; +/* + * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - tcprg_lock: Protects the other fields + * - tcprg_size: Allocated size (in entries) of tcprg_members array + * - tcprg_count: Count of occupied tcprg_members slots + * - tcprg_active: Count of members which still have SO_REUSEPORT set + * - tcprg_members: Connections associated with address/port group + */ +typedef struct tcp_rg_s { + kmutex_t tcprg_lock; + unsigned int tcprg_size; + unsigned int tcprg_count; + unsigned int tcprg_active; + tcp_t **tcprg_members; +} tcp_rg_t; + #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -574,6 +579,61 @@ extern uint32_t tcp_early_abort; #define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval #define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval + +/* + * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a + * multiple of the deviation estimates (K * RTTVAR): + * + * RTO = SRTT + max(G, K * RTTVAR) + * + * K is defined in the RFC as 4, and G is the clock granularity. We constrain + * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this + * becomes: + * + * RTO = SRTT + 4 * RTTVAR + * + * In practice, however, we make several additions to it. As we use a finer + * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of + * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size: + * + * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR + * + * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR, + * this becomes: + * + * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd + * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + * + * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are + * used to help account for extreme environments where the algorithm fails to + * work; by default they should be 0. (The latter tunable is only used for + * calculating the intial RTO, and so is optionally passed in as "extra".) We + * add them here: + * + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + + * tcps_rexmit_interval_extra + tcps_conn_grace_period + * + * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5 + * of RFC 6298). + */ +static __GNU_INLINE clock_t +tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra) +{ + clock_t rto; + + rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) + + tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra; + + if (rto < tcp->tcp_rto_min) { + rto = tcp->tcp_rto_min; + } else if (rto > tcp->tcp_rto_max) { + rto = tcp->tcp_rto_max; + } + + return (rto); +} + extern struct qinit tcp_rinitv4, tcp_rinitv6; extern boolean_t do_tcp_fusion; @@ -632,7 +692,7 @@ extern int tcp_rwnd_set(tcp_t *, uint32_t); extern int tcp_set_destination(tcp_t *); extern void tcp_set_ws_value(tcp_t *); extern void tcp_stop_lingering(tcp_t *); -extern void tcp_update_pmtu(tcp_t *, boolean_t); +extern boolean_t tcp_update_pmtu(tcp_t *, boolean_t); extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); extern boolean_t tcp_zcopy_check(tcp_t *); extern void tcp_zcopy_notify(tcp_t *); @@ -649,6 +709,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, int, boolean_t, boolean_t, boolean_t); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); +extern tcp_rg_t *tcp_rg_init(tcp_t *); +extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); +extern void tcp_rg_destroy(tcp_rg_t *); +extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); /* * Fusion related functions in tcp_fusion.c. diff --git a/usr/src/uts/common/inet/tcp_stats.h b/usr/src/uts/common/inet/tcp_stats.h index 487d0d3414..704102e9d6 100644 --- a/usr/src/uts/common/inet/tcp_stats.h +++ b/usr/src/uts/common/inet/tcp_stats.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _INET_TCP_STATS_H @@ -205,6 +206,26 @@ typedef struct { tcp_stat_counter_t tcp_sc_stats; } tcp_stats_cpu_t; +/* + * Per-connection statistics. Some of these are also kept globally in the + * per-cpu tcp_sc_mib entry (see tcp_stats_cpu_t above). We need not maintain + * per-cpu versions of these stats since a connection is typically processed + * on the same CPU. + */ +typedef struct tcp_conn_stats { + uint64_t tcp_in_data_inorder_bytes; + uint64_t tcp_in_data_inorder_segs; + uint64_t tcp_in_data_unorder_bytes; + uint64_t tcp_in_data_unorder_segs; + uint64_t tcp_in_zwnd_probes; + + uint64_t tcp_out_data_bytes; + uint64_t tcp_out_data_segs; + uint64_t tcp_out_retrans_bytes; + uint64_t tcp_out_retrans_segs; + uint64_t tcp_out_zwnd_probes; +} tcp_conn_stats_t; + #define TCPS_BUMP_MIB(tcps, x) \ BUMP_MIB(&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_mib, x) diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index d233ea14de..165adcb852 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2018, Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -76,7 +77,8 @@ #include <inet/ipclassifier.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> -#include <sys/ethernet.h> +#include <sys/vxlan.h> +#include <inet/inet_hash.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> @@ -346,6 +348,85 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol, typedef union T_primitives *t_primp_t; /* + * Various protocols that encapsulate UDP have no real use for the source port. + * Instead, they want to vary the source port to provide better equal-cost + * multipathing and other systems that use fanout. Consider something like + * VXLAN. If you're actually sending multiple different streams to a single + * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP, + * SRC Port, DST Port) will always be the same. + * + * Here, we return a port to hash this to, if we know how to hash it. If for + * some reason we can't perform an L4 hash, then we just return the default + * value, usually the default port. After we determine the hash we transform it + * so that it's in the range of [ min, max ]. + * + * We'd like to avoid a pull up for the sake of performing the hash. If the + * first mblk_t doesn't have the full protocol header, then we just send it to + * the default. If for some reason we have an encapsulated packet that has its + * protocol header in different parts of an mblk_t, then we'll go with the + * default port. This means that that if a driver isn't consistent about how it + * generates the frames for a given flow, it will not always be consistently + * hashed. That should be an uncommon event. + */ +uint16_t +udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, + uint16_t def) +{ + size_t szused = 0; + ip6_t *ip6h; + ipha_t *ipha; + uint16_t sap; + uint64_t hash; + uint32_t mod; + + ASSERT(min <= max); + + if (type != UDP_HASH_VXLAN) + return (def); + + if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) + return (def); + + /* + * The following logic is VXLAN specific to get at the header, if we + * have formats, eg. GENEVE, then we should ignore this. + * + * The kernel overlay device often puts a first mblk_t for the data + * which is just the encap. If so, then we're going to use that and try + * to avoid a pull up. + */ + if (MBLKL(mp) == VXLAN_HDR_LEN) { + if (mp->b_cont == NULL) + return (def); + mp = mp->b_cont; + } else if (MBLKL(mp) < VXLAN_HDR_LEN) { + return (def); + } else { + szused = VXLAN_HDR_LEN; + } + + /* Can we hold a MAC header? */ + if (MBLKL(mp) + szused < sizeof (struct ether_header)) + return (def); + + /* + * We need to lie about the starting offset into the message block for + * convenience. Undo it at the end. We know that inet_pkt_hash() won't + * modify the mblk_t. + */ + mp->b_rptr += szused; + hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 | + INET_PKT_HASH_L3 | INET_PKT_HASH_L4); + mp->b_rptr -= szused; + + if (hash == 0) + return (def); + + mod = max - min + 1; + return ((hash % mod) + min); +} + +/* * Return the next anonymous port in the privileged port range for * bind checking. * @@ -1584,6 +1665,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_rcvhdr ? 1 : 0; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SRCPORT_HASH: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_vxlanhash; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_snd_to_conn ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1719,6 +1810,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, udp->udp_rcvhdr = onoff; mutex_exit(&connp->conn_lock); return (0); + case UDP_SRCPORT_HASH: + /* + * This should have already been verified, but double + * check. + */ + if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { + return (error); + } + + /* First see if the val is something we understand */ + if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN) + return (EINVAL); + + if (!checkonly) { + mutex_enter(&connp->conn_lock); + udp->udp_vxlanhash = *i1; + mutex_exit(&connp->conn_lock); + } + /* Fully handled this option. */ + return (0); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + udp->udp_snd_to_conn = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; } @@ -2002,13 +2118,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, uint32_t cksum; udp_t *udp = connp->conn_udp; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t ulp_hdr_len; + uint16_t srcport; data_len = msgdsize(data_mp); ulp_hdr_len = UDPH_SIZE; if (insert_spi) ulp_hdr_len += sizeof (uint32_t); + /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); if (mp == NULL) { @@ -2020,7 +2148,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); - udpha->uha_src_port = connp->conn_lport; + if (hash_srcport == B_TRUE) { + udpha->uha_src_port = htons(srcport); + } else { + udpha->uha_src_port = connp->conn_lport; + } udpha->uha_dst_port = dstport; udpha->uha_checksum = 0; udpha->uha_length = htons(data_len); @@ -3195,6 +3327,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t pktlen; uint_t alloclen; uint_t copylen; @@ -3203,10 +3336,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udpha_t *udpha; uint32_t cksum; ip_pkt_t *ipp; + uint16_t srcport; ASSERT(MUTEX_HELD(&connp->conn_lock)); /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + + /* * Copy the header template and leave space for an SPI */ copylen = connp->conn_ht_iphc_len; @@ -3304,6 +3448,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, *((uint32_t *)(udpha + 1)) = 0; udpha->uha_dst_port = dstport; + if (hash_srcport == B_TRUE) + udpha->uha_src_port = htons(srcport); + return (mp); } @@ -5952,10 +6099,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, else return (error); } - if (udp->udp_state == TS_DATA_XFER) { + + /* + * Check if we're allowed to send to a connection on which we've + * already called 'connect'. The posix spec. allows both behaviors but + * historically we've returned an error if already connected. The + * client can allow this via a sockopt. + */ + if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) { UDPS_BUMP_MIB(us, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, (struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) { diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c279bb4a21..847e2cdde6 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -292,6 +293,9 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, +{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 1e5204bb15..ef11973707 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _UDP_IMPL_H @@ -178,8 +179,12 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ + udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ + /* Because there's only VXLAN, cheat */ + /* and only use a single bit */ + udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */ - udp_pad_to_bit_31 : 29; + udp_pad_to_bit_31 : 27; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 7e930c89e8..c5d6f09b0c 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -32,39 +32,69 @@ * module. The hash key is the linkid associated with the link * aggregation group. * - * A set of MAC ports are associated with each association group. + * Each aggregation contains a set of ports. The port is represented + * by the aggr_port_t structure. A port consists of a single MAC + * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying + * MAC. This client is used by the aggr to send and receive LACP + * traffic. Each port client takes on the same MAC unicast address -- + * the address of the aggregation itself (taken from the first port by + * default). * - * Aggr pseudo TX rings - * -------------------- - * The underlying ports (NICs) in an aggregation can have TX rings. To - * enhance aggr's performance, these TX rings are made available to the - * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. - * They are already present and implemented on the RX side. It is called - * as pseudo RX rings. The same concept is extended to the TX side where - * each TX ring of an underlying port is reflected in aggr as a pseudo - * TX ring. Thus each pseudo TX ring will map to a specific hardware TX - * ring. Even in the case of a NIC that does not have a TX ring, a pseudo - * TX ring is given to the aggregation layer. + * The MAC client that hangs off each aggr port is not your typical + * MAC client. Not only does it have exclusive control of the MAC, but + * it also has no Tx or Rx SRSes. An SRS is designed to queue and + * fanout traffic among L4 protocols; but the aggr is an intermediary, + * not a consumer. Instead of using SRSes, the aggr puts the + * underlying hardware rings into passthru mode and ships packets up + * via a direct call to aggr_recv_cb(). This allows aggr to enforce + * LACP while passing all other traffic up to clients of the aggr. + * + * Pseudo Rx Groups and Rings + * -------------------------- + * + * It is imperative for client performance that the aggr provide as + * many MAC groups as possible. In order to use the underlying HW + * resources, aggr creates pseudo groups to aggregate the underlying + * HW groups. Every HW group gets mapped to a pseudo group; and every + * HW ring in that group gets mapped to a pseudo ring. The pseudo + * group at index 0 combines all the HW groups at index 0 from each + * port, etc. The aggr's MAC then creates normal MAC groups and rings + * out of these pseudo groups and rings to present to the aggr's + * clients. To the clients, the aggr's groups and rings are absolutely + * no different than a NIC's groups or rings. + * + * Pseudo Tx Rings + * --------------- + * + * The underlying ports (NICs) in an aggregation can have Tx rings. To + * enhance aggr's performance, these Tx rings are made available to + * the aggr layer as pseudo Tx rings. The concept of pseudo rings are + * not new. They are already present and implemented on the Rx side. + * The same concept is extended to the Tx side where each Tx ring of + * an underlying port is reflected in aggr as a pseudo Tx ring. Thus + * each pseudo Tx ring will map to a specific hardware Tx ring. Even + * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring + * is given to the aggregation layer. * * With this change, the outgoing stack depth looks much better: * * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() * - * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: + * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: * SRS_TX_AGGR and SRS_TX_BW_AGGR. * * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine - * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX + * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx * ring belonging to a port on which the packet has to be sent. * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 - * policy and then uses the fanout_hint passed to it to pick a TX ring from + * policy and then uses the fanout_hint passed to it to pick a Tx ring from * the selected port. * * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where * bandwidth limit is applied first on the outgoing packet and the packets * allowed to go out would call mac_tx_aggr_mode() to send the packet on a - * particular TX ring. + * particular Tx ring. */ #include <sys/types.h> @@ -121,9 +151,12 @@ static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static int aggr_pseudo_disable_intr(mac_intr_handle_t); static int aggr_pseudo_enable_intr(mac_intr_handle_t); -static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); +static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); +static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); static int aggr_addmac(void *, const uint8_t *); static int aggr_remmac(void *, const uint8_t *); +static int aggr_addvlan(mac_group_driver_t, uint16_t); +static int aggr_remvlan(mac_group_driver_t, uint16_t); static mblk_t *aggr_rx_poll(void *, int); static void aggr_fill_ring(void *, mac_ring_type_t, const int, const int, mac_ring_info_t *, mac_ring_handle_t); @@ -324,6 +357,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) return (B_FALSE); } + mutex_enter(&grp->lg_stat_lock); if (grp->lg_ifspeed == 0) { /* * The group inherits the speed of the first link being @@ -337,8 +371,10 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) * the group link speed, as per 802.3ad. Since it is * not, the attach is cancelled. */ + mutex_exit(&grp->lg_stat_lock); return (B_FALSE); } + mutex_exit(&grp->lg_stat_lock); grp->lg_nattached_ports++; @@ -347,7 +383,9 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) */ if (grp->lg_link_state != LINK_STATE_UP) { grp->lg_link_state = LINK_STATE_UP; + mutex_enter(&grp->lg_stat_lock); grp->lg_link_duplex = LINK_DUPLEX_FULL; + mutex_exit(&grp->lg_stat_lock); link_state_changed = B_TRUE; } @@ -359,9 +397,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) aggr_grp_multicst_port(port, B_TRUE); /* - * Set port's receive callback + * The port client doesn't have an Rx SRS; instead of calling + * mac_rx_set() we set the client's flow callback directly. + * This datapath is used only when the port's driver doesn't + * support MAC_CAPAB_RINGS. Drivers with ring support will + * deliver traffic to the aggr via ring passthru. */ - mac_rx_set(port->lp_mch, aggr_recv_cb, port); + mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); /* * If LACP is OFF, the port can be used to send data as soon @@ -391,7 +433,7 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return (B_FALSE); - mac_rx_clear(port->lp_mch); + mac_client_clear_flow_cb(port->lp_mch); aggr_grp_multicst_port(port, B_FALSE); @@ -405,9 +447,11 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) grp->lg_nattached_ports--; if (grp->lg_nattached_ports == 0) { /* the last attached MAC port of the group is being detached */ - grp->lg_ifspeed = 0; grp->lg_link_state = LINK_STATE_DOWN; + mutex_enter(&grp->lg_stat_lock); + grp->lg_ifspeed = 0; grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; + mutex_exit(&grp->lg_stat_lock); link_state_changed = B_TRUE; } @@ -528,26 +572,27 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, zoneid_t port_zoneid = ALL_ZONES; int err; - /* The port must be int the same zone as the aggregation. */ + /* The port must be in the same zone as the aggregation. */ if (zone_check_datalink(&port_zoneid, port_linkid) != 0) port_zoneid = GLOBAL_ZONEID; if (grp->lg_zoneid != port_zoneid) return (EBUSY); /* - * lg_mh could be NULL when the function is called during the creation - * of the aggregation. + * If we are creating the aggr, then there is no MAC handle + * and thus no perimeter to hold. If we are adding a port to + * an existing aggr, then the perimiter of the aggr's MAC must + * be held. */ ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); - /* create new port */ err = aggr_port_create(grp, port_linkid, force, &port); if (err != 0) return (err); mac_perim_enter_by_mh(port->lp_mh, &mph); - /* add port to list of group constituent ports */ + /* Add the new port to the end of the list. */ cport = &grp->lg_ports; while (*cport != NULL) cport = &((*cport)->lp_next); @@ -629,6 +674,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags |= MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = hw_rh; ring->arr_port = port; + ring->arr_grp = rx_grp; rx_grp->arg_ring_cnt++; /* @@ -639,10 +685,15 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; } else { - mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, - mac_find_ring(rx_grp->arg_gh, j)); + /* + * This must run after the MAC is registered. + */ + ASSERT3P(ring->arr_rh, !=, NULL); + mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, + (void *)port, (mac_resource_handle_t)ring); } return (err); } @@ -653,11 +704,9 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, static void aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) { - aggr_pseudo_rx_ring_t *ring; - int j; + for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { + aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; - for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { - ring = rx_grp->arg_rings + j; if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || ring->arr_hw_rh != hw_rh) { continue; @@ -668,134 +717,140 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; - mac_hwring_teardown(hw_rh); + mac_hwring_clear_passthru(hw_rh); break; } } /* - * This function is called to create pseudo rings over the hardware rings of - * the underlying device. Note that there is a 1:1 mapping between the pseudo - * RX rings of the aggr and the hardware rings of the underlying port. + * Create pseudo rings over the HW rings of the port. + * + * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. + * + * o Program existing unicast filters on the pseudo group into the HW group. + * + * o Program existing VLAN filters on the pseudo group into the HW group. */ static int aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr, *a; mac_perim_handle_t pmph; - int hw_rh_cnt, i = 0, j; + aggr_vlan_t *avp; + uint_t hw_rh_cnt, i; int err = 0; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * This function must be called after the aggr registers its mac - * and its RX group has been initialized. + * This function must be called after the aggr registers its + * MAC and its Rx groups have been initialized. */ ASSERT(rx_grp->arg_gh != NULL); /* - * Get the list the the underlying HW rings. + * Get the list of the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); - - if (port->lp_hwgh != NULL) { - /* - * Quiesce the HW ring and the mac srs on the ring. Note - * that the HW ring will be restarted when the pseudo ring - * is started. At that time all the packets will be - * directly passed up to the pseudo RX ring and handled - * by mac srs created over the pseudo RX ring. - */ - mac_rx_client_quiesce(port->lp_mch); - mac_srs_perm_quiesce(port->lp_mch, B_TRUE); - } + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, + &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); /* - * Add all the unicast addresses to the newly added port. + * Add existing VLAN and unicast address filters to the port. */ + for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; + avp = list_next(&rx_grp->arg_vlans, avp)) { + if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) + goto err; + } + for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { - if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) - break; + if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) + goto err; } - for (i = 0; err == 0 && i < hw_rh_cnt; i++) + for (i = 0; i < hw_rh_cnt; i++) { err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); + if (err != 0) + goto err; + } - if (err != 0) { - for (j = 0; j < i; j++) - aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); + mac_perim_exit(pmph); + return (0); + +err: + ASSERT(err != 0); + + for (uint_t j = 0; j < i; j++) + aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); + + for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) + aggr_port_remmac(port, g_idx, a->aua_addr); - for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) - aggr_port_remmac(port, a->aua_addr); + if (avp != NULL) + avp = list_prev(&rx_grp->arg_vlans, avp); - if (port->lp_hwgh != NULL) { - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); - port->lp_hwgh = NULL; + for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { + int err2; + + if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" + ": errno %d.", avp->av_vid, + mac_client_name(port->lp_mch), err2); } - } else { - port->lp_rx_grp_added = B_TRUE; } -done: + + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); return (err); } /* - * This function is called by aggr to remove pseudo RX rings over the - * HW rings of the underlying port. + * Destroy the pseudo rings mapping to this port and remove all VLAN + * and unicast filters from this port. Even if there are no underlying + * HW rings we must still remove the unicast filters to take the port + * out of promisc mode. */ static void aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr; - mac_group_handle_t hwgh; mac_perim_handle_t pmph; - int hw_rh_cnt, i; + uint_t hw_rh_cnt; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); + ASSERT3P(rx_grp->arg_gh, !=, NULL); mac_perim_enter_by_mh(port->lp_mh, &pmph); - if (!port->lp_rx_grp_added) - goto done; - - ASSERT(rx_grp->arg_gh != NULL); - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &hwgh, hw_rh, MAC_RING_TYPE_RX); + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, + MAC_RING_TYPE_RX); - /* - * If hw_rh_cnt is 0, it means that the underlying port does not - * support RX rings. Directly return in this case. - */ - for (i = 0; i < hw_rh_cnt; i++) + for (uint_t i = 0; i < hw_rh_cnt; i++) aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) - aggr_port_remmac(port, addr->aua_addr); + aggr_port_remmac(port, g_idx, addr->aua_addr); - if (port->lp_hwgh != NULL) { - port->lp_hwgh = NULL; + for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; + avp = list_next(&rx_grp->arg_vlans, avp)) { + int err; - /* - * First clear the permanent-quiesced flag of the RX srs then - * restart the HW ring and the mac srs on the ring. Note that - * the HW ring and associated SRS will soon been removed when - * the port is removed from the aggr. - */ - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); + if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" + ": errno %d.", avp->av_vid, + mac_client_name(port->lp_mch), err); + } } - port->lp_rx_grp_added = B_FALSE; -done: + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); } @@ -899,8 +954,8 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) /* * Get the list the the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - NULL, hw_rh, MAC_RING_TYPE_TX); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, + MAC_RING_TYPE_TX); /* * Even if the underlying NIC does not have TX rings, we @@ -1006,21 +1061,45 @@ aggr_pseudo_enable_intr(mac_intr_handle_t ih) } /* - * Here we need to start the pseudo-ring. As MAC already ensures that the - * underlying device is set up, all we need to do is save the ring generation. - * - * Note, we don't end up wanting to use the underlying mac_hwring_start/stop - * functions here as those don't actually stop and start the ring, they just - * quiesce the ring. Regardless of whether the aggr is logically up or not, we - * want to make sure that we can receive traffic for LACP. + * Start the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to start the underlying HW + * ring. */ static int -aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) +aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) { + int err; aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + err = mac_hwring_start(rr_ring->arr_hw_rh); + + if (err != 0) + return (err); + rr_ring->arr_gen = mr_gen; - return (0); + return (err); +} + +/* + * Stop the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to stop the underlying HW + * ring. + */ +static void +aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + + /* + * The rings underlying the default group must stay up to + * continue receiving LACP traffic. We would normally never + * stop the default Rx rings because of the primary MAC + * client; but aggr's primary MAC client doesn't call + * mac_unicast_add() and thus mi_active is 0 when the last + * non-primary client is deleted. + */ + if (rr_ring->arr_grp->arg_index != 0) + mac_hwring_stop(rr_ring->arr_hw_rh); } /* @@ -1030,13 +1109,15 @@ int aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, laioc_port_t *ports) { - int rc, i, nadded = 0; + int rc; + uint_t port_added = 0; + uint_t grp_added; aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; mac_perim_handle_t mph, pmph; - /* get group corresponding to linkid */ + /* Get the aggr corresponding to linkid. */ rw_enter(&aggr_grp_lock, RW_READER); if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), (mod_hash_val_t *)&grp) != 0) { @@ -1046,20 +1127,22 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, AGGR_GRP_REFHOLD(grp); /* - * Hold the perimeter so that the aggregation won't be destroyed. + * Hold the perimeter so that the aggregation can't be destroyed. */ mac_perim_enter_by_mh(grp->lg_mh, &mph); rw_exit(&aggr_grp_lock); - /* add the specified ports to group */ - for (i = 0; i < nports; i++) { - /* add port to group */ + /* Add the specified ports to the aggr. */ + for (uint_t i = 0; i < nports; i++) { + grp_added = 0; + if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port)) != 0) { goto bail; } + ASSERT(port != NULL); - nadded++; + port_added++; /* check capabilities */ if (!aggr_grp_capab_check(grp, port) || @@ -1076,9 +1159,16 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); if (rc != 0) goto bail; - rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); - if (rc != 0) - goto bail; + + for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { + rc = aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + + if (rc != 0) + goto bail; + + grp_added++; + } mac_perim_enter_by_mh(port->lp_mh, &pmph); @@ -1096,7 +1186,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, /* * Turn on the promiscuous mode over the port when it * is requested to be turned on to receive the - * non-primary address over a port, or the promiscous + * non-primary address over a port, or the promiscuous * mode is enabled over the aggr. */ if (grp->lg_promisc || port->lp_prom_addr != NULL) { @@ -1131,17 +1221,33 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, bail: if (rc != 0) { /* stop and remove ports that have been added */ - for (i = 0; i < nadded; i++) { + for (uint_t i = 0; i < port_added; i++) { + uint_t grp_remove; + port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); ASSERT(port != NULL); + if (grp->lg_started) { mac_perim_enter_by_mh(port->lp_mh, &pmph); (void) aggr_port_promisc(port, B_FALSE); aggr_port_stop(port); mac_perim_exit(pmph); } + aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + + /* + * Only the last port could have a partial set + * of groups added. + */ + grp_remove = (i + 1 == port_added) ? grp_added : + grp->lg_rx_group_count; + + for (uint_t j = 0; j < grp_remove; j++) { + aggr_rem_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + } + (void) aggr_grp_rem_port(grp, port, NULL, NULL); } } @@ -1303,7 +1409,8 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP), KM_SLEEP); grp->lg_tx_blocked_cnt = 0; - bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); + bzero(&grp->lg_rx_groups, + sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); aggr_lacp_init_grp(grp); @@ -1323,11 +1430,48 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_key = key; for (i = 0; i < nports; i++) { - err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); + err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); if (err != 0) goto bail; } + grp->lg_rx_group_count = 1; + + for (i = 0, port = grp->lg_ports; port != NULL; + i++, port = port->lp_next) { + uint_t num_rgroups; + + mac_perim_enter_by_mh(port->lp_mh, &mph); + num_rgroups = mac_get_num_rx_groups(port->lp_mh); + mac_perim_exit(mph); + + /* + * Utilize all the groups in a port. If some ports + * have less groups than others, then traffic destined + * for the same unicast address may be HW classified + * on some ports but SW classified by aggr when + * arriving on other ports. + */ + grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, + num_rgroups); + } + + /* + * There could be cases where the hardware provides more + * groups than aggr can support. Make sure we never go above + * the max aggr can support. + */ + grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, + MAX_GROUPS_PER_PORT); + + ASSERT3U(grp->lg_rx_group_count, >, 0); + for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { + grp->lg_rx_groups[i].arg_index = i; + grp->lg_rx_groups[i].arg_untagged = 0; + list_create(&(grp->lg_rx_groups[i].arg_vlans), + sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); + } + /* * If no explicit MAC address was specified by the administrator, * set it to the MAC address of the first port. @@ -1345,7 +1489,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_mac_addr_port = grp->lg_ports; } - /* set the initial group capabilities */ + /* Set the initial group capabilities. */ aggr_grp_capab_set(grp); if ((mac = mac_alloc(MAC_VERSION)) == NULL) { @@ -1380,14 +1524,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, * Update the MAC address of the constituent ports. * None of the port is attached at this time, the link state of the * aggregation will not change. + * + * All ports take on the primary MAC address of the aggr + * (lg_aggr). At this point, none of the ports are attached; + * thus the link state of the aggregation will not change. */ link_state_changed = aggr_grp_update_ports_mac(grp); ASSERT(!link_state_changed); - /* update outbound load balancing policy */ + /* Update outbound load balancing policy. */ aggr_send_update_policy(grp, policy); - /* set LACP mode */ + /* Set LACP mode. */ aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); /* @@ -1395,12 +1543,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { /* - * Create the pseudo ring for each HW ring of the underlying - * port. Note that this is done after the aggr registers the - * mac. + * Create the pseudo ring for each HW ring of the + * underlying port. Note that this is done after the + * aggr registers its MAC. */ - VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); - VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); + VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), + ==, 0); + + for (i = 0; i < grp->lg_rx_group_count; i++) { + VERIFY3S(aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[i]), ==, 0); + } + if (aggr_port_notify_link(grp, port)) link_state_changed = B_TRUE; @@ -1545,7 +1699,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, continue; val = aggr_port_stat(port, stat); val -= port->lp_stat[i]; + mutex_enter(&grp->lg_stat_lock); grp->lg_stat[i] += val; + mutex_exit(&grp->lg_stat_lock); } for (i = 0; i < ETHER_NSTAT; i++) { stat = i + MACTYPE_STAT_MIN; @@ -1553,7 +1709,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, continue; val = aggr_port_stat(port, stat); val -= port->lp_ether_stat[i]; + mutex_enter(&grp->lg_stat_lock); grp->lg_ether_stat[i] += val; + mutex_exit(&grp->lg_stat_lock); } grp->lg_nports--; @@ -1678,7 +1836,8 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) * aggr_find_tx_ring() will not return any rings * belonging to it. */ - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); /* remove port from group */ rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, @@ -1783,7 +1942,8 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) (void) aggr_grp_detach_port(grp, port); mac_perim_exit(pmph); aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); aggr_port_delete(port); port = cport; } @@ -1802,6 +1962,10 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) VERIFY(mac_unregister(grp->lg_mh) == 0); grp->lg_mh = NULL; + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { + list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); + } + AGGR_GRP_REFRELE(grp); return (0); } @@ -1884,6 +2048,8 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) aggr_port_t *port; uint_t stat_index; + ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); + /* We only aggregate counter statistics. */ if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { @@ -1952,10 +2118,9 @@ static int aggr_m_stat(void *arg, uint_t stat, uint64_t *val) { aggr_grp_t *grp = arg; - mac_perim_handle_t mph; int rval = 0; - mac_perim_enter_by_mh(grp->lg_mh, &mph); + mutex_enter(&grp->lg_stat_lock); switch (stat) { case MAC_STAT_IFSPEED: @@ -1975,7 +2140,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val) rval = aggr_grp_stat(grp, stat, val); } - mac_perim_exit(mph); + mutex_exit(&grp->lg_stat_lock); return (rval); } @@ -2165,17 +2330,15 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) return (!grp->lg_zcopy); case MAC_CAPAB_RINGS: { mac_capab_rings_t *cap_rings = cap_data; + uint_t ring_cnt = 0; + + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; if (cap_rings->mr_type == MAC_RING_TYPE_RX) { cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; - cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; - - /* - * An aggregation advertises only one (pseudo) RX - * group, which virtualizes the main/primary group of - * the underlying devices. - */ - cap_rings->mr_gnum = 1; + cap_rings->mr_rnum = ring_cnt; + cap_rings->mr_gnum = grp->lg_rx_group_count; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; } else { @@ -2207,19 +2370,17 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) } /* - * Callback funtion for MAC layer to register groups. + * Callback function for MAC layer to register groups. */ static void aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { aggr_grp_t *grp = arg; - aggr_pseudo_rx_group_t *rx_group; - aggr_pseudo_tx_group_t *tx_group; - ASSERT(index == 0); if (rtype == MAC_RING_TYPE_RX) { - rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; + rx_group->arg_gh = gh; rx_group->arg_grp = grp; @@ -2229,8 +2390,18 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, infop->mgi_addmac = aggr_addmac; infop->mgi_remmac = aggr_remmac; infop->mgi_count = rx_group->arg_ring_cnt; + + /* + * Always set the HW VLAN callbacks. They are smart + * enough to know when a port has HW VLAN filters to + * program and when it doesn't. + */ + infop->mgi_addvlan = aggr_addvlan; + infop->mgi_remvlan = aggr_remvlan; } else { - tx_group = &grp->lg_tx_group; + aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; + + ASSERT3S(index, ==, 0); tx_group->atg_gh = gh; } } @@ -2246,13 +2417,13 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, switch (rtype) { case MAC_RING_TYPE_RX: { - aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group; aggr_pseudo_rx_ring_t *rx_ring; mac_intr_t aggr_mac_intr; - ASSERT(rg_index == 0); - - ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); + rx_group = &grp->lg_rx_groups[rg_index]; + ASSERT3S(index, >=, 0); + ASSERT3S(index, <, rx_group->arg_ring_cnt); rx_ring = rx_group->arg_rings + index; rx_ring->arr_rh = rh; @@ -2266,8 +2437,8 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, aggr_mac_intr.mi_ddi_handle = NULL; infop->mri_driver = (mac_ring_driver_t)rx_ring; - infop->mri_start = aggr_pseudo_start_ring; - infop->mri_stop = NULL; + infop->mri_start = aggr_pseudo_start_rx_ring; + infop->mri_stop = aggr_pseudo_stop_rx_ring; infop->mri_intr = aggr_mac_intr; infop->mri_poll = aggr_rx_poll; @@ -2354,6 +2525,7 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) aggr_port_t *port, *p; mac_perim_handle_t mph; int err = 0; + uint_t idx = rx_group->arg_index; mac_perim_enter_by_mh(grp->lg_mh, &mph); @@ -2380,12 +2552,12 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) *pprev = addr; for (port = grp->lg_ports; port != NULL; port = port->lp_next) - if ((err = aggr_port_addmac(port, mac_addr)) != 0) + if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) break; if (err != 0) { for (p = grp->lg_ports; p != port; p = p->lp_next) - aggr_port_remmac(p, mac_addr); + aggr_port_remmac(p, idx, mac_addr); *pprev = NULL; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2430,7 +2602,7 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) } for (port = grp->lg_ports; port != NULL; port = port->lp_next) - aggr_port_remmac(port, mac_addr); + aggr_port_remmac(port, rx_group->arg_index, mac_addr); *pprev = addr->aua_next; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2440,6 +2612,188 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) } /* + * Search for VID in the Rx group's list and return a pointer if + * found. Otherwise return NULL. + */ +static aggr_vlan_t * +aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) +{ + ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); + for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; + avp = list_next(&rx_group->arg_vlans, avp)) { + if (avp->av_vid == vid) + return (avp); + } + + return (NULL); +} + +/* + * Accept traffic on the specified VID. + * + * Persist VLAN state in the aggr so that ports added later will + * receive the correct filters. In the future it would be nice to + * allow aggr to iterate its clients instead of duplicating state. + */ +static int +aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_grp_t *aggr = rx_group->arg_grp; + aggr_port_t *port, *p; + mac_perim_handle_t mph; + int err = 0; + aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; + + mac_perim_enter_by_mh(aggr->lg_mh, &mph); + + if (vid == MAC_VLAN_UNTAGGED) { + /* + * Aggr is both a MAC provider and MAC client. As a + * MAC provider it is passed MAC_VLAN_UNTAGGED by its + * client. As a client itself, it should pass + * VLAN_ID_NONE to its ports. + */ + vid = VLAN_ID_NONE; + rx_group->arg_untagged++; + goto update_ports; + } + + avp = aggr_find_vlan(rx_group, vid); + + if (avp != NULL) { + avp->av_refs++; + mac_perim_exit(mph); + return (0); + } + + avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); + avp->av_vid = vid; + avp->av_refs = 1; + +update_ports: + for (port = aggr->lg_ports; port != NULL; port = port->lp_next) + if ((err = aggr_port_addvlan(port, idx, vid)) != 0) + break; + + if (err != 0) { + /* + * If any of these calls fail then we are in a + * situation where the ports have different HW state. + * There's no reasonable action the MAC client can + * take in this scenario to rectify the situation. + */ + for (p = aggr->lg_ports; p != port; p = p->lp_next) { + int err2; + + if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u" + " from port %s: errno %d.", vid, + mac_client_name(p->lp_mch), err2); + } + + } + + if (vid == VLAN_ID_NONE) + rx_group->arg_untagged--; + + if (avp != NULL) { + kmem_free(avp, sizeof (aggr_vlan_t)); + avp = NULL; + } + } + + if (avp != NULL) + list_insert_tail(&rx_group->arg_vlans, avp); + +done: + mac_perim_exit(mph); + return (err); +} + +/* + * Stop accepting traffic on this VLAN if it's the last use of this VLAN. + */ +static int +aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_grp_t *aggr = rx_group->arg_grp; + aggr_port_t *port, *p; + mac_perim_handle_t mph; + int err = 0; + aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; + + mac_perim_enter_by_mh(aggr->lg_mh, &mph); + + /* + * See the comment in aggr_addvlan(). + */ + if (vid == MAC_VLAN_UNTAGGED) { + vid = VLAN_ID_NONE; + rx_group->arg_untagged--; + + if (rx_group->arg_untagged > 0) + goto done; + + goto update_ports; + } + + avp = aggr_find_vlan(rx_group, vid); + + if (avp == NULL) { + err = ENOENT; + goto done; + } + + avp->av_refs--; + + if (avp->av_refs > 0) + goto done; + +update_ports: + for (port = aggr->lg_ports; port != NULL; port = port->lp_next) + if ((err = aggr_port_remvlan(port, idx, vid)) != 0) + break; + + /* + * See the comment in aggr_addvlan() for justification of the + * use of VERIFY here. + */ + if (err != 0) { + for (p = aggr->lg_ports; p != port; p = p->lp_next) { + int err2; + + if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { + cmn_err(CE_WARN, "Failed to add VLAN %u" + " to port %s: errno %d.", vid, + mac_client_name(p->lp_mch), err2); + } + } + + if (avp != NULL) + avp->av_refs++; + + if (vid == VLAN_ID_NONE) + rx_group->arg_untagged++; + + goto done; + } + + if (err == 0 && avp != NULL) { + VERIFY3U(avp->av_refs, ==, 0); + list_remove(&rx_group->arg_vlans, avp); + kmem_free(avp, sizeof (aggr_vlan_t)); + } + +done: + mac_perim_exit(mph); + return (err); +} + +/* * Add or remove the multicast addresses that are defined for the group * to or from the specified port. * diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 00545d2c03..c8dbe00336 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -69,10 +71,10 @@ aggr_port_destructor(void *buf, void *arg) { aggr_port_t *port = buf; - ASSERT(port->lp_mnh == NULL); - ASSERT(port->lp_mphp == NULL); - ASSERT(!port->lp_rx_grp_added && !port->lp_tx_grp_added); - ASSERT(port->lp_hwgh == NULL); + ASSERT3P(port->lp_mnh, ==, NULL); + ASSERT(!port->lp_tx_grp_added); + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) + ASSERT3P(port->lp_hwghs[i], ==, NULL); } void @@ -126,7 +128,6 @@ aggr_port_init_callbacks(aggr_port_t *port) aggr_grp_port_hold(port); } -/* ARGSUSED */ int aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) @@ -195,9 +196,9 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, } /* - * As the underlying mac's current margin size is used to determine + * As the underlying MAC's current margin size is used to determine * the margin size of the aggregation itself, request the underlying - * mac not to change to a smaller size. + * MAC not to change to a smaller size. */ if ((err = mac_margin_add(mh, &margin, B_TRUE)) != 0) { id_free(aggr_portids, portid); @@ -206,7 +207,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, if ((err = mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, &mah, 0, &diag)) != 0) { - VERIFY(mac_margin_remove(mh, margin) == 0); + VERIFY3S(mac_margin_remove(mh, margin), ==, 0); id_free(aggr_portids, portid); goto fail; } @@ -261,6 +262,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, fail: if (mch != NULL) mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE); + mac_close(mh); return (err); } @@ -270,13 +272,11 @@ aggr_port_delete(aggr_port_t *port) { aggr_lacp_port_t *pl = &port->lp_lacp; - ASSERT(port->lp_mphp == NULL); ASSERT(!port->lp_promisc_on); - port->lp_closing = B_TRUE; + VERIFY0(mac_margin_remove(port->lp_mh, port->lp_margin)); + mac_client_clear_flow_cb(port->lp_mch); - VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0); - mac_rx_clear(port->lp_mch); /* * If the notification callback is already in process and waiting for * the aggr grp's mac perimeter, don't wait (otherwise there would be @@ -307,8 +307,10 @@ aggr_port_delete(aggr_port_t *port) * port's MAC_NOTE_UNICST notify callback function being called. */ (void) mac_unicast_primary_set(port->lp_mh, port->lp_addr); + if (port->lp_mah != NULL) (void) mac_unicast_remove(port->lp_mch, port->lp_mah); + mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE); mac_close(port->lp_mh); AGGR_PORT_REFRELE(port); @@ -373,10 +375,14 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port) /* link speed changes? */ ifspeed = aggr_port_stat(port, MAC_STAT_IFSPEED); if (port->lp_ifspeed != ifspeed) { + mutex_enter(&grp->lg_stat_lock); + if (port->lp_state == AGGR_PORT_STATE_ATTACHED) do_detach |= (ifspeed != grp->lg_ifspeed); else do_attach |= (ifspeed == grp->lg_ifspeed); + + mutex_exit(&grp->lg_stat_lock); } port->lp_ifspeed = ifspeed; @@ -515,6 +521,10 @@ aggr_port_stop(aggr_port_t *port) port->lp_started = B_FALSE; } +/* + * Set the promisc mode of the port. If the port is already in the + * requested mode then do nothing. + */ int aggr_port_promisc(aggr_port_t *port, boolean_t on) { @@ -523,27 +533,14 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (on == port->lp_promisc_on) - /* already in desired promiscous mode */ return (0); - if (on) { - mac_rx_clear(port->lp_mch); - rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_cb, port, &port->lp_mphp, - MAC_PROMISC_FLAGS_NO_TX_LOOP); - if (rc != 0) { - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - return (rc); - } - } else { - mac_promisc_remove(port->lp_mphp); - port->lp_mphp = NULL; - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - } + rc = mac_set_promisc(port->lp_mh, on); - port->lp_promisc_on = on; + if (rc == 0) + port->lp_promisc_on = on; - return (0); + return (rc); } /* @@ -583,35 +580,45 @@ aggr_port_stat(aggr_port_t *port, uint_t stat) } /* - * Add a non-primary unicast address to the underlying port. If the port - * supports HW Rx group, try to add the address into the HW Rx group of - * the port first. If that fails, or if the port does not support HW Rx - * group, enable the port's promiscous mode. + * Add a non-primary unicast address to the underlying port. If the + * port supports HW Rx groups, then try to add the address filter to + * the HW group first. If that fails, or if the port does not support + * RINGS capab, then enable the port's promiscous mode. */ int -aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_addmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; int err; ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * If the underlying port support HW Rx group, add the mac to its - * RX group directly. + * If the port doesn't have a HW group to back the aggr's + * pseudo group, then try using the port's default group and + * let the aggr SW classify its traffic. This scenario happens + * when mixing ports with a different number of HW groups. */ - if ((port->lp_hwgh != NULL) && - ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) { + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + /* + * If there is an underlying HW Rx group, then try adding this + * unicast address to it. + */ + if ((port->lp_hwghs[idx] != NULL) && + ((mac_hwgroup_addmac(port->lp_hwghs[idx], mac_addr)) == 0)) { mac_perim_exit(pmph); return (0); } /* - * If that fails, or if the port does not support HW Rx group, enable - * the port's promiscous mode. (Note that we turn on the promiscous - * mode only if the port is already started. + * If the port doesn't have HW groups, or we failed to add the + * HW filter, then enable the port's promiscuous mode. We + * enable promiscuous mode only if the port is already started. */ if (port->lp_started && ((err = aggr_port_promisc(port, B_TRUE)) != 0)) { @@ -643,13 +650,14 @@ aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) * promiscous mode. */ void -aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_remmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_grp_t *grp = port->lp_grp; aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* @@ -662,6 +670,7 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) break; pprev = &addr->aua_next; } + if (addr != NULL) { /* * This unicast address put the port into the promiscous mode, @@ -674,8 +683,65 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) if (port->lp_prom_addr == NULL && !grp->lg_promisc) (void) aggr_port_promisc(port, B_FALSE); } else { - ASSERT(port->lp_hwgh != NULL); - (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr); + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + ASSERT3P(port->lp_hwghs[idx], !=, NULL); + (void) mac_hwgroup_remmac(port->lp_hwghs[idx], mac_addr); } + mac_perim_exit(pmph); } + +int +aggr_port_addvlan(aggr_port_t *port, uint_t idx, uint16_t vid) +{ + mac_perim_handle_t pmph; + int err; + + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + /* + * Add the VLAN filter to the HW group if the port has a HW + * group. If the port doesn't have a HW group, then it will + * implicitly allow tagged traffic to pass and there is + * nothing to do. + */ + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_addvlan(port->lp_hwghs[idx], vid); + + mac_perim_exit(pmph); + return (err); +} + +int +aggr_port_remvlan(aggr_port_t *port, uint_t idx, uint16_t vid) +{ + mac_perim_handle_t pmph; + int err; + + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_remvlan(port->lp_hwghs[idx], vid); + + mac_perim_exit(pmph); + return (err); +} diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 2bdb7872e3..b6b3e6de1f 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -21,6 +21,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,7 +57,7 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) { aggr_grp_t *grp = port->lp_grp; - /* in promiscuous mode, send copy of packet up */ + /* In promiscuous mode, pass copy of packet up. */ if (grp->lg_promisc) { mblk_t *nmp = copymsg(mp); @@ -68,11 +70,11 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* * Callback function invoked by MAC service module when packets are - * made available by a MAC port. + * made available by a MAC port, both in promisc_on mode and not. */ /* ARGSUSED */ -void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, +static void +aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) { aggr_port_t *port = (aggr_port_t *)arg; @@ -161,3 +163,10 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } } + +void +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback); +} diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c new file mode 100644 index 0000000000..6cbde58a20 --- /dev/null +++ b/usr/src/uts/common/io/bpf/bpf_wrap.c @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <net/bpf.h> +#include <inet/bpf.h> + +/* + * With BPF filter validation and evaluation moved into the 'ip' module, these + * wrapper functions are provided to expose the original interface. + */ + +uint_t +bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +{ + return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen)); +} + +int +bpf_validate(struct bpf_insn *f, int len) +{ + return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len)); +} diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c index bc54527515..375d166972 100644 --- a/usr/src/uts/common/io/bridge.c +++ b/usr/src/uts/common/io/bridge.c @@ -23,6 +23,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -41,6 +42,7 @@ #include <sys/modctl.h> #include <sys/note.h> #include <sys/param.h> +#include <sys/pattr.h> #include <sys/policy.h> #include <sys/sdt.h> #include <sys/stat.h> @@ -1693,7 +1695,8 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick, * The passed-in tci is the "impossible" value 0xFFFF when no tag is present. */ static mblk_t * -reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) +reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid, + boolean_t keep_flags) { boolean_t source_has_tag = (tci != 0xFFFF); mblk_t *mpcopy; @@ -1705,8 +1708,13 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) if (mp == NULL) return (mp); - /* No forwarded packet can have hardware checksum enabled */ - DB_CKSUMFLAGS(mp) = 0; + /* + * A forwarded packet cannot have HW offloads enabled unless + * the destination is known to be local to the host and HW + * offloads haven't been emulated. + */ + if (!keep_flags) + DB_CKSUMFLAGS(mp) = 0; /* Get the no-modification cases out of the way first */ if (!source_has_tag && vlanid == pvid) /* 1a */ @@ -1907,17 +1915,46 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, blp->bl_trillthreads++; mutex_exit(&blp->bl_trilllock); update_header(mp, hdr_info, B_FALSE); - if (is_xmit) - mp = mac_fix_cksum(mp); - /* all trill data frames have Inner.VLAN */ - mp = reform_vlan_header(mp, vlanid, tci, 0); - if (mp == NULL) { - KIINCR(bki_drops); - fwd_unref(bfp); - return (NULL); + + if (is_xmit) { + mac_hw_emul(&mp, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); + + if (mp == NULL) { + KIINCR(bki_drops); + goto done; + } } - trill_encap_fn(tdp, blp, hdr_info, mp, - bfp->bf_trill_nick); + + while (mp != NULL) { + mblk_t *next = mp->b_next; + + mp->b_next = NULL; + + /* + * All trill data frames have + * Inner.VLAN. + */ + mp = reform_vlan_header(mp, vlanid, tci, + 0, B_FALSE); + + if (mp == NULL) { + /* + * Make sure to free + * any remaining + * segments. + */ + freemsgchain(next); + KIINCR(bki_drops); + goto done; + } + + trill_encap_fn(tdp, blp, hdr_info, mp, + bfp->bf_trill_nick); + mp = next; + } + +done: mutex_enter(&blp->bl_trilllock); if (--blp->bl_trillthreads == 0 && blp->bl_trilldata == NULL) @@ -1959,31 +1996,68 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); + /* + * If the destination is not local to the host + * then we need to emulate HW offloads because + * we can't guarantee the forwarding + * destination provides them. + */ + if (!from_trill && is_xmit && + !(bfp->bf_flags & BFF_LOCALADDR)) { + mac_hw_emul(&mpsend, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid); - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + } + + /* + * The HW emulation above may have segmented + * an LSO mblk. + */ + while ((mpsend != NULL) && + !(bfp->bf_flags & BFF_LOCALADDR)) { + mblk_t *next = mpsend->b_next; + + mpsend->b_next = NULL; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_FALSE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + mpsend = next; + continue; + } + + KIINCR(bki_forwards); + KLPINCR(blpsend, bkl_xmit); + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend); + freemsg(mpsend); + mpsend = next; } - KIINCR(bki_forwards); /* * No need to bump up the link reference count, as * the forwarding entry itself holds a reference to * the link. */ if (bfp->bf_flags & BFF_LOCALADDR) { + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_TRUE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + + KIINCR(bki_forwards); mac_rx_common(blpsend->bl_mh, NULL, mpsend); - } else { - KLPINCR(blpsend, bkl_xmit); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, - mpsend); - freemsg(mpsend); } } + /* * Handle a special case: if we're transmitting to the original * link, then check whether the localaddr flag is set. If it @@ -2019,7 +2093,7 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, * Inner.VLAN */ mpsend = reform_vlan_header(mpsend, - vlanid, tci, 0); + vlanid, tci, 0, B_FALSE); if (mpsend == NULL) { KIINCR(bki_drops); } else { @@ -2070,25 +2144,57 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); + /* + * In this case, send to all links connected + * to the bridge. Some of these destinations + * may not provide HW offload -- so just + * emulate it here. + */ + if (!from_trill && is_xmit) { + mac_hw_emul(&mpsend, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid); - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + } + + /* + * The HW emulation above may have segmented + * an LSO mblk. + */ + while (mpsend != NULL) { + mblk_t *next = mpsend->b_next; + + mpsend->b_next = NULL; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_FALSE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + mpsend = next; + continue; + } + + if (hdr_info->mhi_dsttype == + MAC_ADDRTYPE_UNICAST) + KIINCR(bki_unknown); + else + KIINCR(bki_mbcast); + + KLPINCR(blpsend, bkl_xmit); + if ((mpcopy = copymsg(mpsend)) != NULL) { + mac_rx_common(blpsend->bl_mh, NULL, + mpcopy); + } + + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend); + freemsg(mpsend); + mpsend = next; } - if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST) - KIINCR(bki_unknown); - else - KIINCR(bki_mbcast); - KLPINCR(blpsend, bkl_xmit); - if ((mpcopy = copymsg(mpsend)) != NULL) - mac_rx_common(blpsend->bl_mh, NULL, mpcopy); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend); - freemsg(mpsend); link_unref(blpsend); } } diff --git a/usr/src/uts/common/io/chxge/ch.c b/usr/src/uts/common/io/chxge/ch.c index e7ea942405..46920a1ea2 100644 --- a/usr/src/uts/common/io/chxge/ch.c +++ b/usr/src/uts/common/io/chxge/ch.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -59,6 +60,7 @@ #include <sys/sunddi.h> #include <sys/dlpi.h> #include <sys/ethernet.h> +#include <sys/mac_provider.h> #include <sys/strsun.h> #include <sys/strsubr.h> #include <inet/common.h> @@ -1377,8 +1379,7 @@ ch_send_up(ch_t *chp, mblk_t *mp, uint32_t cksum, int flg) * set in /etc/system (see sge.c). */ if (flg) - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, cksum, - HCK_FULLCKSUM, 0); + mac_hcksum_set(mp, 0, 0, 0, cksum, HCK_FULLCKSUM); gld_recv(chp->ch_macp, mp); } else { freemsg(mp); @@ -1693,8 +1694,7 @@ ch_send(gld_mac_info_t *macinfo, mblk_t *mp) msg_flg = 0; if (chp->ch_config.cksum_enabled) { if (is_T2(chp)) { - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, - NULL, &msg_flg); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &msg_flg); flg = (msg_flg & HCK_FULLCKSUM)? CH_NO_CPL: CH_NO_HWCKSUM|CH_NO_CPL; } else diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c index 507f918d8f..8635023fe3 100644 --- a/usr/src/uts/common/io/cons.c +++ b/usr/src/uts/common/io/cons.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* @@ -53,6 +54,7 @@ #include <sys/vnode.h> #include <sys/uio.h> #include <sys/stat.h> +#include <sys/limits.h> #include <sys/console.h> #include <sys/consdev.h> @@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred) */ if (vsconsvp != NULL && vsconsvp->v_stream != NULL) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; + + if (uio->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uio->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } /* * strwrite modifies uio so need to make copy. */ - (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt); (void) strwrite(vsconsvp, &uiod.d_uio, cred); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); } if (rconsvp->v_stream != NULL) diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c index 622f0dcf68..f67d77b3d2 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3.c @@ -41,7 +41,7 @@ extern cpqary3_driver_info_t gdriver_info; * Global Variables Definitions */ -static char cpqary3_brief[] = "HP Smart Array Driver"; +static char cpqary3_brief[] = "HP Smart Array (Legacy)"; void *cpqary3_state; /* HPQaculi Changes */ diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 6f9bf93226..7368c9b43d 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, - pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds, + int *fdcntp) { - int start, ostart, end; - int fdcnt, fd; - boolean_t done; - file_t *fp; - short revent; - boolean_t no_wrap; - pollhead_t *php; - polldat_t *pdp; + int start, ostart, end, fdcnt, error = 0; + boolean_t done, no_wrap; pollfd_t *pfdp; epoll_event_t *epoll; - int error = 0; - short mask = POLLRDHUP | POLLWRBAND; - boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; + const short mask = POLLRDHUP | POLLWRBAND; + const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { - /* - * No Need to search because no poll fd - * has been cached. - */ - return (error); + /* No Need to search because no poll fd has been cached. */ + return (0); } if (is_epoll) { @@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; - php = NULL; if (start == 0) { /* @@ -294,8 +283,11 @@ retry: done = B_FALSE; fdcnt = 0; while ((fdcnt < nfds) && !done) { - php = NULL; - revent = 0; + pollhead_t *php = NULL; + short revent = 0; + uf_entry_gen_t gen; + int fd; + /* * Examine the bit map in a circular fashion * to avoid starvation. Always resume from @@ -305,6 +297,9 @@ retry: fd = bt_getlowbit(pcp->pc_bitmap, start, end); ASSERT(fd <= end); if (fd >= 0) { + file_t *fp; + polldat_t *pdp; + if (fd == end) { if (no_wrap) { done = B_TRUE; @@ -328,28 +323,14 @@ repoll: */ continue; } - if ((fp = getf(fd)) == NULL) { - /* - * The fd has been closed, but user has not - * done a POLLREMOVE on this fd yet. Instead - * of cleaning it here implicitly, we return - * POLLNVAL. This is consistent with poll(2) - * polling a closed fd. Hope this will remind - * user to do a POLLREMOVE. - */ - if (!is_epoll && pfdp != NULL) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; - continue; - } - - /* - * In the epoll compatibility case, we actually - * perform the implicit removal to remain - * closer to the epoll semantics. - */ + if ((fp = getf_gen(fd, &gen)) == NULL) { if (is_epoll) { + /* + * In the epoll compatibility case, we + * actually perform the implicit + * removal to remain closer to the + * epoll semantics. + */ pdp->pd_fp = NULL; pdp->pd_events = 0; @@ -360,30 +341,36 @@ repoll: } BT_CLEAR(pcp->pc_bitmap, fd); - continue; + } else if (pfdp != NULL) { + /* + * The fd has been closed, but user has + * not done a POLLREMOVE on this fd + * yet. Instead of cleaning it here + * implicitly, we return POLLNVAL. This + * is consistent with poll(2) polling a + * closed fd. Hope this will remind + * user to do a POLLREMOVE. + */ + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; } + continue; } - if (fp != pdp->pd_fp) { + /* + * Detect a change to the resource underlying a cached + * file descriptor. While the fd generation comparison + * will catch nearly all cases, the file_t comparison + * is maintained as a failsafe as well. + */ + if (gen != pdp->pd_gen || fp != pdp->pd_fp) { /* * The user is polling on a cached fd which was * closed and then reused. Unfortunately there * is no good way to communicate this fact to * the consumer. * - * If the file struct is also reused, we may - * not be able to detect the fd reuse at all. - * As long as this does not cause system - * failure and/or memory leaks, we will play - * along. The man page states that if the user - * does not clean up closed fds, polling - * results will be indeterministic. - * - * XXX: perhaps log the detection of fd reuse? - */ - pdp->pd_fp = fp; - - /* * When this situation has been detected, it's * likely that any existing pollhead is * ill-suited to perform proper wake-ups. @@ -396,7 +383,42 @@ repoll: pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } + + /* + * Since epoll is expected to act on the + * underlying 'struct file' (in Linux terms, + * our vnode_t would be a closer analog) rather + * than the fd itself, an implicit remove + * is necessary under these circumstances to + * suppress any results (or errors) from the + * new resource occupying the fd. + */ + if (is_epoll) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + BT_CLEAR(pcp->pc_bitmap, fd); + releasef(fd); + continue; + } else { + /* + * Regular /dev/poll is unbothered + * about the fd reassignment. + */ + pdp->pd_fp = fp; + pdp->pd_gen = gen; + } } + + /* + * Skip entries marked with the sentinal value for + * having already fired under oneshot conditions. + */ + if (pdp->pd_events == POLLONESHOT) { + releasef(fd); + BT_CLEAR(pcp->pc_bitmap, fd); + continue; + } + /* * XXX - pollrelock() logic needs to know which * which pollcache lock to grab. It'd be a @@ -537,18 +559,19 @@ repoll: /* Handle special polling modes. */ if (pdp->pd_events & POLLONESHOT) { /* - * If POLLONESHOT is set, perform the - * implicit POLLREMOVE. + * Entries operating under POLLONESHOT + * will be marked with a sentinel value + * to indicate that they have "fired" + * when emitting an event. This will + * disable them from polling until a + * later add/modify event rearms them. */ - pdp->pd_fp = NULL; - pdp->pd_events = 0; - + pdp->pd_events = POLLONESHOT; if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } - BT_CLEAR(pcp->pc_bitmap, fd); } else if (pdp->pd_events & POLLET) { /* @@ -700,14 +723,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pollfd_t *pollfdp, *pfdp; dvpoll_epollfd_t *epfdp; uintptr_t limit; - int error, size; - ssize_t uiosize; - size_t copysize; + int error; + uint_t size; + size_t copysize, uiosize; nfds_t pollfdnum; - struct pollhead *php = NULL; - polldat_t *pdp; - int fd; - file_t *fp; boolean_t is_epoll, fds_added = B_FALSE; minor = getminor(dev); @@ -732,10 +751,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pcp->pc_pid = curproc->p_pid; } - uiosize = uiop->uio_resid; + if (uiop->uio_resid < 0) { + /* No one else is this careful, but maybe they should be. */ + return (EINVAL); + } + + uiosize = (size_t)uiop->uio_resid; pollfdnum = uiosize / size; /* + * For epoll-enabled handles, restrict the allowed write size to 2. + * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD + * operation which is expanded into two operations (DEL and ADD). + * + * All other operations performed through epoll_ctl(3C) will consist of + * a single entry. + */ + if (is_epoll && pollfdnum > 2) { + return (EINVAL); + } + + /* * We want to make sure that pollfdnum isn't large enough to DoS us, * but we also don't want to grab p_lock unnecessarily -- so we * perform the full check against our resource limits if and only if @@ -794,6 +830,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { ASSERT(dpep->dpe_refcnt != 0); + /* + * The epoll API does not allow EINTR as a result when making + * modifications to the set of polled fds. Given that write + * activity is relatively quick and the size of accepted writes + * is limited above to two entries, a signal-ignorant wait is + * used here to avoid the EINTR. + */ + if (is_epoll) { + cv_wait(&dpep->dpe_cv, &dpep->dpe_lock); + continue; + } + + /* + * Non-epoll writers to /dev/poll handles can tolerate EINTR. + */ if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); @@ -828,7 +879,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } for (pfdp = pollfdp; (uintptr_t)pfdp < limit; pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { - fd = pfdp->fd; + int fd = pfdp->fd; + polldat_t *pdp; + if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { /* * epoll semantics demand that we return EBADF if our @@ -844,76 +897,60 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { + uf_entry_gen_t gen; + file_t *fp = NULL; + struct pollhead *php = NULL; - fp = NULL; - - if (pdp == NULL) { - /* - * If we're in epoll compatibility mode, check - * that the fd is valid before allocating - * anything for it; epoll semantics demand that - * we return EBADF if our specified fd is - * invalid. - */ - if (is_epoll) { - if ((fp = getf(fd)) == NULL) { - error = EBADF; - break; - } + /* + * If we're in epoll compatibility mode, check that the + * fd is valid before allocating anything for it; epoll + * semantics demand that we return EBADF if our + * specified fd is invalid. + */ + if (is_epoll) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + error = EBADF; + break; } - + } + if (pdp == NULL) { pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); - } else { + } + + if (is_epoll) { /* - * epoll semantics demand that we error out if - * a file descriptor is added twice, which we - * check (imperfectly) by checking if we both - * have the file descriptor cached and the - * file pointer that correponds to the file - * descriptor matches our cached value. If - * there is a pointer mismatch, the file - * descriptor was closed without being removed. - * The converse is clearly not true, however, - * so to narrow the window by which a spurious - * EEXIST may be returned, we also check if - * this fp has been added to an epoll control - * descriptor in the past; if it hasn't, we - * know that this is due to fp reuse -- it's - * not a true EEXIST case. (By performing this - * additional check, we limit the window of - * spurious EEXIST to situations where a single - * file descriptor is being used across two or - * more epoll control descriptors -- and even - * then, the file descriptor must be closed and - * reused in a relatively tight time span.) + * If the fd is already a member of the epoll + * set, error emission is needed only when the + * fd assignment generation matches the one + * recorded in the polldat_t. Absence of such + * a generation match indicates that a new + * resource has been assigned at that fd. + * + * Caveat: It is possible to force a generation + * update while keeping the same backing + * resource. This is possible via dup2, but + * does not represent real-world use cases, + * making the lack of error acceptable. */ - if (is_epoll) { - if (pdp->pd_fp != NULL && - (fp = getf(fd)) != NULL && - fp == pdp->pd_fp && - (fp->f_flag2 & FEPOLLED)) { - error = EEXIST; - releasef(fd); - break; - } - - /* - * We have decided that the cached - * information was stale: it either - * didn't match, or the fp had never - * actually been epoll()'d on before. - * We need to now clear our pd_events - * to assure that we don't mistakenly - * operate on cached event disposition. - */ - pdp->pd_events = 0; + if (pdp->pd_fp != NULL && pdp->pd_gen == gen) { + error = EEXIST; + releasef(fd); + break; } - } - if (is_epoll) { + /* + * We have decided that the cached information + * was stale. Reset pd_events to assure that + * we don't mistakenly operate on cached event + * disposition. This configures the implicit + * subscription to HUP and ERR events which + * epoll features. + */ + pdp->pd_events = POLLERR|POLLHUP; + epfdp = (dvpoll_epollfd_t *)pfdp; pdp->pd_epolldata = epfdp->dpep_data; } @@ -928,39 +965,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } - if (fp == NULL && (fp = getf(fd)) == NULL) { - /* - * The fd is not valid. Since we can't pass - * this error back in the write() call, set - * the bit in bitmap to force DP_POLL ioctl - * to examine it. - */ - BT_SET(pcp->pc_bitmap, fd); - pdp->pd_events |= pfdp->events; - continue; - } - /* - * To (greatly) reduce EEXIST false positives, we - * denote that this fp has been epoll()'d. We do this - * regardless of epoll compatibility mode, as the flag - * is harmless if not in epoll compatibility mode. - */ - fp->f_flag2 |= FEPOLLED; + if (!is_epoll) { + ASSERT(fp == NULL); - /* - * Don't do VOP_POLL for an already cached fd with - * same poll events. - */ - if ((pdp->pd_events == pfdp->events) && - (pdp->pd_fp == fp)) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + /* + * The fd is not valid. Since we can't + * pass this error back in the write() + * call, set the bit in bitmap to force + * DP_POLL ioctl to examine it. + */ + BT_SET(pcp->pc_bitmap, fd); + pdp->pd_events |= pfdp->events; + continue; + } /* - * the events are already cached + * Don't do VOP_POLL for an already cached fd + * with same poll events. */ - releasef(fd); - continue; + if ((pdp->pd_events == pfdp->events) && + (pdp->pd_fp == fp)) { + /* + * the events are already cached + */ + releasef(fd); + continue; + } } + /* * do VOP_POLL and cache this poll fd. */ @@ -992,11 +1026,11 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * wake-ups. * * Drivers which never emit a pollhead will simply - * disobey the exectation of edge-triggered behavior. + * disobey the expectation of edge-triggered behavior. * This includes recursive epoll which, even on Linux, * yields its events in a level-triggered fashion only. */ - if ((pdp->pd_events & POLLET) && error == 0 && + if ((pfdp->events & POLLET) != 0 && error == 0 && php == NULL) { short levent = 0; @@ -1018,6 +1052,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) break; } pdp->pd_fp = fp; + pdp->pd_gen = gen; pdp->pd_events |= pfdp->events; if (php != NULL) { if (pdp->pd_php == NULL) { @@ -1143,8 +1178,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * to turn it off for a particular open. */ dpep->dpe_flag |= DP_ISEPOLLCOMPAT; - mutex_exit(&dpep->dpe_lock); + /* Record the epoll-enabled nature in the pollcache too */ + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_EPOLL; + mutex_exit(&pcp->pc_lock); + + mutex_exit(&dpep->dpe_lock); return (0); } diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index cfe0f78415..00b5f0e3de 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -347,8 +347,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0) return (err); - if ((err = mac_perim_enter_by_macname( - dls_devnet_mac(dlh), &mph)) != 0) { + if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh), + &mph)) != 0) { dls_devnet_rele_tmp(dlh); return (err); } @@ -360,7 +360,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu); - dls_link_rele(dlp); mac_perim_exit(mph); dls_devnet_rele_tmp(dlh); @@ -702,7 +701,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -717,8 +717,18 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, else err = drv_ioc_clrap(linkid); } else { - if (kprop->pr_valsize == 0) - return (ENOBUFS); + /* + * You might think that the earlier call to + * mac_prop_check_size() should catch this but + * it can't. The autopush prop uses 0 as a + * sentinel value to clear the prop. This + * check ensures we don't allow a get with a + * valsize of 0. + */ + if (kprop->pr_valsize == 0) { + err = ENOBUFS; + goto done; + } kprop->pr_perm_flags = MAC_PROP_PERM_RW; err = drv_ioc_getap(linkid, dlap); @@ -866,7 +876,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) @@ -1321,10 +1331,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, dls_link_t *dlp = NULL; dld_ioc_gettran_t *dgt = karg; - if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0) + goto done; + + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0) + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1343,13 +1356,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1373,10 +1387,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, if (dti->dti_nbytes != 256 || dti->dti_off != 0) return (EINVAL); - if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0) + goto done; + + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0) + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1396,13 +1413,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1499,7 +1517,6 @@ done: return (ret); } - /* * Note that ioctls that modify links have a NULL di_priv_func(), as * privileges can only be checked after we know the class of the link being @@ -1575,7 +1592,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = { {SIMNET_IOC, "simnet", 0, NULL, 0}, {BRIDGE_IOC, "bridge", 0, NULL, 0}, {IPTUN_IOC, "iptun", 0, NULL, 0}, - {IBPART_IOC, "ibp", -1, NULL, 0} + {IBPART_IOC, "ibp", -1, NULL, 0}, + {OVERLAY_IOC, "overlay", 0, NULL, 0} }; #define DLDIOC_CNT \ (sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t)) diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index cadd2a76d3..1371fa47c0 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, - proto_notify_req, proto_passive_req; + proto_notify_req, proto_passive_req, proto_exclusive_req; static void proto_capability_advertise(dld_str_t *, mblk_t *); static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); @@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp) case DL_PASSIVE_REQ: proto_passive_req(dsp, mp); break; + case DL_EXCLUSIVE_REQ: + proto_exclusive_req(dsp, mp); + break; default: proto_req(dsp, mp); break; @@ -606,6 +609,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) new_flags |= DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + new_flags |= DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + new_flags |= DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -693,6 +704,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) new_flags &= ~DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) { + dl_err = DL_NOTENAB; + goto failed2; + } + new_flags &= ~DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) { + dl_err = DL_NOTENAB; + goto failed2; + } + new_flags &= ~DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -1184,7 +1211,6 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp) uint16_t sap; uint_t addr_length; mblk_t *bp, *payload; - uint32_t start, stuff, end, value, flags; t_uscalar_t dl_err; uint_t max_sdu; @@ -1253,9 +1279,7 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp) /* * Transfer the checksum offload information if it is present. */ - hcksum_retrieve(payload, NULL, NULL, &start, &stuff, &end, &value, - &flags); - (void) hcksum_assoc(bp, NULL, NULL, start, stuff, end, value, flags, 0); + mac_hcksum_clone(payload, bp); /* * Link the payload onto the new header. @@ -1296,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp) * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ - if (dsp->ds_passivestate == DLD_ACTIVE) { + if (dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE) { dl_err = DL_OUTSTATE; goto failed; } @@ -1350,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + if (direct->di_flags & DI_DIRECT_RAW) { + direct->di_tx_df = + (uintptr_t)str_mdata_raw_fastpath_put; + } else { + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + } direct->di_tx_dh = dsp; direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; @@ -1377,24 +1410,22 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) } /* - * dld_capab_poll_enable() - * - * This function is misnamed. All polling and fanouts are run out of the - * lower mac (in case of VNIC and the only mac in case of NICs). The - * availability of Rx ring and promiscous mode is all taken care between - * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any - * fanout necessary is done by the soft rings that are part of the - * mac_srs (by default mac_srs sends the packets up via a TCP and - * non TCP soft ring). + * This function is misnamed. All polling and fanouts are run out of + * the lower MAC for VNICs and out of the MAC for NICs. The + * availability of Rx rings and promiscous mode is taken care of + * between the soft ring set (mac_srs), the Rx ring, and the SW + * classifier. Fanout, if necessary, is done by the soft rings that + * are part of the SRS. By default the SRS divvies up the packets + * based on protocol: TCP, UDP, or Other (OTH). * - * The mac_srs (or its associated soft rings) always store the ill_rx_ring + * The SRS (or its associated soft rings) always store the ill_rx_ring * (the cookie returned when they registered with IP during plumb) as their * 2nd argument which is passed up as mac_resource_handle_t. The upcall * function and 1st argument is what the caller registered when they * called mac_rx_classify_flow_add() to register the flow. For VNIC, * the function is vnic_rx and argument is vnic_t. For regular NIC * case, it mac_rx_default and mac_handle_t. As explained above, the - * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t) + * SRS (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t) * from its stored 2nd argument. */ static int @@ -1407,11 +1438,11 @@ dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll) return (ENOTSUP); /* - * Enable client polling if and only if DLS bypass is possible. - * Special cases like VLANs need DLS processing in the Rx data path. - * In such a case we can neither allow the client (IP) to directly - * poll the softring (since DLS processing hasn't been done) nor can - * we allow DLS bypass. + * Enable client polling if and only if DLS bypass is + * possible. Some traffic requires DLS processing in the Rx + * data path. In such a case we can neither allow the client + * (IP) to directly poll the soft ring (since DLS processing + * hasn't been done) nor can we allow DLS bypass. */ if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg)) return (ENOTSUP); @@ -1456,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: return (dld_capab_poll_enable(dsp, poll)); @@ -1466,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) } static int +dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags) +{ + dld_capab_ipcheck_t *ipc = data; + + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + switch (flags) { + case DLD_ENABLE: + ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr; + ipc->ipc_allowed_dh = dsp->ds_mch; + return (0); + case DLD_DISABLE: + return (0); + } + + return (ENOTSUP); +} + +static int dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags) { dld_capab_lso_t *lso = data; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: { mac_capab_lso_t mac_lso; @@ -1517,8 +1573,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) * completes. So we limit the check to DLD_ENABLE case. */ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && - (dsp->ds_sap != ETHERTYPE_IP || - !check_mod_above(dsp->ds_rq, "ip"))) { + (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) || + !check_mod_above(dsp->ds_rq, "ip")) && + !check_mod_above(dsp->ds_rq, "vnd"))) { return (ENOTSUP); } @@ -1539,6 +1596,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) err = dld_capab_lso(dsp, data, flags); break; + case DLD_CAPAB_IPCHECK: + err = dld_capab_ipcheck(dsp, data, flags); + break; + default: err = ENOTSUP; break; @@ -1600,9 +1661,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Direct capability negotiation interface between IP and DLD + * Direct capability negotiation interface between IP/VND and DLD. Note + * that for vnd we only allow the case where the media type is the + * native media type so we know that there are no transformations that + * would have to happen to the mac header that it receives. */ - if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) { + if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) && + check_mod_above(dsp->ds_rq, "ip")) || + (check_mod_above(dsp->ds_rq, "vnd") && + dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) { dld_capable = B_TRUE; subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1721,3 +1788,36 @@ dld_capabilities_disable(dld_str_t *dsp) if (dsp->ds_polling) (void) dld_capab_poll_disable(dsp, NULL); } + +static void +proto_exclusive_req(dld_str_t *dsp, mblk_t *mp) +{ + int ret = 0; + t_uscalar_t dl_err; + mac_perim_handle_t mph; + + if (dsp->ds_passivestate != DLD_UNINITIALIZED) { + dl_err = DL_OUTSTATE; + goto failed; + } + + if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) { + dl_err = DL_BADPRIM; + goto failed; + } + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + ret = dls_exclusive_set(dsp, B_TRUE); + mac_perim_exit(mph); + + if (ret != 0) { + dl_err = DL_SYSERR; + goto failed; + } + + dsp->ds_passivestate = DLD_EXCLUSIVE; + dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ); + return; +failed: + dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret); +} diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 9f89165455..5efbe0576d 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -857,6 +857,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid, return (mp); } +static boolean_t +i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp) +{ + mblk_t *mp = *mpp; + mblk_t *newmp; + uint_t pri, vid, dvid; + + dvid = mac_client_vid(dsp->ds_mch); + + /* + * Discard the packet if this is a VLAN stream but the VID in + * the packet is not correct. + */ + vid = VLAN_ID(mhip->mhi_tci); + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + return (B_FALSE); + + /* + * Discard the packet if this packet is a tagged packet + * but both pri and VID are 0. + */ + pri = VLAN_PRI(mhip->mhi_tci); + if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 && + vid == VLAN_ID_NONE) + return (B_FALSE); + + /* + * Update the priority bits to the per-stream priority if + * priority is not set in the packet. Update the VID for + * packets on a VLAN stream. + */ + pri = (pri == 0) ? dsp->ds_pri : 0; + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { + if ((newmp = i_dld_ether_header_update_tag(mp, pri, + dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { + return (B_FALSE); + } + *mpp = newmp; + } + + return (B_TRUE); +} + +mac_tx_cookie_t +str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) +{ + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); + mac_header_info_t mhi; + mac_tx_cookie_t cookie; + + if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0) + goto discard; + + if (is_ethernet) { + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) + goto discard; + } + + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); +discard: + /* TODO: bump kstat? */ + freemsg(mp); + return (NULL); +} + + + /* * M_DATA put (IP fast-path mode) */ @@ -905,7 +976,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -951,38 +1021,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { - dvid = mac_client_vid(dsp->ds_mch); - - /* - * Discard the packet if this is a VLAN stream but the VID in - * the packet is not correct. - */ - vid = VLAN_ID(mhi.mhi_tci); - if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) - goto discard; - - /* - * Discard the packet if this packet is a tagged packet - * but both pri and VID are 0. - */ - pri = VLAN_PRI(mhi.mhi_tci); - if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 && - vid == VLAN_ID_NONE) + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) goto discard; - - /* - * Update the priority bits to the per-stream priority if - * priority is not set in the packet. Update the VID for - * packets on a VLAN stream. - */ - pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dvid != VLAN_ID_NONE)) { - if ((newmp = i_dld_ether_header_update_tag(mp, pri, - dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { - goto discard; - } - mp = newmp; - } } if (DLD_TX(dsp, mp, 0, 0) != 0) { diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index d6bc723371..b71d95bd44 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -171,16 +171,16 @@ dls_bind(dld_str_t *dsp, uint32_t sap) /* * The MAC layer does the VLAN demultiplexing and will only pass up * untagged packets to non-promiscuous primary MAC clients. In order to - * support the binding to the VLAN SAP which is required by DLPI, dls + * support binding to the VLAN SAP, which is required by DLPI, DLS * needs to get a copy of all tagged packets when the client binds to * the VLAN SAP. We do this by registering a separate promiscuous - * callback for each dls client binding to that SAP. + * callback for each DLS client binding to that SAP. * * Note: even though there are two promiscuous handles in dld_str_t, * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle - * to receive VLAN pkt when promiscuous mode is not on. Only one of - * them can be non-NULL at the same time, to avoid receiving dup copies - * of pkts. + * to receive VLAN traffic when promiscuous mode is not on. Only one of + * them can be non-NULL at the same time, to avoid receiving duplicate + * copies of packets. */ if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) { int err; @@ -250,19 +250,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) { int err = 0; uint32_t old_flags = dsp->ds_promisc; + uint32_t new_type = new_flags & + ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS); mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL; + uint16_t mac_flags = 0; + boolean_t doremove = B_FALSE; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | - DLS_PROMISC_PHYS))); + DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS))); + + /* + * If we only have the non-data receive flags set or are only changing + * them, then there's nothing to do other than update the flags here. + * Basically when we only have something in the set of + * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's + * nothing else for us to do other than toggle it, as there's no need to + * talk to MAC and we don't have to do anything else. + */ + if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) { + dsp->ds_promisc = new_flags; + return (0); + } /* * If the user has only requested DLS_PROMISC_MULTI then we need to make * sure that they don't see all packets. */ - if (new_flags == DLS_PROMISC_MULTI) + if (new_type == DLS_PROMISC_MULTI) mptype = MAC_CLIENT_PROMISC_MULTI; + /* + * Look at new flags and figure out the correct mac promisc flags. + * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS, + * don't turn on physical promisc mode. + */ + if (new_flags & DLS_PROMISC_RX_ONLY) + mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP; + if (new_flags & DLS_PROMISC_FIXUPS) + mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS; + if (new_type == DLS_PROMISC_SAP) + mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS; + + /* + * If we're coming in and we're being asked to transition to a state + * where the only DLS flags would be enabled are flags that change what + * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and + * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we + * need to remove the MAC layer promiscuous handler. + */ + if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 && + new_flags != 0) { + doremove = B_TRUE; + } + + /* + * There are three cases we care about here with respect to MAC. Going + * from nothing to something, something to nothing, something to + * something where we need to change how we're getting stuff from mac. + * In the last case, as long as they're not equal, we need to assume + * something has changed and do something about it. + */ if (dsp->ds_promisc == 0 && new_flags != 0) { /* * If only DLS_PROMISC_SAP, we don't turn on the @@ -270,9 +320,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, - (new_flags != DLS_PROMISC_SAP) ? 0 : - MAC_PROMISC_FLAGS_NO_PHYS); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) { dsp->ds_promisc = old_flags; return (err); @@ -283,7 +331,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) mac_promisc_remove(dsp->ds_vlan_mph); dsp->ds_vlan_mph = NULL; } - } else if (dsp->ds_promisc != 0 && new_flags == 0) { + } else if (dsp->ds_promisc != 0 && + (new_flags == 0 || doremove == B_TRUE)) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); @@ -298,19 +347,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); } - } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 && - new_flags != dsp->ds_promisc) { - /* - * If the old flag is PROMISC_SAP, but the current flag has - * changed to some new non-zero value, we need to turn the - * physical promiscuous mode. - */ + } else if (new_flags != 0 && new_flags != old_flags) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); /* Honors both after-remove and before-add semantics! */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, 0); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) dsp->ds_promisc = old_flags; } else { @@ -631,6 +674,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } @@ -652,8 +711,8 @@ dls_mac_active_set(dls_link_t *dlp) /* request the primary MAC address */ if ((err = mac_unicast_add(dlp->dl_mch, NULL, MAC_UNICAST_PRIMARY | MAC_UNICAST_TAG_DISABLE | - MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, 0, - &diag)) != 0) { + MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, + VLAN_ID_NONE, &diag)) != 0) { return (err); } @@ -661,7 +720,10 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + } else if (dlp->dl_exclusive == B_TRUE) { + return (EBUSY); } + dlp->dl_nactive++; return (0); } @@ -687,7 +749,11 @@ dls_active_set(dld_str_t *dsp) if (dsp->ds_passivestate == DLD_PASSIVE) return (0); - /* If we're already active, then there's nothing more to do. */ + if (dsp->ds_dlp->dl_exclusive == B_TRUE && + dsp->ds_passivestate != DLD_EXCLUSIVE) + return (EBUSY); + + /* If we're already active, we need to check the link's exclusivity */ if ((dsp->ds_nactive == 0) && ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) { /* except for ENXIO all other errors are mapped to EBUSY */ @@ -696,7 +762,8 @@ dls_active_set(dld_str_t *dsp) return (err); } - dsp->ds_passivestate = DLD_ACTIVE; + dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ? + DLD_EXCLUSIVE : DLD_ACTIVE; dsp->ds_nactive++; return (0); } @@ -727,7 +794,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all) if (dsp->ds_nactive != 0) return; - ASSERT(dsp->ds_passivestate == DLD_ACTIVE); + ASSERT(dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE); dls_mac_active_clear(dsp->ds_dlp); + /* + * We verify below to ensure that no other part of DLS has mucked with + * our exclusive state. + */ + if (dsp->ds_passivestate == DLD_EXCLUSIVE) + VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0); dsp->ds_passivestate = DLD_UNINITIALIZED; } + +int +dls_exclusive_set(dld_str_t *dsp, boolean_t enable) +{ + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + if (enable == B_FALSE) { + dsp->ds_dlp->dl_exclusive = B_FALSE; + return (0); + } + + if (dsp->ds_dlp->dl_nactive != 0) + return (EBUSY); + + dsp->ds_dlp->dl_exclusive = B_TRUE; + + return (0); +} diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 6c8ffcb0a9..c792251052 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -30,11 +30,15 @@ #include <sys/sysmacros.h> #include <sys/strsubr.h> +#include <sys/pattr.h> #include <sys/strsun.h> #include <sys/vlan.h> #include <sys/dld_impl.h> #include <sys/sdt.h> #include <sys/atomic.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/datalink.h> static kmem_cache_t *i_dls_link_cachep; mod_hash_t *i_dls_link_hash; @@ -159,6 +163,18 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip, uint16_t cvid, cpri; int err; + /* + * If this message is from a same-machine sender, then + * there may be HW checksum offloads to emulate. + */ + if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { + mblk_t *tmpnext = mp->b_next; + + mp->b_next = NULL; + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + mp->b_next = tmpnext; + } + DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err); if (err != 0) break; @@ -353,6 +369,22 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, int err, rval; /* + * The mac_hw_emul() function, by design, doesn't predicate on + * HW_LOCAL_MAC. But since we are in Rx context we know that + * any LSO packet must also be from a same-machine sender. We + * take advantage of that and forgoe writing a manual loop to + * predicate on HW_LOCAL_MAC. + * + * But for checksum emulation we need to predicate on + * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that + * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM + * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we + * do the checksum emulation in the second loop and in + * subchain matching. + */ + mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL); + + /* * Walk the packet chain. */ for (; mp != NULL; mp = nextp) { @@ -361,6 +393,18 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, */ accepted = B_FALSE; + /* + * If this message is from a same-machine sender, then + * there may be HW checksum offloads to emulate. + */ + if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { + mblk_t *tmpnext = mp->b_next; + + mp->b_next = NULL; + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + mp->b_next = tmpnext; + } + DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); if (err != 0) { atomic_inc_32(&(dlp->dl_unknowns)); @@ -379,7 +423,16 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, vid = VLAN_ID(mhi.mhi_tci); + /* + * This condition is true only when a sun4v vsw client + * is on the scene; as it is the only type of client + * that multiplexes VLANs on a single client instance. + * All other types of clients have one VLAN per client + * instance. In that case, MAC strips the VLAN tag + * before delivering it to DLS (see mac_rx_deliver()). + */ if (mhi.mhi_istagged) { + /* * If it is tagged traffic, send it upstream to * all dld_str_t which are attached to the physical @@ -554,7 +607,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, dls_head_t *dhp; mod_hash_key_t key; + /* + * We expect to deal with only a single packet. + */ + ASSERT3P(mp->b_next, ==, NULL); + DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); + if (err != 0) goto drop; @@ -580,6 +639,67 @@ drop: freemsg(mp); } +/* + * We'd like to notify via sysevents that a link state change has occurred. + * There are a couple of challenges associated with this. The first is that if + * the link is flapping a lot, we may not see an accurate state when we launch + * the notification, we're told it changed, not what it changed to. + * + * The next problem is that all of the information that a user has associated + * with this device is the exact opposite of what we have on the dls_link_t. We + * have the name of the mac device, which has no bearing on what users see. + * Likewise, we don't have the datalink id either. So we're going to have to get + * this from dls. + * + * This is all further complicated by the fact that this could be going on in + * another thread at the same time as someone is tearing down the dls_link_t + * that we're associated with. We need to be careful not to grab the mac + * perimeter, otherwise we stand a good chance of deadlock. + */ +static void +dls_link_notify(void *arg, mac_notify_type_t type) +{ + dls_link_t *dlp = arg; + dls_dl_handle_t dhp; + nvlist_t *nvp; + sysevent_t *event; + sysevent_id_t eid; + + if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK) + return; + + /* + * If we can't find a devnet handle for this link, then there is no user + * knowable device for this at the moment and there's nothing we can + * really share with them that will make sense. + */ + if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0) + return; + + /* + * Because we're attaching this nvlist_t to the sysevent, it'll get + * cleaned up when we call sysevent_free. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID, + dls_devnet_linkid(dhp)) == 0); + VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME, + dls_devnet_link(dhp)) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID, + dls_devnet_getzid(dhp)) == 0); + + dls_devnet_rele_tmp(dhp); + + event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE, + ILLUMOS_KERN_PUB"dls", SE_SLEEP); + VERIFY(event != NULL); + (void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp); + + (void) log_sysevent(event, SE_SLEEP, &eid); + sysevent_free(event); + +} + static void i_dls_link_destroy(dls_link_t *dlp) { @@ -590,6 +710,9 @@ i_dls_link_destroy(dls_link_t *dlp) /* * Free the structure back to the cache. */ + if (dlp->dl_mnh != NULL) + mac_notify_remove(dlp->dl_mnh, B_TRUE); + if (dlp->dl_mch != NULL) mac_client_close(dlp->dl_mch, 0); @@ -601,8 +724,10 @@ i_dls_link_destroy(dls_link_t *dlp) dlp->dl_mh = NULL; dlp->dl_mch = NULL; dlp->dl_mip = NULL; + dlp->dl_mnh = NULL; dlp->dl_unknowns = 0; dlp->dl_nonip_cnt = 0; + dlp->dl_exclusive = B_FALSE; kmem_cache_free(i_dls_link_cachep, dlp); } @@ -641,6 +766,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp) if (err != 0) goto bail; + dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp); + DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *, dlp->dl_mch); diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 05620698ca..f813acaac6 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2017 Joyent, Inc. */ /* * Copyright (c) 2016 by Delphix. All rights reserved. @@ -85,6 +86,14 @@ static door_handle_t dls_mgmt_dh = NULL; /* dls_devnet_t dd_flags */ #define DD_CONDEMNED 0x1 #define DD_IMPLICIT_IPTUN 0x2 /* Implicitly-created ip*.*tun* tunnel */ +#define DD_INITIALIZING 0x4 + +/* + * If the link is marked as initializing or condemned then it should + * not be visible outside of the DLS framework. + */ +#define DD_NOT_VISIBLE(flags) ( \ + (flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0) /* * This structure is used to keep the <linkid, macname> mapping. @@ -108,13 +117,14 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); -static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); +static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t); /*ARGSUSED*/ static int @@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg) { dls_devnet_t *ddp = buf; - ASSERT(ddp->dd_ksp == NULL); - ASSERT(ddp->dd_ref == 0); - ASSERT(ddp->dd_tref == 0); + VERIFY(ddp->dd_ksp == NULL); + VERIFY(ddp->dd_ref == 0); + VERIFY(ddp->dd_tref == 0); mutex_destroy(&ddp->dd_mutex); cv_destroy(&ddp->dd_cv); } @@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -529,6 +544,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -537,6 +553,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) return (err); } +int +dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid, + zoneid_t zid) +{ + dlmgmt_door_getlinkid_t getlinkid; + dlmgmt_getlinkid_retval_t retval; + int err; + + ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid()); + getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; + (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = zid; + + if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, + sizeof (retval))) == 0) { + *linkid = retval.lr_linkid; + } + return (err); +} + + datalink_id_t dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class, datalink_media_t dmedia, uint32_t flags) @@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, + if (dls_stat_create("link", 0, nm, zoneid, dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid, - &ksp) == 0) { + &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* - * Associate a linkid with a given link (identified by macname) + * Associate the linkid with the link identified by macname. If this + * is called on behalf of a physical link then linkid may be + * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a + * virtual link, linkid must have a value. */ static int -dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, +dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid, dls_devnet_t **ddpp) { + const char *macname = mac_name(mh); dls_devnet_t *ddp = NULL; datalink_class_t class; int err; @@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, } /* - * This might be a physical link that has already - * been created, but which does not have a linkid - * because dlmgmtd was not running when it was created. + * If we arrive here we know we are attempting to set + * the linkid on a physical link. A virtual link + * should never arrive here because it should never + * call this function without a linkid. Virtual links + * are created through dlgmtmd and thus we know + * dlmgmtd is alive to assign it a linkid (search for + * uses of dladm_create_datalink_id() to prove this to + * yourself); we don't have the same guarantee for a + * physical link which may perform an upcall for a + * linkid while dlmgmtd is down but will continue + * creating a devnet without the linkid (see + * softmac_create_datalink() to see how physical link + * creation works). That is why there is no entry in + * the id hash but there is one in the macname hash -- + * softmac couldn't acquire a linkid the first time it + * called this function. + * + * Because of the check above, we also know that + * ddp->dd_linkid is not set. Following this, the link + * must still be in the DD_INITIALIZING state because + * that flag is removed IFF dd_linkid is set. This is + * why we can ASSERT the DD_INITIALIZING flag below if + * the call to i_dls_devnet_setzid() fails. */ if (linkid == DATALINK_INVALID_LINKID || class != DATALINK_CLASS_PHYS) { err = EINVAL; goto done; } + + ASSERT(ddp->dd_flags & DD_INITIALIZING); + } else { ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP); + ddp->dd_flags = DD_INITIALIZING; ddp->dd_tref = 0; ddp->dd_ref++; ddp->dd_owner_zid = zoneid; @@ -875,8 +961,19 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) - (void) dls_devnet_unset(macname, &linkid, B_TRUE); + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) { + /* + * At this point the link is marked as + * DD_INITIALIZING -- there can be no + * outstanding temp refs and therefore no need + * to wait for them. + */ + ASSERT(ddp->dd_flags & DD_INITIALIZING); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); + return (err); + } + /* * The kstat subsystem holds its own locks (rather perimeter) * before calling the ks_update (dls_devnet_stat_update) entry @@ -884,20 +981,35 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; + + mutex_enter(&ddp->dd_mutex); + if (linkid != DATALINK_INVALID_LINKID && + !ddp->dd_prop_loaded && ddp->dd_prop_taskid == NULL) { + ddp->dd_prop_taskid = taskq_dispatch(system_taskq, + dls_devnet_prop_task, ddp, TQ_SLEEP); + } + mutex_exit(&ddp->dd_mutex); + } return (err); } /* - * Disassociate a linkid with a given link (identified by macname) - * This waits until temporary references to the dls_devnet_t are gone. + * Disassociate the linkid from the link identified by macname. If + * wait is B_TRUE, wait until all temporary refs are released and the + * prop task is finished. + * + * If waiting then you SHOULD NOT call this from inside the MAC perim + * as deadlock will ensue. Otherwise, this function is safe to call + * from inside or outside the MAC perim. */ static int -dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) +dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait) { + const char *macname = mac_name(mh); dls_devnet_t *ddp; int err; mod_hash_val_t val; @@ -918,21 +1030,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) * deadlock. Return EBUSY if the asynchronous thread started for * property loading as part of the post attach hasn't yet completed. */ - ASSERT(ddp->dd_ref != 0); + VERIFY(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != 0))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); - /* * Remove this dls_devnet_t from the hash table. */ @@ -947,18 +1100,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) } rw_exit(&i_dls_devnet_lock); + /* + * It is important to call i_dls_devnet_setzid() WITHOUT the + * i_dls_devnet_lock held. The setzid call grabs the MAC + * perim; thus causing DLS -> MAC lock ordering if performed + * with the i_dls_devnet_lock held. This forces consumers to + * grab the MAC perim before calling dls_devnet_unset() (the + * locking rules state MAC -> DLS order). By performing the + * setzid outside of the i_dls_devnet_lock consumers can + * safely call dls_devnet_unset() outside the MAC perim. + */ + if (ddp->dd_zid != GLOBAL_ZONEID) { + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } + if (wait) { /* * Wait until all temporary references are released. + * The holders of the tref need the MAC perim to + * perform their work and release the tref. To avoid + * deadlock, assert that the perim is never held here. */ + ASSERT0(MAC_PERIM_HELD(mh)); while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != 0)) cv_wait(&ddp->dd_cv, &ddp->dd_mutex); } else { - ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); + VERIFY(ddp->dd_tref == 0); + VERIFY(ddp->dd_prop_taskid == NULL); } - if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid); + } ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; @@ -969,6 +1144,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) return (0); } +/* + * This is a private hold routine used when we already have the dls_link_t, thus + * we know that it cannot go away. + */ +int +dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp) +{ + int err; + dls_devnet_t *ddp = NULL; + + rw_enter(&i_dls_devnet_lock, RW_WRITER); + if ((err = mod_hash_find(i_dls_devnet_hash, + (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) { + ASSERT(err == MH_ERR_NOTFOUND); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + + mutex_enter(&ddp->dd_mutex); + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + ddp->dd_tref++; + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + + *ddhp = ddp; + return (0); +} + static int dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, boolean_t tmp_hold) @@ -985,8 +1193,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1053,8 +1261,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp) return (ENOENT); } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1071,7 +1279,7 @@ void dls_devnet_rele(dls_devnet_t *ddp) { mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 1); + VERIFY(ddp->dd_ref > 1); ddp->dd_ref--; if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) { mutex_exit(&ddp->dd_mutex); @@ -1083,7 +1291,7 @@ dls_devnet_rele(dls_devnet_t *ddp) } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) { char drv[MAXLINKNAMELEN]; uint_t ppa; @@ -1093,7 +1301,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) dls_dev_handle_t ddh; int err; - if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0) + if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0) return (dls_devnet_hold(linkid, ddpp)); /* @@ -1236,9 +1444,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zoneinit parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1283,10 +1497,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) } mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } mutex_exit(&ddp->dd_mutex); @@ -1297,7 +1513,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1364,7 +1588,7 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (mph != NULL) mac_perim_exit(mph); @@ -1373,7 +1597,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1402,10 +1627,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) sizeof (retval)); if (err != 0) goto done; + + /* + * We set upcall_done only if the upcall is + * successful. This way, if dls_link_setzid() fails, + * we know another upcall must be done to reset the + * dlmgmtd state. + */ upcall_done = B_TRUE; } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1420,7 +1653,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1442,7 +1675,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1459,7 +1692,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } @@ -1497,15 +1730,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid) * Access a vanity naming node. */ int -dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp, + zoneid_t zid) { dls_devnet_t *ddp; dls_link_t *dlp; - zoneid_t zid = getzoneid(); + zoneid_t czid = getzoneid(); int err; mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + if (czid != GLOBAL_ZONEID && czid != zid) + return (ENOENT); + + if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) return (err); dls_devnet_prop_task_wait(ddp); @@ -1538,6 +1775,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (0); } +int +dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +{ + return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid())); +} + /* * Close access to a vanity naming node. */ @@ -1594,13 +1837,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) * we need to use the linkid to get the user name for the link * when we create the MAC client. */ - if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) { + if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) { if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) { mac_perim_exit(mph); - (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); return (err); } + + /* + * If dd_linkid is set then the link was successfully + * initialized. In this case we can remove the + * initializing flag and make the link visible to the + * rest of the system. + * + * If not set then we were called by softmac and it + * was unable to obtain a linkid for the physical link + * because dlmgmtd is down. In that case softmac will + * eventually obtain a linkid and call + * dls_devnet_recreate() to complete initialization. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + mac_perim_exit(mph); return (err); } @@ -1614,8 +1876,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) int dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid) { - ASSERT(linkid != DATALINK_INVALID_LINKID); - return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL)); + dls_devnet_t *ddp; + int err; + + VERIFY(linkid != DATALINK_INVALID_LINKID); + if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) { + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + + return (err); + } int @@ -1625,15 +1898,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) mac_perim_handle_t mph; *idp = DATALINK_INVALID_LINKID; - err = dls_devnet_unset(mac_name(mh), idp, wait); - if (err != 0 && err != ENOENT) + err = dls_devnet_unset(mh, idp, wait); + + /* + * We continue on in the face of ENOENT because the devnet + * unset and DLS link release are not atomic and we may have a + * scenario where there is no entry in i_dls_devnet_hash for + * the MAC name but there is an entry in i_dls_link_hash. For + * example, if the following occurred: + * + * 1. dls_devnet_unset() returns success, and + * + * 2. dls_link_rele_by_name() fails with ENOTEMPTY because + * flows still exist, and + * + * 3. dls_devnet_set() fails to set the zone id and calls + * dls_devnet_unset() -- leaving an entry in + * i_dls_link_hash but no corresponding entry in + * i_dls_devnet_hash. + * + * Even if #3 wasn't true the dls_devnet_set() may fail for + * different reasons in the future; the point is that it _can_ + * fail as part of its contract. We can't rely on it working + * so we must assume that these two pieces of state (devnet + * and link hashes), which should always be in sync, can get + * out of sync and thus even if we get ENOENT from the devnet + * hash we should still try to delete from the link hash just + * in case. + * + * We could prevent the ENOTEMPTY from dls_link_rele_by_name() + * by calling mac_disable() before calling + * dls_devnet_destroy() but that's not currently possible due + * to a long-standing bug. OpenSolaris 6791335: The semantics + * of mac_disable() were modified by Crossbow such that + * dls_devnet_destroy() needs to be called before + * mac_disable() can succeed. This is because of the implicit + * reference that dls has on the mac_impl_t. + */ + if (err != 0 && err != ENOENT) { return (err); + } mac_perim_enter_by_mh(mh, &mph); err = dls_link_rele_by_name(mac_name(mh)); - mac_perim_exit(mph); - if (err != 0) { + dls_devnet_t *ddp; + /* * XXX It is a general GLDv3 bug that dls_devnet_set() has to * be called to re-set the link when destroy fails. The @@ -1641,9 +1951,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) * called from kernel context or from a zone other than that * which initially created the link. */ - (void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()), - NULL); + (void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp); + + /* + * You might think dd_linkid should always be set + * here, but in the case where dls_devnet_unset() + * returns ENOENT it will be DATALINK_INVALID_LINKID. + * Stay consistent with the rest of DLS and only + * remove the initializing flag if linkid is set. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); } + + mac_perim_exit(mph); return (err); } @@ -1717,6 +2040,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid) } const char * +dls_devnet_link(dls_dl_handle_t ddh) +{ + return (ddh->dd_linkname); +} + +const char * dls_devnet_mac(dls_dl_handle_t ddh) { return (ddh->dd_mac); diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE new file mode 100644 index 0000000000..00aefb6f51 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE @@ -0,0 +1,32 @@ +/* + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..ac6d2d1b15 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +DR_SAS DRIVER diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c new file mode 100644 index 0000000000..02354c9b16 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.c @@ -0,0 +1,5510 @@ +/* + * dr_sas.c: source for dr_sas driver + * + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Arun Chandrashekhar + * Manju R + * Rajesh Prabhakaran + * Seokmann Ju + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/pci.h> +#include <sys/scsi/scsi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/signal.h> +#include <sys/fs/dv_node.h> /* devfs_clean */ + +#include "dr_sas.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + +/* + * Local static data + */ +static void *drsas_state = NULL; +static int debug_level_g = CL_NONE; + +#pragma weak scsi_hba_open +#pragma weak scsi_hba_close +#pragma weak scsi_hba_ioctl + +static ddi_dma_attr_t drsas_generic_dma_attr = { + DMA_ATTR_V0, /* dma_attr_version */ + 0, /* low DMA address range */ + 0xFFFFFFFFU, /* high DMA address range */ + 0xFFFFFFFFU, /* DMA counter register */ + 8, /* DMA address alignment */ + 0x07, /* DMA burstsizes */ + 1, /* min DMA size */ + 0xFFFFFFFFU, /* max DMA size */ + 0xFFFFFFFFU, /* segment boundary */ + DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */ + 512, /* granularity of device */ + 0 /* bus specific DMA flags */ +}; + +int32_t drsas_max_cap_maxxfer = 0x1000000; + +/* + * cb_ops contains base level routines + */ +static struct cb_ops drsas_cb_ops = { + drsas_open, /* open */ + drsas_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + drsas_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + nodev, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * dev_ops contains configuration routines + */ +static struct dev_ops drsas_ops = { + DEVO_REV, /* rev, */ + 0, /* refcnt */ + drsas_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + drsas_attach, /* attach */ + drsas_detach, /* detach */ + drsas_reset, /* reset */ + &drsas_cb_ops, /* char/block ops */ + NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_supported, /* quiesce */ +}; + +char _depends_on[] = "misc/scsi"; + +static struct modldrv modldrv = { + &mod_driverops, /* module type - driver */ + DRSAS_VERSION, + &drsas_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* ml_rev - must be MODREV_1 */ + &modldrv, /* ml_linkage */ + NULL /* end of driver linkage */ +}; + +static struct ddi_device_acc_attr endian_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + + +/* + * ************************************************************************** * + * * + * common entry points - for loadable kernel modules * + * * + * ************************************************************************** * + */ + +int +_init(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ret = ddi_soft_state_init(&drsas_state, + sizeof (struct drsas_instance), 0); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state")); + return (ret); + } + + if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba")); + ddi_soft_state_fini(&drsas_state); + return (ret); + } + + ret = mod_install(&modlinkage); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed")); + scsi_hba_fini(&modlinkage); + ddi_soft_state_fini(&drsas_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + return (ret); + + scsi_hba_fini(&modlinkage); + + ddi_soft_state_fini(&drsas_state); + + return (ret); +} + + +/* + * ************************************************************************** * + * * + * common entry points - for autoconfiguration * + * * + * ************************************************************************** * + */ + +static int +drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance_no; + int nregs; + uint8_t added_isr_f = 0; + uint8_t added_soft_isr_f = 0; + uint8_t create_devctl_node_f = 0; + uint8_t create_scsi_node_f = 0; + uint8_t create_ioc_node_f = 0; + uint8_t tran_alloc_f = 0; + uint8_t irq; + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + uint16_t command; + off_t reglength = 0; + int intr_types = 0; + char *data; + int msi_enable = 0; + + scsi_hba_tran_t *tran; + ddi_dma_attr_t tran_dma_attr; + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + /* + * check to see whether this device is in a DMA-capable slot. + */ + if (ddi_slaveonly(dip) == DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Device in slave-only slot, unused", + instance_no)); + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_ATTACH: + con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH")); + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(drsas_state, instance_no) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to allocate soft state", + instance_no)); + + return (DDI_FAILURE); + } + + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + if (instance == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Bad soft state", instance_no)); + + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + bzero((caddr_t)instance, + sizeof (struct drsas_instance)); + + instance->func_ptr = kmem_zalloc( + sizeof (struct drsas_func_ptr), KM_SLEEP); + ASSERT(instance->func_ptr); + + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: pci config setup failed ", + instance_no)); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to get registers.")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); + + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); + + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); + + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, DRSAS_VERSION)); + + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); + + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; + + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + + con_log(CL_ANN, (CE_CONT, "dr_sas%d: " + "enable bus-mastering", instance_no)); + } else { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "bus-mastering already set", instance_no)); + } + + /* initialize function pointers */ + if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || + (device_id == PCI_DEVICE_ID_LSI_2108V)) { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "2108V/DE detected", instance_no)); + instance->func_ptr->read_fw_status_reg = + read_fw_status_reg_ppc; + instance->func_ptr->issue_cmd = issue_cmd_ppc; + instance->func_ptr->issue_cmd_in_sync_mode = + issue_cmd_in_sync_mode_ppc; + instance->func_ptr->issue_cmd_in_poll_mode = + issue_cmd_in_poll_mode_ppc; + instance->func_ptr->enable_intr = + enable_intr_ppc; + instance->func_ptr->disable_intr = + disable_intr_ppc; + instance->func_ptr->intr_ack = intr_ack_ppc; + } else { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Invalid device detected")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + drsas_fm_init(instance); + + /* Initialize Interrupts */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + return (DDI_FAILURE); + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "dr_sas: register length to map is " + "0x%lx bytes", reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: couldn't map control registers")); + goto fail_attach; + } + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + msi_enable = 0; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "drsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "yes", 3) == 0) { + msi_enable = 1; + con_log(CL_ANN, (CE_WARN, + "msi_enable = %d ENABLED", + msi_enable)); + } + ddi_prop_free(data); + } + + con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", + msi_enable)); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "ddi_intr_get_supported_types() failed")); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", + intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSIX interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & + DDI_INTR_TYPE_MSI)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSI interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "FIXED interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + con_log(CL_ANN, (CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts")); + goto fail_attach; + } + + added_isr_f = 1; + + /* setup the mfi based low level driver */ + if (init_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "could not initialize the low level driver")); + + goto fail_attach; + } + + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + &instance->soft_intr_id, NULL, NULL, + drsas_softintr, (caddr_t)instance) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + " Software ISR did not register")); + + goto fail_attach; + } + + added_soft_isr_f = 1; + } + + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + + if (tran == NULL) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_tran_alloc failed")); + goto fail_attach; + } + + tran_alloc_f = 1; + + instance->tran = tran; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = drsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = drsas_tran_tgt_free; + tran->tran_init_pkt = drsas_tran_init_pkt; + tran->tran_start = drsas_tran_start; + tran->tran_abort = drsas_tran_abort; + tran->tran_reset = drsas_tran_reset; + tran->tran_getcap = drsas_tran_getcap; + tran->tran_setcap = drsas_tran_setcap; + tran->tran_destroy_pkt = drsas_tran_destroy_pkt; + tran->tran_dmafree = drsas_tran_dmafree; + tran->tran_sync_pkt = drsas_tran_sync_pkt; + tran->tran_bus_config = drsas_tran_bus_config; + + tran_dma_attr = drsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_attach failed")); + + goto fail_attach; + } + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create devctl node.")); + + goto fail_attach; + } + + create_devctl_node_f = 1; + + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), + DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create scsi node.")); + + goto fail_attach; + } + + create_scsi_node_f = 1; + + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); + + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), + DDI_PSEUDO, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create ioctl node.")); + + goto fail_attach; + } + + create_ioc_node_f = 1; + + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "drsas_dr_taskq", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create taskq ")); + instance->taskq = NULL; + goto fail_attach; + } + + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); + + /* initiate AEN */ + if (start_mfi_aen(instance)) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to initiate AEN.")); + goto fail_initiate_aen; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "AEN started for instance %d.", instance_no)); + + /* Finally! We are on the air. */ + ddi_report_dev(dip); + + if (drsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (drsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + instance->dr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld), + KM_SLEEP); + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, (CE_WARN, + "dr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + +fail_initiate_aen: +fail_attach: + if (create_devctl_node_f) { + ddi_remove_minor_node(dip, "devctl"); + } + + if (create_scsi_node_f) { + ddi_remove_minor_node(dip, "scsi"); + } + + if (create_ioc_node_f) { + ddi_remove_minor_node(dip, instance->iocnode); + } + + if (tran_alloc_f) { + scsi_hba_tran_free(tran); + } + + + if (added_soft_isr_f) { + ddi_remove_softintr(instance->soft_intr_id); + } + + if (added_isr_f) { + drsas_rem_intrs(instance); + } + + if (instance && instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + ddi_soft_state_free(drsas_state, instance_no); + + con_log(CL_ANN, (CE_NOTE, + "dr_sas: return failure from drsas_attach")); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + int rval; + int drsas_minor = getminor((dev_t)arg); + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + instance = (struct drsas_instance *) + ddi_get_soft_state(drsas_state, + MINOR2INST(drsas_minor)); + + if (instance == NULL) { + *resultp = NULL; + rval = DDI_FAILURE; + } else { + *resultp = instance->dip; + rval = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)instance; + rval = DDI_SUCCESS; + break; + default: + *resultp = NULL; + rval = DDI_FAILURE; + } + + return (rval); +} + +static int +drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state, + instance_no); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d could not get instance in detach", + instance_no)); + + return (DDI_FAILURE); + } + + con_log(CL_ANN, (CE_NOTE, + "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x", + instance_no, instance->vendor_id, instance->device_id, + instance->subsysvid, instance->subsysid)); + + switch (cmd) { + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_DETACH")); + + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d failed to detach", + instance_no)); + + return (DDI_FAILURE); + } + + scsi_hba_tran_free(instance->tran); + + flush_cache(instance); + + if (abort_aen_cmd(instance, instance->aen_cmd)) { + con_log(CL_ANN, (CE_WARN, "drsas_detach: " + "failed to abort prevous AEN command")); + + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + if (instance->isr_level == HIGH_LEVEL_INTR) { + ddi_remove_softintr(instance->soft_intr_id); + } + + drsas_rem_intrs(instance); + + if (instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + kmem_free(instance->dr_ld_list, MRDRV_MAX_LD + * sizeof (struct drsas_ld)); + free_space_for_mfi(instance); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + + ddi_soft_state_free(drsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * ************************************************************************** * + * * + * common entry points - for character driver types * + * * + * ************************************************************************** * + */ +static int +drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* Check root permissions */ + if (drv_priv(credp) != 0) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Non-root ioctl access denied!")); + return (EPERM); + } + + /* Verify we are being opened as a character device */ + if (otyp != OTYP_CHR) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: ioctl node must be a char node")); + return (EINVAL); + } + + if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev))) + == NULL) { + return (ENXIO); + } + + if (scsi_hba_open) { + rval = scsi_hba_open(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* no need for locks! */ + + if (scsi_hba_close) { + rval = scsi_hba_close(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int rval = 0; + + struct drsas_instance *instance; + struct drsas_ioctl *ioctl; + struct drsas_aen aen; + int i; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev))); + + if (instance == NULL) { + /* invalid minor number */ + con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found.")); + return (ENXIO); + } + + ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl), + KM_SLEEP); + ASSERT(ioctl); + + switch ((uint_t)cmd) { + case DRSAS_IOCTL_FIRMWARE: + for (i = 0; i < sizeof (struct drsas_ioctl); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)ioctl+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "drsas_ioctl " + "ERROR IOCTL copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) { + rval = handle_drv_ioctl(instance, ioctl, mode); + } else { + rval = handle_mfi_ioctl(instance, ioctl, mode); + } + for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) { + if (ddi_copyout((uint8_t *)ioctl+i, + (uint8_t *)arg+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: ddi_copyout " + "failed")); + rval = 1; + break; + } + } + + break; + case DRSAS_IOCTL_AEN: + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)&aen+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ERROR AEN copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + + rval = handle_mfi_aen(instance, &aen); + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyout((uint8_t *)&aen + i, + (uint8_t *)arg + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ddi_copyout failed")); + rval = 1; + break; + } + } + + break; + default: + rval = scsi_hba_ioctl(dev, cmd, arg, + mode, credp, rvalp); + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: " + "scsi_hba_ioctl called, ret = %x.", rval)); + } + + kmem_free(ioctl, sizeof (struct drsas_ioctl)); + return (rval); +} + +/* + * ************************************************************************** * + * * + * common entry points - for block driver types * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + instance_no = ddi_get_instance(dip); + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter " + "in reset", instance_no)); + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + instance_no)); + + flush_cache(instance); + + return (DDI_SUCCESS); +} + + +/* + * ************************************************************************** * + * * + * entry points (SCSI HBA) * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + uint16_t tgt = sd->sd_address.a_target; + uint8_t lun = sd->sd_address.a_lun; + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d", + tgt, lun)); + + instance = ADDR2MR(&sd->sd_address); + + if (ndi_dev_is_persistent_node(tgt_dip) == 0) { + (void) ndi_merge_node(tgt_dip, drsas_name_node); + ddi_set_name_addr(tgt_dip, NULL); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in " + "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", + tgt, lun)); + return (DDI_FAILURE); + } + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p", + (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == NULL && + strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + instance->dr_ld_list[tgt].dip = tgt_dip; + instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN; + } + } + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void +drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + + instance = ADDR2MR(&sd->sd_address); + + con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == tgt_dip) { + instance->dr_ld_list[tgt].dip = NULL; + } + } +} + +static dev_info_t * +drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun) +{ + dev_info_t *child = NULL; + char addr[SCSI_MAXNAMELEN]; + char tmp[MAXNAMELEN]; + + (void) sprintf(addr, "%x,%x", tgt, lun); + for (child = ddi_get_child(instance->dip); child; + child = ddi_get_next_sibling(child)) { + + if (drsas_name_node(child, tmp, MAXNAMELEN) != + DDI_SUCCESS) { + continue; + } + + if (strcmp(addr, tmp) == 0) { + break; + } + } + con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p", + (void *)child)); + return (child); +} + +static int +drsas_name_node(dev_info_t *dip, char *name, int len) +{ + int tgt, lun; + + tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "target", -1); + con_log(CL_ANN1, (CE_NOTE, + "drsas_name_node: dip %p tgt %d", (void *)dip, tgt)); + if (tgt == -1) { + return (DDI_FAILURE); + } + lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "lun", -1); + con_log(CL_ANN1, + (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun)); + if (lun == -1) { + return (DDI_FAILURE); + } + (void) snprintf(name, len, "%x,%x", tgt, lun); + return (DDI_SUCCESS); +} + +static struct scsi_pkt * +drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct drsas_instance *instance; + struct scsi_pkt *new_pkt; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (drsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + + return (pkt); +} + +static int +drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) +{ + uchar_t cmd_done = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x", + __func__, __LINE__, pkt->pkt_cdbp[0])); + + pkt->pkt_reason = CMD_CMPLT; + *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */ + + cmd = build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the drsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy")); + return_mfi_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + instance->func_ptr->issue_cmd(cmd, instance); + + } else { + struct drsas_header *hdr = &cmd->frame->hdr; + + cmd->sync_cmd = DRSAS_TRUE; + + instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, + &hdr->cmd_status)) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + + case MFI_STAT_SCSI_DONE_WITH_ERROR: + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + break; + + case MFI_STAT_DEVICE_NOT_FOUND: + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + default: + ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1; + } + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + if (pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + } + + return (TRAN_ACCEPT); +} + +/*ARGSUSED*/ +static int +drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* abort command not supported by H/W */ + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_tran_reset(struct scsi_address *ap, int level) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* reset command not supported by H/W */ + + return (DDI_FAILURE); + +} + +/*ARGSUSED*/ +static int +drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) +{ + int rval = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* we do allow inquiring about capabilities for other targets */ + if (cap == NULL) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + /* Limit to 16MB max transfer */ + rval = drsas_max_cap_maxxfer; + break; + case SCSI_CAP_MSG_OUT: + rval = 1; + break; + case SCSI_CAP_DISCONNECT: + rval = 0; + break; + case SCSI_CAP_SYNCHRONOUS: + rval = 0; + break; + case SCSI_CAP_WIDE_XFER: + rval = 1; + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_UNTAGGED_QING: + rval = 1; + break; + case SCSI_CAP_PARITY: + rval = 1; + break; + case SCSI_CAP_INITIATOR_ID: + rval = instance->init_id; + break; + case SCSI_CAP_ARQ: + rval = 1; + break; + case SCSI_CAP_LINKED_CMDS: + rval = 0; + break; + case SCSI_CAP_RESET_NOTIFICATION: + rval = 1; + break; + case SCSI_CAP_GEOMETRY: + rval = -1; + + break; + default: + con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x", + scsi_hba_lookup_capstr(cap))); + rval = -1; + break; + } + + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) +{ + int rval = 1; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* We don't allow setting capabilities for other targets */ + if (cap == NULL || whom == 0) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_MSG_OUT: + case SCSI_CAP_PARITY: + case SCSI_CAP_LINKED_CMDS: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_ARQ: + /* + * None of these are settable via + * the capability interface. + */ + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_SECTOR_SIZE: + rval = 1; + break; + + case SCSI_CAP_TOTAL_SECTORS: + rval = 1; + break; + default: + rval = -1; + break; + } + + return (rval); +} + +static void +drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } + + /* free the pkt */ + scsi_hba_pkt_free(ap, pkt); +} + +/*ARGSUSED*/ +static void +drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } +} + +/*ARGSUSED*/ +static void +drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset, + acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ? + DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU); + } +} + +/* + * drsas_isr(caddr_t) + * + * The Interrupt Service Routine + * + * Collect status for all completed commands and do callback + * + */ +static uint_t +drsas_isr(struct drsas_instance *instance) +{ + int need_softintr; + uint32_t producer; + uint32_t consumer; + uint32_t context; + + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ASSERT(instance); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } + + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->producer); + consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer); + + con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ", + producer, consumer)); + if (producer == consumer) { + con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + return (DDI_INTR_UNCLAIMED); + } + mutex_enter(&instance->completed_pool_mtx); + + while (consumer != producer) { + context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + &instance->reply_queue[consumer]); + cmd = instance->cmd_list[context]; + mlist_add_tail(&cmd->list, &instance->completed_pool_list); + + consumer++; + if (consumer == (instance->max_fw_cmds + 1)) { + consumer = 0; + } + } + + mutex_exit(&instance->completed_pool_mtx); + + ddi_put32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer, consumer); + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + if (instance->softint_running) { + need_softintr = 0; + } else { + need_softintr = 1; + } + + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (need_softintr) { + ddi_trigger_softintr(instance->soft_intr_id); + } + } else { + /* + * Not a high-level interrupt, therefore call the soft level + * interrupt explicitly + */ + (void) drsas_softintr(instance); + } + + return (DDI_INTR_CLAIMED); +} + + +/* + * ************************************************************************** * + * * + * libraries * + * * + * ************************************************************************** * + */ +/* + * get_mfi_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ +static struct drsas_cmd * +get_mfi_pkt(struct drsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct drsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct drsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) + cmd->pkt = NULL; + mutex_exit(&instance->cmd_pool_mtx); + + return (cmd); +} + +/* + * return_mfi_pkt : Return a cmd to free command pool + */ +static void +return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + mlist_add(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +/* + * destroy_mfi_frame_pool + */ +static void +destroy_mfi_frame_pool(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + struct drsas_cmd *cmd; + + /* return all frames to pool */ + for (i = 0; i < max_cmd+1; i++) { + + cmd = instance->cmd_list[i]; + + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) + (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj); + + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } + +} + +/* + * create_mfi_frame_pool + */ +static int +create_mfi_frame_pool(struct drsas_instance *instance) +{ + int i = 0; + int cookie_cnt; + uint16_t max_cmd; + uint16_t sge_sz; + uint32_t sgl_sz; + uint32_t tot_frame_size; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sge_sz = sizeof (struct drsas_sge64); + + /* calculated the number of 64byte frames required for SGL */ + sgl_sz = sge_sz * instance->max_num_sge; + tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " + "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); + + while (i < max_cmd+1) { + cmd = instance->cmd_list[i]; + + cmd->frame_dma_obj.size = tot_frame_size; + cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr; + cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; + cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; + + + cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC); + + if (cookie_cnt == -1 || cookie_cnt > 1) { + con_log(CL_ANN, (CE_WARN, + "create_mfi_frame_pool: could not alloc.")); + return (DDI_FAILURE); + } + + bzero(cmd->frame_dma_obj.buffer, tot_frame_size); + + cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED; + cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer; + cmd->frame_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address; + + cmd->sense = (uint8_t *)(((unsigned long) + cmd->frame_dma_obj.buffer) + + tot_frame_size - SENSE_LENGTH); + cmd->sense_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address + + tot_frame_size - SENSE_LENGTH; + + if (!cmd->frame || !cmd->sense) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: pci_pool_alloc failed")); + + return (ENOMEM); + } + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.context, cmd->index); + i++; + + con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x", + cmd->index, cmd->frame_phys_addr)); + } + + return (DDI_SUCCESS); +} + +/* + * free_additional_dma_buffer + */ +static void +free_additional_dma_buffer(struct drsas_instance *instance) +{ + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } +} + +/* + * alloc_additional_dma_buffer + */ +static int +alloc_additional_dma_buffer(struct drsas_instance *instance) +{ + uint32_t reply_q_sz; + uint32_t internal_buf_size = PAGESIZE*2; + + /* max cmds plus 1 + producer & consumer */ + reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2); + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: could not alloc reply queue")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->producer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer); + instance->consumer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 4); + instance->reply_queue = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 8); + instance->internal_buf = (caddr_t)(((unsigned long) + instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + + (reply_q_sz + 8); + instance->internal_buf_size = internal_buf_size - + (reply_q_sz + 8); + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct drsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + return (DDI_SUCCESS); +} + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + free_additional_dma_buffer(instance); + + /* first free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* free all the commands in the cmd_list */ + for (i = 0; i < instance->max_fw_cmds+1; i++) { + kmem_free(instance->cmd_list[i], + sizeof (struct drsas_cmd)); + + instance->cmd_list[i] = NULL; + } + + /* free the cmd_list buffer itself */ + kmem_free(instance->cmd_list, + sizeof (struct drsas_cmd *) * (max_cmd+1)); + + instance->cmd_list = NULL; + + INIT_LIST_HEAD(&instance->cmd_pool_list); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd; + size_t sz; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + /* reserve 1 more slot for flush_cache */ + sz = sizeof (struct drsas_cmd *) * (max_cmd+1); + + /* + * instance->cmd_list is an array of struct drsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + ASSERT(instance->cmd_list); + + for (i = 0; i < max_cmd+1; i++) { + instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd), + KM_SLEEP); + ASSERT(instance->cmd_list[i]); + } + + INIT_LIST_HEAD(&instance->cmd_pool_list); + + /* add all the commands to command pool (instance->cmd_pool) */ + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + /* single slot for flush_cache won't be added in command pool */ + cmd = instance->cmd_list[max_cmd]; + cmd->index = i; + + /* create a frame pool and assign one frame to each cmd */ + if (create_mfi_frame_pool(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + if (alloc_additional_dma_buffer(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * get_ctrl_info + */ +static int +get_ctrl_info(struct drsas_instance *instance, + struct drsas_ctrl_info *ctrl_info) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + struct drsas_ctrl_info *ci; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + ci = (struct drsas_ctrl_info *)instance->internal_buf; + + if (!ci) { + con_log(CL_ANN, (CE_WARN, + "Failed to alloc mem for ctrl info")); + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + + (void) memset(ci, 0, sizeof (struct drsas_ctrl_info)); + + /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_ctrl_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->internal_buf_dmac_add); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_ctrl_info)); + + cmd->frame_count = 1; + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)ctrl_info, (uint8_t *)ci, + sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR); + } else { + con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + ret = -1; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = -1; + } + + return (ret); +} + +/* + * abort_aen_cmd + */ +static int +abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_abort_frame *abort_fr; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + instance->aen_cmd->abort_aen = 1; + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "abort_aen_cmd: issue_cmd_in_sync_mode failed")); + ret = -1; + } else { + ret = 0; + } + + instance->aen_cmd->abort_aen = 1; + instance->aen_cmd = 0; + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + return (ret); +} + +/* + * init_mfi + */ +static int +init_mfi(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd; + struct drsas_ctrl_info ctrl_info; + struct drsas_init_frame *init_frame; + struct drsas_init_queue_info *initq_info; + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready")); + goto fail_ready_state; + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + instance->max_num_sge = + (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ? + DRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* create a pool of commands */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) + goto fail_alloc_fw_space; + + /* + * Prepare a init frame. Note the init frame points to queue info + * structure. Each frame has SGL allocated after first 64 bytes. For + * this frame - since we don't need any SGL - we use SGL's space as + * queue info structure + */ + cmd = get_mfi_pkt(instance); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + init_frame = (struct drsas_init_frame *)cmd->frame; + initq_info = (struct drsas_init_queue_info *) + ((unsigned long)init_frame + 64); + + (void) memset(init_frame, 0, MRMFI_FRAME_SIZE); + (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info)); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_entries, instance->max_fw_cmds + 1); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8); + + ddi_put8(cmd->frame_dma_obj.acc_handle, + &init_frame->cmd, MFI_CMD_OP_INIT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_lo, + cmd->frame_phys_addr + 64); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_hi, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, + sizeof (struct drsas_init_queue_info)); + + cmd->frame_count = 1; + + /* issue the init frame in polled mode */ + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "failed to init firmware")); + goto fail_fw_init; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + goto fail_fw_init; + } + + /* gather misc FW related information */ + if (!get_ctrl_info(instance, &ctrl_info)) { + instance->max_sectors_per_req = ctrl_info.max_request_size; + con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d", + ctrl_info.product_name, ctrl_info.ld_present_count)); + } else { + instance->max_sectors_per_req = instance->max_num_sge * + PAGESIZE / 512; + } + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + goto fail_fw_init; + } + + return (DDI_SUCCESS); + +fail_fw_init: +fail_alloc_fw_space: + + free_space_for_mfi(instance); + +fail_ready_state: + ddi_regs_map_free(&instance->regmap_handle); + +fail_mfi_reg_setup: + return (DDI_FAILURE); +} + +/* + * mfi_state_transition_to_ready : Move the FW to READY state + * + * @reg_set : MFI register set + */ +static int +mfi_state_transition_to_ready(struct drsas_instance *instance) +{ + int i; + uint8_t max_wait; + uint32_t fw_ctrl; + uint32_t fw_state; + uint32_t cur_state; + + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK; + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); + + while (fw_state != MFI_STATE_READY) { + con_log(CL_ANN, (CE_NOTE, + "mfi_state_transition_to_ready:FW state%x", fw_state)); + + switch (fw_state) { + case MFI_STATE_FAULT: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW in FAULT state!!")); + + return (ENODEV); + case MFI_STATE_WAIT_HANDSHAKE: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW waiting for HANDSHAKE")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + + max_wait = 2; + cur_state = MFI_STATE_WAIT_HANDSHAKE; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW state boot message pending")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + + max_wait = 10; + cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; + break; + case MFI_STATE_OPERATIONAL: + /* bring it to READY state; assuming max wait 2 secs */ + instance->func_ptr->disable_intr(instance); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: FW in OPERATIONAL state")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT) + * to be set + */ + /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + + max_wait = 10; + cur_state = MFI_STATE_OPERATIONAL; + break; + case MFI_STATE_UNDEFINED: + /* this state should not last for more than 2 seconds */ + con_log(CL_ANN, (CE_NOTE, "FW state undefined")); + + max_wait = 2; + cur_state = MFI_STATE_UNDEFINED; + break; + case MFI_STATE_BB_INIT: + max_wait = 2; + cur_state = MFI_STATE_BB_INIT; + break; + case MFI_STATE_FW_INIT: + max_wait = 2; + cur_state = MFI_STATE_FW_INIT; + break; + case MFI_STATE_DEVICE_SCAN: + max_wait = 10; + cur_state = MFI_STATE_DEVICE_SCAN; + break; + default: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: Unknown state 0x%x", fw_state)); + return (ENODEV); + } + + /* the cur_state should not last for more than max_wait secs */ + for (i = 0; i < (max_wait * MILLISEC); i++) { + /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */ + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & + MFI_STATE_MASK; + + if (fw_state == cur_state) { + delay(1 * drv_usectohz(MILLISEC)); + } else { + break; + } + } + + /* return error if fw_state hasn't changed after max_wait */ + if (fw_state == cur_state) { + con_log(CL_ANN, (CE_NOTE, + "FW state hasn't changed in %d secs", max_wait)); + return (ENODEV); + } + }; + + fw_ctrl = RD_IB_DOORBELL(instance); + + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + + /* + * Write 0xF to the doorbell register to do the following. + * - Abort all outstanding commands (bit 0). + * - Transition from OPERATIONAL to READY state (bit 1). + * - Discard (possible) low MFA posted in 64-bit mode (bit-2). + * - Set to release FW to continue running (i.e. BIOS handshake + * (bit 3). + */ + WR_IB_DOORBELL(0xF, instance); + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + return (ENODEV); + } + return (DDI_SUCCESS); +} + +/* + * get_seq_num + */ +static int +get_seq_num(struct drsas_instance *instance, + struct drsas_evt_log_info *eli) +{ + int ret = DDI_SUCCESS; + + dma_obj_t dcmd_dma_obj; + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, "dr_sas: failed to get a cmd"); + return (ENOMEM); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info); + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "get_seq_num: could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct drsas_evt_log_info)); + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + cmn_err(CE_WARN, "get_seq_num: " + "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO"); + ret = DDI_FAILURE; + } else { + /* copy the data back into callers buffer */ + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli, + (uint8_t *)dcmd_dma_obj.buffer, + sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR); + ret = DDI_SUCCESS; + } + + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + ret = DDI_FAILURE; + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = DDI_FAILURE; + } + return (ret); +} + +/* + * start_mfi_aen + */ +static int +start_mfi_aen(struct drsas_instance *instance) +{ + int ret = 0; + + struct drsas_evt_log_info eli; + union drsas_evt_class_locale class_locale; + + /* get the latest sequence number from FW */ + (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info)); + + if (get_seq_num(instance, &eli)) { + cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num"); + return (-1); + } + + /* register AEN with FW for latest sequence number plus 1 */ + class_locale.members.reserved = 0; + class_locale.members.locale = DR_EVT_LOCALE_ALL; + class_locale.members.class = DR_EVT_CLASS_INFO; + ret = register_mfi_aen(instance, eli.newest_seq_num + 1, + class_locale.word); + + if (ret) { + cmn_err(CE_WARN, "start_mfi_aen: aen registration failed"); + return (-1); + } + + return (ret); +} + +/* + * flush_cache + */ +static void +flush_cache(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd = NULL; + struct drsas_dcmd_frame *dcmd; + uint32_t max_cmd = instance->max_fw_cmds; + + cmd = instance->cmd_list[max_cmd]; + + if (cmd == NULL) + return; + + dcmd = &cmd->frame->dcmd; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_NONE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_CACHE_FLUSH); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0], + DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE); + + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); + } + con_log(CL_DLEVEL1, (CE_NOTE, "done")); +} + +/* + * service_mfi_aen- Completes an AEN command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + */ +static void +service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + uint32_t seq_num; + struct drsas_evt_detail *evt_detail = + (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; + int rval = 0; + int tgt = 0; + ddi_acc_handle_t acc_handle; + + acc_handle = cmd->frame_dma_obj.acc_handle; + + cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + /* + * log the MFI AEN event to the sysevent queue so that + * application will get noticed + */ + if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS", + NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) { + int instance_no = ddi_get_instance(instance->dip); + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to log AEN event", instance_no)); + } + /* + * Check for any ld devices that has changed state. i.e. online + * or offline. + */ + con_log(CL_ANN1, (CE_NOTE, + "AEN: code = %x class = %x locale = %x args = %x", + ddi_get32(acc_handle, &evt_detail->code), + evt_detail->cl.members.class, + ddi_get16(acc_handle, &evt_detail->cl.members.locale), + ddi_get8(acc_handle, &evt_detail->arg_type))); + + switch (ddi_get32(acc_handle, &evt_detail->code)) { + case DR_EVT_CFG_CLEARED: { + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + if (instance->dr_ld_list[tgt].dip != NULL) { + rval = drsas_service_evt(instance, tgt, 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "dr_sas: CFG CLEARED AEN rval = %d " + "tgt id = %d", rval, tgt)); + } + } + break; + } + + case DR_EVT_LD_DELETED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_DELETED */ + + case DR_EVT_LD_CREATED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_CREATED */ + } /* End of Main Switch */ + + /* get copy of seq_num and class/locale for re-registration */ + seq_num = ddi_get32(acc_handle, &evt_detail->seq_num); + seq_num++; + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0); + ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num); + + instance->aen_seq_num = seq_num; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + instance->func_ptr->issue_cmd(cmd, instance); +} + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +static void +complete_cmd_in_sync_mode(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = DRSAS_FALSE; + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + cv_broadcast(&instance->int_cmd_cv); +} + +/* + * drsas_softintr - The Software ISR + * @param arg : HBA soft state + * + * called from high-level interrupt if hi-level interrupt are not there, + * otherwise triggered as a soft interrupt + */ +static uint_t +drsas_softintr(struct drsas_instance *instance) +{ + struct scsi_pkt *pkt; + struct scsa_cmd *acmd; + struct drsas_cmd *cmd; + struct mlist_head *pos, *next; + mlist_t process_list; + struct drsas_header *hdr; + struct scsi_arq_status *arqstat; + + con_log(CL_ANN1, (CE_CONT, "drsas_softintr called")); + + ASSERT(instance); + mutex_enter(&instance->completed_pool_mtx); + + if (mlist_empty(&instance->completed_pool_list)) { + mutex_exit(&instance->completed_pool_mtx); + return (DDI_INTR_UNCLAIMED); + } + + instance->softint_running = 1; + + INIT_LIST_HEAD(&process_list); + mlist_splice(&instance->completed_pool_list, &process_list); + INIT_LIST_HEAD(&instance->completed_pool_list); + + mutex_exit(&instance->completed_pool_mtx); + + /* perform all callbacks first, before releasing the SCBs */ + mlist_for_each_safe(pos, next, &process_list) { + cmd = mlist_entry(pos, struct drsas_cmd, list); + + /* syncronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + hdr = &cmd->frame->hdr; + + /* remove the internal command from the process list */ + mlist_del_init(&cmd->list); + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) { + case MFI_CMD_OP_PD_SCSI: + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_LD_READ: + case MFI_CMD_OP_LD_WRITE: + /* + * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI + * could have been issued either through an + * IO path or an IOCTL path. If it was via IOCTL, + * we will send it to internal completion. + */ + if (cmd->sync_cmd == DRSAS_TRUE) { + complete_cmd_in_sync_mode(instance, cmd); + break; + } + + /* regular commands */ + acmd = cmd->cmd; + pkt = CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, + acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN1, (CE_CONT, + "CDB[0] = %x completed for %s: size %lx context %x", + pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, hdr->context)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (hdr->cmd_status == MFI_STAT_OK)) { + display_scsi_inquiry( + (caddr_t)inq); + } else if ((hdr->cmd_status == + MFI_STAT_OK) && inq->inq_dtype == + DTYPE_DIRECT) { + + display_scsi_inquiry( + (caddr_t)inq); + + /* for physical disk */ + hdr->cmd_status = + MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (hdr->cmd_status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + con_log(CL_ANN, + (CE_WARN, "Initialization in Progress")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + ddi_rep_get8( + cmd->frame_dma_obj.acc_handle, + (uint8_t *) + &(arqstat->sts_sensedata), + cmd->sense, + acmd->cmd_scblen - + offsetof(struct scsi_arq_status, + sts_sensedata), DDI_DEV_AUTOINCR); + } + break; + case MFI_STAT_LD_OFFLINE: + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN1, (CE_CONT, + "device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = + KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = + CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + + break; + + default: + con_log(CL_ANN, (CE_CONT, "Unknown status!")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return_mfi_pkt(instance, cmd); + + (void) drsas_common_check(instance, cmd); + + if (acmd->cmd_dmahandle) { + if (drsas_check_dma_handle( + acmd->cmd_dmahandle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + break; + case MFI_CMD_OP_SMP: + case MFI_CMD_OP_STP: + complete_cmd_in_sync_mode(instance, cmd); + break; + case MFI_CMD_OP_DCMD: + /* see if got an event notification */ + if (ddi_get32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->dcmd.opcode) == + DR_DCMD_CTRL_EVENT_WAIT) { + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, + "drsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, + (-1)); + service_mfi_aen(instance, cmd); + } + } else { + complete_cmd_in_sync_mode(instance, cmd); + } + + break; + case MFI_CMD_OP_ABORT: + con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + /* + * MFI_CMD_OP_ABORT successfully completed + * in the synchronous mode + */ + complete_cmd_in_sync_mode(instance, cmd); + break; + default: + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + if (cmd->pkt != NULL) { + pkt = cmd->pkt; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + } + con_log(CL_ANN, (CE_WARN, "Cmd type unknown !")); + break; + } + } + + instance->softint_running = 0; + + return (DDI_INTR_CLAIMED); +} + +/* + * drsas_alloc_dma_obj + * + * Allocate the memory and other resources for an dma object. + */ +static int +drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj, + uchar_t endian_flags) +{ + int i; + size_t alen = 0; + uint_t cookie_cnt; + struct ddi_device_acc_attr tmp_endian_attr; + + tmp_endian_attr = endian_attr; + tmp_endian_attr.devacc_attr_endian_flags = endian_flags; + + i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr, + DDI_DMA_SLEEP, NULL, &obj->dma_handle); + if (i != DDI_SUCCESS) { + + switch (i) { + case DDI_DMA_BADATTR : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- Bad attribute")); + break; + case DDI_DMA_NORESOURCES : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- No Resources")); + break; + default : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle: " + "unknown status %d", i)); + break; + } + + return (-1); + } + + if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, + &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) || + alen < obj->size) { + + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + + return (-1); + } + + if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, + obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, + NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) { + + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + + return (-1); + } + + if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + return (cookie_cnt); +} + +/* + * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t) + * + * De-allocate the memory and other resources for an dma object, which must + * have been alloated by a previous call to drsas_alloc_dma_obj() + */ +static int +drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj) +{ + + if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + (void) ddi_dma_unbind_handle(obj.dma_handle); + ddi_dma_mem_free(&obj.acc_handle); + ddi_dma_free_handle(&obj.dma_handle); + + return (DDI_SUCCESS); +} + +/* + * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *, + * int, int (*)()) + * + * Allocate dma resources for a new scsi command + */ +static int +drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp, int flags, int (*callback)()) +{ + int dma_flags; + int (*cb)(caddr_t); + int i; + + ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr; + struct scsa_cmd *acmd = PKT2CMD(pkt); + + acmd->cmd_buf = bp; + + if (bp->b_flags & B_READ) { + acmd->cmd_flags &= ~CFLAG_DMASEND; + dma_flags = DDI_DMA_READ; + } else { + acmd->cmd_flags |= CFLAG_DMASEND; + dma_flags = DDI_DMA_WRITE; + } + + if (flags & PKT_CONSISTENT) { + acmd->cmd_flags |= CFLAG_CONSISTENT; + dma_flags |= DDI_DMA_CONSISTENT; + } + + if (flags & PKT_DMA_PARTIAL) { + dma_flags |= DDI_DMA_PARTIAL; + } + + dma_flags |= DDI_DMA_REDZONE; + + cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; + + tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; + tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + + if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, + cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { + switch (i) { + case DDI_DMA_BADATTR: + bioerror(bp, EFAULT); + return (DDI_FAILURE); + + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + return (DDI_FAILURE); + + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: " + "impossible result (0x%x)", i)); + bioerror(bp, EFAULT); + return (DDI_FAILURE); + } + } + + i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags, + cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies); + + switch (i) { + case DDI_DMA_PARTIAL_MAP: + if ((dma_flags & DDI_DMA_PARTIAL) == 0) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "DDI_DMA_PARTIAL_MAP impossible")); + goto no_dma_cookies; + } + + if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed")); + goto no_dma_cookies; + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + + con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed")); + goto no_dma_cookies; + } + + goto get_dma_cookies; + case DDI_DMA_MAPPED: + acmd->cmd_nwin = 1; + acmd->cmd_dma_len = 0; + acmd->cmd_dma_offset = 0; + +get_dma_cookies: + i = 0; + acmd->cmd_dmacount = 0; + for (;;) { + acmd->cmd_dmacount += + acmd->cmd_dmacookies[i++].dmac_size; + + if (i == instance->max_num_sge || + i == acmd->cmd_ncookies) + break; + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookie = i; + acmd->cmd_cookiecnt = i; + + acmd->cmd_flags |= CFLAG_DMAVALID; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + break; + case DDI_DMA_NOMAPPING: + bioerror(bp, EFAULT); + break; + case DDI_DMA_TOOBIG: + bioerror(bp, EINVAL); + break; + case DDI_DMA_INUSE: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:" + " DDI_DMA_INUSE impossible")); + break; + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "impossible result (0x%x)", i)); + break; + } + +no_dma_cookies: + ddi_dma_free_handle(&acmd->cmd_dmahandle); + acmd->cmd_dmahandle = NULL; + acmd->cmd_flags &= ~CFLAG_DMAVALID; + return (DDI_FAILURE); +} + +/* + * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *) + * + * move dma resources to next dma window + * + */ +static int +drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp) +{ + int i = 0; + + struct scsa_cmd *acmd = PKT2CMD(pkt); + + /* + * If there are no more cookies remaining in this window, + * must move to the next window first. + */ + if (acmd->cmd_cookie == acmd->cmd_ncookies) { + if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) { + return (DDI_SUCCESS); + } + + /* at last window, cannot move */ + if (++acmd->cmd_curwin >= acmd->cmd_nwin) { + return (DDI_FAILURE); + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + return (DDI_FAILURE); + } + + acmd->cmd_cookie = 0; + } else { + /* still more cookies in this window - get the next one */ + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[0]); + } + + /* get remaining cookies in this window, up to our maximum */ + for (;;) { + acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size; + acmd->cmd_cookie++; + + if (i == instance->max_num_sge || + acmd->cmd_cookie == acmd->cmd_ncookies) { + break; + } + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookiecnt = i; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); +} + +/* + * build_cmd + */ +static struct drsas_cmd * +build_cmd(struct drsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint16_t flags = 0; + uint32_t i; + uint32_t context __unused; + uint32_t sge_bytes; + ddi_acc_handle_t acc_handle; + struct drsas_cmd *cmd; + struct drsas_sge64 *mfi_sgl; + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct drsas_pthru_frame *pthru; + struct drsas_io_frame *ldio; + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_mfi_pkt(instance))) { + return (NULL); + } + + acc_handle = cmd->frame_dma_obj.acc_handle; + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index); + + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_WRITE; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_READ; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + flags = MFI_FRAME_DIR_NONE; + } + + flags |= MFI_FRAME_SGL64; + + switch (pkt->pkt_cdbp[0]) { + + /* + * case SCMD_SYNCHRONIZE_CACHE: + * flush_cache(instance); + * return_mfi_pkt(instance, cmd); + * *cmd_done = 1; + * + * return (NULL); + */ + + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + if (acmd->islogical) { + ldio = (struct drsas_io_frame *)cmd->frame; + + /* + * preare the Logical IO frame: + * 2nd bit is zero for all read cmds + */ + ddi_put8(acc_handle, &ldio->cmd, + (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE + : MFI_CMD_OP_LD_READ); + ddi_put8(acc_handle, &ldio->cmd_status, 0x0); + ddi_put8(acc_handle, &ldio->scsi_status, 0x0); + ddi_put8(acc_handle, &ldio->target_id, acmd->device_id); + ddi_put16(acc_handle, &ldio->timeout, 0); + ddi_put8(acc_handle, &ldio->reserved_0, 0); + ddi_put16(acc_handle, &ldio->pad_0, 0); + ddi_put16(acc_handle, &ldio->flags, flags); + + /* Initialize sense Information */ + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + ddi_put32(acc_handle, &ldio->start_lba_hi, 0); + ddi_put8(acc_handle, &ldio->access_byte, + (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0); + ddi_put8(acc_handle, &ldio->sge_count, + acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&ldio->sgl; + + context = ddi_get32(acc_handle, &ldio->context); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + ddi_put32(acc_handle, &ldio->lba_count, ( + (uint16_t)(pkt->pkt_cdbp[4]))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[3])) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16))); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP2) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[9])) | + ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP3) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[13])) | + ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } + + break; + } + /* fall through */ + default: + + switch (pkt->pkt_cdbp[0]) { + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) drsas_mode_sense_build(pkt); + return_mfi_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + default: + break; + } + + pthru = (struct drsas_pthru_frame *)cmd->frame; + + /* prepare the DCDB frame */ + ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ? + MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI); + ddi_put8(acc_handle, &pthru->cmd_status, 0x0); + ddi_put8(acc_handle, &pthru->scsi_status, 0x0); + ddi_put8(acc_handle, &pthru->target_id, acmd->device_id); + ddi_put8(acc_handle, &pthru->lun, 0); + ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); + ddi_put16(acc_handle, &pthru->timeout, 0); + ddi_put16(acc_handle, &pthru->flags, flags); + ddi_put32(acc_handle, &pthru->data_xfer_len, + acmd->cmd_dmacount); + ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&pthru->sgl; + + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + + context = ddi_get32(acc_handle, &pthru->context); + ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp, + (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + break; + } +#ifdef lint + context = context; +#endif + /* prepare the scatter-gather list for the firmware */ + for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) { + ddi_put64(acc_handle, &mfi_sgl->phys_addr, + acmd->cmd_dmacookies[i].dmac_laddress); + ddi_put32(acc_handle, &mfi_sgl->length, + acmd->cmd_dmacookies[i].dmac_size); + } + + sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt; + + cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) + + ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1; + + if (cmd->frame_count >= 8) { + cmd->frame_count = 8; + } + + return (cmd); +} + +/* + * issue_mfi_pthru + */ +static int +issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint_t model; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + dma_obj_t pthru_dma_obj; + struct drsas_pthru_frame *kpthru; + struct drsas_pthru_frame *pthru; + int i; + pthru = &cmd->frame->pthru; + kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + + xferlen = kpthru->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + xferlen = kpthru->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + xferlen = kpthru->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; +#endif + } + + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + pthru_dma_obj.size = xferlen; + pthru_dma_obj.dma_attr = drsas_generic_dma_attr; + pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_sgllen = 1; + pthru_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &pthru_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kpthru->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf+i, + (uint8_t *)pthru_dma_obj.buffer+i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); + ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len); + ddi_put8(acc_handle, &pthru->cmd_status, 0); + ddi_put8(acc_handle, &pthru->scsi_status, 0); + ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); + ddi_put8(acc_handle, &pthru->lun, kpthru->lun); + ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len); + ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count); + ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout); + ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); + + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + + ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, + pthru->cdb_len, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru: fw_ioctl failed")); + } else { + if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)pthru_dma_obj.buffer+i, + (uint8_t *)ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); + kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); + + con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_dcmd + */ +static int +issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint32_t model; + dma_obj_t dcmd_dma_obj; + struct drsas_dcmd_frame *kdcmd; + struct drsas_dcmd_frame *dcmd; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + dcmd = &cmd->frame->dcmd; + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = xferlen; + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kdcmd->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf + i, + (uint8_t *)dcmd_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd); + ddi_put8(acc_handle, &dcmd->cmd_status, 0); + ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count); + ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout); + ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len); + ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode); + + ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b, + (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); + } else { + if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)dcmd_dma_obj.buffer + i, + (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_smp + */ +static int +issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *request_ubuf; + void *response_ubuf; + uint32_t request_xferlen = 0; + uint32_t response_xferlen = 0; + uint_t model; + dma_obj_t request_dma_obj; + dma_obj_t response_dma_obj; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + struct drsas_smp_frame *ksmp; + struct drsas_smp_frame *smp; + struct drsas_sge32 *sge32; +#ifndef _ILP32 + struct drsas_sge64 *sge64; +#endif + int i; + uint64_t tmp_sas_addr; + + smp = &cmd->frame->smp; + ksmp = (struct drsas_smp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + + sge64 = &ksmp->sgl[0].sge64[0]; + response_xferlen = sge64[0].length; + request_xferlen = sge64[1].length; + + response_ubuf = (void *)(ulong_t)sge64[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge64[1].phys_addr; +#endif + } + if (request_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + request_dma_obj.size = request_xferlen; + request_dma_obj.dma_attr = drsas_generic_dma_attr; + request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_sgllen = 1; + request_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &request_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyin((uint8_t *)request_ubuf + i, + (uint8_t *)request_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + response_dma_obj.size = response_xferlen; + response_dma_obj.dma_attr = drsas_generic_dma_attr; + response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_sgllen = 1; + response_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &response_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyin((uint8_t *)response_ubuf + i, + (uint8_t *)response_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &smp->cmd, ksmp->cmd); + ddi_put8(acc_handle, &smp->cmd_status, 0); + ddi_put8(acc_handle, &smp->connection_status, 0); + ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count); + /* smp->context = ksmp->context; */ + ddi_put16(acc_handle, &smp->timeout, ksmp->timeout); + ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len); + + bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr, + sizeof (uint64_t)); + ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr); + + ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64); + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#else + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: DDI_MODEL_LP64")); + sge64 = &smp->sgl[0].sge64[0]; + ddi_put32(acc_handle, &sge64[0].length, response_xferlen); + ddi_put64(acc_handle, &sge64[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge64[1].length, request_xferlen); + ddi_put64(acc_handle, &sge64[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#endif + } + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + "smp->response_xferlen = %d, smp->request_xferlen = %d " + "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), + ddi_get32(acc_handle, &sge32[1].length), + ddi_get32(acc_handle, &smp->data_xfer_len))); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp: fw_ioctl failed")); + } else { + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: copy to user space")); + + if (request_xferlen) { + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)request_dma_obj.buffer + + i, (uint8_t *)request_ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to user space" + " failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)response_dma_obj.buffer + + i, (uint8_t *)response_ubuf + + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + + ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", + ddi_get8(acc_handle, &smp->cmd_status))); + + + if (request_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, request_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (response_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, response_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_stp + */ +static int +issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *fis_ubuf; + void *data_ubuf; + uint32_t fis_xferlen = 0; + uint32_t data_xferlen = 0; + uint_t model; + dma_obj_t fis_dma_obj; + dma_obj_t data_dma_obj; + struct drsas_stp_frame *kstp; + struct drsas_stp_frame *stp; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + + stp = &cmd->frame->stp; + kstp = (struct drsas_stp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; + } + else + { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + + fis_xferlen = kstp->sgl.sge64[0].length; + data_xferlen = kstp->sgl.sge64[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr; +#endif + } + + + if (fis_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + fis_dma_obj.size = fis_xferlen; + fis_dma_obj.dma_attr = drsas_generic_dma_attr; + fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_sgllen = 1; + fis_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &fis_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyin((uint8_t *)fis_ubuf + i, + (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (data_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + "data_xferlen = %x", data_ubuf, data_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + data_dma_obj.size = data_xferlen; + data_dma_obj.dma_attr = drsas_generic_dma_attr; + data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_sgllen = 1; + data_dma_obj.dma_attr.dma_attr_align = 1; + +/* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &data_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyin((uint8_t *)data_ubuf + i, + (uint8_t *)data_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &stp->cmd, kstp->cmd); + ddi_put8(acc_handle, &stp->cmd_status, 0); + ddi_put8(acc_handle, &stp->connection_status, 0); + ddi_put8(acc_handle, &stp->target_id, kstp->target_id); + ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count); + + ddi_put16(acc_handle, &stp->timeout, kstp->timeout); + ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len); + + ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10, + DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags); + ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr, + fis_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr, + data_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); + } else { + + if (fis_xferlen) { + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)fis_dma_obj.buffer + i, + (uint8_t *)fis_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + if (data_xferlen) { + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)data_dma_obj.buffer + i, + (uint8_t *)data_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to" + " user space failed")); + return (DDI_FAILURE); + } + } + } + + kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + + if (fis_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (data_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * fill_up_drv_ver + */ +static void +fill_up_drv_ver(struct drsas_drv_ver *dv) +{ + (void) memset(dv, 0, sizeof (struct drsas_drv_ver)); + + (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$")); + (void) memcpy(dv->os_name, "Solaris", strlen("Solaris")); + (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas")); + (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION)); + (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE, + strlen(DRSAS_RELDATE)); +} + +/* + * handle_drv_ioctl + */ +static int +handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int i; + int rval = DDI_SUCCESS; + int *props = NULL; + void *ubuf; + + uint8_t *pci_conf_buf; + uint32_t xferlen; + uint32_t num_props; + uint_t model; + struct drsas_dcmd_frame *kdcmd; + struct drsas_drv_ver dv; + struct drsas_pci_information pi; + + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "dataBuf=%p size=%d bytes", ubuf, xferlen)); + + switch (kdcmd->opcode) { + case DRSAS_DRIVER_IOCTL_DRIVER_VERSION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION")); + + fill_up_drv_ver(&dv); + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + if (i == xferlen) + kdcmd->cmd_status = 0; + break; + case DRSAS_DRIVER_IOCTL_PCI_INFORMATION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON")); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip, + 0, "reg", &props, &num_props)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : " + "ddi_prop_look_int_array failed")); + rval = DDI_FAILURE; + } else { + + pi.busNumber = (props[0] >> 16) & 0xFF; + pi.deviceNumber = (props[0] >> 11) & 0x1f; + pi.functionNumber = (props[0] >> 8) & 0x7; + ddi_prop_free((void *)props); + } + + pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo; + + for (i = 0; i < (sizeof (struct drsas_pci_information) - + offsetof(struct drsas_pci_information, pciHeaderInfo)); + i++) { + pci_conf_buf[i] = + pci_config_get8(instance->pci_handle, i); + } + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + + if (i == xferlen) + kdcmd->cmd_status = 0; + + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "invalid driver specific IOCTL opcode = 0x%x", + kdcmd->opcode)); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + + return (rval); +} + +/* + * handle_mfi_ioctl + */ +static int +handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int rval = DDI_SUCCESS; + + struct drsas_header *hdr; + struct drsas_cmd *cmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "failed to get a cmd packet")); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + hdr = (struct drsas_header *)&ioctl->frame[0]; + + switch (hdr->cmd) { + case MFI_CMD_OP_DCMD: + rval = issue_mfi_dcmd(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_SMP: + rval = issue_mfi_smp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_STP: + rval = issue_mfi_stp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_PD_SCSI: + rval = issue_mfi_pthru(instance, ioctl, cmd, mode); + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: " + "invalid mfi ioctl hdr->cmd = %d", hdr->cmd)); + rval = DDI_FAILURE; + break; + } + + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) + rval = DDI_FAILURE; + return (rval); +} + +/* + * AEN + */ +static int +handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen) +{ + int rval = 0; + + rval = register_mfi_aen(instance, instance->aen_seq_num, + aen->class_locale_word); + + aen->cmd_status = (uint8_t)rval; + + return (rval); +} + +static int +register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num, + uint32_t class_locale_word) +{ + int ret_val; + + struct drsas_cmd *cmd, *aen_cmd; + struct drsas_dcmd_frame *dcmd; + union drsas_evt_class_locale curr_aen; + union drsas_evt_class_locale prev_aen; + + /* + * If there an AEN pending already (aen_cmd), check if the + * class_locale of that pending AEN is inclusive of the new + * AEN request we currently have. If it is, then we don't have + * to do anything. In other words, whichever events the current + * AEN request is subscribing to, have already been subscribed + * to. + * + * If the old_cmd is _not_ inclusive, then we have to abort + * that command, form a class_locale that is superset of both + * old and current and re-issue to the FW + */ + + curr_aen.word = class_locale_word; + aen_cmd = instance->aen_cmd; + if (aen_cmd) { + prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle, + &aen_cmd->frame->dcmd.mbox.w[1]); + + /* + * A class whose enum value is smaller is inclusive of all + * higher values. If a PROGRESS (= -1) was previously + * registered, then a new registration requests for higher + * classes need not be sent to FW. They are automatically + * included. + * + * Locale numbers don't have such hierarchy. They are bitmap + * values + */ + if ((prev_aen.members.class <= curr_aen.members.class) && + !((prev_aen.members.locale & curr_aen.members.locale) ^ + curr_aen.members.locale)) { + /* + * Previously issued event registration includes + * current request. Nothing to do. + */ + + return (0); + } else { + curr_aen.members.locale |= prev_aen.members.locale; + + if (prev_aen.members.class < curr_aen.members.class) + curr_aen.members.class = prev_aen.members.class; + + ret_val = abort_aen_cmd(instance, aen_cmd); + + if (ret_val) { + con_log(CL_ANN, (CE_WARN, "register_mfi_aen: " + "failed to abort prevous AEN command")); + + return (ret_val); + } + } + } else { + curr_aen.word = class_locale_word; + } + + cmd = get_mfi_pkt(instance); + + if (!cmd) + return (ENOMEM); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + /* Prepare DCMD for aen registration */ + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_detail)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_WAIT); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1], + curr_aen.word); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_detail)); + + instance->aen_seq_num = seq_num; + + + /* + * Store reference to the cmd used to register for AEN. When an + * application wants us to register for AEN, we have to abort this + * cmd and re-register with a new EVENT LOCALE supplied by that app + */ + instance->aen_cmd = cmd; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + /* atomic_add_16 (&instance->fw_outstanding, 1); */ + instance->func_ptr->issue_cmd(cmd, instance); + + return (0); +} + +static void +display_scsi_inquiry(caddr_t scsi_inq) +{ +#define MAX_SCSI_DEVICE_CODE 14 + int i; + char inquiry_buf[256] = {0}; + int len; + const char *const scsi_device_types[] = { + "Direct-Access ", + "Sequential-Access", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", + }; + + len = 0; + + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + for (i = 8; i < 16; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + + for (i = 16; i < 32; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + + for (i = 32; i < 36; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + + + i = scsi_inq[0] & 0x1f; + + + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : + "Unknown "); + + + len += snprintf(inquiry_buf + len, 265 - len, + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + + if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { + len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); + } else { + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + } + + con_log(CL_ANN1, (CE_CONT, inquiry_buf)); +} + +static int +read_fw_status_reg_ppc(struct drsas_instance *instance) +{ + return ((int)RD_OB_SCRATCH_PAD_0(instance)); +} + +static void +issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance) +{ + atomic_add_16(&instance->fw_outstanding, 1); + + /* Issue the command to the FW */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); +} + +/* + * issue_cmd_in_sync_mode + */ +static int +issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called")); + + cmd->cmd_status = ENODATA; + + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + mutex_enter(&instance->int_cmd_mtx); + + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + + mutex_exit(&instance->int_cmd_mtx); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +static int +issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct drsas_header *frame_hdr; + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called")); + + frame_hdr = (struct drsas_header *)cmd->frame; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + /* issue the frame using inbound queue port */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + /* wait for cmd_status to change from 0xFF */ + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " + "cmd polling timed out")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static void +enable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called")); + + /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */ + WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance); + + /* WR_OB_INTR_MASK(~0x80000000, instance); */ + WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: " + "outbound_intr_mask = 0x%x", mask)); +} + +static void +disable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask __unused; + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called")); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */ + WR_OB_INTR_MASK(OB_INTR_MASK, instance); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); +#ifdef lint + mask = mask; +#endif +} + +static int +intr_ack_ppc(struct drsas_instance *instance) +{ + uint32_t status; + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called")); + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status)); + + if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) { + return (DDI_INTR_UNCLAIMED); + } + + /* clear the interrupt by writing back the same value */ + WR_OB_DOORBELL_CLEAR(status, instance); + + /* dummy READ */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared")); + + return (DDI_INTR_CLAIMED); +} + +static int +drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int ret = DDI_SUCCESS; + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + + ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); + + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + /* + * as the driver can always deal with an error in any dma or + * access handle, we can just return the fme_status value. + */ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +drsas_fm_init(struct drsas_instance *instance) +{ + /* Need to change iblock to priority for new MSI intr */ + ddi_iblock_cookie_t fm_ibc; + + /* Only register with IO Fault Services if we have some capability */ + if (instance->fm_capabilities) { + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_FLAGERR_ACC; + drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR; + + /* + * Register capabilities with IO Fault Services. + * fm_capabilities will be updated to indicate + * capabilities actually supported (not requested.) + */ + + ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc); + + /* + * Initialize pci ereport capabilities if ereport + * capable (should always be.) + */ + + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_setup(instance->dip); + } + + /* + * Register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_register(instance->dip, + drsas_fm_error_cb, (void*) instance); + } + } else { + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +static void +drsas_fm_fini(struct drsas_instance *instance) +{ + /* Only unregister FMA capabilities if registered */ + if (instance->fm_capabilities) { + /* + * Un-register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_unregister(instance->dip); + } + + /* + * Release any resources allocated by pci_ereport_setup() + */ + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_teardown(instance->dip); + } + + /* Unregister from IO Fault Services */ + ddi_fm_fini(instance->dip); + + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +int +drsas_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +int +drsas_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +void +drsas_fm_ereport(struct drsas_instance *instance, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) { + ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL); + } +} + +static int +drsas_add_intrs(struct drsas_instance *instance, int intr_type) +{ + + dev_info_t *dip = instance->dip; + int avail, actual, count; + int i, flag, ret; + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x", + intr_type)); + + /* Get number of interrupts */ + ret = ddi_intr_get_nintrs(dip, intr_type, &count); + if ((ret != DDI_SUCCESS) || (count == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:" + "ret %d count %d", ret, count)); + + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count)); + + /* Get number of available interrupts */ + ret = ddi_intr_get_navail(dip, intr_type, &avail); + if ((ret != DDI_SUCCESS) || (avail == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:" + "ret %d avail %d", ret, avail)); + + return (DDI_FAILURE); + } + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail)); + + /* Only one interrupt routine. So limit the count to 1 */ + if (count > 1) { + count = 1; + } + + /* + * Allocate an array of interrupt handlers. Currently we support + * only one interrupt. The framework can be extended later. + */ + instance->intr_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); + ASSERT(instance->intr_htable); + + flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == + DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + + /* Allocate interrupt */ + ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, + count, &actual, flag); + + if ((ret != DDI_SUCCESS) || (actual == 0)) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "avail = %d", avail)); + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + if (actual < count) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "Requested = %d Received = %d", count, actual)); + } + instance->intr_cnt = actual; + + /* + * Get the priority of the interrupt allocated. + */ + if ((ret = ddi_intr_get_pri(instance->intr_htable[0], + &instance->intr_pri)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "get priority call failed")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + /* + * Test for high level mutex. we don't support them. + */ + if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "High level interrupts not supported.")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ", + instance->intr_pri)); + + /* Call ddi_intr_add_handler() */ + for (i = 0; i < actual; i++) { + ret = ddi_intr_add_handler(instance->intr_htable[i], + (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance, + (caddr_t)(uintptr_t)i); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:" + "failed %d", ret)); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + } + + con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + + if ((ret = ddi_intr_get_cap(instance->intr_htable[0], + &instance->intr_cap)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", + ret)); + + /* Free already allocated intr */ + for (i = 0; i < actual; i++) { + (void) ddi_intr_remove_handler( + instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable")); + + (void) ddi_intr_block_enable(instance->intr_htable, + instance->intr_cnt); + } else { + con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable")); + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_enable(instance->intr_htable[i]); + con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns " + "%d", i)); + } + } + + return (DDI_SUCCESS); + +} + + +static void +drsas_rem_intrs(struct drsas_instance *instance) +{ + int i; + + con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called")); + + /* Disable all interrupts first */ + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(instance->intr_htable, + instance->intr_cnt); + } else { + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_disable(instance->intr_htable[i]); + } + } + + /* Remove all the handlers */ + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + + kmem_free(instance->intr_htable, instance->intr_size); +} + +static int +drsas_tran_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + struct drsas_instance *instance; + int config; + int rval; + + char *ptr = NULL; + int tgt, lun; + + con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op)); + + if ((instance = ddi_get_soft_state(drsas_state, + ddi_get_instance(parent))) == NULL) { + return (NDI_FAILURE); + } + + /* Hold nexus during bus_config */ + ndi_devi_enter(parent, &config); + switch (op) { + case BUS_CONFIG_ONE: { + + /* parse wwid/target name out of name given */ + if ((ptr = strchr((char *)arg, '@')) == NULL) { + rval = NDI_FAILURE; + break; + } + ptr++; + + if (drsas_parse_devname(arg, &tgt, &lun) != 0) { + rval = NDI_FAILURE; + break; + } + + if (lun == 0) { + rval = drsas_config_ld(instance, tgt, lun, childp); + } else { + rval = NDI_FAILURE; + } + + break; + } + case BUS_CONFIG_DRIVER: + case BUS_CONFIG_ALL: { + + rval = drsas_config_all_devices(instance); + + rval = NDI_SUCCESS; + break; + } + } + + if (rval == NDI_SUCCESS) { + rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + } + ndi_devi_exit(parent, config); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x", + rval)); + return (rval); +} + +static int +drsas_config_all_devices(struct drsas_instance *instance) +{ + int rval, tgt; + + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + (void) drsas_config_ld(instance, tgt, 0, NULL); + + } + + rval = NDI_SUCCESS; + return (rval); +} + +static int +drsas_parse_devname(char *devnm, int *tgt, int *lun) +{ + char devbuf[SCSI_MAXNAMELEN]; + char *addr; + char *p, *tp, *lp; + long num; + + /* Parse dev name and address */ + (void) strcpy(devbuf, devnm); + addr = ""; + for (p = devbuf; *p != '\0'; p++) { + if (*p == '@') { + addr = p + 1; + *p = '\0'; + } else if (*p == ':') { + *p = '\0'; + break; + } + } + + /* Parse target and lun */ + for (p = tp = addr, lp = NULL; *p != '\0'; p++) { + if (*p == ',') { + lp = p + 1; + *p = '\0'; + break; + } + } + if (tgt && tp) { + if (ddi_strtol(tp, NULL, 0x10, &num)) { + return (DDI_FAILURE); /* Can declare this as constant */ + } + *tgt = (int)num; + } + if (lun && lp) { + if (ddi_strtol(lp, NULL, 0x10, &num)) { + return (DDI_FAILURE); + } + *lun = (int)num; + } + return (DDI_SUCCESS); /* Success case */ +} + +static int +drsas_config_ld(struct drsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval; + + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d", + tgt, lun)); + + if ((child = drsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + con_log(CL_ANN1, (CE_NOTE, + "drsas_config_ld: Child = %p found t = %d l = %d", + (void *)child, tgt, lun)); + return (NDI_SUCCESS); + } + + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) + rval = drsas_config_scsi_device(instance, sd, ldip); + else + rval = NDI_FAILURE; + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + + kmem_free(sd, sizeof (struct scsi_device)); + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d", + rval)); + return (rval); +} + +static int +drsas_config_scsi_device(struct drsas_instance *instance, + struct scsi_device *sd, dev_info_t **dipp) +{ + char *nodename = NULL; + char **compatible = NULL; + int ncompatible = 0; + char *childname; + dev_info_t *ldip = NULL; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; + int rval; + + con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun)); + scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, + NULL, &nodename, &compatible, &ncompatible); + + if (nodename == NULL) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver " + "for t%dL%d", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; + con_log(CL_ANN1, (CE_WARN, + "dr_sas: Childname = %2s nodename = %s", childname, nodename)); + + /* Create a dev node */ + rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); + con_log(CL_ANN1, (CE_WARN, + "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); + if (rval == NDI_SUCCESS) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d target", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d lun", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip, + "compatible", compatible, ncompatible) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d compatible", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH); + if (rval != NDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online " + "t%dl%d", tgt, lun)); + ndi_prop_remove_all(ldip); + (void) ndi_devi_free(ldip); + } else { + con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :" + "0 t%dl%d", tgt, lun)); + } + + } +finish: + if (dipp) { + *dipp = ldip; + } + + con_log(CL_DLEVEL1, (CE_WARN, + "dr_sas: config_scsi_device rval = %d t%dL%d", + rval, tgt, lun)); + scsi_hba_nodename_compatible_free(nodename, compatible); + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event, + uint64_t wwn) +{ + struct drsas_eventinfo *mrevt = NULL; + + con_log(CL_ANN1, (CE_NOTE, + "drsas_service_evt called for t%dl%d event = %d", + tgt, lun, event)); + + if ((instance->taskq == NULL) || (mrevt = + kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + mrevt->instance = instance; + mrevt->tgt = tgt; + mrevt->lun = lun; + mrevt->event = event; + + if ((ddi_taskq_dispatch(instance->taskq, + (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != + DDI_SUCCESS) { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: Event task failed for t%dl%d event = %d", + tgt, lun, event)); + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static void +drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt) +{ + struct drsas_instance *instance = mrevt->instance; + dev_info_t *dip, *pdip; + int circ1 = 0; + char *devname; + + con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for" + " tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + dip = instance->dr_ld_list[mrevt->tgt].dip; + } else { + return; + } + + ndi_devi_enter(instance->dip, &circ1); + switch (mrevt->event) { + case DRSAS_EVT_CONFIG_TGT: + if (dip == NULL) { + + if (mrevt->lun == 0) { + (void) drsas_config_ld(instance, mrevt->tgt, + 0, NULL); + } + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT dip != NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + case DRSAS_EVT_UNCONFIG_TGT: + if (dip) { + if (i_ddi_devi_attached(dip)) { + + pdip = ddi_get_parent(dip); + + devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP); + (void) ddi_deviname(dip, devname); + + (void) devfs_clean(pdip, devname + 1, + DV_CLEAN_FORCE); + kmem_free(devname, MAXNAMELEN + 1); + } + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT dip == NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + } + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + ndi_devi_exit(instance->dip, circ1); +} + +static int +drsas_mode_sense_build(struct scsi_pkt *pkt) +{ + union scsi_cdb *cdbp; + uint16_t page_code; + struct scsa_cmd *acmd; + struct buf *bp; + struct mode_header *modehdrp; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = cdbp->cdb_un.sg.scsi[0]; + acmd = PKT2CMD(pkt); + bp = acmd->cmd_buf; + if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) { + con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command")); + /* ADD pkt statistics as Command failed. */ + return (NULL); + } + + bp_mapin(bp); + bzero(bp->b_un.b_addr, bp->b_bcount); + + switch (page_code) { + case 0x3: { + struct mode_format *page3p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page3p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page3p->mode_page.code = 0x3; + page3p->mode_page.length = + (uchar_t)(sizeof (struct mode_format)); + page3p->data_bytes_sect = 512; + page3p->sect_track = 63; + break; + } + case 0x4: { + struct mode_geometry *page4p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page4p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page4p->mode_page.code = 0x4; + page4p->mode_page.length = + (uchar_t)(sizeof (struct mode_geometry)); + page4p->heads = 255; + page4p->rpm = 10000; + break; + } + default: + break; + } + return (NULL); +} diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf new file mode 100644 index 0000000000..3792f43ca4 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf @@ -0,0 +1,15 @@ +# +# Copyright (c) 2008-2009, LSI Logic Corporation. +# All rights reserved. +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# dr_sas.conf for sol 10 (and later) for all supported architectures +# +# global definitions + +# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI +#drsas-enable-msi="yes"; diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h new file mode 100644 index 0000000000..8f78658edf --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.h @@ -0,0 +1,1766 @@ +/* + * dr_sas.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_H_ +#define _DR_SAS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/scsi/scsi.h> +#include "dr_sas_list.h" + +/* + * MegaRAID SAS2.0 Driver meta data + */ +#define DRSAS_VERSION "LSIv2.0" +#define DRSAS_RELDATE "Jan 9, 2009" + +#define DRSAS_TRUE 1 +#define DRSAS_FALSE 0 + +/* + * MegaRAID SAS2.0 device id conversion definitions. + */ +#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) + +/* + * MegaRAID SAS2.0 supported controllers + */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* + * Register Index for 2108 Controllers. + */ +#define REGISTER_SET_IO_2108 (2) + +#define DRSAS_MAX_SGE_CNT 0x50 + +#define DRSAS_IOCTL_DRIVER 0x12341234 +#define DRSAS_IOCTL_FIRMWARE 0x12345678 +#define DRSAS_IOCTL_AEN 0x87654321 + +#define DRSAS_1_SECOND 1000000 + +/* Dynamic Enumeration Flags */ +#define DRSAS_PD_LUN 1 +#define DRSAS_LD_LUN 0 +#define DRSAS_PD_TGT_MAX 255 +#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max) +#define WWN_STRLEN 17 + +/* + * ===================================== + * MegaRAID SAS2.0 MFI firmware definitions + * ===================================== + */ +/* + * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for + * protocol between the software and firmware. Commands are issued using + * "message frames" + */ + +/* + * FW posts its state in upper 4 bits of outbound_msg_0 register + */ +#define MFI_STATE_SHIFT 28 +#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) +#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) +#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) +#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) +#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) +#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) +#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) +#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) +#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) + +#define MRMFI_FRAME_SIZE 64 + +/* + * During FW init, clear pending cmds & reset state using inbound_msg_0 + * + * ABORT : Abort all pending cmds + * READY : Move from OPERATIONAL to READY state; discard queue info + * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??) + * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver + */ +#define MFI_INIT_ABORT 0x00000001 +#define MFI_INIT_READY 0x00000002 +#define MFI_INIT_MFIMODE 0x00000004 +#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 +#define MFI_STOP_ADP 0x00000020 +#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT + +/* + * MFI frame flags + */ +#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000 +#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001 +#define MFI_FRAME_SGL32 0x0000 +#define MFI_FRAME_SGL64 0x0002 +#define MFI_FRAME_SENSE32 0x0000 +#define MFI_FRAME_SENSE64 0x0004 +#define MFI_FRAME_DIR_NONE 0x0000 +#define MFI_FRAME_DIR_WRITE 0x0008 +#define MFI_FRAME_DIR_READ 0x0010 +#define MFI_FRAME_DIR_BOTH 0x0018 + +/* + * Definition for cmd_status + */ +#define MFI_CMD_STATUS_POLL_MODE 0xFF +#define MFI_CMD_STATUS_SYNC_MODE 0xFF + +/* + * MFI command opcodes + */ +#define MFI_CMD_OP_INIT 0x00 +#define MFI_CMD_OP_LD_READ 0x01 +#define MFI_CMD_OP_LD_WRITE 0x02 +#define MFI_CMD_OP_LD_SCSI 0x03 +#define MFI_CMD_OP_PD_SCSI 0x04 +#define MFI_CMD_OP_DCMD 0x05 +#define MFI_CMD_OP_ABORT 0x06 +#define MFI_CMD_OP_SMP 0x07 +#define MFI_CMD_OP_STP 0x08 + +#define DR_DCMD_CTRL_GET_INFO 0x01010000 + +#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000 +#define DR_FLUSH_CTRL_CACHE 0x01 +#define DR_FLUSH_DISK_CACHE 0x02 + +#define DR_DCMD_CTRL_SHUTDOWN 0x01050000 +#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01 + +#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100 +#define DR_DCMD_CTRL_EVENT_GET 0x01040300 +#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500 +#define DR_DCMD_LD_GET_PROPERTIES 0x03030000 +#define DR_DCMD_PD_GET_INFO 0x02020000 + +/* + * Solaris Specific MAX values + */ +#define MAX_SGL 24 +/* + * MFI command completion codes + */ +enum MFI_STAT { + MFI_STAT_OK = 0x00, + MFI_STAT_INVALID_CMD = 0x01, + MFI_STAT_INVALID_DCMD = 0x02, + MFI_STAT_INVALID_PARAMETER = 0x03, + MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04, + MFI_STAT_ABORT_NOT_POSSIBLE = 0x05, + MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06, + MFI_STAT_APP_IN_USE = 0x07, + MFI_STAT_APP_NOT_INITIALIZED = 0x08, + MFI_STAT_ARRAY_INDEX_INVALID = 0x09, + MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a, + MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b, + MFI_STAT_DEVICE_NOT_FOUND = 0x0c, + MFI_STAT_DRIVE_TOO_SMALL = 0x0d, + MFI_STAT_FLASH_ALLOC_FAIL = 0x0e, + MFI_STAT_FLASH_BUSY = 0x0f, + MFI_STAT_FLASH_ERROR = 0x10, + MFI_STAT_FLASH_IMAGE_BAD = 0x11, + MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12, + MFI_STAT_FLASH_NOT_OPEN = 0x13, + MFI_STAT_FLASH_NOT_STARTED = 0x14, + MFI_STAT_FLUSH_FAILED = 0x15, + MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16, + MFI_STAT_LD_CC_IN_PROGRESS = 0x17, + MFI_STAT_LD_INIT_IN_PROGRESS = 0x18, + MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19, + MFI_STAT_LD_MAX_CONFIGURED = 0x1a, + MFI_STAT_LD_NOT_OPTIMAL = 0x1b, + MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c, + MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d, + MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e, + MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f, + MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20, + MFI_STAT_MFC_HW_ERROR = 0x21, + MFI_STAT_NO_HW_PRESENT = 0x22, + MFI_STAT_NOT_FOUND = 0x23, + MFI_STAT_NOT_IN_ENCL = 0x24, + MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25, + MFI_STAT_PD_TYPE_WRONG = 0x26, + MFI_STAT_PR_DISABLED = 0x27, + MFI_STAT_ROW_INDEX_INVALID = 0x28, + MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29, + MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a, + MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b, + MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c, + MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d, + MFI_STAT_SCSI_IO_FAILED = 0x2e, + MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f, + MFI_STAT_SHUTDOWN_FAILED = 0x30, + MFI_STAT_TIME_NOT_SET = 0x31, + MFI_STAT_WRONG_STATE = 0x32, + MFI_STAT_LD_OFFLINE = 0x33, + /* UNUSED: 0x34 to 0xfe */ + MFI_STAT_INVALID_STATUS = 0xFF +}; + +enum DR_EVT_CLASS { + DR_EVT_CLASS_DEBUG = -2, + DR_EVT_CLASS_PROGRESS = -1, + DR_EVT_CLASS_INFO = 0, + DR_EVT_CLASS_WARNING = 1, + DR_EVT_CLASS_CRITICAL = 2, + DR_EVT_CLASS_FATAL = 3, + DR_EVT_CLASS_DEAD = 4 +}; + +enum DR_EVT_LOCALE { + DR_EVT_LOCALE_LD = 0x0001, + DR_EVT_LOCALE_PD = 0x0002, + DR_EVT_LOCALE_ENCL = 0x0004, + DR_EVT_LOCALE_BBU = 0x0008, + DR_EVT_LOCALE_SAS = 0x0010, + DR_EVT_LOCALE_CTRL = 0x0020, + DR_EVT_LOCALE_CONFIG = 0x0040, + DR_EVT_LOCALE_CLUSTER = 0x0080, + DR_EVT_LOCALE_ALL = 0xffff +}; + +#define DR_EVT_CFG_CLEARED 0x0004 +#define DR_EVT_LD_CREATED 0x008a +#define DR_EVT_LD_DELETED 0x008b +#define DR_EVT_PD_REMOVED_EXT 0x00f8 +#define DR_EVT_PD_INSERTED_EXT 0x00f7 + +enum LD_STATE { + LD_OFFLINE = 0, + LD_PARTIALLY_DEGRADED = 1, + LD_DEGRADED = 2, + LD_OPTIMAL = 3, + LD_INVALID = 0xFF +}; + +enum DRSAS_EVT { + DRSAS_EVT_CONFIG_TGT = 0, + DRSAS_EVT_UNCONFIG_TGT = 1, + DRSAS_EVT_UNCONFIG_SMP = 2 +}; + +#define DMA_OBJ_ALLOCATED 1 +#define DMA_OBJ_REALLOCATED 2 +#define DMA_OBJ_FREED 3 + +/* + * dma_obj_t - Our DMA object + * @param buffer : kernel virtual address + * @param size : size of the data to be allocated + * @param acc_handle : access handle + * @param dma_handle : dma handle + * @param dma_cookie : scatter-gather list + * @param dma_attr : dma attributes for this buffer + * Our DMA object. The caller must initialize the size and dma attributes + * (dma_attr) fields before allocating the resources. + */ +typedef struct { + caddr_t buffer; + uint32_t size; + ddi_acc_handle_t acc_handle; + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT]; + ddi_dma_attr_t dma_attr; + uint8_t status; + uint8_t reserved[3]; +} dma_obj_t; + +struct drsas_eventinfo { + struct drsas_instance *instance; + int tgt; + int lun; + int event; +}; + +struct drsas_ld { + dev_info_t *dip; + uint8_t lun_type; + uint8_t reserved[3]; +}; + +struct drsas_pd { + dev_info_t *dip; + uint8_t lun_type; + uint8_t dev_id; + uint8_t flags; + uint8_t reserved; +}; + +struct drsas_pd_info { + uint16_t deviceId; + uint16_t seqNum; + uint8_t inquiryData[96]; + uint8_t vpdPage83[64]; + uint8_t notSupported; + uint8_t scsiDevType; + uint8_t a; + uint8_t device_speed; + uint32_t mediaerrcnt; + uint32_t other; + uint32_t pred; + uint32_t lastpred; + uint16_t fwState; + uint8_t disabled; + uint8_t linkspwwd; + uint32_t ddfType; + struct { + uint8_t count; + uint8_t isPathBroken; + uint8_t connectorIndex[2]; + uint8_t reserved[4]; + uint64_t sasAddr[2]; + uint8_t reserved2[16]; + } pathInfo; +}; + +typedef struct drsas_instance { + uint32_t *producer; + uint32_t *consumer; + + uint32_t *reply_queue; + dma_obj_t mfi_internal_dma_obj; + + uint8_t init_id; + uint8_t reserved[3]; + + uint16_t max_num_sge; + uint16_t max_fw_cmds; + uint32_t max_sectors_per_req; + + struct drsas_cmd **cmd_list; + + mlist_t cmd_pool_list; + kmutex_t cmd_pool_mtx; + + mlist_t cmd_pend_list; + kmutex_t cmd_pend_mtx; + + dma_obj_t mfi_evt_detail_obj; + struct drsas_cmd *aen_cmd; + + uint32_t aen_seq_num; + uint32_t aen_class_locale_word; + + scsi_hba_tran_t *tran; + + kcondvar_t int_cmd_cv; + kmutex_t int_cmd_mtx; + + kcondvar_t aen_cmd_cv; + kmutex_t aen_cmd_mtx; + + kcondvar_t abort_cmd_cv; + kmutex_t abort_cmd_mtx; + + dev_info_t *dip; + ddi_acc_handle_t pci_handle; + + timeout_id_t timeout_id; + uint32_t unique_id; + uint16_t fw_outstanding; + caddr_t regmap; + ddi_acc_handle_t regmap_handle; + uint8_t isr_level; + ddi_iblock_cookie_t iblock_cookie; + ddi_iblock_cookie_t soft_iblock_cookie; + ddi_softintr_t soft_intr_id; + uint8_t softint_running; + kmutex_t completed_pool_mtx; + mlist_t completed_pool_list; + + caddr_t internal_buf; + uint32_t internal_buf_dmac_add; + uint32_t internal_buf_size; + + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + int instance; + int baseaddress; + char iocnode[16]; + + int fm_capabilities; + + struct drsas_func_ptr *func_ptr; + /* MSI interrupts specific */ + ddi_intr_handle_t *intr_htable; + int intr_type; + int intr_cnt; + size_t intr_size; + uint_t intr_pri; + int intr_cap; + + ddi_taskq_t *taskq; + struct drsas_ld *dr_ld_list; +} drsas_t; + +struct drsas_func_ptr { + int (*read_fw_status_reg)(struct drsas_instance *); + void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *); + int (*issue_cmd_in_sync_mode)(struct drsas_instance *, + struct drsas_cmd *); + int (*issue_cmd_in_poll_mode)(struct drsas_instance *, + struct drsas_cmd *); + void (*enable_intr)(struct drsas_instance *); + void (*disable_intr)(struct drsas_instance *); + int (*intr_ack)(struct drsas_instance *); +}; + +/* + * ### Helper routines ### + */ + +/* + * con_log() - console log routine + * @param level : indicates the severity of the message. + * @fparam mt : format string + * + * con_log displays the error messages on the console based on the current + * debug level. Also it attaches the appropriate kernel severity level with + * the message. + * + * + * console messages debug levels + */ +#define CL_NONE 0 /* No debug information */ +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ + +#ifdef __SUNPRO_C +#define __func__ "" +#endif + +#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; } + +/* + * ### SCSA definitions ### + */ +#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target) +#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun) +#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran) +#define ADDR2TRAN(ap) ((ap)->a_hba_tran) + +#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private) +#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap)) + +#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private) +#define CMD2PKT(sp) ((sp)->cmd_pkt) +#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request)) + +#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address) +#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran) +#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd))) + +#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */ +#define CFLAG_DMASEND 0x0002 /* Transfer from the device */ +#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */ + +/* + * ### Data structures for ioctl inteface and internal commands ### + */ + +/* + * Data direction flags + */ +#define UIOC_RD 0x00001 +#define UIOC_WR 0x00002 + +#define SCP2HOST(scp) (scp)->device->host /* to host */ +#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */ +#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */ +#define SCP2TARGET(scp) (scp)->device->id /* to target */ +#define SCP2LUN(scp) (scp)->device->lun /* to LUN */ + +#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0]) +#define SCP2ADAPTER(scp) \ + (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp)) + +#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \ + (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0 +#define MRDRV_IS_LOGICAL(ap) \ + ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0 +#define MAP_DEVICE_ID(instance, ap) \ + (ap->a_target) + +#define HIGH_LEVEL_INTR 1 +#define NORMAL_LEVEL_INTR 0 + +/* + * scsa_cmd - Per-command mr private data + * @param cmd_dmahandle : dma handle + * @param cmd_dmacookies : current dma cookies + * @param cmd_pkt : scsi_pkt reference + * @param cmd_dmacount : dma count + * @param cmd_cookie : next cookie + * @param cmd_ncookies : cookies per window + * @param cmd_cookiecnt : cookies per sub-win + * @param cmd_nwin : number of dma windows + * @param cmd_curwin : current dma window + * @param cmd_dma_offset : current window offset + * @param cmd_dma_len : current window length + * @param cmd_flags : private flags + * @param cmd_cdblen : length of cdb + * @param cmd_scblen : length of scb + * @param cmd_buf : command buffer + * @param channel : channel for scsi sub-system + * @param target : target for scsi sub-system + * @param lun : LUN for scsi sub-system + * + * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E) + * - Pointed to by pkt_ha_private field in scsi_pkt + */ +struct scsa_cmd { + ddi_dma_handle_t cmd_dmahandle; + ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT]; + struct scsi_pkt *cmd_pkt; + ulong_t cmd_dmacount; + uint_t cmd_cookie; + uint_t cmd_ncookies; + uint_t cmd_cookiecnt; + uint_t cmd_nwin; + uint_t cmd_curwin; + off_t cmd_dma_offset; + ulong_t cmd_dma_len; + ulong_t cmd_flags; + uint_t cmd_cdblen; + uint_t cmd_scblen; + struct buf *cmd_buf; + ushort_t device_id; + uchar_t islogical; + uchar_t lun; + struct drsas_device *drsas_dev; +}; + + +struct drsas_cmd { + union drsas_frame *frame; + uint32_t frame_phys_addr; + uint8_t *sense; + uint32_t sense_phys_addr; + dma_obj_t frame_dma_obj; + uint8_t frame_dma_obj_status; + + uint32_t index; + uint8_t sync_cmd; + uint8_t cmd_status; + uint16_t abort_aen; + mlist_t list; + uint32_t frame_count; + struct scsa_cmd *cmd; + struct scsi_pkt *pkt; +}; + +#define MAX_MGMT_ADAPTERS 1024 +#define IOC_SIGNATURE "MR-SAS" + +#define IOC_CMD_FIRMWARE 0x0 +#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000 +#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100 +#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200 +#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300 + + +#define DRSAS_MAX_SENSE_LENGTH 32 + +struct drsas_mgmt_info { + + uint16_t count; + struct drsas_instance *instance[MAX_MGMT_ADAPTERS]; + uint16_t map[MAX_MGMT_ADAPTERS]; + int max_index; +}; + +#pragma pack(1) + +/* + * SAS controller properties + */ +struct drsas_ctrl_prop { + uint16_t seq_num; + uint16_t pred_fail_poll_interval; + uint16_t intr_throttle_count; + uint16_t intr_throttle_timeouts; + + uint8_t rebuild_rate; + uint8_t patrol_read_rate; + uint8_t bgi_rate; + uint8_t cc_rate; + uint8_t recon_rate; + + uint8_t cache_flush_interval; + + uint8_t spinup_drv_count; + uint8_t spinup_delay; + + uint8_t cluster_enable; + uint8_t coercion_mode; + uint8_t disk_write_cache_disable; + uint8_t alarm_enable; + + uint8_t reserved[44]; +}; + +/* + * SAS controller information + */ +struct drsas_ctrl_info { + /* PCI device information */ + struct { + uint16_t vendor_id; + uint16_t device_id; + uint16_t sub_vendor_id; + uint16_t sub_device_id; + uint8_t reserved[24]; + } pci; + + /* Host interface information */ + struct { + uint8_t PCIX : 1; + uint8_t PCIE : 1; + uint8_t iSCSI : 1; + uint8_t SAS_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } host_interface; + + /* Device (backend) interface information */ + struct { + uint8_t SPI : 1; + uint8_t SAS_3G : 1; + uint8_t SATA_1_5G : 1; + uint8_t SATA_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } device_interface; + + /* List of components residing in flash. All str are null terminated */ + uint32_t image_check_word; + uint32_t image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char built_time[16]; + } image_component[8]; + + /* + * List of flash components that have been flashed on the card, but + * are not in use, pending reset of the adapter. This list will be + * empty if a flash operation has not occurred. All stings are null + * terminated + */ + uint32_t pending_image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char build_time[16]; + } pending_image_component[8]; + + uint8_t max_arms; + uint8_t max_spans; + uint8_t max_arrays; + uint8_t max_lds; + + char product_name[80]; + char serial_no[32]; + + /* + * Other physical/controller/operation information. Indicates the + * presence of the hardware + */ + struct { + uint32_t bbu : 1; + uint32_t alarm : 1; + uint32_t nvram : 1; + uint32_t uart : 1; + uint32_t reserved : 28; + } hw_present; + + uint32_t current_fw_time; + + /* Maximum data transfer sizes */ + uint16_t max_concurrent_cmds; + uint16_t max_sge_count; + uint32_t max_request_size; + + /* Logical and physical device counts */ + uint16_t ld_present_count; + uint16_t ld_degraded_count; + uint16_t ld_offline_count; + + uint16_t pd_present_count; + uint16_t pd_disk_present_count; + uint16_t pd_disk_pred_failure_count; + uint16_t pd_disk_failed_count; + + /* Memory size information */ + uint16_t nvram_size; + uint16_t memory_size; + uint16_t flash_size; + + /* Error counters */ + uint16_t mem_correctable_error_count; + uint16_t mem_uncorrectable_error_count; + + /* Cluster information */ + uint8_t cluster_permitted; + uint8_t cluster_active; + uint8_t reserved_1[2]; + + /* Controller capabilities structures */ + struct { + uint32_t raid_level_0 : 1; + uint32_t raid_level_1 : 1; + uint32_t raid_level_5 : 1; + uint32_t raid_level_1E : 1; + uint32_t reserved : 28; + } raid_levels; + + struct { + uint32_t rbld_rate : 1; + uint32_t cc_rate : 1; + uint32_t bgi_rate : 1; + uint32_t recon_rate : 1; + uint32_t patrol_rate : 1; + uint32_t alarm_control : 1; + uint32_t cluster_supported : 1; + uint32_t bbu : 1; + uint32_t spanning_allowed : 1; + uint32_t dedicated_hotspares : 1; + uint32_t revertible_hotspares : 1; + uint32_t foreign_config_import : 1; + uint32_t self_diagnostic : 1; + uint32_t reserved : 19; + } adapter_operations; + + struct { + uint32_t read_policy : 1; + uint32_t write_policy : 1; + uint32_t io_policy : 1; + uint32_t access_policy : 1; + uint32_t reserved : 28; + } ld_operations; + + struct { + uint8_t min; + uint8_t max; + uint8_t reserved[2]; + } stripe_size_operations; + + struct { + uint32_t force_online : 1; + uint32_t force_offline : 1; + uint32_t force_rebuild : 1; + uint32_t reserved : 29; + } pd_operations; + + struct { + uint32_t ctrl_supports_sas : 1; + uint32_t ctrl_supports_sata : 1; + uint32_t allow_mix_in_encl : 1; + uint32_t allow_mix_in_ld : 1; + uint32_t allow_sata_in_cluster : 1; + uint32_t reserved : 27; + } pd_mix_support; + + /* Include the controller properties (changeable items) */ + uint8_t reserved_2[12]; + struct drsas_ctrl_prop properties; + + uint8_t pad[0x800 - 0x640]; +}; + +/* + * ================================== + * MegaRAID SAS2.0 driver definitions + * ================================== + */ +#define MRDRV_MAX_NUM_CMD 1024 + +#define MRDRV_MAX_PD_CHANNELS 2 +#define MRDRV_MAX_LD_CHANNELS 2 +#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \ + MRDRV_MAX_LD_CHANNELS) +#define MRDRV_MAX_DEV_PER_CHANNEL 128 +#define MRDRV_DEFAULT_INIT_ID -1 +#define MRDRV_MAX_CMD_PER_LUN 1000 +#define MRDRV_MAX_LUN 1 +#define MRDRV_MAX_LD 64 + +#define MRDRV_RESET_WAIT_TIME 300 +#define MRDRV_RESET_NOTICE_INTERVAL 5 + +#define DRSAS_IOCTL_CMD 0 + +/* + * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit + * SGLs based on the size of dma_addr_t + */ +#define IS_DMA64 (sizeof (dma_addr_t) == 8) + +#define IB_MSG_0_OFF 0x10 /* XScale */ +#define OB_MSG_0_OFF 0x18 /* XScale */ +#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ +#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */ +#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */ +#define IB_QPORT_OFF 0x40 /* XScale & ROC */ +#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */ +#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ +#define OB_INTR_MASK 0xFFFFFFFF +#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF + +/* + * All MFI register set macros accept drsas_register_set* + */ +#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) + +#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF)) + +#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v)) + +#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF)) + +#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v)) + +#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF)) + +#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v)) + +#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF)) + +#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v)) + +#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \ + (v)) + +#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) + +/* + * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data + * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs + * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled. + */ +#define MFI_OB_INTR_STATUS_MASK 0x00000002 + +/* + * This MFI_REPLY_2108_MESSAGE_INTR flag is used also + * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has + * been set in this flag along with bit 1. + */ +#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 +#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 + +#define MFI_POLL_TIMEOUT_SECS 60 + +#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1) +#define MFI_DISABLE_INTR(instance) \ +{ \ + uint32_t disable = 1; \ + uint32_t mask = ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\ + mask &= ~disable; \ + ddi_put32((instance)->regmap_handle, (uint32_t *) \ + (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \ +} + +/* By default, the firmware programs for 8 Kbytes of memory */ +#define DEFAULT_MFI_MEM_SZ 8192 +#define MINIMUM_MFI_MEM_SZ 4096 + +/* DCMD Message Frame MAILBOX0-11 */ +#define DCMD_MBOX_SZ 12 + + +struct drsas_register_set { + uint32_t reserved_0[4]; + + uint32_t inbound_msg_0; + uint32_t inbound_msg_1; + uint32_t outbound_msg_0; + uint32_t outbound_msg_1; + + uint32_t inbound_doorbell; + uint32_t inbound_intr_status; + uint32_t inbound_intr_mask; + + uint32_t outbound_doorbell; + uint32_t outbound_intr_status; + uint32_t outbound_intr_mask; + + uint32_t reserved_1[2]; + + uint32_t inbound_queue_port; + uint32_t outbound_queue_port; + + uint32_t reserved_2[22]; + + uint32_t outbound_doorbell_clear; + + uint32_t reserved_3[3]; + + uint32_t outbound_scratch_pad; + + uint32_t reserved_4[3]; + + uint32_t inbound_low_queue_port; + + uint32_t inbound_high_queue_port; + + uint32_t reserved_5; + uint32_t index_registers[820]; +}; + +struct drsas_sge32 { + uint32_t phys_addr; + uint32_t length; +}; + +struct drsas_sge64 { + uint64_t phys_addr; + uint32_t length; +}; + +union drsas_sgl { + struct drsas_sge32 sge32[1]; + struct drsas_sge64 sge64[1]; +}; + +struct drsas_header { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xferlen; +}; + +union drsas_sgl_frame { + struct drsas_sge32 sge32[8]; + struct drsas_sge64 sge64[5]; +}; + +struct drsas_init_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t data_xfer_len; + + uint32_t queue_info_new_phys_addr_lo; + uint32_t queue_info_new_phys_addr_hi; + uint32_t queue_info_old_phys_addr_lo; + uint32_t queue_info_old_phys_addr_hi; + + uint32_t reserved_4[6]; +}; + +struct drsas_init_queue_info { + uint32_t init_flags; + uint32_t reply_queue_entries; + + uint32_t reply_queue_start_phys_addr_lo; + uint32_t reply_queue_start_phys_addr_hi; + uint32_t producer_index_phys_addr_lo; + uint32_t producer_index_phys_addr_hi; + uint32_t consumer_index_phys_addr_lo; + uint32_t consumer_index_phys_addr_hi; +}; + +struct drsas_io_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t access_byte; + uint8_t reserved_0; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t lba_count; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint32_t start_lba_lo; + uint32_t start_lba_hi; + + union drsas_sgl sgl; +}; + +struct drsas_pthru_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xfer_len; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint8_t cdb[16]; + union drsas_sgl sgl; +}; + +struct drsas_dcmd_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + uint8_t reserved_1[4]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + uint32_t opcode; + + union { + uint8_t b[DCMD_MBOX_SZ]; + uint16_t s[6]; + uint32_t w[3]; + } mbox; + + union drsas_sgl sgl; +}; + +struct drsas_abort_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t reserved_4; + + uint32_t abort_context; + uint32_t pad_1; + + uint32_t abort_mfi_phys_addr_lo; + uint32_t abort_mfi_phys_addr_hi; + + uint32_t reserved_5[6]; +}; + +struct drsas_smp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t reserved_2[3]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint64_t sas_addr; + + union drsas_sgl sgl[2]; +}; + +struct drsas_stp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t target_id; + uint8_t reserved_2[2]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint16_t fis[10]; + uint32_t stp_flags; + union drsas_sgl sgl; +}; + +union drsas_frame { + struct drsas_header hdr; + struct drsas_init_frame init; + struct drsas_io_frame io; + struct drsas_pthru_frame pthru; + struct drsas_dcmd_frame dcmd; + struct drsas_abort_frame abort; + struct drsas_smp_frame smp; + struct drsas_stp_frame stp; + + uint8_t raw_bytes[64]; +}; + +typedef struct drsas_pd_address { + uint16_t device_id; + uint16_t encl_id; + + union { + struct { + uint8_t encl_index; + uint8_t slot_number; + } pd_address; + struct { + uint8_t encl_position; + uint8_t encl_connector_index; + } encl_address; + }address; + + uint8_t scsi_dev_type; + + union { + uint8_t port_bitmap; + uint8_t port_numbers; + } connected; + + uint64_t sas_addr[2]; +} drsas_pd_address_t; + +union drsas_evt_class_locale { + struct { + uint16_t locale; + uint8_t reserved; + int8_t class; + } members; + + uint32_t word; +}; + +struct drsas_evt_log_info { + uint32_t newest_seq_num; + uint32_t oldest_seq_num; + uint32_t clear_seq_num; + uint32_t shutdown_seq_num; + uint32_t boot_seq_num; +}; + +struct drsas_progress { + uint16_t progress; + uint16_t elapsed_seconds; +}; + +struct drsas_evtarg_ld { + uint16_t target_id; + uint8_t ld_index; + uint8_t reserved; +}; + +struct drsas_evtarg_pd { + uint16_t device_id; + uint8_t encl_index; + uint8_t slot_number; +}; + +struct drsas_evt_detail { + uint32_t seq_num; + uint32_t time_stamp; + uint32_t code; + union drsas_evt_class_locale cl; + uint8_t arg_type; + uint8_t reserved1[15]; + + union { + struct { + struct drsas_evtarg_pd pd; + uint8_t cdb_length; + uint8_t sense_length; + uint8_t reserved[2]; + uint8_t cdb[16]; + uint8_t sense[64]; + } cdbSense; + + struct drsas_evtarg_ld ld; + + struct { + struct drsas_evtarg_ld ld; + uint64_t count; + } ld_count; + + struct { + uint64_t lba; + struct drsas_evtarg_ld ld; + } ld_lba; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prevOwner; + uint32_t newOwner; + } ld_owner; + + struct { + uint64_t ld_lba; + uint64_t pd_lba; + struct drsas_evtarg_ld ld; + struct drsas_evtarg_pd pd; + } ld_lba_pd_lba; + + struct { + struct drsas_evtarg_ld ld; + struct drsas_progress prog; + } ld_prog; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prev_state; + uint32_t new_state; + } ld_state; + + struct { + uint64_t strip; + struct drsas_evtarg_ld ld; + } ld_strip; + + struct drsas_evtarg_pd pd; + + struct { + struct drsas_evtarg_pd pd; + uint32_t err; + } pd_err; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + } pd_lba; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + struct drsas_evtarg_ld ld; + } pd_lba_ld; + + struct { + struct drsas_evtarg_pd pd; + struct drsas_progress prog; + } pd_prog; + + struct { + struct drsas_evtarg_pd pd; + uint32_t prevState; + uint32_t newState; + } pd_state; + + struct { + uint16_t vendorId; + uint16_t deviceId; + uint16_t subVendorId; + uint16_t subDeviceId; + } pci; + + uint32_t rate; + char str[96]; + + struct { + uint32_t rtc; + uint32_t elapsedSeconds; + } time; + + struct { + uint32_t ecar; + uint32_t elog; + char str[64]; + } ecc; + + drsas_pd_address_t pd_addr; + + uint8_t b[96]; + uint16_t s[48]; + uint32_t w[24]; + uint64_t d[12]; + } args; + + char description[128]; + +}; + +/* only 63 are usable by the application */ +#define MAX_LOGICAL_DRIVES 64 +/* only 255 physical devices may be used */ +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_PD_PER_ENCLOSURE 64 +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +/* maximum number of arrays a hot spare may be dedicated to */ +#define MAX_ARRAYS_DEDICATED 16 +/* maximum number of arrays which may exist */ +#define MAX_ARRAYS 128 +/* maximum number of foreign configs that may ha managed at once */ +#define MAX_FOREIGN_CONFIGS 8 +/* maximum spares (global and dedicated combined) */ +#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES +/* maximum possible Target IDs (i.e. 0 to 63) */ +#define MAX_TARGET_ID 63 +/* maximum number of supported enclosures */ +#define MAX_ENCLOSURES 32 +/* maximum number of PHYs per controller */ +#define MAX_PHYS_PER_CONTROLLER 16 +/* maximum number of LDs per array (due to DDF limitations) */ +#define MAX_LDS_PER_ARRAY 16 + +/* + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Logical Drive commands + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + */ +#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */ + +/* + * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST + * dcmd.mbox - reserved + * dcmd.sge IN - ptr to returned DR_LD_LIST structure + * Desc: Return the logical drive list structure + * Status: No error + */ + +/* + * defines the logical drive reference structure + */ +typedef union _DR_LD_REF { /* LD reference structure */ + struct { + uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */ + uint8_t reserved; /* reserved for in line with DR_PD_REF */ + uint16_t seqNum; /* Sequence Number */ + } ld_ref; + uint32_t ref; /* shorthand reference to full 32-bits */ +} DR_LD_REF; /* 4 bytes */ + +/* + * defines the logical drive list structure + */ +typedef struct _DR_LD_LIST { + uint32_t ldCount; /* number of LDs */ + uint32_t reserved; /* pad to 8-byte boundary */ + struct { + DR_LD_REF ref; /* LD reference */ + uint8_t state; /* current LD state (DR_LD_STATE) */ + uint8_t reserved[3]; /* pad to 8-byte boundary */ + uint64_t size; /* LD size */ + } ldList[MAX_LOGICAL_DRIVES]; +} DR_LD_LIST; + +struct drsas_drv_ver { + uint8_t signature[12]; + uint8_t os_name[16]; + uint8_t os_ver[12]; + uint8_t drv_name[20]; + uint8_t drv_ver[32]; + uint8_t drv_rel_date[20]; +}; + +#define PCI_TYPE0_ADDRESSES 6 +#define PCI_TYPE1_ADDRESSES 2 +#define PCI_TYPE2_ADDRESSES 5 + +struct drsas_pci_common_header { + uint16_t vendorID; /* (ro) */ + uint16_t deviceID; /* (ro) */ + uint16_t command; /* Device control */ + uint16_t status; + uint8_t revisionID; /* (ro) */ + uint8_t progIf; /* (ro) */ + uint8_t subClass; /* (ro) */ + uint8_t baseClass; /* (ro) */ + uint8_t cacheLineSize; /* (ro+) */ + uint8_t latencyTimer; /* (ro+) */ + uint8_t headerType; /* (ro) */ + uint8_t bist; /* Built in self test */ + + union { + struct { + uint32_t baseAddresses[PCI_TYPE0_ADDRESSES]; + uint32_t cis; + uint16_t subVendorID; + uint16_t subSystemID; + uint32_t romBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t reserved2; + uint8_t interruptLine; + uint8_t interruptPin; /* (ro) */ + uint8_t minimumGrant; /* (ro) */ + uint8_t maximumLatency; /* (ro) */ + } type_0; + + struct { + uint32_t baseAddresses[PCI_TYPE1_ADDRESSES]; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + uint8_t ioBase; + uint8_t ioLimit; + uint16_t secondaryStatus; + uint16_t memoryBase; + uint16_t memoryLimit; + uint16_t prefetchBase; + uint16_t prefetchLimit; + uint32_t prefetchBaseUpper32; + uint32_t prefetchLimitUpper32; + uint16_t ioBaseUpper16; + uint16_t ioLimitUpper16; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t romBaseAddress; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_1; + + struct { + uint32_t socketRegistersBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved; + uint16_t secondaryStatus; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + struct { + uint32_t base; + uint32_t limit; + } range[PCI_TYPE2_ADDRESSES-1]; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_2; + } header; +}; + +struct drsas_pci_link_capability { + union { + struct { + uint32_t linkSpeed :4; + uint32_t linkWidth :6; + uint32_t aspmSupport :2; + uint32_t losExitLatency :3; + uint32_t l1ExitLatency :3; + uint32_t rsvdp :6; + uint32_t portNumber :8; + } bits; + + uint32_t asUlong; + } cap; + +}; + +struct drsas_pci_link_status_capability { + union { + struct { + uint16_t linkSpeed :4; + uint16_t negotiatedLinkWidth :6; + uint16_t linkTrainingError :1; + uint16_t linkTraning :1; + uint16_t slotClockConfig :1; + uint16_t rsvdZ :3; + } bits; + + uint16_t asUshort; + } stat_cap; + + uint16_t reserved; + +}; + +struct drsas_pci_capabilities { + struct drsas_pci_link_capability linkCapability; + struct drsas_pci_link_status_capability linkStatusCapability; +}; + +struct drsas_pci_information +{ + uint32_t busNumber; + uint8_t deviceNumber; + uint8_t functionNumber; + uint8_t interruptVector; + uint8_t reserved; + struct drsas_pci_common_header pciHeaderInfo; + struct drsas_pci_capabilities capability; + uint8_t reserved2[32]; +}; + +struct drsas_ioctl { + uint16_t version; + uint16_t controller_id; + uint8_t signature[8]; + uint32_t reserved_1; + uint32_t control_code; + uint32_t reserved_2[2]; + uint8_t frame[64]; + union drsas_sgl_frame sgl_frame; + uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH]; + uint8_t data[1]; +}; + +struct drsas_aen { + uint16_t host_no; + uint16_t cmd_status; + uint32_t seq_num; + uint32_t class_locale_word; +}; +#pragma pack() + +#ifndef DDI_VENDOR_LSI +#define DDI_VENDOR_LSI "LSI" +#endif /* DDI_VENDOR_LSI */ + +static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int drsas_attach(dev_info_t *, ddi_attach_cmd_t); +static int drsas_reset(dev_info_t *, ddi_reset_cmd_t); +static int drsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int drsas_open(dev_t *, int, int, cred_t *); +static int drsas_close(dev_t, int, int, cred_t *); +static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int drsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int drsas_tran_reset(struct scsi_address *, int); +static int drsas_tran_getcap(struct scsi_address *, char *, int); +static int drsas_tran_setcap(struct scsi_address *, char *, int, int); +static void drsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static uint_t drsas_isr(); +static uint_t drsas_softintr(); + +static int init_mfi(struct drsas_instance *); +static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t); +static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *, + uchar_t); +static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *); +static void return_mfi_pkt(struct drsas_instance *, + struct drsas_cmd *); + +static void free_space_for_mfi(struct drsas_instance *); +static void free_additional_dma_buffer(struct drsas_instance *); +static int alloc_additional_dma_buffer(struct drsas_instance *); +static int read_fw_status_reg_ppc(struct drsas_instance *); +static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static void enable_intr_ppc(struct drsas_instance *); +static void disable_intr_ppc(struct drsas_instance *); +static int intr_ack_ppc(struct drsas_instance *); +static int mfi_state_transition_to_ready(struct drsas_instance *); +static void destroy_mfi_frame_pool(struct drsas_instance *); +static int create_mfi_frame_pool(struct drsas_instance *); +static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *, + struct buf *, int, int (*)()); +static int drsas_dma_move(struct drsas_instance *, + struct scsi_pkt *, struct buf *); +static void flush_cache(struct drsas_instance *instance); +static void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct drsas_instance *instance); +static int handle_drv_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct drsas_instance *instance, + struct drsas_aen *aen); +static void fill_up_drv_ver(struct drsas_drv_ver *dv); +static struct drsas_cmd *build_cmd(struct drsas_instance *instance, + struct scsi_address *ap, struct scsi_pkt *pkt, + uchar_t *cmd_done); +static int register_mfi_aen(struct drsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort); + +static int drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd); +static void drsas_fm_init(struct drsas_instance *instance); +static void drsas_fm_fini(struct drsas_instance *instance); +static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, + const void *); +static void drsas_fm_ereport(struct drsas_instance *instance, + char *detail); +static int drsas_check_dma_handle(ddi_dma_handle_t handle); +static int drsas_check_acc_handle(ddi_acc_handle_t handle); + +static void drsas_rem_intrs(struct drsas_instance *instance); +static int drsas_add_intrs(struct drsas_instance *instance, int intr_type); + +static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int drsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int drsas_parse_devname(char *, int *, int *); +static int drsas_config_all_devices(struct drsas_instance *); +static int drsas_config_scsi_device(struct drsas_instance *, + struct scsi_device *, dev_info_t **); +static int drsas_config_ld(struct drsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t, + uint8_t); +static int drsas_name_node(dev_info_t *, char *, int); +static void drsas_issue_evt_taskq(struct drsas_eventinfo *); +static int drsas_service_evt(struct drsas_instance *, int, int, int, + uint64_t); +static int drsas_mode_sense_build(struct scsi_pkt *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_H_ */ diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h new file mode 100644 index 0000000000..4154a77796 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h @@ -0,0 +1,212 @@ +/* + * dr_sas_list.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_LIST_H_ +#define _DR_SAS_LIST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct mlist_head { + struct mlist_head *next, *prev; +}; + +typedef struct mlist_head mlist_t; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct mlist_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} + + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_add(struct mlist_head *new, + struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static void mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + + + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_del(struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static void mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +static int mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static void mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + + +/* + * mlist_entry - get the struct for this entry + * @ptr: the &struct mlist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define mlist_entry(ptr, type, member) \ + ((type *)((size_t)(ptr) - offsetof(type, member))) + + +/* + * mlist_for_each - iterate over a list + * @pos: the &struct mlist_head to use as a loop counter. + * @head: the head for your list. + */ +#define mlist_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + + +/* + * mlist_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct mlist_head to use as a loop counter. + * @n: another &struct mlist_head to use as temporary storage + * @head: the head for your list. + */ +#define mlist_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_LIST_H_ */ diff --git a/usr/src/uts/common/io/elxl/elxl.c b/usr/src/uts/common/io/elxl/elxl.c index 2ffe96aff3..42552225f8 100644 --- a/usr/src/uts/common/io/elxl/elxl.c +++ b/usr/src/uts/common/io/elxl/elxl.c @@ -1,6 +1,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -1163,8 +1164,7 @@ elxl_m_tx(void *arg, mblk_t *mp) cflags = 0; if ((sc->ex_conf & CONF_90XB) != 0) { uint32_t pflags; - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, - &pflags); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags); if (pflags & HCK_IPV4_HDRCKSUM) { cflags |= EX_DPD_IPCKSUM; } @@ -1327,7 +1327,7 @@ elxl_recv(elxl_t *sc, ex_desc_t *rxd, uint32_t stat) if (stat & (EX_UPD_TCPCHECKED | EX_UPD_UDPCHECKED)) { pflags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK); } - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, pflags, 0); + mac_hcksum_set(mp, 0, 0, 0, 0, pflags); } return (mp); diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c index 32f875917f..efc1f9233f 100644 --- a/usr/src/uts/common/io/eventfd.c +++ b/usr/src/uts/common/io/eventfd.c @@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) * transitions from EVENTFD_VALMAX to a lower value. At all other * times, it is already considered writable by poll. */ - if (oval == EVENTFD_VALMAX) { + if (oval >= EVENTFD_VALMAX) { pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); } return (err); } -/*ARGSUSED*/ static int -eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async, + boolean_t file_nonblock) { - eventfd_state_t *state; - minor_t minor = getminor(dev); - uint64_t val, oval; - int err; - - if (uio->uio_resid < sizeof (val)) - return (EINVAL); - - if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) - return (err); - - if (val > EVENTFD_VALMAX) - return (EINVAL); - - state = ddi_get_soft_state(eventfd_softstate, minor); + uint64_t oval; + boolean_t overflow = B_FALSE; mutex_enter(&state->efd_lock); while (val > EVENTFD_VALMAX - state->efd_value) { - if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + + /* + * When called from (LX) AIO, expectations about overflow and + * blocking are different than normal operation. If the + * incoming value would cause overflow, it is clamped to reach + * the overflow value exactly. This is added to the existing + * value without blocking. Any pollers of the eventfd will see + * POLLERR asserted when this occurs. + */ + if (is_async) { + val = EVENTFD_VALOVERFLOW - state->efd_value; + overflow = B_TRUE; + break; + } + + if (file_nonblock) { mutex_exit(&state->efd_lock); return (EAGAIN); } @@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) } /* - * We now know that we can add the value without overflowing. + * We now know that we can safely add the value. */ state->efd_value = (oval = state->efd_value) + val; @@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) mutex_exit(&state->efd_lock); /* - * Notify pollers as well if the eventfd is now readable. + * Notify pollers as well if the eventfd has become readable or has + * transitioned into overflow. */ if (oval == 0) { pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); + } else if (overflow && val != 0) { + pollwakeup(&state->efd_pollhd, POLLERR); } return (0); @@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) /*ARGSUSED*/ static int +eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +{ + eventfd_state_t *state; + boolean_t file_nonblock; + uint64_t val; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) + return (err); + + if (val > EVENTFD_VALMAX) + return (EINVAL); + + file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0; + state = ddi_get_soft_state(eventfd_softstate, getminor(dev)); + return (eventfd_post(state, val, B_FALSE, file_nonblock)); +} + +/*ARGSUSED*/ +static int eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { @@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, if (state->efd_value < EVENTFD_VALMAX) revents |= POLLWRNORM | POLLOUT; + if (state->efd_value == EVENTFD_VALOVERFLOW) + revents |= POLLERR; + *reventsp = revents & events; if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { *phpp = &state->efd_pollhd; @@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { eventfd_state_t *state; minor_t minor = getminor(dev); + uint64_t *valp; state = ddi_get_soft_state(eventfd_softstate, minor); switch (cmd) { - case EVENTFDIOC_SEMAPHORE: { + case EVENTFDIOC_SEMAPHORE: mutex_enter(&state->efd_lock); state->efd_semaphore ^= 1; mutex_exit(&state->efd_lock); + return (0); + case EVENTFDIOC_POST: + /* + * This ioctl is expected to be kernel-internal, used only by + * the AIO emulation in LX. + */ + if ((md & FKIOCTL) == 0) { + break; + } + valp = (uint64_t *)arg; + VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0); return (0); - } default: break; diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c index 4c2a39013a..eb2a0c2ec5 100644 --- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c +++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c @@ -24,6 +24,7 @@ */ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* * Fibre channel Transport Library (fctl) @@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count) maxPorts ++; } + if (maxPorts == 0) { + mutex_exit(&fctl_port_lock); + return (0); + } + /* Now allocate a buffer to store all the pointers for comparisons */ portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP); diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c index c6c6b65900..5502ea54af 100644 --- a/usr/src/uts/common/io/gld.c +++ b/usr/src/uts/common/io/gld.c @@ -22,6 +22,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -4550,8 +4551,7 @@ gld_unitdata(queue_t *q, mblk_t *mp) ifp = ((gld_mac_pvt_t *)macinfo->gldm_mac_pvt)->interfacep; /* grab any checksum information that may be present */ - hcksum_retrieve(mp->b_cont, NULL, NULL, &start, &stuff, &end, - &value, &flags); + mac_hcksum_get(mp->b_cont, &start, &stuff, &end, &value, &flags); /* * Prepend a valid header for transmission @@ -4567,8 +4567,7 @@ gld_unitdata(queue_t *q, mblk_t *mp) } /* apply any checksum information to the first block in the chain */ - (void) hcksum_assoc(nmp, NULL, NULL, start, stuff, end, value, - flags, 0); + mac_hcksum_set(nmp, start, stuff, end, value, flags); GLD_CLEAR_MBLK_VTAG(nmp); if (gld_start(q, nmp, GLD_WSRV, upri) == GLD_NORESOURCES) { diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c new file mode 100644 index 0000000000..03bb799499 --- /dev/null +++ b/usr/src/uts/common/io/gsqueue/gsqueue.c @@ -0,0 +1,608 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Serialization queues are a technique used in illumos to provide what's + * commonly known as a 'vertical' perimeter. The idea (described a bit in + * uts/common/inet/squeue.c) is to provide a means to make sure that message + * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd + * consume these on different policies, ip on a conn_t basis, vnd on a per + * device basis, and use this to ensure that only one packet is being processed + * at a given time. + * + * Serialization queues were originally used by ip. As part of that + * implementation, many of the details of ip were baked into it. That includes + * things like conn_t, ip receive attributes, and the notion of sets. While an + * individual serialization queue, or gsqueue_t, is a useful level of + * abstraction, it isn't the basis on which monst consumers want to manage them. + * Instead, we have the notion of a set of serialization queues. These sets are + * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a + * gsqueue_t per CPU to fanout on without managing them all itself. In the + * original implementation, this existed, but they were heavily tied into the + * infrastructure of IP, and its notion of polling on the underlying MAC + * devices. + * + * The result of that past is a new interface to serialization queues and a + * similar, but slightly different, abstraction to sets of these + * (gsqueue_set_t). When designing this there are two different approaches that + * one could consider. The first is that the system has one gsqueue_set_t that + * the entire world shares, whether IP or some other consumer. The other is that + * every consumer has their own set. + * + * The trade offs between these two failure modes are the pathological failure + * modes. There is no guarantee that any two consumers here are equivalent. In + * fact, they very likely have very different latency profiles. If they are + * being processed in the same queue, that can lead to very odd behaviors. More + * generally, if we have a series of processing functions from one consumer + * which are generally short, and another which are generally long, that'll + * cause undue latency that's harder to observe. If we instead take the approach + * that each consumer should have its own set that it fans out over then we + * won't end up with the problem that a given serialization queue will have + * multiple latency profiles, but instead we'll see cpu contention for the bound + * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker + * thread is bound and it is in fact possible for it to be processed by other + * threads on other CPUs. + * + * We've opted to go down the second path, so each consumer has its own + * independent set of serialization queues that it is bound over. + * + * Structure Hierarchies + * --------------------- + * + * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t + * encapsulates all the per-CPU gsqueue_t that exist in the form of + * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could + * accommodate more than one gsqueue_t, but today there is a one to one mapping. + * + * We maintain two different lists of gsqueue_cpu_t, the active and defunct + * sets. The active set is maintained in the array `gs_cpus`. There are NCPU + * entries available in `gs_cpus` with the total number of currently active cpus + * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When + * there is no longer a need for a given binding (see the following section for + * more explanation on when this is the case) then we move the entry to the + * `gs_defunct` list which is just a list_t of gsqueue_cpu_t. + * + * In addition, each gsqueue_set_t can have a series of callbacks registered + * with it. These are described in the following section. Graphically, a given + * gsqueue_set_t looks roughly like the following: + * + * +---------------+ + * | gsqueue_set_t | + * +---------------+ + * | | | + * | | * . . . gs_cpus + * | | | + * | | | +-------------------------------------------------+ + * | | +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |... + * | | +-------------------------------------------------+ + * | | + * | * . . . gs_defunct + * | | + * | | +---------------+ +---------------+ +---------------+ + * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |... + * | +---------------+ +---------------+ +---------------+ + * * . . . gs_cbs + * | + * | +--------------+ +--------------+ +--------------+ + * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |... + * +--------------+ +--------------+ +--------------+ + * + * CPU DR, gsqueue_t, and gsqueue_t + * -------------------------------- + * + * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker + * thread that may end up doing work. As part of supporting fanout, we have one + * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of + * this binding, we need to deal with CPU DR changes. + * + * The gsqueue driver maintains a single CPU DR callback that is used for the + * entire sub-system. We break down CPU DR events into three groups. Offline + * events, online events, and events we can ignore. When the first group occurs, + * we need to go through every gsqueue_t, find the gsqueue_cpu_t that + * corresponds to that processor id, and unbind all of its gsqueue_t's. It's + * rather important that we only unbind the gsqueue_t's and not actually destroy + * them. When this happens, they could very easily have data queued inside of + * them and it's unreasonable to just throw out everything in them at this + * point. The data remains intact and service continues uinterrupted. + * + * When we receive an online event, we do the opposite. We try to find a + * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid + * field intact) in the defunct list. If we find one, we remove it from the + * defunct list and add it to the active list as well as binding the gsqueue_t + * to the CPU in question. If we don't find one, then we create a new one. + * + * To deal with these kinds of situations, we allow a consumer to register + * callbacks for the gsqueue_t that they are interested in. These callbacks will + * fire whenever we are handling a topology change. The design of the callbacks + * is not that the user can take any administrative action during them, but + * rather set something for them to do asynchronously. It is illegal to make any + * calls into the gsqueue system while you are in a callback. + * + * Locking + * ------- + * + * The lock ordering here is fairly straightforward. Due to our use of CPU + * binding and the CPU DR callbacks, we have an additional lock to consider + * cpu_lock. Because of that, the following are the rules for locking: + * + * + * o If performing binding operations, you must grab cpu_lock. cpu_lock is + * also at the top of the order. + * + * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock + * If you need to take multiple locks, you must take the greatest + * (left-most) one first. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/cpuvar.h> +#include <sys/list.h> +#include <sys/sysmacros.h> + +#include <sys/gsqueue.h> +#include <sys/squeue_impl.h> + +typedef struct gsqueue_cb { + struct gsqueue_cb *gcb_next; + gsqueue_cb_f gcb_func; + void *gcb_arg; +} gsqueue_cb_t; + +typedef struct gsqueue_cpu { + list_node_t gqc_lnode; + squeue_t *gqc_head; + processorid_t gqc_cpuid; +} gsqueue_cpu_t; + +struct gsqueue_set { + list_node_t gs_next; + pri_t gs_wpri; + kmutex_t gs_lock; + int gs_ncpus; + gsqueue_cpu_t **gs_cpus; + list_t gs_defunct; + gsqueue_cb_t *gs_cbs; +}; + +static kmutex_t gsqueue_lock; +static list_t gsqueue_list; +static kmem_cache_t *gsqueue_cb_cache; +static kmem_cache_t *gsqueue_cpu_cache; +static kmem_cache_t *gsqueue_set_cache; + +static gsqueue_cpu_t * +gsqueue_cpu_create(pri_t wpri, processorid_t cpuid) +{ + gsqueue_cpu_t *scp; + + scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP); + + list_link_init(&scp->gqc_lnode); + scp->gqc_cpuid = cpuid; + scp->gqc_head = squeue_create(wpri, B_FALSE); + scp->gqc_head->sq_state = SQS_DEFAULT; + squeue_bind(scp->gqc_head, cpuid); + + return (scp); +} + +static void +gsqueue_cpu_destroy(gsqueue_cpu_t *scp) +{ + squeue_destroy(scp->gqc_head); + kmem_cache_free(gsqueue_cpu_cache, scp); +} + +gsqueue_set_t * +gsqueue_set_create(pri_t wpri) +{ + int i; + gsqueue_set_t *gssp; + + gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP); + gssp->gs_wpri = wpri; + gssp->gs_ncpus = 0; + + /* + * We're grabbing CPU lock. Once we let go of it we have to ensure all + * set up of the gsqueue_set_t is complete, as it'll be in there for the + * various CPU DR bits. + */ + mutex_enter(&cpu_lock); + + for (i = 0; i < NCPU; i++) { + gsqueue_cpu_t *scp; + cpu_t *cp = cpu_get(i); + if (cp != NULL && CPU_ACTIVE(cp) && + cp->cpu_flags & CPU_EXISTS) { + scp = gsqueue_cpu_create(wpri, cp->cpu_id); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + } + } + + /* Finally we can add it to our global list and be done */ + mutex_enter(&gsqueue_lock); + list_insert_tail(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + mutex_exit(&cpu_lock); + + return (gssp); +} + +void +gsqueue_set_destroy(gsqueue_set_t *gssp) +{ + int i; + gsqueue_cpu_t *scp; + + /* + * Go through and unbind all of the squeues while cpu_lock is held and + * move them to the defunct list. Once that's done, we don't need to do + * anything else with cpu_lock. + */ + mutex_enter(&cpu_lock); + mutex_enter(&gsqueue_lock); + list_remove(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + + mutex_enter(&gssp->gs_lock); + + for (i = 0; i < gssp->gs_ncpus; i++) { + scp = gssp->gs_cpus[i]; + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = NULL; + } + gssp->gs_ncpus = 0; + + mutex_exit(&gssp->gs_lock); + mutex_exit(&cpu_lock); + + while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) { + gsqueue_cpu_destroy(scp); + } + + while (gssp->gs_cbs != NULL) { + gsqueue_cb_t *cbp; + + cbp = gssp->gs_cbs; + gssp->gs_cbs = cbp->gcb_next; + kmem_cache_free(gsqueue_cb_cache, cbp); + } + + ASSERT3U(gssp->gs_ncpus, ==, 0); + ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL); + ASSERT3P(gssp->gs_cbs, ==, NULL); + kmem_cache_free(gsqueue_set_cache, gssp); +} + +gsqueue_t * +gsqueue_set_get(gsqueue_set_t *gssp, uint_t index) +{ + squeue_t *sqp; + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_cpus[index % gssp->gs_ncpus]; + sqp = scp->gqc_head; + mutex_exit(&gssp->gs_lock); + return ((gsqueue_t *)sqp); +} + +uintptr_t +gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg) +{ + gsqueue_cb_t *cbp; + + cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP); + cbp->gcb_func = cb; + cbp->gcb_arg = arg; + + mutex_enter(&gssp->gs_lock); + cbp->gcb_next = gssp->gs_cbs; + gssp->gs_cbs = cbp; + mutex_exit(&gssp->gs_lock); + return ((uintptr_t)cbp); +} + +int +gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id) +{ + gsqueue_cb_t *cbp, *prev; + mutex_enter(&gssp->gs_lock); + cbp = gssp->gs_cbs; + prev = NULL; + while (cbp != NULL) { + if ((uintptr_t)cbp != id) { + prev = cbp; + cbp = cbp->gcb_next; + continue; + } + + if (prev == NULL) { + gssp->gs_cbs = cbp->gcb_next; + } else { + prev->gcb_next = cbp->gcb_next; + } + + mutex_exit(&gssp->gs_lock); + kmem_cache_free(gsqueue_cb_cache, cbp); + return (0); + } + mutex_exit(&gssp->gs_lock); + return (-1); +} + +void +gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg, + int flags, uint8_t tag) +{ + squeue_t *sqp = (squeue_t *)gsp; + + ASSERT(mp->b_next == NULL); + ASSERT(mp->b_prev == NULL); + mp->b_queue = (queue_t *)func; + mp->b_prev = arg; + sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag); +} + +static void +gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online) +{ + gsqueue_cb_t *cbp; + + ASSERT(MUTEX_HELD(&gssp->gs_lock)); + cbp = gssp->gs_cbs; + while (cbp != NULL) { + cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online); + cbp = cbp->gcb_next; + } + +} + +/* + * When we online a processor we need to go through and either bind a defunct + * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the + * defunct list that used to be on that processor. If no such gsqueue_cpu_t + * exists, then we'll create a new one. We'd rather avoid taking over an + * existing defunct one that used to be on another CPU, as its not unreasonable + * to believe that its CPU will come back. More CPUs are offlined and onlined by + * the administrator or by creating cpu sets than actually get offlined by FMA. + */ +static void +gsqueue_handle_online(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + for (scp = list_head(&gssp->gs_defunct); scp != NULL; + scp = list_next(&gssp->gs_defunct, scp)) { + if (scp->gqc_cpuid == id) { + list_remove(&gssp->gs_defunct, scp); + break; + } + } + + if (scp == NULL) { + scp = gsqueue_cpu_create(gssp->gs_wpri, id); + } else { + squeue_bind(scp->gqc_head, id); + } + + ASSERT(gssp->gs_ncpus < NCPU); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + gsqueue_notify(gssp, scp->gqc_head, B_TRUE); + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +static void +gsqueue_handle_offline(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + int i; + gsqueue_cpu_t *scp = NULL; + + mutex_enter(&gssp->gs_lock); + for (i = 0; i < gssp->gs_ncpus; i++) { + if (gssp->gs_cpus[i]->gqc_cpuid == id) { + scp = gssp->gs_cpus[i]; + break; + } + } + + if (scp != NULL) { + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1]; + gssp->gs_ncpus--; + gsqueue_notify(gssp, scp->gqc_head, B_FALSE); + } + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +/* ARGSUSED */ +static int +gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu_get(id); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS) + gsqueue_handle_online(cp->cpu_id); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + gsqueue_handle_offline(cp->cpu_id); + break; + default: + break; + } + + return (0); +} + + +/* ARGSUSED */ +static int +gsqueue_set_cache_construct(void *buf, void *arg, int kmflags) +{ + gsqueue_set_t *gssp = buf; + + gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags); + if (gssp->gs_cpus == NULL) + return (-1); + + mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t), + offsetof(gsqueue_cpu_t, gqc_lnode)); + gssp->gs_ncpus = 0; + gssp->gs_cbs = NULL; + + return (0); +} + +/* ARGSUSED */ +static void +gsqueue_set_cache_destruct(void *buf, void *arg) +{ + gsqueue_set_t *gssp = buf; + + kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU); + gssp->gs_cpus = NULL; + list_destroy(&gssp->gs_defunct); + mutex_destroy(&gssp->gs_lock); +} + +static void +gsqueue_ddiinit(void) +{ + list_create(&gsqueue_list, sizeof (gsqueue_set_t), + offsetof(gsqueue_set_t, gs_next)); + mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL); + + gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache", + sizeof (gsqueue_cb_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache", + sizeof (gsqueue_cpu_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_set_cache = kmem_cache_create("squeue_set_cache", + sizeof (gsqueue_set_t), + 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct, + NULL, NULL, NULL, 0); + + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +static int +gsqueue_ddifini(void) +{ + mutex_enter(&gsqueue_lock); + if (list_is_empty(&gsqueue_list) == 0) { + mutex_exit(&gsqueue_lock); + return (EBUSY); + } + list_destroy(&gsqueue_list); + mutex_exit(&gsqueue_lock); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + kmem_cache_destroy(gsqueue_set_cache); + kmem_cache_destroy(gsqueue_cpu_cache); + kmem_cache_destroy(gsqueue_cb_cache); + + mutex_destroy(&gsqueue_lock); + + return (0); +} + +static struct modlmisc gsqueue_modmisc = { + &mod_miscops, + "gsqueue" +}; + +static struct modlinkage gsqueue_modlinkage = { + MODREV_1, + &gsqueue_modmisc, + NULL +}; + +int +_init(void) +{ + int ret; + + gsqueue_ddiinit(); + if ((ret = mod_install(&gsqueue_modlinkage)) != 0) { + VERIFY(gsqueue_ddifini() == 0); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&gsqueue_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = gsqueue_ddifini()) != 0) + return (ret); + + if ((ret = mod_remove(&gsqueue_modlinkage)) != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c index c3ebfa0e47..6726f72147 100644 --- a/usr/src/uts/common/io/hook.c +++ b/usr/src/uts/common/io/hook.c @@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks) /* Free container */ kmem_free(hfi, sizeof (*hfi)); - if (hks->hks_shutdown == 2) + if (hks != NULL && hks->hks_shutdown == 2) hook_stack_remove(hks); mutex_exit(&hook_stack_lock); diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c index d34057d64f..ccf814be0b 100644 --- a/usr/src/uts/common/io/i40e/i40e_gld.c +++ b/usr/src/uts/common/io/i40e/i40e_gld.c @@ -39,7 +39,8 @@ char *i40e_priv_props[] = { static int i40e_group_remove_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; struct i40e_aqc_remove_macvlan_element_data filt; struct i40e_hw *hw = &i40e->i40e_hw_space; int ret, i, last; @@ -107,10 +108,11 @@ done: static int i40e_group_add_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; - struct i40e_hw *hw = &i40e->i40e_hw_space; - int i, ret; - i40e_uaddr_t *iua; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int i, ret; + i40e_uaddr_t *iua; struct i40e_aqc_add_macvlan_element_data filt; if (I40E_IS_MULTICAST(mac_addr)) @@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) } } - /* - * Note, the general use of the i40e_vsi_id will have to be refactored - * when we have proper group support. - */ bzero(&filt, sizeof (filt)); bcopy(mac_addr, filt.mac_addr, ETHERADDRL); filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d", @@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used]; bcopy(mac_addr, iua->iua_mac, ETHERADDRL); - iua->iua_vsi = i40e->i40e_vsi_id; + iua->iua_vsi = rxg->irg_vsi_seid; i40e->i40e_resources.ifr_nmacfilt_used++; ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <= i40e->i40e_resources.ifr_nmacfilt); @@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on) } - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " @@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on) goto done; } - ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s multicast promiscuity on " @@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on) * Try our best to put us back into a state that MAC expects us * to be in. */ - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, - !on, NULL, B_FALSE); + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, + I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " "the default VSI after toggling multicast failed: " @@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 0 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_TRUE, NULL); + I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to enable multicast " "promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d", @@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH | I40E_AQC_MACVLAN_DEL_IGNORE_VLAN; - if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id, - &filt, 1, NULL) != I40E_SUCCESS) { + if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, + 1, NULL) != I40E_SUCCESS) { i40e_error(i40e, "failed to remove mac address " "%2x:%2x:%2x:%2x:%2x:%2x from multicast " "filter: %d", @@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 1 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_FALSE, NULL); + I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to disable " "multicast promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index, * we're not actually grouping things tx-wise at this time. */ ASSERT(group_index == -1); - ASSERT(ring_index < i40e->i40e_num_trqpairs); + ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi); itrq->itrq_mactxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index, { i40e_t *i40e = arg; mac_intr_t *mintr = &infop->mri_intr; - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index]; + uint_t trqpair_index; + i40e_trqpair_t *itrq; - /* - * We assert the group number and ring index to help sanity check - * ourselves and mark that we'll need to rework this when we have - * multiple groups. - */ - ASSERT3S(group_index, ==, 0); - ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs); + /* This assumes static groups. */ + ASSERT3S(group_index, >=, 0); + ASSERT3S(ring_index, >=, 0); + trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) + + ring_index; + ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs); + itrq = &i40e->i40e_trqpairs[trqpair_index]; itrq->itrq_macrxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { i40e_t *i40e = arg; + i40e_rx_group_t *rxg; if (rtype != MAC_RING_TYPE_RX) return; - /* - * Note, this is a simplified view of a group, given that we only have a - * single group and a single ring at the moment. We'll want to expand - * upon this as we leverage more hardware functionality. - */ - i40e->i40e_rx_group_handle = gh; - infop->mgi_driver = (mac_group_driver_t)i40e; + rxg = &i40e->i40e_rx_groups[index]; + rxg->irg_grp_hdl = gh; + + infop->mgi_driver = (mac_group_driver_t)rxg; infop->mgi_start = NULL; infop->mgi_stop = NULL; infop->mgi_addmac = i40e_group_add_mac; infop->mgi_remmac = i40e_group_remove_mac; - ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX); - infop->mgi_count = i40e->i40e_num_trqpairs; + ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX); + infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi; } static int @@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (i40e->i40e_tx_lso_enable == B_TRUE) { + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN; + } else { + return (B_FALSE); + } + break; + } + case MAC_CAPAB_RINGS: cap_rings = cap_data; cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; switch (cap_rings->mr_type) { case MAC_RING_TYPE_TX: /* - * Note, saying we have no rings, but some number of - * groups indicates to MAC that it should create - * psuedo-groups with one for each TX ring. This may not - * be the long term behavior we want, but it'll work for - * now. + * Note, saying we have no groups, but some + * number of rings indicates to MAC that it + * should create psuedo-groups with one for + * each TX ring. This may not be the long term + * behavior we want, but it'll work for now. */ cap_rings->mr_gnum = 0; - cap_rings->mr_rnum = i40e->i40e_num_trqpairs; + cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi; cap_rings->mr_rget = i40e_fill_tx_ring; cap_rings->mr_gget = NULL; cap_rings->mr_gaddring = NULL; @@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) case MAC_RING_TYPE_RX: cap_rings->mr_rnum = i40e->i40e_num_trqpairs; cap_rings->mr_rget = i40e_fill_rx_ring; - cap_rings->mr_gnum = I40E_GROUP_MAX; + cap_rings->mr_gnum = i40e->i40e_num_rx_groups; cap_rings->mr_gget = i40e_fill_rx_group; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c index 51d1bbac92..170bef7ec6 100644 --- a/usr/src/uts/common/io/i40e/i40e_intr.c +++ b/usr/src/uts/common/io/i40e/i40e_intr.c @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e) I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg); } +/* + * The next two functions enable/disable the reception of interrupts + * on the given vector. Only vectors 1..N are programmed by these + * functions; vector 0 is special and handled by a different register. + * We must subtract one from the vector because i40e implicitly adds + * one to the vector value. See section 10.2.2.10.13 for more details. + */ static void i40e_intr_io_enable(i40e_t *i40e, int vector) { uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_PFINT_DYN_CTLN_INTENA_MASK | I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT); @@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector) uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT; I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg); } @@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e) } /* - * Enable all of the queues and set the corresponding LNKLSTN registers. Note - * that we always enable queues as interrupt sources, even though we don't - * enable the MSI-X interrupt vectors. + * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N] + * register actually refers to the 'N + 1' interrupt vector. E.g., + * PFINT_LNKLSTN[0] refers to interrupt vector 1. + */ +static void +i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); + + I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg); + DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg); +} + +/* + * Set the QINT_RQCTL[queue] register. The next queue is always the Tx + * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the + * vector should be the actual vector this queue is on -- i.e., it + * should be equal to itrq_rx_intrvec. + */ +static void +i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec); + + reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_RQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg); + DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is + * either the Rx queue of another TRQP, or EOL. + */ +static void +i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec); + + reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) | + (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_TQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg); + DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Program the interrupt linked list. Each vector has a linked list of + * queues which act as event sources for that vector. When one of + * those sources has an event the associated interrupt vector is + * fired. This mapping must match the mapping found in + * i40e_map_intrs_to_vectors(). + * + * See section 7.5.3 for more information about the configuration of + * the interrupt linked list. */ static void i40e_intr_init_queue_msix(i40e_t *i40e) { - i40e_hw_t *hw = &i40e->i40e_hw_space; - uint32_t reg; - int i; + uint_t intr_count; /* - * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1. - * Note that we skip the ITR logic for the moment, just to make our - * lives as explicit and simple as possible. + * The 0th vector is for 'Other Interrupts' only (subject to + * change in the future). */ - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + intr_count = i40e->i40e_intr_count - 1; - reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << - I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); - I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg); + for (uint_t vec = 0; vec < intr_count; vec++) { + boolean_t head = B_TRUE; - reg = - (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_RQCTL_CAUSE_ENA_MASK; + for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs; + qidx += intr_count) { + uint_t next_qidx = qidx + intr_count; - I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg); + next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ? + I40E_QUEUE_TYPE_EOL : next_qidx; - reg = - (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_TQCTL_CAUSE_ENA_MASK; + if (head) { + i40e_set_lnklstn(i40e, vec, qidx); + head = B_FALSE; + } - I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg); + i40e_set_rqctl(i40e, vec + 1, qidx); + i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx); + } } - } /* @@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e) } static void -i40e_intr_rx_work(i40e_t *i40e, int queue) +i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { mblk_t *mp = NULL; - i40e_trqpair_t *itrq; - - ASSERT(queue < i40e->i40e_num_trqpairs); - itrq = &i40e->i40e_trqpairs[queue]; mutex_enter(&itrq->itrq_rx_lock); if (!itrq->itrq_intr_poll) mp = i40e_ring_rx(itrq, I40E_POLL_NULL); mutex_exit(&itrq->itrq_rx_lock); - if (mp != NULL) { - mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, - itrq->itrq_rxgen); - } + if (mp == NULL) + return; + + mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, + itrq->itrq_rxgen); } +/* ARGSUSED */ static void -i40e_intr_tx_work(i40e_t *i40e, int queue) +i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { - i40e_trqpair_t *itrq; - - itrq = &i40e->i40e_trqpairs[queue]; i40e_tx_recycle_ring(itrq); } @@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e) i40e_intr_adminq_enable(i40e); } +/* + * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of + * the MSI-X interrupt sequence. + */ uint_t i40e_intr_msix(void *arg1, void *arg2) { i40e_t *i40e = (i40e_t *)arg1; - int vector_idx = (int)(uintptr_t)arg2; + uint_t vector_idx = (uint_t)(uintptr_t)arg2; + + ASSERT3U(vector_idx, <, i40e->i40e_intr_count); /* * When using MSI-X interrupts, vector 0 is always reserved for the @@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2) return (DDI_INTR_CLAIMED); } - i40e_intr_rx_work(i40e, vector_idx - 1); - i40e_intr_tx_work(i40e, vector_idx - 1); - i40e_intr_io_enable(i40e, vector_idx); + ASSERT3U(vector_idx, >, 0); + /* + * We determine the queue indexes via simple arithmetic (as + * opposed to keeping explicit state like a bitmap). While + * conveinent, it does mean that i40e_map_intrs_to_vectors(), + * i40e_intr_init_queue_msix(), and this function must be + * modified as a unit. + * + * We subtract 1 from the vector to offset the addition we + * performed during i40e_map_intrs_to_vectors(). + */ + for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs; + i += (i40e->i40e_intr_count - 1)) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + + ASSERT3U(i, <, i40e->i40e_num_trqpairs); + ASSERT3P(itrq, !=, NULL); + i40e_intr_rx_work(i40e, itrq); + i40e_intr_tx_work(i40e, itrq); + } + + i40e_intr_io_enable(i40e, vector_idx); return (DDI_INTR_CLAIMED); } @@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) { i40e_hw_t *hw = &i40e->i40e_hw_space; uint32_t reg; + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0]; int ret = DDI_INTR_CLAIMED; if (shared == B_TRUE) { @@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) i40e_intr_adminq_work(i40e); if (reg & I40E_INTR_NOTX_RX_MASK) - i40e_intr_rx_work(i40e, 0); + i40e_intr_rx_work(i40e, itrq); if (reg & I40E_INTR_NOTX_TX_MASK) - i40e_intr_tx_work(i40e, 0); + i40e_intr_tx_work(i40e, itrq); done: i40e_intr_adminq_enable(i40e); diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c index 54aef43424..0623aee513 100644 --- a/usr/src/uts/common/io/i40e/i40e_main.c +++ b/usr/src/uts/common/io/i40e/i40e_main.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -188,14 +188,15 @@ * VSI Management * -------------- * - * At this time, we currently only support a single MAC group, and thus a single - * VSI. This VSI is considered the default VSI and should be the only one that - * exists after a reset. Currently it is stored as the member - * i40e_t`i40e_vsi_id. While this works for the moment and for an initial - * driver, it's not sufficient for the longer-term path of the driver. Instead, - * we'll want to actually have a unique i40e_vsi_t structure which is used - * everywhere. Note that this means that every place that uses the - * i40e_t`i40e_vsi_id will need to be refactored. + * The PFs share 384 VSIs. The firmware creates one VSI per PF by default. + * During chip start we retrieve the SEID of this VSI and assign it as the + * default VSI for our VEB (one VEB per PF). We then add additional VSIs to + * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups. + * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can + * allocate the same number of VSIs, and b) to keep the interrupt multiplexing + * under control. In the future, when we improve the interrupt allocation, we + * may want to revisit this cap to make better use of the available VSIs. The + * VSI allocation and configuration can be found in i40e_chip_start(). * * ---------------- * Structure Layout @@ -240,7 +241,7 @@ * | i40e_hw_t --+---> Intel common code structure * | mac_handle_t --+---> GLDv3 handle to MAC * | ddi_periodic_t --+---> Link activity timer - * | int (vsi_id) --+---> VSI ID, main identifier + * | i40e_vsi_t * --+---> Array of VSIs * | i40e_func_rsrc_t --+---> Available hardware resources * | i40e_switch_rsrc_t * --+---> Switch resource snapshot * | i40e_sdu --+---> Current MTU @@ -249,11 +250,10 @@ * | i40e_maddr_t * --+---> Array of assigned multicast MACs * | i40e_mcast_promisccount --+---> Active multicast state * | i40e_promisc_on --+---> Current promiscuous mode state - * | int --+---> Number of transmit/receive pairs + * | uint_t --+---> Number of transmit/receive pairs + * | i40e_rx_group_t * --+---> Array of Rx groups * | kstat_t * --+---> PF kstats - * | kstat_t * --+---> VSI kstats * | i40e_pf_stats_t --+---> PF kstat backing data - * | i40e_vsi_stats_t --+---> VSI kstat backing data * | i40e_trqpair_t * --+---------+ * +---------------------------+ | * | @@ -359,8 +359,6 @@ * While bugs have been filed to cover this future work, the following gives an * overview of expected work: * - * o TSO support - * o Multiple group support * o DMA binding and breaking up the locking in ring recycling. * o Enhanced detection of device errors * o Participation in IRM @@ -371,7 +369,7 @@ #include "i40e_sw.h" -static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1"; +static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.3"; /* * The i40e_glock primarily protects the lists below and the i40e_device_t @@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail) } /* - * Here we're trying to get the ID of the default VSI. In general, when we come - * through and look at this shortly after attach, we expect there to only be a - * single element present, which is the default VSI. Importantly, each PF seems - * to not see any other devices, in part because of the simple switch mode that - * we're using. If for some reason, we see more artifact, we'll need to revisit - * what we're doing here. + * Here we're trying to set the SEID of the default VSI. In general, + * when we come through and look at this shortly after attach, we + * expect there to only be a single element present, which is the + * default VSI. Importantly, each PF seems to not see any other + * devices, in part because of the simple switch mode that we're + * using. If for some reason, we see more artifacts, we'll need to + * revisit what we're doing here. */ -static int -i40e_get_vsi_id(i40e_t *i40e) +static boolean_t +i40e_set_def_vsi_seid(i40e_t *i40e) { i40e_hw_t *hw = &i40e->i40e_hw_space; struct i40e_aqc_get_switch_config_resp *sw_config; @@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e) if (rc != I40E_SUCCESS) { i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", rc, hw->aq.asq_last_status); - return (-1); + return (B_FALSE); } if (LE_16(sw_config->header.num_reported) != 1) { i40e_error(i40e, "encountered multiple (%d) switching units " "during attach, not proceeding", LE_16(sw_config->header.num_reported)); + return (B_FALSE); + } + + I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid; + return (B_TRUE); +} + +/* + * Get the SEID of the uplink MAC. + */ +static int +i40e_get_mac_seid(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_aqc_get_switch_config_resp *sw_config; + uint8_t aq_buf[I40E_AQ_LARGE_BUF]; + uint16_t next = 0; + int rc; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf; + rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next, + NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", + rc, hw->aq.asq_last_status); return (-1); } - return (sw_config->element[0].seid); + return (LE_16(sw_config->element[0].uplink_seid)); } /* @@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e) static void i40e_free_trqpairs(i40e_t *i40e) { - int i; i40e_trqpair_t *itrq; + if (i40e->i40e_rx_groups != NULL) { + kmem_free(i40e->i40e_rx_groups, + sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups); + i40e->i40e_rx_groups = NULL; + } + if (i40e->i40e_trqpairs != NULL) { - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { itrq = &i40e->i40e_trqpairs[i]; mutex_destroy(&itrq->itrq_rx_lock); mutex_destroy(&itrq->itrq_tx_lock); @@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e) static boolean_t i40e_alloc_trqpairs(i40e_t *i40e) { - int i; void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri); /* @@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e) i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs, KM_SLEEP); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; itrq->itrq_i40e = i40e; @@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e) itrq->itrq_index = i; } + i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) * + i40e->i40e_num_rx_groups, KM_SLEEP); + + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i]; + + rxg->irg_index = i; + rxg->irg_i40e = i40e; + } + return (B_TRUE); } @@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e) /* * Unless a .conf file already overrode i40e_t structure values, they will * be 0, and need to be set in conjunction with the now-available HW report. - * - * However, at the moment, we cap all of these resources as we only support a - * single receive ring and a single group. */ /* ARGSUSED */ static void i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw) { - if (i40e->i40e_num_trqpairs == 0) { - i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX; + if (i40e->i40e_num_trqpairs_per_vsi == 0) { + if (i40e_is_x722(i40e)) { + i40e->i40e_num_trqpairs_per_vsi = + I40E_722_MAX_TC_QUEUES; + } else { + i40e->i40e_num_trqpairs_per_vsi = + I40E_710_MAX_TC_QUEUES; + } } if (i40e->i40e_num_rx_groups == 0) { @@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw) } /* - * We need to obtain the Virtual Station ID (VSI) before we can - * perform other operations on the device. + * We need to obtain the Default Virtual Station SEID (VSI) + * before we can perform other operations on the device. */ - i40e->i40e_vsi_id = i40e_get_vsi_id(i40e); - if (i40e->i40e_vsi_id == -1) { - i40e_error(i40e, "failed to obtain VSI ID"); + if (!i40e_set_def_vsi_seid(i40e)) { + i40e_error(i40e, "failed to obtain Default VSI SEID"); return (B_FALSE); } @@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e) i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable", + B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); @@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) } i40e->i40e_intr_type = 0; + i40e->i40e_num_rx_groups = I40E_GROUP_MAX; + /* + * We need to determine the number of queue pairs per traffic + * class. We only have one traffic class (TC0), so we'll base + * this off the number of interrupts provided. Furthermore, + * since we only use one traffic class, the number of queues + * per traffic class and per VSI are the same. + */ if ((intr_types & DDI_INTR_TYPE_MSIX) && - i40e->i40e_intr_force <= I40E_INTR_MSIX) { - if (i40e_alloc_intr_handles(i40e, devinfo, - DDI_INTR_TYPE_MSIX)) { - i40e->i40e_num_trqpairs = - MIN(i40e->i40e_intr_count - 1, max_trqpairs); - return (B_TRUE); - } + (i40e->i40e_intr_force <= I40E_INTR_MSIX) && + (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) { + uint32_t n; + + /* + * While we want the number of queue pairs to match + * the number of interrupts, we must keep stay in + * bounds of the maximum number of queues per traffic + * class. We subtract one from i40e_intr_count to + * account for interrupt zero; which is currently + * restricted to admin queue commands and other + * interrupt causes. + */ + n = MIN(i40e->i40e_intr_count - 1, max_trqpairs); + ASSERT3U(n, >, 0); + + /* + * Round up to the nearest power of two to ensure that + * the QBASE aligns with the TC size which must be + * programmed as a power of two. See the queue mapping + * description in section 7.4.9.5.5.1. + * + * If i40e_intr_count - 1 is not a power of two then + * some queue pairs on the same VSI will have to share + * an interrupt. + * + * We may want to revisit this logic in a future where + * we have more interrupts and more VSIs. Otherwise, + * each VSI will use as many interrupts as possible. + * Using more QPs per VSI means better RSS for each + * group, but at the same time may require more + * sharing of interrupts across VSIs. This may be a + * good candidate for a .conf tunable. + */ + n = 0x1 << ddi_fls(n); + i40e->i40e_num_trqpairs_per_vsi = n; + ASSERT3U(i40e->i40e_num_rx_groups, >, 0); + i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi * + i40e->i40e_num_rx_groups; + return (B_TRUE); } /* @@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) * single MSI interrupt. */ i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX; + i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs; i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX; if ((intr_types & DDI_INTR_TYPE_MSI) && @@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) static boolean_t i40e_map_intrs_to_vectors(i40e_t *i40e) { - int i; - if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { return (B_TRUE); } /* - * Each queue pair is mapped to a single interrupt, so transmit - * and receive interrupts for a given queue share the same vector. - * The number of queue pairs is one less than the number of interrupt - * vectors and is assigned the vector one higher than its index. - * Vector zero is reserved for the admin queue. + * Each queue pair is mapped to a single interrupt, so + * transmit and receive interrupts for a given queue share the + * same vector. Vector zero is reserved for the admin queue. */ - ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1); + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint_t vector = i % (i40e->i40e_intr_count - 1); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1; - i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1; + i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1; + i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1; } return (B_TRUE); @@ -1923,89 +2005,282 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw) } /* - * Configure the hardware for the Virtual Station Interface (VSI). Currently - * we only support one, but in the future we could instantiate more than one - * per attach-point. + * Set the properties which have common values across all the VSIs. + * Consult the "Add VSI" command section (7.4.9.5.5.1) for a + * complete description of these properties. */ -static boolean_t -i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw) +static void +i40e_set_shared_vsi_props(i40e_t *i40e, + struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx) { - struct i40e_vsi_context context; - int err, tc_queues; + uint_t tc_queues; + uint16_t vsi_qp_base; - bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; - context.pf_num = hw->pf_id; - err = i40e_aq_get_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "get VSI params failed with %d", err); - return (B_FALSE); - } - - i40e->i40e_vsi_num = context.vsi_number; + /* + * It's important that we use bitwise-OR here; callers to this + * function might enable other sections before calling this + * function. + */ + info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID | + I40E_AQ_VSI_PROP_VLAN_VALID); /* - * Set the queue and traffic class bits. Keep it simple for now. + * Calculate the starting QP index for this VSI. This base is + * relative to the PF queue space; so a value of 0 for PF#1 + * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1. */ - context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; - context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG; - context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES; + vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi; + info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG); + info->queue_mapping[0] = + LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) & + I40E_AQ_VSI_QUEUE_MASK); /* - * tc_queues determines the size of the traffic class, where the size is - * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722. + * tc_queues determines the size of the traffic class, where + * the size is 2^^tc_queues to a maximum of 64 for the X710 + * and 128 for the X722. * * Some examples: - * i40e_num_trqpairs == 1 => tc_queues = 0, 2^^0 = 1. - * i40e_num_trqpairs == 7 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 8 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 9 => tc_queues = 4, 2^^4 = 16. - * i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32. - * i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64. + * i40e_num_trqpairs_per_vsi == 1 => tc_queues = 0, 2^^0 = 1. + * i40e_num_trqpairs_per_vsi == 7 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 8 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 9 => tc_queues = 4, 2^^4 = 16. + * i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32. + * i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64. */ - tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1); + tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1); - context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & - I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | - ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & - I40E_AQ_VSI_TC_QUE_NUMBER_MASK); + /* + * The TC queue mapping is in relation to the VSI queue space. + * Since we are only using one traffic class (TC0) we always + * start at queue offset 0. + */ + info->tc_mapping[0] = + LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & + I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | + ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & + I40E_AQ_VSI_TC_QUE_NUMBER_MASK)); - context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID; - context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | + /* + * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode") + * + * Allow tagged and untagged packets to be sent to this + * VSI from the host. + * + * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode") + * + * Leave the tag on the frame and place no VLAN + * information in the descriptor. We want this mode + * because our MAC layer will take care of the VLAN tag, + * if there is one. + */ + info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | I40E_AQ_VSI_PVLAN_EMOD_NOTHING; +} - context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF); +/* + * Delete the VSI at this index, if one exists. We assume there is no + * action we can take if this command fails but to log the failure. + */ +static void +i40e_delete_vsi(i40e_t *i40e, uint_t idx) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint16_t seid = i40e->i40e_vsis[idx].iv_seid; - i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx); - if (i40e_stat_vsi_init(i40e) == B_FALSE) - return (B_FALSE); + if (seid != 0) { + int rc; - err = i40e_aq_update_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "Update VSI params failed with %d", err); + rc = i40e_aq_delete_element(hw, seid, NULL); + + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VSI %d: %d", + rc, hw->aq.asq_last_status); + } + + i40e->i40e_vsis[idx].iv_seid = 0; + } +} + +/* + * Add a new VSI. + */ +static boolean_t +i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx) +{ + struct i40e_vsi_context ctx; + i40e_rx_group_t *rxg; + int rc; + + /* + * The default VSI is created by the controller. This function + * creates new, non-defualt VSIs only. + */ + ASSERT3U(idx, !=, 0); + + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.uplink_seid = i40e->i40e_veb_seid; + ctx.pf_num = hw->pf_id; + ctx.flags = I40E_AQ_VSI_TYPE_PF; + ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL; + i40e_set_shared_vsi_props(i40e, &ctx.info, idx); + + rc = i40e_aq_add_vsi(hw, &ctx, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc, + hw->aq.asq_last_status); return (B_FALSE); } + rxg = &i40e->i40e_rx_groups[idx]; + rxg->irg_vsi_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_number = ctx.vsi_number; + i40e->i40e_vsis[idx].iv_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + + if (i40e_stat_vsi_init(i40e, idx) == B_FALSE) + return (B_FALSE); return (B_TRUE); } /* - * Configure the RSS key. For the X710 controller family, this is set on a - * per-PF basis via registers. For the X722, this is done on a per-VSI basis - * through the admin queue. + * Configure the hardware for the Default Virtual Station Interface (VSI). */ static boolean_t -i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw) { - uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + struct i40e_vsi_context ctx; + i40e_rx_group_t *def_rxg; + int err; + struct i40e_aqc_remove_macvlan_element_data filt; - (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.seid = I40E_DEF_VSI_SEID(i40e); + ctx.pf_num = hw->pf_id; + err = i40e_aq_get_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "get VSI params failed with %d", err); + return (B_FALSE); + } - if (i40e_is_x722(i40e)) { + ctx.info.valid_sections = 0; + i40e->i40e_vsis[0].iv_number = ctx.vsi_number; + i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + if (i40e_stat_vsi_init(i40e, 0) == B_FALSE) + return (B_FALSE); + + i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX); + + err = i40e_aq_update_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "Update VSI params failed with %d", err); + return (B_FALSE); + } + + def_rxg = &i40e->i40e_rx_groups[0]; + def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e); + + /* + * We have seen three different behaviors in regards to the + * Default VSI and its implicit L2 MAC+VLAN filter. + * + * 1. It has an implicit filter for the factory MAC address + * and this filter counts against 'ifr_nmacfilt_used'. + * + * 2. It has an implicit filter for the factory MAC address + * and this filter DOES NOT count against 'ifr_nmacfilt_used'. + * + * 3. It DOES NOT have an implicit filter. + * + * All three of these cases are accounted for below. If we + * fail to remove the L2 filter (ENOENT) then we assume there + * wasn't one. Otherwise, if we successfully remove the + * filter, we make sure to update the 'ifr_nmacfilt_used' + * count accordingly. + * + * We remove this filter to prevent duplicate delivery of + * packets destined for the primary MAC address as DLS will + * create the same filter on a non-default VSI for the primary + * MAC client. + * + * If you change the following code please test it across as + * many X700 series controllers and firmware revisions as you + * can. + */ + bzero(&filt, sizeof (filt)); + bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH; + filt.vlan_tag = 0; + + ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1); + i40e_log(i40e, "Num L2 filters: %u", + i40e->i40e_resources.ifr_nmacfilt_used); + + err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, + NULL); + if (err == I40E_SUCCESS) { + i40e_log(i40e, + "Removed L2 filter from Default VSI with SEID %u", + I40E_DEF_VSI_SEID(i40e)); + } else if (hw->aq.asq_last_status == ENOENT) { + i40e_log(i40e, + "No L2 filter for Default VSI with SEID %u", + I40E_DEF_VSI_SEID(i40e)); + } else { + i40e_error(i40e, "Failed to remove L2 filter from" + " Default VSI with SEID %u: %d (%d)", + I40E_DEF_VSI_SEID(i40e), err, hw->aq.asq_last_status); + + return (B_FALSE); + } + + /* + * As mentioned above, the controller created an implicit L2 + * filter for the primary MAC. We want to remove both the + * filter and decrement the filter count. However, not all + * controllers count this implicit filter against the total + * MAC filter count. So here we are making sure it is either + * one or zero. If it is one, then we know it is for the + * implicit filter and we should decrement since we just + * removed the filter above. If it is zero then we know the + * controller that does not count the implicit filter, and it + * was enough to just remove it; we leave the count alone. + * But if it is neither, then we have never seen a controller + * like this before and we should fail to attach. + * + * It is unfortunate that this code must exist but the + * behavior of this implicit L2 filter and its corresponding + * count were dicovered through empirical testing. The + * programming manuals hint at this filter but do not + * explicitly call out the exact behavior. + */ + if (i40e->i40e_resources.ifr_nmacfilt_used == 1) { + i40e->i40e_resources.ifr_nmacfilt_used--; + } else { + if (i40e->i40e_resources.ifr_nmacfilt_used != 0) { + i40e_error(i40e, "Unexpected L2 filter count: %u" + " (expected 0)", + i40e->i40e_resources.ifr_nmacfilt_used); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw) +{ + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; struct i40e_aqc_get_set_rss_key_data key; - const char *u8seed = (char *)seed; + const char *u8seed; enum i40e_status_code status; + uint16_t vsi_number = i40e->i40e_vsis[i].iv_number; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + u8seed = (char *)seed; CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) + sizeof (key.extended_hash_key))); @@ -2015,14 +2290,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) bcopy(&u8seed[sizeof (key.standard_rss_key)], key.extended_hash_key, sizeof (key.extended_hash_key)); - status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key); + ASSERT3U(vsi_number, !=, 0); + status = i40e_aq_set_rss_key(hw, vsi_number, &key); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set rss key: %d", status); + i40e_error(i40e, "failed to set RSS key for VSI %u: %d", + vsi_number, status); return (B_FALSE); } + } + + return (B_TRUE); +} + +/* + * Configure the RSS key. For the X710 controller family, this is set on a + * per-PF basis via registers. For the X722, this is done on a per-VSI basis + * through the admin queue. + */ +static boolean_t +i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +{ + if (i40e_is_x722(i40e)) { + if (!i40e_config_rss_key_x722(i40e, hw)) + return (B_FALSE); } else { - uint_t i; - for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]); } @@ -2034,11 +2330,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) * family, with the X722 using a known 7-bit width. On the X710 controller, this * is programmed through its control registers where as on the X722 this is * configured through the admin queue. Also of note, the X722 allows the LUT to - * be set on a per-PF or VSI basis. At this time, as we only have a single VSI, - * we use the PF setting as it is the primary VSI. + * be set on a per-PF or VSI basis. At this time we use the PF setting. If we + * decide to use the per-VSI LUT in the future, then we will need to modify the + * i40e_add_vsi() function to set the RSS LUT bits in the queueing section. * * We populate the LUT in a round robin fashion with the rx queue indices from 0 - * to i40e_num_trqpairs - 1. + * to i40e_num_trqpairs_per_vsi - 1. */ static boolean_t i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) @@ -2068,15 +2365,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1; } - for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) - ((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask; + for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) { + ((uint8_t *)hlut)[i] = + (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask; + } if (i40e_is_x722(i40e)) { enum i40e_status_code status; - status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE, - (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE); + + status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut, + I40E_HLUT_TABLE_SIZE); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set RSS LUT: %d", status); + i40e_error(i40e, "failed to set RSS LUT %d: %d", + status, hw->aq.asq_last_status); goto out; } } else { @@ -2152,6 +2454,7 @@ i40e_chip_start(i40e_t *i40e) i40e_hw_t *hw = &i40e->i40e_hw_space; struct i40e_filter_control_settings filter; int rc; + uint8_t err; if (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) || (hw->aq.fw_maj_ver < 4)) { @@ -2167,6 +2470,15 @@ i40e_chip_start(i40e_t *i40e) /* Determine hardware state */ i40e_get_hw_state(i40e, hw); + /* For now, we always disable Ethernet Flow Control. */ + hw->fc.requested_mode = I40E_FC_NONE; + rc = i40e_set_fc(hw, &err, B_TRUE); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Setting flow control failed, returned %d" + " with error: 0x%x", rc, err); + return (B_FALSE); + } + /* Initialize mac addresses. */ i40e_init_macaddrs(i40e, hw); @@ -2188,8 +2500,34 @@ i40e_chip_start(i40e_t *i40e) i40e_intr_chip_init(i40e); - if (!i40e_config_vsi(i40e, hw)) + rc = i40e_get_mac_seid(i40e); + if (rc == -1) { + i40e_error(i40e, "failed to obtain MAC Uplink SEID"); + return (B_FALSE); + } + i40e->i40e_mac_seid = (uint16_t)rc; + + /* + * Create a VEB in order to support multiple VSIs. Each VSI + * functions as a MAC group. This call sets the PF's MAC as + * the uplink port and the PF's default VSI as the default + * downlink port. + */ + rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e), + 0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc, + hw->aq.asq_last_status); return (B_FALSE); + } + + if (!i40e_config_def_vsi(i40e, hw)) + return (B_FALSE); + + for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) { + if (!i40e_add_vsi(i40e, hw, i)) + return (B_FALSE); + } if (!i40e_config_rss(i40e, hw)) return (B_FALSE); @@ -2549,7 +2887,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq) * assigned to traffic class zero, because we don't actually use them. */ bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; + context.seid = I40E_DEF_VSI_SEID(i40e); context.pf_num = hw->pf_id; err = i40e_aq_get_vsi_params(hw, &context, NULL); if (err != I40E_SUCCESS) { @@ -2653,7 +2991,8 @@ i40e_setup_tx_rings(i40e_t *i40e) void i40e_stop(i40e_t *i40e, boolean_t free_allocations) { - int i; + uint_t i; + i40e_hw_t *hw = &i40e->i40e_hw_space; ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); @@ -2689,6 +3028,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) delay(50 * drv_usectohz(1000)); + /* + * We don't delete the default VSI because it replaces the VEB + * after VEB deletion (see the "Delete Element" section). + * Furthermore, since the default VSI is provided by the + * firmware, we never attempt to delete it. + */ + for (i = 1; i < i40e->i40e_num_rx_groups; i++) { + i40e_delete_vsi(i40e, i); + } + + if (i40e->i40e_veb_seid != 0) { + int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL); + + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VEB %d: %d", rc, + hw->aq.asq_last_status); + } + + i40e->i40e_veb_seid = 0; + } + i40e_intr_chip_fini(i40e); for (i = 0; i < i40e->i40e_num_trqpairs; i++) { @@ -2718,7 +3078,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock); } - i40e_stat_vsi_fini(i40e); + for (i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_stat_vsi_fini(i40e, i); + } i40e->i40e_link_speed = 0; i40e->i40e_link_duplex = 0; @@ -2783,7 +3145,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc) * Enable broadcast traffic; however, do not enable multicast traffic. * That's handle exclusively through MAC's mc_multicst routines. */ - err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL); + err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE, + NULL); if (err != I40E_SUCCESS) { i40e_error(i40e, "failed to set default VSI: %d", err); rc = B_FALSE; diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c index 7a4f0faedd..e40c9f2c53 100644 --- a/usr/src/uts/common/io/i40e/i40e_stats.c +++ b/usr/src/uts/common/io/i40e/i40e_stats.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include "i40e_sw.h" @@ -69,12 +69,7 @@ * --------------------- * * The hardware keeps statistics at each physical function/MAC (PF) and it keeps - * statistics on each virtual station interface (VSI). Currently we only use one - * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited - * number of statistics units available. While every PF is guaranteed to have a - * statistics unit, it is possible that we will run out for a given VSI. We'll - * have to figure out an appropriate strategy here when we end up supporting - * multiple VSIs. + * statistics on each virtual station interface (VSI). * * The hardware keeps these statistics as 32-bit and 48-bit counters. We are * required to read them and then compute the differences between them. The @@ -100,10 +95,10 @@ * data. * * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the - * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in - * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All - * of this data is protected by the i40e_stat_lock, which should be taken last, - * when acquiring locks. + * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in + * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the + * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the + * i40e_stat_lock, which should be taken last, when acquiring locks. */ static void @@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat, } static void -i40e_stat_vsi_update(i40e_t *i40e, boolean_t init) +i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init) { i40e_vsi_stats_t *ivs; i40e_vsi_kstats_t *ivk; - int id = i40e->i40e_vsi_stat_id; + uint16_t id = i40e->i40e_vsis[idx].iv_stats_id; - ASSERT(i40e->i40e_vsi_kstat != NULL); - ivs = &i40e->i40e_vsi_stat; - ivk = i40e->i40e_vsi_kstat->ks_data; + ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL); + ivs = &i40e->i40e_vsis[idx].iv_stats; + ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data; mutex_enter(&i40e->i40e_stat_lock); @@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw) return (EACCES); i40e = ksp->ks_private; - i40e_stat_vsi_update(i40e, B_FALSE); + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) + i40e_stat_vsi_update(i40e, i, B_FALSE); + return (0); } void -i40e_stat_vsi_fini(i40e_t *i40e) +i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx) { - if (i40e->i40e_vsi_kstat != NULL) { - kstat_delete(i40e->i40e_vsi_kstat); - i40e->i40e_vsi_kstat = NULL; + if (i40e->i40e_vsis[idx].iv_kstats != NULL) { + kstat_delete(i40e->i40e_vsis[idx].iv_kstats); + i40e->i40e_vsis[idx].iv_kstats = NULL; } } boolean_t -i40e_stat_vsi_init(i40e_t *i40e) +i40e_stat_vsi_init(i40e_t *i40e, uint_t idx) { kstat_t *ksp; i40e_vsi_kstats_t *ivk; char buf[64]; + uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid; - (void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id); + (void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id); ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED, sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0); if (ksp == NULL) { - i40e_error(i40e, "Failed to create kstats for VSI %d", - i40e->i40e_vsi_id); + i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id); return (B_FALSE); } - i40e->i40e_vsi_kstat = ksp; + i40e->i40e_vsis[idx].iv_kstats = ksp; ivk = ksp->ks_data; ksp->ks_update = i40e_stat_vsi_kstat_update; ksp->ks_private = i40e; @@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e) kstat_named_init(&ivk->ivk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); - bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t)); - i40e_stat_vsi_update(i40e, B_TRUE); - kstat_install(i40e->i40e_vsi_kstat); + bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t)); + i40e_stat_vsi_update(i40e, idx, B_TRUE); + kstat_install(i40e->i40e_vsis[idx].iv_kstats); return (B_TRUE); } @@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e) void i40e_stats_fini(i40e_t *i40e) { - ASSERT(i40e->i40e_vsi_kstat == NULL); +#ifdef DEBUG + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL); + } +#endif + if (i40e->i40e_pf_kstat != NULL) { kstat_delete(i40e->i40e_pf_kstat); i40e->i40e_pf_kstat = NULL; @@ -1230,6 +1232,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq) kstat_named_init(&tsp->itxs_recycled, "tx_recycled", KSTAT_DATA_UINT64); tsp->itxs_recycled.value.ui64 = 0; + kstat_named_init(&tsp->itxs_force_copy, "tx_force_copy", + KSTAT_DATA_UINT64); + tsp->itxs_force_copy.value.ui64 = 0; + kstat_named_init(&tsp->itxs_tso_force_copy, "tx_tso_force_copy", + KSTAT_DATA_UINT64); + tsp->itxs_tso_force_copy.value.ui64 = 0; kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail", KSTAT_DATA_UINT64); @@ -1249,6 +1257,15 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq) kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4", KSTAT_DATA_UINT64); tsp->itxs_hck_badl4.value.ui64 = 0; + kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck", + KSTAT_DATA_UINT64); + tsp->itxs_lso_nohck.value.ui64 = 0; + kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails", + KSTAT_DATA_UINT64); + tsp->itxs_bind_fails.value.ui64 = 0; + kstat_named_init(&tsp->itxs_tx_short, "tx_short", + KSTAT_DATA_UINT64); + tsp->itxs_tx_short.value.ui64 = 0; kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb", KSTAT_DATA_UINT64); tsp->itxs_err_notcb.value.ui64 = 0; diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h index 78aced0144..e7b64c2160 100644 --- a/usr/src/uts/common/io/i40e/i40e_sw.h +++ b/usr/src/uts/common/io/i40e/i40e_sw.h @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -152,9 +152,10 @@ typedef enum i40e_itr_index { } i40e_itr_index_t; /* - * Table 1-5 of the PRM notes that LSO supports up to 256 KB. + * The hardware claims to support LSO up to 256 KB, but due to the limitations + * imposed by the IP header for non-jumbo frames, we cap it at 64 KB. */ -#define I40E_LSO_MAXLEN (256 * 1024) +#define I40E_LSO_MAXLEN (64 * 1024) #define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */ #define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */ @@ -173,13 +174,22 @@ typedef enum i40e_itr_index { #define I40E_BUF_IPHDR_ALIGNMENT 2 /* - * The XL710 controller has a limit of eight buffers being allowed to be used - * for the transmission of a single frame. This is defined in 8.4.1 - Transmit + * The XL710 controller has a total of eight buffers available for the + * transmission of any single frame. This is defined in 8.4.1 - Transmit * Packet in System Memory. */ #define I40E_TX_MAX_COOKIE 8 /* + * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more + * cookies than a non-LSO frame. The key here to is to select a value such + * that once the HW has chunked up the LSO frame into MSS-sized segments that no + * single segment spans more than 8 cookies (see comments for + * I40E_TX_MAX_COOKIE) + */ +#define I40E_TX_LSO_MAX_COOKIE 32 + +/* * Sizing to determine the amount of available descriptors at which we'll * consider ourselves blocked. Also, when we have these available, we'll then * consider ourselves available to transmit to MAC again. Strictly speaking, the @@ -203,6 +213,12 @@ typedef enum i40e_itr_index { #define I40E_MAX_TX_DMA_THRESH INT32_MAX /* + * The max size of each individual tx buffer is 16KB - 1. + * See table 8-17 + */ +#define I40E_MAX_TX_BUFSZ 0x0000000000003FFFull + +/* * Resource sizing counts. There are various aspects of hardware where we may * have some variable number of elements that we need to handle. Such as the * hardware capabilities and switch capacities. We cannot know a priori how many @@ -240,21 +256,6 @@ typedef enum i40e_itr_index { #define I40E_HMC_TX_TPH_DISABLE 0 /* - * Whenever we establish and create a VSI, we need to assign some number of - * queues that it's allowed to access from the PF. Because we only have a single - * VSI per PF at this time, we assign it all the queues. - * - * Many of the devices support what's called Data-center Bridging. Which is a - * feature that we don't have much use of at this time. However, we still need - * to fill in this information. We follow the guidance of the note in Table 7-80 - * which talks about bytes 62-77. It says that if we don't want to assign - * anything to traffic classes, we should set the field to zero. Effectively - * this means that everything in the system is assigned to traffic class zero. - */ -#define I40E_ASSIGN_ALL_QUEUES 0 -#define I40E_TRAFFIC_CLASS_NO_QUEUES 0 - -/* * This defines the error mask that we care about from rx descriptors. Currently * we're only concerned with the general errors and oversize errors. */ @@ -268,12 +269,12 @@ typedef enum i40e_itr_index { #define I40E_DDI_PROP_LEN 64 /* - * We currently consolidate some overrides that we use in the code here. These - * will be gone in the fullness of time, but as we're bringing up the device, - * this is what we use. + * Place an artificial limit on the max number of groups. The X710 + * series supports up to 384 VSIs to be partitioned across PFs as the + * driver sees fit. But until we support more interrupts this seems + * like a good place to start. */ -#define I40E_GROUP_MAX 1 -#define I40E_TRQPAIR_MAX 1 +#define I40E_GROUP_MAX 32 #define I40E_GROUP_NOMSIX 1 #define I40E_TRQPAIR_NOMSIX 1 @@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block { typedef enum { I40E_TX_NONE, I40E_TX_COPY, - I40E_TX_DMA + I40E_TX_DMA, + I40E_TX_DESC, } i40e_tx_type_t; typedef struct i40e_tx_desc i40e_tx_desc_t; +typedef struct i40e_tx_context_desc i40e_tx_context_desc_t; typedef union i40e_32byte_rx_desc i40e_rx_desc_t; +struct i40e_dma_bind_info { + caddr_t dbi_paddr; + size_t dbi_len; +}; + typedef struct i40e_tx_control_block { struct i40e_tx_control_block *tcb_next; mblk_t *tcb_mp; i40e_tx_type_t tcb_type; ddi_dma_handle_t tcb_dma_handle; + ddi_dma_handle_t tcb_lso_dma_handle; i40e_dma_buffer_t tcb_dma; + struct i40e_dma_bind_info *tcb_bind_info; + uint_t tcb_bind_ncookies; + boolean_t tcb_used_lso; } i40e_tx_control_block_t; /* @@ -517,6 +529,8 @@ typedef struct i40e_txq_stat { kstat_named_t itxs_packets; /* Packets out on queue */ kstat_named_t itxs_descriptors; /* Descriptors issued */ kstat_named_t itxs_recycled; /* Descriptors reclaimed */ + kstat_named_t itxs_force_copy; /* non-TSO force copy */ + kstat_named_t itxs_tso_force_copy; /* TSO force copy */ /* * Various failure conditions. */ @@ -526,6 +540,9 @@ typedef struct i40e_txq_stat { kstat_named_t itxs_hck_nol4info; /* Missing l4 info */ kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */ kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */ + kstat_named_t itxs_lso_nohck; /* Missing offloads for LSO */ + kstat_named_t itxs_bind_fails; /* DMA bind failures */ + kstat_named_t itxs_tx_short; /* Tx chain too short */ kstat_named_t itxs_err_notcb; /* No tcb's available */ kstat_named_t itxs_err_nodescs; /* No tcb's available */ @@ -761,6 +778,25 @@ typedef struct i40e_func_rsrc { uint_t ifr_nmcastfilt_used; } i40e_func_rsrc_t; +typedef struct i40e_vsi { + uint16_t iv_seid; + uint16_t iv_number; + kstat_t *iv_kstats; + i40e_vsi_stats_t iv_stats; + uint16_t iv_stats_id; +} i40e_vsi_t; + +/* + * While irg_index and irg_grp_hdl aren't used anywhere, they are + * still useful for debugging. + */ +typedef struct i40e_rx_group { + uint32_t irg_index; /* index in i40e_rx_groups[] */ + uint16_t irg_vsi_seid; /* SEID of VSI for this group */ + mac_group_handle_t irg_grp_hdl; /* handle to mac_group_t */ + struct i40e *irg_i40e; /* ref to i40e_t */ +} i40e_rx_group_t; + /* * Main i40e per-instance state. */ @@ -789,11 +825,18 @@ typedef struct i40e { struct i40e_aq_get_phy_abilities_resp i40e_phy; void *i40e_aqbuf; +#define I40E_DEF_VSI_IDX 0 +#define I40E_DEF_VSI(i40e) ((i40e)->i40e_vsis[I40E_DEF_VSI_IDX]) +#define I40E_DEF_VSI_SEID(i40e) (I40E_DEF_VSI(i40e).iv_seid) + /* * Device state, switch information, and resources. */ - int i40e_vsi_id; - uint16_t i40e_vsi_num; + i40e_vsi_t i40e_vsis[I40E_GROUP_MAX]; + uint16_t i40e_mac_seid; /* SEID of physical MAC */ + uint16_t i40e_veb_seid; /* switch atop MAC (SEID) */ + uint16_t i40e_vsi_avail; /* VSIs avail to this PF */ + uint16_t i40e_vsi_used; /* VSIs used by this PF */ struct i40e_device *i40e_device; i40e_func_rsrc_t i40e_resources; uint16_t i40e_switch_rsrc_alloc; @@ -814,12 +857,13 @@ typedef struct i40e { */ i40e_trqpair_t *i40e_trqpairs; boolean_t i40e_mr_enable; - int i40e_num_trqpairs; + uint_t i40e_num_trqpairs; /* total TRQPs (per PF) */ + uint_t i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */ uint_t i40e_other_itr; - int i40e_num_rx_groups; + i40e_rx_group_t *i40e_rx_groups; + uint_t i40e_num_rx_groups; int i40e_num_rx_descs; - mac_group_handle_t i40e_rx_group_handle; uint32_t i40e_rx_ring_size; uint32_t i40e_rx_buf_size; boolean_t i40e_rx_hcksum_enable; @@ -832,6 +876,7 @@ typedef struct i40e { uint32_t i40e_tx_buf_size; uint32_t i40e_tx_block_thresh; boolean_t i40e_tx_hcksum_enable; + boolean_t i40e_tx_lso_enable; uint32_t i40e_tx_dma_min; uint_t i40e_tx_itr; @@ -855,6 +900,7 @@ typedef struct i40e { */ ddi_dma_attr_t i40e_static_dma_attr; ddi_dma_attr_t i40e_txbind_dma_attr; + ddi_dma_attr_t i40e_txbind_lso_dma_attr; ddi_device_acc_attr_t i40e_desc_acc_attr; ddi_device_acc_attr_t i40e_buf_acc_attr; @@ -872,10 +918,7 @@ typedef struct i40e { */ kmutex_t i40e_stat_lock; kstat_t *i40e_pf_kstat; - kstat_t *i40e_vsi_kstat; i40e_pf_stats_t i40e_pf_stat; - i40e_vsi_stats_t i40e_vsi_stat; - uint16_t i40e_vsi_stat_id; /* * Misc. stats and counters that should maybe one day be kstats. @@ -975,8 +1018,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *); */ extern boolean_t i40e_stats_init(i40e_t *); extern void i40e_stats_fini(i40e_t *); -extern boolean_t i40e_stat_vsi_init(i40e_t *); -extern void i40e_stat_vsi_fini(i40e_t *); +extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t); +extern void i40e_stat_vsi_fini(i40e_t *, uint_t); extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *); extern void i40e_stats_trqpair_fini(i40e_trqpair_t *); extern int i40e_m_stat(void *, uint_t, uint64_t *); diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c index 57620f03fa..caafa3e102 100644 --- a/usr/src/uts/common/io/i40e/i40e_transceiver.c +++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include "i40e_sw.h" @@ -60,19 +60,19 @@ * This size is then rounded up to the nearest 1k chunk, which represents the * actual amount of memory that we'll allocate for a single frame. * - * Note, that for rx, we do something that might be unexpected. We always add + * Note, that for RX, we do something that might be unexpected. We always add * an extra two bytes to the frame size that we allocate. We then offset the DMA * address that we receive a packet into by two bytes. This ensures that the IP * header will always be 4 byte aligned because the MAC header is either 14 or * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's * and MAC's lives easier. * - * Both the rx and tx descriptor rings (which are what we use to communicate + * Both the RX and TX descriptor rings (which are what we use to communicate * with hardware) are allocated as a single region of DMA memory which is the * size of the descriptor (4 bytes and 2 bytes respectively) times the total - * number of descriptors for an rx and tx ring. + * number of descriptors for an RX and TX ring. * - * While the rx and tx descriptors are allocated using DMA-based memory, the + * While the RX and TX descriptors are allocated using DMA-based memory, the * control blocks for each of them are allocated using normal kernel memory. * They aren't special from a DMA perspective. We'll go over the design of both * receiving and transmitting separately, as they have slightly different @@ -113,16 +113,16 @@ * * To try and ensure that the device always has blocks that it can receive data * into, we maintain two lists of control blocks, a working list and a free - * list. Each list is sized equal to the number of descriptors in the rx ring. - * During the GLDv3 mc_start routine, we allocate a number of rx control blocks + * list. Each list is sized equal to the number of descriptors in the RX ring. + * During the GLDv3 mc_start routine, we allocate a number of RX control blocks * equal to twice the number of descriptors in the ring and we assign them * equally to the free list and to the working list. Each control block also has * DMA memory allocated and associated with which it will be used to receive the * actual packet data. All of a received frame's data will end up in a single * DMA buffer. * - * During operation, we always maintain the invariant that each rx descriptor - * has an associated rx control block which lives in the working list. If we + * During operation, we always maintain the invariant that each RX descriptor + * has an associated RX control block which lives in the working list. If we * feel that we should loan up DMA memory to MAC in the form of a message block, * we can only do so if we can maintain this invariant. To do that, we swap in * one of the buffers from the free list. If none are available, then we resort @@ -130,14 +130,14 @@ * size. * * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is - * called on the block, at which point we restore the rx control block to the + * called on the block, at which point we restore the RX control block to the * free list and are able to reuse the DMA memory again. While the scheme may * seem odd, it importantly keeps us out of trying to do any DMA allocations in * the normal path of operation, even though we may still have to allocate * message blocks and copy. * - * The following state machine describes the life time of a rx control block. In - * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx + * The following state machine describes the life time of a RX control block. In + * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx * control block entry as rcb. * * | | @@ -160,11 +160,11 @@ * +--------------------<-----| rcb loaned to MAC | * +-------------------+ * - * Finally, note that every rx control block has a reference count on it. One + * Finally, note that every RX control block has a reference count on it. One * reference is added as long as the driver has had the GLDv3 mc_start endpoint * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and * no other DLPI consumers remain, then we'll decrement the reference count by - * one. Whenever we loan up the rx control block and associated buffer to MAC, + * one. Whenever we loan up the RX control block and associated buffer to MAC, * then we bump the reference count again. Even though the device is stopped, * there may still be loaned frames in upper levels that we'll want to account * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure @@ -192,10 +192,10 @@ * state tracking. Effectively, we cache the HEAD register and then update it * ourselves based on our work. * - * When we iterate over the rx descriptors and thus the received frames, we are + * When we iterate over the RX descriptors and thus the received frames, we are * either in an interrupt context or we've been asked by MAC to poll on the * ring. If we've been asked to poll on the ring, we have a maximum number of - * bytes of mblk_t's to return. If processing an rx descriptor would cause us to + * bytes of mblk_t's to return. If processing an RX descriptor would cause us to * exceed that count, then we do not process it. When in interrupt context, we * don't have a strict byte count. However, to ensure liveness, we limit the * amount of data based on a configuration value @@ -249,31 +249,54 @@ * differently due to the fact that all data is originated by the operating * system and not by the device. * - * Like rx, there is both a descriptor ring that we use to communicate to the - * driver and which points to the memory used to transmit a frame. Similarly, - * there is a corresponding transmit control block. Each transmit control block - * has a region of DMA memory allocated to it; however, the way we use it - * varies. + * Like RX, there is both a descriptor ring that we use to communicate to the + * driver and which points to the memory used to transmit a frame. Similarly, + * there is a corresponding transmit control block, however, the correspondence + * between descriptors and control blocks is more complex and not necessarily + * 1-to-1. * * The driver is asked to process a single frame at a time. That message block * may be made up of multiple fragments linked together by the mblk_t`b_cont * member. The device has a hard limit of up to 8 buffers being allowed for use - * for a single logical frame. For each fragment, we'll try and use an entry - * from the tx descriptor ring and then we'll allocate a corresponding tx - * control block. Depending on the size of the fragment, we may copy it around - * or we might instead try to do DMA binding of the fragment. - * - * If we exceed the number of blocks that fit, we'll try to pull up the block - * and then we'll do a DMA bind and send it out. - * - * If we don't have enough space in the ring or tx control blocks available, + * for a single non-LSO packet or LSO segment. The number of TX ring entires + * (and thus TX control blocks) used depends on the fragment sizes and DMA + * layout, as explained below. + * + * We alter our DMA strategy based on a threshold tied to the fragment size. + * This threshold is configurable via the tx_dma_threshold property. If the + * fragment is above the threshold, we DMA bind it -- consuming one TCB and + * potentially several data descriptors. The exact number of descriptors (equal + * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset + * into page, b_wptr offset into page, and the physical layout of the dblk's + * memory (contiguous or not). Essentially, we are at the mercy of the DMA + * engine and the dblk's memory allocation. Knowing the exact number of + * descriptors up front is a task best not taken on by the driver itself. + * Instead, we attempt to DMA bind the fragment and verify the descriptor + * layout meets hardware constraints. If the proposed DMA bind does not satisfy + * the hardware constaints, then we discard it and instead copy the entire + * fragment into the pre-allocated TCB buffer (or buffers if the fragment is + * larger than the TCB buffer). + * + * If the fragment is below or at the threshold, we copy it to the pre-allocated + * buffer of a TCB. We compress consecutive copy fragments into a single TCB to + * conserve resources. We are guaranteed that the TCB buffer is made up of only + * 1 DMA cookie; and therefore consumes only one descriptor on the controller. + * + * Furthermore, if the frame requires HW offloads such as LSO, tunneling or + * filtering, then the TX data descriptors must be preceeded by a single TX + * context descriptor. Because there is no DMA transfer associated with the + * context descriptor, we allocate a control block with a special type which + * indicates to the TX ring recycle code that there are no associated DMA + * resources to unbind when the control block is free'd. + * + * If we don't have enough space in the ring or TX control blocks available, * then we'll return the unprocessed message block to MAC. This will induce flow * control and once we recycle enough entries, we'll once again enable sending * on the ring. * * We size the working list as equal to the number of descriptors in the ring. * We size the free list as equal to 1.5 times the number of descriptors in the - * ring. We'll allocate a number of tx control block entries equal to the number + * ring. We'll allocate a number of TX control block entries equal to the number * of entries in the free list. By default, all entries are placed in the free * list. As we come along and try to send something, we'll allocate entries from * the free list and add them to the working list, where they'll stay until the @@ -325,7 +348,7 @@ * +------------------+ +------------------+ * | tcb on free list |---*------------------>| tcb on work list | * +------------------+ . +------------------+ - * ^ . tcb allocated | + * ^ . N tcbs allocated[1] | * | to send frame v * | or fragment on | * | wire, mblk from | @@ -335,20 +358,27 @@ * . * . Hardware indicates * entry transmitted. - * tcb recycled, mblk + * tcbs recycled, mblk * from MAC freed. * + * [1] We allocate N tcbs to transmit a single frame where N can be 1 context + * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA + * bind case, N can be 1 context descriptor plus 1 data descriptor per + * b_cont in the mblk. In this case, the mblk is associated with the first + * data descriptor and freed as part of freeing that data descriptor. + * * ------------ * Blocking MAC * ------------ * - * Wen performing transmit, we can run out of descriptors and ring entries. When - * such a case happens, we return the mblk_t to MAC to indicate that we've been - * blocked. At that point in time, MAC becomes blocked and will not transmit - * anything out that specific ring until we notify MAC. To indicate that we're - * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE. + * When performing transmit, we can run out of descriptors and ring entries. + * When such a case happens, we return the mblk_t to MAC to indicate that we've + * been blocked. At that point in time, MAC becomes blocked and will not + * transmit anything out that specific ring until we notify MAC. To indicate + * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member + * to B_TRUE. * - * When we recycle tx descriptors then we'll end up signaling MAC by calling + * When we recycle TX descriptors then we'll end up signaling MAC by calling * mac_tx_ring_update() if we were blocked, letting it know that it's safe to * start sending frames out to us again. */ @@ -367,13 +397,15 @@ /* * This structure is used to maintain information and flags related to - * transmitting a frame. The first member is the set of flags we need to or into - * the command word (generally checksumming related). The second member controls - * the word offsets which is required for IP and L4 checksumming. + * transmitting a frame. These fields are ultimately used to construct the + * TX data descriptor(s) and, if necessary, the TX context descriptor. */ typedef struct i40e_tx_context { - enum i40e_tx_desc_cmd_bits itc_cmdflags; - uint32_t itc_offsets; + enum i40e_tx_desc_cmd_bits itc_data_cmdflags; + uint32_t itc_data_offsets; + enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; + uint32_t itc_ctx_tsolen; + uint32_t itc_ctx_mss; } i40e_tx_context_t; /* @@ -395,14 +427,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; * i40e_static_dma_attr, is designed to be used for both the descriptor rings * and the static buffers that we associate with control blocks. For this * reason, we force an SGL length of one. While technically the driver supports - * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our + * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our * management here. In addition, when the Intel common code wants to allocate * memory via the i40e_allocate_virt_mem osdep function, we have it leverage * the static dma attr. * - * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're - * binding a bunch of mblk_t fragments to go out the door. Note that the main - * difference here is that we're allowed a larger SGL length -- eight. + * The latter two sets of attributes, are what we use when we're binding a + * bunch of mblk_t fragments to go out the door. Note that the main difference + * here is that we're allowed a larger SGL length. For non-LSO TX, we + * restrict the SGL length to match the number of TX buffers available to the + * PF (8). For the LSO case we can go much larger, with the caveat that each + * MSS-sized chunk (segment) must not span more than 8 data descriptors and + * hence must not span more than 8 cookies. * * Note, we default to setting ourselves to be DMA capable here. However, * because we could have multiple instances which have different FMA error @@ -429,7 +465,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ - 0x00000000FFFFFFFFull, /* dma counter max */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ @@ -440,6 +476,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DDI_DMA_FLAGERR /* DMA flags */ }; +static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { + DMA_ATTR_V0, /* version number */ + 0x0000000000000000ull, /* low address */ + 0xFFFFFFFFFFFFFFFFull, /* high address */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ + I40E_DMA_ALIGNMENT, /* alignment */ + 0x00000FFF, /* burst sizes */ + 0x00000001, /* minimum transfer size */ + 0x00000000FFFFFFFFull, /* maximum transfer size */ + 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ + I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ + 0x00000001, /* granularity */ + DDI_DMA_FLAGERR /* DMA flags */ +}; + /* * Next, we have the attributes for these structures. The descriptor rings are * all strictly little endian, while the data buffers are just arrays of bytes @@ -668,7 +719,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_ring_size, KM_NOSLEEP); if (rxd->rxd_work_list == NULL) { - i40e_error(i40e, "failed to allocate rx work list for a ring " + i40e_error(i40e, "failed to allocate RX work list for a ring " "of %d entries for ring %d", rxd->rxd_ring_size, itrq->itrq_index); goto cleanup; @@ -677,7 +728,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_free_list_size, KM_NOSLEEP); if (rxd->rxd_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry rx free list " + i40e_error(i40e, "failed to allocate a %d entry RX free list " "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); goto cleanup; } @@ -765,7 +816,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) i40e_t *i40e = rxd->rxd_i40e; /* - * First allocate the rx descriptor ring. + * First allocate the RX descriptor ring. */ dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; VERIFY(dmasz > 0); @@ -773,7 +824,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate DMA resources " - "for rx descriptor ring"); + "for RX descriptor ring"); return (B_FALSE); } rxd->rxd_desc_ring = @@ -799,7 +850,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) if (i40e_alloc_dma_buffer(i40e, dmap, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate rx dma buffer"); + i40e_error(i40e, "failed to allocate RX dma buffer"); return (B_FALSE); } @@ -841,6 +892,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq) ddi_dma_free_handle(&tcb->tcb_dma_handle); tcb->tcb_dma_handle = NULL; } + if (tcb->tcb_lso_dma_handle != NULL) { + ddi_dma_free_handle(&tcb->tcb_lso_dma_handle); + tcb->tcb_lso_dma_handle = NULL; + } } fsz = sizeof (i40e_tx_control_block_t) * @@ -881,7 +936,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) (i40e->i40e_tx_ring_size >> 1); /* - * Allocate an additional tx descriptor for the writeback head. + * Allocate an additional TX descriptor for the writeback head. */ dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; dmasz += sizeof (i40e_tx_desc_t); @@ -890,7 +945,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate DMA resources for tx " + i40e_error(i40e, "failed to allocate DMA resources for TX " "descriptor ring"); return (B_FALSE); } @@ -905,7 +960,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); if (itrq->itrq_tcb_work_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx work list " + i40e_error(i40e, "failed to allocate a %d entry TX work list " "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); goto cleanup; } @@ -913,14 +968,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * sizeof (i40e_tx_control_block_t *), KM_SLEEP); if (itrq->itrq_tcb_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx free list " + i40e_error(i40e, "failed to allocate a %d entry TX free list " "for ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); goto cleanup; } /* - * We allocate enough tx control blocks to cover the free list. + * We allocate enough TX control blocks to cover the free list. */ itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * itrq->itrq_tx_free_list_size, KM_NOSLEEP); @@ -948,18 +1003,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, &tcb->tcb_dma_handle); if (ret != DDI_SUCCESS) { - i40e_error(i40e, "failed to allocate DMA handle for tx " + i40e_error(i40e, "failed to allocate DMA handle for TX " "data binding on ring %d: %d", itrq->itrq_index, ret); tcb->tcb_dma_handle = NULL; goto cleanup; } + ret = ddi_dma_alloc_handle(i40e->i40e_dip, + &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, + &tcb->tcb_lso_dma_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate DMA handle for TX " + "LSO data binding on ring %d: %d", itrq->itrq_index, + ret); + tcb->tcb_lso_dma_handle = NULL; + goto cleanup; + } + if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate %ld bytes of " - "DMA for tx data binding on ring %d", dmasz, + "DMA for TX data binding on ring %d", dmasz, itrq->itrq_index); goto cleanup; } @@ -989,10 +1055,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; /* - * Clean up our rx data. We have to free DMA resources first and + * In some cases i40e_alloc_rx_data() may have failed + * and in that case there is no rxd to free. + */ + if (rxd == NULL) + continue; + + /* + * Clean up our RX data. We have to free DMA resources first and * then if we have no more pending RCB's, then we'll go ahead * and clean things up. Note, we can't set the stopped flag on - * the rx data until after we've done the first pass of the + * the RX data until after we've done the first pass of the * pending resources. Otherwise we might race with * i40e_rx_recycle on determining who should free the * i40e_rx_data_t above. @@ -1055,6 +1128,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, sizeof (ddi_dma_attr_t)); + bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, + sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, sizeof (ddi_device_acc_attr_t)); bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, @@ -1063,9 +1138,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) if (fma == B_TRUE) { i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= + DDI_DMA_FLAGERR; } else { i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= + ~DDI_DMA_FLAGERR; } } @@ -1102,7 +1181,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd) /* * This is the callback that we get from the OS when freemsg(9F) has been called * on a loaned descriptor. In addition, if we take the last reference count - * here, then we have to tear down all of the rx data. + * here, then we have to tear down all of the RX data. */ void i40e_rx_recycle(caddr_t arg) @@ -1768,17 +1847,18 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) * to properly program the hardware for checksum offload as well as the * generally required flags. * - * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or - * into the descriptor based on the checksum flags for this mblk_t and the + * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to + * 'or' into the descriptor based on the checksum flags for this mblk_t and the * actual information we care about. + * + * If the mblk requires LSO then we'll also gather the information that will be + * used to construct the Transmit Context Descriptor. */ static int i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, - i40e_tx_context_t *tctx) + mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx) { - int ret; - uint32_t flags, start; - mac_ether_offload_info_t meo; + uint32_t chkflags, start, mss, lsoflags; i40e_txq_stat_t *txs = &itrq->itrq_txstat; bzero(tctx, sizeof (i40e_tx_context_t)); @@ -1786,37 +1866,34 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, if (i40e->i40e_tx_hcksum_enable != B_TRUE) return (0); - mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags); - if (flags == 0) - return (0); + mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); + mac_lso_get(mp, &mss, &lsoflags); - if ((ret = mac_ether_offload_info(mp, &meo)) != 0) { - txs->itxs_hck_meoifail.value.ui64++; - return (ret); - } + if (chkflags == 0 && lsoflags == 0) + return (0); /* * Have we been asked to checksum an IPv4 header. If so, verify that we * have sufficient information and then set the proper fields in the * command structure. */ - if (flags & HCK_IPV4_HDRCKSUM) { - if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + if (chkflags & HCK_IPV4_HDRCKSUM) { + if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } - if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } - if (meo.meoi_l3proto != ETHERTYPE_IP) { + if (meo->meoi_l3proto != ETHERTYPE_IP) { txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; + tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } @@ -1826,57 +1903,77 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, * onto seeing if we have enough information for the L4 checksum * offload. */ - if (flags & HCK_PARTIALCKSUM) { - if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { + if (chkflags & HCK_PARTIALCKSUM) { + if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) { txs->itxs_hck_nol4info.value.ui64++; return (-1); } - if (!(flags & HCK_IPV4_HDRCKSUM)) { - if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + if (!(chkflags & HCK_IPV4_HDRCKSUM)) { + if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } - if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } - if (meo.meoi_l3proto == ETHERTYPE_IP) { - tctx->itc_cmdflags |= + if (meo->meoi_l3proto == ETHERTYPE_IP) { + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4; - } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { - tctx->itc_cmdflags |= + } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) { + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } - switch (meo.meoi_l4proto) { + switch (meo->meoi_l4proto) { case IPPROTO_TCP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_TCP; break; case IPPROTO_UDP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_UDP; break; case IPPROTO_SCTP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_SCTP; break; default: txs->itxs_hck_badl4.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } + if (lsoflags & HW_LSO) { + /* + * LSO requires that checksum offloads are enabled. If for + * some reason they're not we bail out with an error. + */ + if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 || + (chkflags & HCK_PARTIALCKSUM) == 0) { + txs->itxs_lso_nohck.value.ui64++; + return (-1); + } + + tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; + tctx->itc_ctx_mss = mss; + tctx->itc_ctx_tsolen = msgsize(mp) - + (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen); + } + return (0); } @@ -1925,7 +2022,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) tcb->tcb_dma.dmab_len = 0; break; case I40E_TX_DMA: - (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); + else if (tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_bind_info != NULL) { + kmem_free(tcb->tcb_bind_info, + tcb->tcb_bind_ncookies * + sizeof (struct i40e_dma_bind_info)); + } + tcb->tcb_bind_info = NULL; + tcb->tcb_bind_ncookies = 0; + tcb->tcb_used_lso = B_FALSE; + break; + case I40E_TX_DESC: break; case I40E_TX_NONE: /* Cast to pacify lint */ @@ -1935,8 +2045,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) } tcb->tcb_type = I40E_TX_NONE; - freemsg(tcb->tcb_mp); - tcb->tcb_mp = NULL; + if (tcb->tcb_mp != NULL) { + freemsg(tcb->tcb_mp); + tcb->tcb_mp = NULL; + } tcb->tcb_next = NULL; } @@ -1969,10 +2081,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) i40e_tx_control_block_t *tcb; tcb = itrq->itrq_tcb_work_list[index]; - VERIFY(tcb != NULL); - itrq->itrq_tcb_work_list[index] = NULL; - i40e_tcb_reset(tcb); - i40e_tcb_free(itrq, tcb); + if (tcb != NULL) { + itrq->itrq_tcb_work_list[index] = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + } bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); @@ -1995,6 +2108,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) uint32_t wbhead, toclean, count; i40e_tx_control_block_t *tcbhead; i40e_t *i40e = itrq->itrq_i40e; + uint_t desc_per_tcb, i; mutex_enter(&itrq->itrq_tx_lock); @@ -2042,11 +2156,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) tcbhead = tcb; /* - * We zero this out for sanity purposes. + * In the DMA bind case, there may not necessarily be a 1:1 + * mapping between tcb's and descriptors. If the tcb type + * indicates a DMA binding then check the number of DMA + * cookies to determine how many entries to clean in the + * descriptor ring. */ - bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); - toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); - count++; + if (tcb->tcb_type == I40E_TX_DMA) + desc_per_tcb = tcb->tcb_bind_ncookies; + else + desc_per_tcb = 1; + + for (i = 0; i < desc_per_tcb; i++) { + /* + * We zero this out for sanity purposes. + */ + bzero(&itrq->itrq_desc_ring[toclean], + sizeof (i40e_tx_desc_t)); + toclean = i40e_next_desc(toclean, 1, + itrq->itrq_tx_ring_size); + count++; + } } itrq->itrq_desc_head = wbhead; @@ -2078,10 +2208,610 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); } +static void +i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp, + const size_t off, const size_t len) +{ + const void *soff = mp->b_rptr + off; + void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + + ASSERT3U(len, >, 0); + ASSERT3P(soff, >=, mp->b_rptr); + ASSERT3P(soff, <=, mp->b_wptr); + ASSERT3U(len, <=, MBLKL(mp)); + ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr); + ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len); + bcopy(soff, doff, len); + tcb->tcb_type = I40E_TX_COPY; + tcb->tcb_dma.dmab_len += len; + I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); +} + +static i40e_tx_control_block_t * +i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, + size_t off, boolean_t use_lso) +{ + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie; + uint_t i = 0, ncookies = 0, dmaflags; + i40e_tx_control_block_t *tcb; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + return (NULL); + } + tcb->tcb_type = I40E_TX_DMA; + + if (use_lso == B_TRUE) + dma_handle = tcb->tcb_lso_dma_handle; + else + dma_handle = tcb->tcb_dma_handle; + + dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING; + if (ddi_dma_addr_bind_handle(dma_handle, NULL, + (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags, + DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { + txs->itxs_bind_fails.value.ui64++; + goto bffail; + } + + tcb->tcb_bind_ncookies = ncookies; + tcb->tcb_used_lso = use_lso; + + tcb->tcb_bind_info = + kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info), + KM_NOSLEEP); + if (tcb->tcb_bind_info == NULL) + goto bffail; + + while (i < ncookies) { + if (i > 0) + ddi_dma_nextcookie(dma_handle, &dma_cookie); + + tcb->tcb_bind_info[i].dbi_paddr = + (caddr_t)dma_cookie.dmac_laddress; + tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size; + } + + return (tcb); + +bffail: + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + return (NULL); +} + +static void +i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, + caddr_t buff, size_t len, boolean_t last_desc) +{ + i40e_tx_desc_t *txdesc; + int cmd; + + ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); + itrq->itrq_desc_free--; + txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; + itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, + itrq->itrq_tx_ring_size); + + cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; + + /* + * The last data descriptor needs the EOP bit set, so that the HW knows + * that we're ready to send. Additionally, we set the RS (Report + * Status) bit, so that we are notified when the transmit engine has + * completed DMA'ing all of the data descriptors and data buffers + * associated with this frame. + */ + if (last_desc == B_TRUE) { + cmd |= I40E_TX_DESC_CMD_EOP; + cmd |= I40E_TX_DESC_CMD_RS; + } + + /* + * Per the X710 manual, section 8.4.2.1.1, the buffer size + * must be a value from 1 to 16K minus 1, inclusive. + */ + ASSERT3U(len, >=, 1); + ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ); + + txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff); + txdesc->cmd_type_offset_bsz = + LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA | + ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | + ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); +} + +/* + * Place 'tcb' on the tail of the list represented by 'head'/'tail'. + */ +static inline void +tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail, + i40e_tx_control_block_t *tcb) +{ + if (*head == NULL) { + *head = tcb; + *tail = *head; + } else { + ASSERT3P(*tail, !=, NULL); + ASSERT3P((*tail)->tcb_next, ==, NULL); + (*tail)->tcb_next = tcb; + *tail = tcb; + } +} + +/* + * This function takes a single packet, possibly consisting of + * multiple mblks, and creates a TCB chain to send to the controller. + * This TCB chain may span up to a maximum of 8 descriptors. A copy + * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or + * more, depending on several factors. For each fragment (invidual + * mblk making up the packet), we determine if its size dictates a + * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a + * count of descriptors used; when that count reaches the max we force + * all remaining fragments into a single TCB buffer. We have a + * guarantee that the TCB buffer is always larger than the MTU -- so + * there is always enough room. Consecutive fragments below the DMA + * threshold are copied into a single TCB. In the event of an error + * this function returns NULL but leaves 'mp' alone. + */ +static i40e_tx_control_block_t * +i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc) +{ + const mblk_t *nmp = mp; + uint_t needed_desc = 0; + boolean_t force_copy = B_FALSE; + i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; + i40e_t *i40e = itrq->itrq_i40e; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + /* TCB buffer is always larger than MTU. */ + ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size); + + while (nmp != NULL) { + const size_t nmp_len = MBLKL(nmp); + + /* Ignore zero-length mblks. */ + if (nmp_len == 0) { + nmp = nmp->b_cont; + continue; + } + + if (nmp_len < i40e->i40e_tx_dma_min || force_copy) { + /* Compress consecutive copies into one TCB. */ + if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) { + i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); + nmp = nmp->b_cont; + continue; + } + + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + /* + * TCB DMA buffer is guaranteed to be one + * cookie by i40e_alloc_dma_buffer(). + */ + i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); + needed_desc++; + tcb_list_append(&tcbhead, &tcbtail, tcb); + } else { + uint_t total_desc; + + tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE); + if (tcb == NULL) { + i40e_error(i40e, "dma bind failed!"); + goto fail; + } + + /* + * If the new total exceeds the max or we've + * reached the limit and there's data left, + * then give up binding and copy the rest into + * the pre-allocated TCB buffer. + */ + total_desc = needed_desc + tcb->tcb_bind_ncookies; + if ((total_desc > I40E_TX_MAX_COOKIE) || + (total_desc == I40E_TX_MAX_COOKIE && + nmp->b_cont != NULL)) { + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + + if (tcbtail != NULL && + tcbtail->tcb_type == I40E_TX_COPY) { + tcb = tcbtail; + } else { + tcb = NULL; + } + + force_copy = B_TRUE; + txs->itxs_force_copy.value.ui64++; + continue; + } + + needed_desc += tcb->tcb_bind_ncookies; + tcb_list_append(&tcbhead, &tcbtail, tcb); + } + + nmp = nmp->b_cont; + } + + ASSERT3P(nmp, ==, NULL); + ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE); + ASSERT3P(tcbhead, !=, NULL); + *ndesc += needed_desc; + return (tcbhead); + +fail: + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + + tcb->tcb_mp = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + tcb = next; + } + + return (NULL); +} + +/* + * Section 8.4.1 of the 700-series programming guide states that a + * segment may span up to 8 data descriptors; including both header + * and payload data. However, empirical evidence shows that the + * controller freezes the Tx queue when presented with a segment of 8 + * descriptors. Or, at least, when the first segment contains 8 + * descriptors. One explanation is that the controller counts the + * context descriptor against the first segment, even though the + * programming guide makes no mention of such a constraint. In any + * case, we limit TSO segments to 7 descriptors to prevent Tx queue + * freezes. We still allow non-TSO segments to utilize all 8 + * descriptors as they have not demonstrated the faulty behavior. + */ +uint_t i40e_lso_num_descs = 7; + +#define I40E_TCB_LEFT(tcb) \ + ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len) + +/* + * This function is similar in spirit to i40e_non_lso_chain(), but + * much more complicated in reality. Like the previous function, it + * takes a packet (an LSO packet) as input and returns a chain of + * TCBs. The complication comes with the fact that we are no longer + * trying to fit the entire packet into 8 descriptors, but rather we + * must fit each MSS-size segment of the LSO packet into 8 descriptors. + * Except it's really 7 descriptors, see i40e_lso_num_descs. + * + * Your first inclination might be to verify that a given segment + * spans no more than 7 mblks; but it's actually much more subtle than + * that. First, let's describe what the hardware expects, and then we + * can expound on the software side of things. + * + * For an LSO packet the hardware expects the following: + * + * o Each MSS-sized segment must span no more than 7 descriptors. + * + * o The header size does not count towards the segment size. + * + * o If header and payload share the first descriptor, then the + * controller will count the descriptor twice. + * + * The most important thing to keep in mind is that the hardware does + * not view the segments in terms of mblks, like we do. The hardware + * only sees descriptors. It will iterate each descriptor in turn, + * keeping a tally of bytes seen and descriptors visited. If the byte + * count hasn't reached MSS by the time the descriptor count reaches + * 7, then the controller freezes the queue and we are stuck. + * Furthermore, the hardware picks up its tally where it left off. So + * if it reached MSS in the middle of a descriptor, it will start + * tallying the next segment in the middle of that descriptor. The + * hardware's view is entirely removed from the mblk chain or even the + * descriptor layout. Consider these facts: + * + * o The MSS will vary dpeneding on MTU and other factors. + * + * o The dblk allocation will sit at various offsets within a + * memory page. + * + * o The page size itself could vary in the future (i.e. not + * always 4K). + * + * o Just because a dblk is virtually contiguous doesn't mean + * it's physically contiguous. The number of cookies + * (descriptors) required by a DMA bind of a single dblk is at + * the mercy of the page size and physical layout. + * + * o The descriptors will most often NOT start/end on a MSS + * boundary. Thus the hardware will often start counting the + * MSS mid descriptor and finish mid descriptor. + * + * The upshot of all this is that the driver must learn to think like + * the controller; and verify that none of the constraints are broken. + * It does this by tallying up the segment just like the hardware + * would. This is handled by the two variables 'segsz' and 'segdesc'. + * After each attempt to bind a dblk, we check the constaints. If + * violated, we undo the DMA and force a copy until MSS is met. We + * have a guarantee that the TCB buffer is larger than MTU; thus + * ensuring we can always meet the MSS with a single copy buffer. We + * also copy consecutive non-DMA fragments into the same TCB buffer. + */ +static i40e_tx_control_block_t * +i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp, + const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx, + uint_t *ndesc) +{ + size_t mp_len = MBLKL(mp); + /* + * The cpoff (copy offset) variable tracks the offset inside + * the current mp. There are cases where the entire mp is not + * fully copied in one go: such as the header copy followed by + * a non-DMA mblk, or a TCB buffer that only has enough space + * to copy part of the current mp. + */ + size_t cpoff = 0; + /* + * The segsz and segdesc variables track the controller's view + * of the segment. The needed_desc variable tracks the total + * number of data descriptors used by the driver. + */ + size_t segsz = 0; + uint_t segdesc = 0; + uint_t needed_desc = 0; + size_t hdrcopied = 0; + const size_t hdrlen = + meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen; + const size_t mss = tctx->itc_ctx_mss; + boolean_t force_copy = B_FALSE; + i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; + i40e_t *i40e = itrq->itrq_i40e; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + /* + * We always copy the header in order to avoid more + * complicated code dealing with various edge cases. + */ + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + needed_desc++; + tcb_list_append(&tcbhead, &tcbtail, tcb); + + while (hdrcopied < hdrlen) { + const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len); + i40e_tx_copy_fragment(tcb, mp, 0, tocopy); + hdrcopied += tocopy; + cpoff += tocopy; + if (tocopy == mp_len) { + /* + * This is a bit of defensive programming. We + * should never have a chain too short to + * satisfy the headers -- but just in case. + */ + if ((mp = mp->b_cont) == NULL) { + txs->itxs_tx_short.value.ui64++; + goto fail; + } + + while ((mp_len = MBLKL(mp)) == 0) { + if ((mp = mp->b_cont) == NULL) { + txs->itxs_tx_short.value.ui64++; + goto fail; + } + } + cpoff = 0; + } + } + ASSERT3U(hdrcopied, ==, hdrlen); + + /* + * A single descriptor containing both header and data is + * counted twice by the controller. + */ + if (mp_len < i40e->i40e_tx_dma_min) { + segdesc = 2; + } else { + segdesc = 1; + } + + while (mp != NULL) { + mp_len = MBLKL(mp); +force_copy: + /* Ignore zero-length mblks. */ + if (mp_len == 0) { + mp = mp->b_cont; + cpoff = 0; + continue; + } + + /* + * We copy into the preallocated TCB buffer when the + * current fragment is less than the DMA threshold OR + * when the DMA bind can't meet the controller's + * segment descriptor limit. + */ + if (mp_len < i40e->i40e_tx_dma_min || force_copy) { + size_t tocopy; + + /* + * Our objective here is to compress + * consecutive copies into one TCB (until it + * is full). If there is no current TCB, or if + * it is a DMA TCB, then allocate a new one. + */ + if (tcb == NULL || + (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) { + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + /* + * The TCB DMA buffer is guaranteed to + * be one cookie by i40e_alloc_dma_buffer(). + */ + needed_desc++; + segdesc++; + ASSERT3U(segdesc, <=, i40e_lso_num_descs); + tcb_list_append(&tcbhead, &tcbtail, tcb); + } else if (segdesc == 0) { + /* + * We are copying into an existing TCB + * but we just crossed the MSS + * boundary. Make sure to increment + * segdesc to track the descriptor + * count as the hardware would. + */ + segdesc++; + } + + tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff); + i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy); + cpoff += tocopy; + segsz += tocopy; + + /* We have consumed the current mp. */ + if (cpoff == mp_len) { + mp = mp->b_cont; + cpoff = 0; + } + + /* We have consumed the current TCB buffer. */ + if (I40E_TCB_LEFT(tcb) == 0) { + tcb = NULL; + } + + /* + * We have met MSS with this copy; restart the + * counters. + */ + if (segsz >= mss) { + segsz = segsz % mss; + segdesc = segsz == 0 ? 0 : 1; + force_copy = B_FALSE; + } + + /* + * We are at the controller's descriptor + * limit; we must copy into the current TCB + * until MSS is reached. The TCB buffer is + * always bigger than the MTU so we know it is + * big enough to meet the MSS. + */ + if (segdesc == i40e_lso_num_descs) { + force_copy = B_TRUE; + } + } else { + uint_t tsegdesc = segdesc; + size_t tsegsz = segsz; + + ASSERT(force_copy == B_FALSE); + ASSERT3U(tsegdesc, <, i40e_lso_num_descs); + + tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE); + if (tcb == NULL) { + i40e_error(i40e, "dma bind failed!"); + goto fail; + } + + for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) { + struct i40e_dma_bind_info dbi = + tcb->tcb_bind_info[i]; + + tsegsz += dbi.dbi_len; + tsegdesc++; + ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); + + /* + * We've met the MSS with this portion + * of the DMA. + */ + if (tsegsz >= mss) { + tsegsz = tsegsz % mss; + tsegdesc = tsegsz == 0 ? 0 : 1; + } + + /* + * We've reached max descriptors but + * have not met the MSS. Undo the bind + * and instead copy. + */ + if (tsegdesc == i40e_lso_num_descs) { + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + + if (tcbtail != NULL && + I40E_TCB_LEFT(tcb) > 0 && + tcbtail->tcb_type == I40E_TX_COPY) { + tcb = tcbtail; + } else { + tcb = NULL; + } + + /* + * Remember, we are still on + * the same mp. + */ + force_copy = B_TRUE; + txs->itxs_tso_force_copy.value.ui64++; + goto force_copy; + } + } + + ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); + ASSERT3U(tsegsz, <, mss); + + /* + * We've made if through the loop without + * breaking the segment descriptor contract + * with the controller -- replace the segment + * tracking values with the temporary ones. + */ + segdesc = tsegdesc; + segsz = tsegsz; + needed_desc += tcb->tcb_bind_ncookies; + cpoff = 0; + tcb_list_append(&tcbhead, &tcbtail, tcb); + mp = mp->b_cont; + } + } + + ASSERT3P(mp, ==, NULL); + ASSERT3P(tcbhead, !=, NULL); + *ndesc += needed_desc; + return (tcbhead); + +fail: + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + + tcb->tcb_mp = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + tcb = next; + } + + return (NULL); +} + /* * We've been asked to send a message block on the wire. We'll only have a * single chain. There will not be any b_next pointers; however, there may be - * multiple b_cont blocks. + * multiple b_cont blocks. The number of b_cont blocks may exceed the + * controller's Tx descriptor limit. * * We may do one of three things with any given mblk_t chain: * @@ -2096,12 +2826,14 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) mblk_t * i40e_ring_tx(void *arg, mblk_t *mp) { - const mblk_t *nmp; - size_t mpsize; - i40e_tx_control_block_t *tcb; - i40e_tx_desc_t *txdesc; + size_t msglen; + i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL; + i40e_tx_context_desc_t *ctxdesc; + mac_ether_offload_info_t meo; i40e_tx_context_t tctx; - int cmd, type; + int type; + uint_t needed_desc = 0; + boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE; i40e_trqpair_t *itrq = arg; i40e_t *i40e = itrq->itrq_i40e; @@ -2119,107 +2851,137 @@ i40e_ring_tx(void *arg, mblk_t *mp) return (NULL); } + if (mac_ether_offload_info(mp, &meo) != 0) { + freemsg(mp); + itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++; + return (NULL); + } + /* * Figure out the relevant context about this frame that we might need - * for enabling checksum, lso, etc. This also fills in information that + * for enabling checksum, LSO, etc. This also fills in information that * we might set around the packet type, etc. */ - if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { + if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) { freemsg(mp); itrq->itrq_txstat.itxs_err_context.value.ui64++; return (NULL); } + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + use_lso = B_TRUE; + do_ctx_desc = B_TRUE; + } /* * For the primordial driver we can punt on doing any recycling right * now; however, longer term we need to probably do some more pro-active - * recycling to cut back on stalls in the tx path. + * recycling to cut back on stalls in the TX path. */ - /* - * Do a quick size check to make sure it fits into what we think it - * should for this device. Note that longer term this will be false, - * particularly when we have the world of TSO. - */ - mpsize = 0; - for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - mpsize += MBLKL(nmp); + msglen = msgsize(mp); + + if (do_ctx_desc) { + /* + * If we're doing tunneling or LSO, then we'll need a TX + * context descriptor in addition to one or more TX data + * descriptors. Since there's no data DMA block or handle + * associated with the context descriptor, we create a special + * control block that behaves effectively like a NOP. + */ + if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto txfail; + } + tcb_ctx->tcb_type = I40E_TX_DESC; + needed_desc++; } - /* - * First we allocate our tx control block and prepare the packet for - * transmit before we do a final check for descriptors. We do it this - * way to minimize the time under the tx lock. - */ - tcb = i40e_tcb_alloc(itrq); - if (tcb == NULL) { - txs->itxs_err_notcb.value.ui64++; - goto txfail; + if (!use_lso) { + tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc); + } else { + tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc); } - /* - * For transmitting a block, we're currently going to use just a - * single control block and bcopy all of the fragments into it. We - * should be more intelligent about doing DMA binding or otherwise, but - * for getting off the ground this will have to do. - */ - ASSERT(tcb->tcb_dma.dmab_len == 0); - ASSERT(tcb->tcb_dma.dmab_size >= mpsize); - for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - size_t clen = MBLKL(nmp); - void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + if (tcbhead == NULL) + goto txfail; - bcopy(nmp->b_rptr, coff, clen); - tcb->tcb_dma.dmab_len += clen; - } - ASSERT(tcb->tcb_dma.dmab_len == mpsize); + tcbhead->tcb_mp = mp; /* - * While there's really no need to keep the mp here, but let's just do - * it to help with our own debugging for now. + * The second condition ensures that 'itrq_desc_tail' never + * equals 'itrq_desc_head'. This enforces the rule found in + * the second bullet point of section 8.4.3.1.5 of the XL710 + * PG, which declares the TAIL pointer in I40E_QTX_TAIL should + * never overlap with the head. This means that we only ever + * have 'itrq_tx_ring_size - 1' total available descriptors. */ - tcb->tcb_mp = mp; - tcb->tcb_type = I40E_TX_COPY; - I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); - mutex_enter(&itrq->itrq_tx_lock); - if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { + if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh || + (itrq->itrq_desc_free - 1) < needed_desc) { txs->itxs_err_nodescs.value.ui64++; mutex_exit(&itrq->itrq_tx_lock); goto txfail; } - /* - * Build up the descriptor and send it out. Thankfully at the moment - * we only need a single desc, because we're not doing anything fancy - * yet. - */ - ASSERT(itrq->itrq_desc_free > 0); - itrq->itrq_desc_free--; - txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; - itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; - itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, - itrq->itrq_tx_ring_size); + if (do_ctx_desc) { + /* + * If we're enabling any offloads for this frame, then we'll + * need to build up a transmit context descriptor, first. The + * context descriptor needs to be placed in the TX ring before + * the data descriptor(s). See section 8.4.2, table 8-16 + */ + uint_t tail = itrq->itrq_desc_tail; + itrq->itrq_desc_free--; + ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; + itrq->itrq_tcb_work_list[tail] = tcb_ctx; + itrq->itrq_desc_tail = i40e_next_desc(tail, 1, + itrq->itrq_tx_ring_size); + + /* QW0 */ + type = I40E_TX_DESC_DTYPE_CONTEXT; + ctxdesc->tunneling_params = 0; + ctxdesc->l2tag2 = 0; + + /* QW1 */ + ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) + ((uint64_t)tctx.itc_ctx_cmdflags << + I40E_TXD_CTX_QW1_CMD_SHIFT) | + ((uint64_t)tctx.itc_ctx_tsolen << + I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | + ((uint64_t)tctx.itc_ctx_mss << + I40E_TXD_CTX_QW1_MSS_SHIFT)); + } + } - /* - * Note, we always set EOP and RS which indicates that this is the last - * data frame and that we should ask for it to be transmitted. We also - * must always set ICRC, because that is an internal bit that must be - * set to one for data descriptors. The remaining bits in the command - * descriptor depend on checksumming and are determined based on the - * information set up in i40e_tx_context(). - */ - type = I40E_TX_DESC_DTYPE_DATA; - cmd = I40E_TX_DESC_CMD_EOP | - I40E_TX_DESC_CMD_RS | - I40E_TX_DESC_CMD_ICRC | - tctx.itc_cmdflags; - txdesc->buffer_addr = - CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address); - txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | - ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | - ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | - ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); + tcb = tcbhead; + while (tcb != NULL) { + + itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; + if (tcb->tcb_type == I40E_TX_COPY) { + boolean_t last_desc = (tcb->tcb_next == NULL); + + i40e_tx_set_data_desc(itrq, &tctx, + (caddr_t)tcb->tcb_dma.dmab_dma_address, + tcb->tcb_dma.dmab_len, last_desc); + } else { + boolean_t last_desc = B_FALSE; + ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA); + + for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) { + last_desc = (c == tcb->tcb_bind_ncookies - 1) && + (tcb->tcb_next == NULL); + + i40e_tx_set_data_desc(itrq, &tctx, + tcb->tcb_bind_info[c].dbi_paddr, + tcb->tcb_bind_info[c].dbi_len, + last_desc); + } + } + + tcb = tcb->tcb_next; + } /* * Now, finally, sync the DMA data and alert hardware. @@ -2228,6 +2990,7 @@ i40e_ring_tx(void *arg, mblk_t *mp) I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), itrq->itrq_desc_tail); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != DDI_FM_OK) { /* @@ -2239,9 +3002,9 @@ i40e_ring_tx(void *arg, mblk_t *mp) atomic_or_32(&i40e->i40e_state, I40E_ERROR); } - txs->itxs_bytes.value.ui64 += mpsize; + txs->itxs_bytes.value.ui64 += msglen; txs->itxs_packets.value.ui64++; - txs->itxs_descriptors.value.ui64++; + txs->itxs_descriptors.value.ui64 += needed_desc; mutex_exit(&itrq->itrq_tx_lock); @@ -2254,10 +3017,23 @@ txfail: * Make sure to reset their message block's, since we'll return them * back to MAC. */ - if (tcb != NULL) { + if (tcb_ctx != NULL) { + tcb_ctx->tcb_mp = NULL; + i40e_tcb_reset(tcb_ctx); + i40e_tcb_free(itrq, tcb_ctx); + } + + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + tcb->tcb_mp = NULL; i40e_tcb_reset(tcb); i40e_tcb_free(itrq, tcb); + tcb = next; } mutex_enter(&itrq->itrq_tx_lock); diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c index 1c8318b191..55c4159bc4 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -272,8 +273,7 @@ ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req) icmph->icmph_checksum = IP_CSUM(pmtu_mp, (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0); - (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0, - HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); + mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, " "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d", @@ -1560,8 +1560,7 @@ ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc) /* * Can RC mode in IB guarantee its checksum correctness? * - * (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, - * HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); + * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); */ /* diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c new file mode 100644 index 0000000000..eaa0c33f0f --- /dev/null +++ b/usr/src/uts/common/io/inotify.c @@ -0,0 +1,1555 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2015 The MathWorks, Inc. All rights reserved. + */ + +/* + * Support for the inotify facility, a Linux-borne facility for asynchronous + * notification of certain events on specified files or directories. Our + * implementation broadly leverages the file event monitoring facility, and + * would actually be quite straightforward were it not for a very serious + * blunder in the inotify interface: in addition to allowing for one to be + * notified on events on a particular file or directory, inotify also allows + * for one to be notified on certain events on files _within_ a watched + * directory -- even though those events have absolutely nothing to do with + * the directory itself. This leads to all sorts of madness because file + * operations are (of course) not undertaken on paths but rather on open + * files -- and the relationships between open files and the paths that resolve + * to those files are neither static nor isomorphic. We implement this + * concept by having _child watches_ when directories are watched with events + * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is + * first added, and we modify those child watches dynamically as files are + * created, deleted, moved into or moved out of the specified directory. This + * mechanism works well, absent hard links. Hard links, unfortunately, break + * this rather badly, and the user is warned that watches on directories that + * have multiple directory entries referring to the same file may behave + * unexpectedly. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/inotify.h> +#include <sys/fem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/vfs_opreg.h> +#include <sys/vmem.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include <sys/cyclic.h> +#include <sys/filio.h> + +struct inotify_state; +struct inotify_kevent; + +typedef struct inotify_watch inotify_watch_t; +typedef struct inotify_state inotify_state_t; +typedef struct inotify_kevent inotify_kevent_t; + +struct inotify_watch { + kmutex_t inw_lock; /* lock protecting ref count */ + int inw_refcnt; /* reference count */ + uint8_t inw_zombie:1; /* boolean: is zombie */ + uint8_t inw_fired:1; /* boolean: fired one-shot */ + uint8_t inw_active:1; /* boolean: watch is active */ + uint8_t inw_orphaned:1; /* boolean: orphaned */ + kcondvar_t inw_cv; /* condvar for zombifier */ + uint32_t inw_mask; /* mask of watch */ + int32_t inw_wd; /* watch descriptor */ + vnode_t *inw_vp; /* underlying vnode */ + inotify_watch_t *inw_parent; /* parent, if a child */ + avl_node_t inw_byvp; /* watches by vnode */ + avl_node_t inw_bywd; /* watches by descriptor */ + avl_tree_t inw_children; /* children, if a parent */ + char *inw_name; /* name, if a child */ + list_node_t inw_orphan; /* orphan list */ + cred_t *inw_cred; /* cred, if orphaned */ + inotify_state_t *inw_state; /* corresponding state */ +}; + +struct inotify_kevent { + inotify_kevent_t *ine_next; /* next event in queue */ + struct inotify_event ine_event; /* event (variable size) */ +}; + +#define INOTIFY_EVENT_LENGTH(ev) \ + (sizeof (inotify_kevent_t) + (ev)->ine_event.len) + +struct inotify_state { + kmutex_t ins_lock; /* lock protecting state */ + avl_tree_t ins_byvp; /* watches by vnode */ + avl_tree_t ins_bywd; /* watches by descriptor */ + vmem_t *ins_wds; /* watch identifier arena */ + int ins_maxwatches; /* maximum number of watches */ + int ins_maxevents; /* maximum number of events */ + int ins_nevents; /* current # of events */ + int32_t ins_size; /* total size of events */ + inotify_kevent_t *ins_head; /* head of event queue */ + inotify_kevent_t *ins_tail; /* tail of event queue */ + pollhead_t ins_pollhd; /* poll head */ + kcondvar_t ins_cv; /* condvar for reading */ + list_t ins_orphans; /* orphan list */ + ddi_periodic_t ins_cleaner; /* cyclic for cleaning */ + inotify_watch_t *ins_zombies; /* zombie watch list */ + cred_t *ins_cred; /* creator's credentials */ + inotify_state_t *ins_next; /* next state on global list */ +}; + +/* + * Tunables (exported read-only in lx-branded zones via /proc). + */ +int inotify_maxwatches = 8192; /* max watches per instance */ +int inotify_maxevents = 16384; /* max events */ +int inotify_maxinstances = 128; /* max instances per user */ + +/* + * Internal global variables. + */ +static kmutex_t inotify_lock; /* lock protecting state */ +static dev_info_t *inotify_devi; /* device info */ +static fem_t *inotify_femp; /* FEM pointer */ +static vmem_t *inotify_minor; /* minor number arena */ +static void *inotify_softstate; /* softstate pointer */ +static inotify_state_t *inotify_state; /* global list if state */ + +static void inotify_watch_event(inotify_watch_t *, uint64_t, char *); +static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *); +static void inotify_watch_delete(inotify_watch_t *, uint32_t); +static void inotify_watch_remove(inotify_state_t *state, + inotify_watch_t *watch); + +static int +inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset, + cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) { + inotify_watch_event(watch, flag & FWRITE ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL); + } + + return (rval); +} + +static int +inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_create(vf, name, vap, excl, mode, + vpp, cr, flag, ct, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE, name); + } + + return (rval); +} + +static int +inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) { + inotify_watch_insert(watch, svp, tnm); + inotify_watch_event(watch, IN_CREATE, tnm); + } + + return (rval); +} + +static int +inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_mkdir(vf, name, vap, vpp, cr, + ct, flags, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name); + } + + return (rval); +} + +static int +inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_open(vf, mode, cr, ct)) == 0) + inotify_watch_event(watch, IN_OPEN, NULL); + + return (rval); +} + +static int +inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_read(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_ACCESS, NULL); + + return (rval); +} + +static int +inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags); + inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL); + + return (rval); +} + +int +inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE, nm); + + return (rval); +} + +int +inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm); + + return (rval); +} + +static int +inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + return (rval); +} + +static int +inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_write(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_MODIFY, NULL); + + return (rval); +} + +static int +inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + + switch (vnevent) { + case VE_RENAME_SRC: + inotify_watch_event(watch, IN_MOVE_SELF, NULL); + inotify_watch_delete(watch, IN_MOVE_SELF); + break; + case VE_REMOVE: + /* + * Linux will apparently fire an IN_ATTRIB event when the link + * count changes (including when it drops to 0 on a remove). + * This is merely somewhat odd; what is amazing is that this + * IN_ATTRIB event is not visible on an inotify watch on the + * parent directory. (IN_ATTRIB events are normally sent to + * watches on the parent directory). While it's hard to + * believe that this constitutes desired semantics, ltp + * unfortunately tests this case (if implicitly); in the name + * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are + * explicitly watching the file that has been removed. + */ + if (watch->inw_parent == NULL) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + /*FALLTHROUGH*/ + case VE_RENAME_DEST: + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_RMDIR: + /* + * It seems that IN_ISDIR should really be OR'd in here, but + * Linux doesn't seem to do that in this case; for the sake of + * bug-for-bug compatibility, we don't do it either. + */ + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_CREATE: + case VE_TRUNCATE: + case VE_RESIZE: + inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL); + break; + case VE_LINK: + inotify_watch_event(watch, IN_ATTRIB, NULL); + break; + case VE_RENAME_SRC_DIR: + inotify_watch_event(watch, IN_MOVED_FROM, name); + break; + case VE_RENAME_DEST_DIR: + if (name == NULL) + name = dvp->v_path; + + inotify_watch_insert(watch, dvp, name); + inotify_watch_event(watch, IN_MOVED_TO, name); + break; + case VE_SUPPORT: + case VE_MOUNTEDOVER: + case VE_PRE_RENAME_SRC: + case VE_PRE_RENAME_DEST: + case VE_PRE_RENAME_DEST_DIR: + break; + } + + return (vnext_vnevent(vf, vnevent, dvp, name, ct)); +} + +const fs_operation_def_t inotify_vnodesrc_template[] = { + VOPNAME_CLOSE, { .femop_close = inotify_fop_close }, + VOPNAME_CREATE, { .femop_create = inotify_fop_create }, + VOPNAME_LINK, { .femop_link = inotify_fop_link }, + VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir }, + VOPNAME_OPEN, { .femop_open = inotify_fop_open }, + VOPNAME_READ, { .femop_read = inotify_fop_read }, + VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir }, + VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove }, + VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir }, + VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr }, + VOPNAME_WRITE, { .femop_write = inotify_fop_write }, + VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent }, + NULL, NULL +}; + +static int +inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + if (lhs->inw_wd < rhs->inw_wd) + return (-1); + + if (lhs->inw_wd > rhs->inw_wd) + return (1); + + return (0); +} + +static int +inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp; + + if (lvp < rvp) + return (-1); + + if (lvp > rvp) + return (1); + + return (0); +} + +static void +inotify_watch_hold(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 0); + watch->inw_refcnt++; + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_release(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 1); + + if (--watch->inw_refcnt == 1 && watch->inw_zombie) { + /* + * We're down to our last reference; kick anyone that might be + * waiting. + */ + cv_signal(&watch->inw_cv); + } + + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name) +{ + inotify_kevent_t *event, *tail; + inotify_state_t *state = watch->inw_state; + uint32_t wd = watch->inw_wd, cookie = 0, len; + boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE; + inotify_watch_t *source = watch; + + if (!(mask &= watch->inw_mask) || mask == IN_ISDIR) + return; + + if (watch->inw_parent != NULL) { + /* + * This is an event on the child; if this isn't a valid child + * event, return. Otherwise, we move our watch to be our + * parent (which we know is around because we have a hold on + * it) and continue. + */ + if (!(mask & IN_CHILD_EVENTS)) + return; + + name = watch->inw_name; + watch = watch->inw_parent; + wd = watch->inw_wd; + } + + if (!removal) { + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || + watch->inw_fired || !watch->inw_active) { + mutex_exit(&state->ins_lock); + return; + } + } else { + if (!watch->inw_active) + return; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + } + + /* + * If this is an operation on a directory and it's a child event + * (event if it's not on a child), we specify IN_ISDIR. + */ + if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS)) + mask |= IN_ISDIR; + + if (mask & (IN_MOVED_FROM | IN_MOVED_TO)) + cookie = (uint32_t)curthread->t_did; + + if (state->ins_nevents >= state->ins_maxevents) { + /* + * We're at our maximum number of events -- turn our event + * into an IN_Q_OVERFLOW event, which will be coalesced if + * it's already the tail event. + */ + mask = IN_Q_OVERFLOW; + wd = (uint32_t)-1; + cookie = 0; + len = 0; + } + + if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd && + tail->ine_event.mask == mask && tail->ine_event.cookie == cookie && + ((tail->ine_event.len == 0 && len == 0) || + (name != NULL && tail->ine_event.len != 0 && + strcmp(tail->ine_event.name, name) == 0))) { + /* + * This is an implicitly coalesced event; we're done. + */ + if (!removal) + mutex_exit(&state->ins_lock); + return; + } + + if (name != NULL) { + /* + * We are in the context of a file event monitoring operation, + * so the name length is bounded by the kernel. + */ + len = strlen(name) + 1; + len = roundup(len, sizeof (struct inotify_event)); + } else { + len = 0; + } + + event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP); + event->ine_event.wd = wd; + event->ine_event.mask = (uint32_t)mask; + event->ine_event.cookie = cookie; + event->ine_event.len = len; + + if (name != NULL) + (void) strcpy(event->ine_event.name, name); + + if (tail != NULL) { + tail->ine_next = event; + } else { + VERIFY(state->ins_head == NULL); + state->ins_head = event; + cv_broadcast(&state->ins_cv); + } + + state->ins_tail = event; + state->ins_nevents++; + state->ins_size += sizeof (event->ine_event) + len; + + if (removal) + return; + + if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) { + /* + * If this is a one-shot, we need to remove the watch. (Note + * that this will recurse back into inotify_watch_event() to + * fire the IN_IGNORED event -- but with "removal" set.) + */ + watch->inw_fired = 1; + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); +} + +/* + * Destroy a watch. By the time we're in here, the watch must have exactly + * one reference. + */ +static void +inotify_watch_destroy(inotify_watch_t *watch) +{ + VERIFY(MUTEX_HELD(&watch->inw_lock)); + + if (watch->inw_name != NULL) + kmem_free(watch->inw_name, strlen(watch->inw_name) + 1); + + kmem_free(watch, sizeof (inotify_watch_t)); +} + +static int +inotify_fem_install(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * For vnodes that are devices (of type VCHR or VBLK), we silently + * refuse to actually install any event monitor. This is to avoid + * single-thread deadlock when both a special device vnode and its + * underlying real vnode are being watched: releasing the device + * vnode upon watch removal can induce an attribute update on the + * underlying vnode, which will bring us into inotify_watch_event() + * with our lock already held. While we could fail earlier and more + * explicitly in this case, we choose to keep with the Linux behavior + * on unwatchable entities and allow the watch but not generate any + * events for it. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_install(vp, inotify_femp, watch, OPARGUNIQ, + (void (*)(void *))inotify_watch_hold, + (void (*)(void *))inotify_watch_release)); +} + +static int +inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * See inotify_fem_install(), above, for our rationale here. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_uninstall(vp, inotify_femp, watch)); +} + +/* + * Zombify a watch. By the time we come in here, it must be true that the + * watch has already been fem_uninstall()'d -- the only reference should be + * in the state's data structure. If we can get away with freeing it, we'll + * do that -- but if the reference count is greater than one due to an active + * vnode operation, we'll put this watch on the zombie list on the state + * structure. + */ +static void +inotify_watch_zombify(inotify_watch_t *watch) +{ + inotify_state_t *state = watch->inw_state; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(!watch->inw_zombie); + + watch->inw_zombie = 1; + + if (watch->inw_parent != NULL) { + inotify_watch_release(watch->inw_parent); + } else { + avl_remove(&state->ins_byvp, watch); + avl_remove(&state->ins_bywd, watch); + vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1); + watch->inw_wd = -1; + } + + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + /* + * There are no operations in flight and there is no way + * for anyone to discover this watch -- we can destroy it. + */ + inotify_watch_destroy(watch); + } else { + /* + * There are operations in flight; we will need to enqueue + * this for later destruction. + */ + watch->inw_parent = state->ins_zombies; + state->ins_zombies = watch; + mutex_exit(&watch->inw_lock); + } +} + +static inotify_watch_t * +inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent, + const char *name, vnode_t *vp, uint32_t mask) +{ + inotify_watch_t *watch; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + + watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP); + + watch->inw_vp = vp; + watch->inw_mask = mask; + watch->inw_state = state; + watch->inw_refcnt = 1; + + if (parent == NULL) { + watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds, + 1, VM_BESTFIT | VM_SLEEP); + avl_add(&state->ins_byvp, watch); + avl_add(&state->ins_bywd, watch); + + avl_create(&watch->inw_children, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + } else { + VERIFY(name != NULL); + inotify_watch_hold(parent); + watch->inw_mask &= IN_CHILD_EVENTS; + watch->inw_parent = parent; + + /* + * Copy the name. Note that when the name is user-specified, + * its length is bounded by the copyinstr() to be MAXPATHLEN + * (and regardless, we know by this point that it exists in + * our parent). + */ + watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(watch->inw_name, name); + + avl_add(&parent->inw_children, watch); + } + + /* + * Add our monitor to the vnode. We must not have the watch lock held + * when we do this, as it will immediately hold our watch. + */ + err = inotify_fem_install(vp, watch); + + VERIFY(err == 0); + + return (watch); +} + +/* + * Remove a (non-child) watch. This is called from either synchronous context + * via inotify_rm_watch() or monitor context via either a vnevent or a + * one-shot. + */ +static void +inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) +{ + inotify_watch_t *child; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(watch->inw_parent == NULL); + + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + /* + * If we have children, we're going to remove them all and set them + * all to be zombies. + */ + while ((child = avl_first(&watch->inw_children)) != NULL) { + VERIFY(child->inw_parent == watch); + avl_remove(&watch->inw_children, child); + + err = inotify_fem_uninstall(child->inw_vp, child); + VERIFY(err == 0); + + /* + * If this child watch has been orphaned, remove it from the + * state's list of orphans. + */ + if (child->inw_orphaned) { + list_remove(&state->ins_orphans, child); + crfree(child->inw_cred); + } + + VN_RELE(child->inw_vp); + + /* + * We're down (or should be down) to a single reference to + * this child watch; it's safe to zombify it. + */ + inotify_watch_zombify(child); + } + + inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- we know that the only reference + * can come from operations in flight. + */ + inotify_watch_zombify(watch); +} + +/* + * Delete a watch. Should only be called from VOP context. + */ +static void +inotify_watch_delete(inotify_watch_t *watch, uint32_t event) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent; + int err; + + if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie) { + mutex_exit(&state->ins_lock); + return; + } + + if ((parent = watch->inw_parent) == NULL) { + if (event == IN_DELETE_SELF) { + /* + * If we're here because we're being deleted and we + * are not a child watch, we need to delete the entire + * watch, children and all. + */ + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + return; + } else { + if (event == IN_DELETE_SELF && + !(parent->inw_mask & IN_EXCL_UNLINK)) { + /* + * This is a child watch for a file that is being + * removed and IN_EXCL_UNLINK has not been specified; + * indicate that it is orphaned and add it to the list + * of orphans. (This list will be checked by the + * cleaning cyclic to determine when the watch has + * become the only hold on the vnode, at which point + * the watch can be zombified.) Note that we check + * if the watch is orphaned before we orphan it: hard + * links make it possible for VE_REMOVE to be called + * multiple times on the same vnode. (!) + */ + if (!watch->inw_orphaned) { + watch->inw_orphaned = 1; + watch->inw_cred = CRED(); + crhold(watch->inw_cred); + list_insert_head(&state->ins_orphans, watch); + } + + mutex_exit(&state->ins_lock); + return; + } + + if (watch->inw_orphaned) { + /* + * If we're here, a file was orphaned and then later + * moved -- which almost certainly means that hard + * links are on the scene. We choose the orphan over + * the move because we don't want to spuriously + * drop events if we can avoid it. + */ + crfree(watch->inw_cred); + list_remove(&state->ins_orphans, watch); + } + } + + if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) { + /* + * This watch has already been deleted from the parent. + */ + mutex_exit(&state->ins_lock); + return; + } + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- which won't actually delete + * it as we know that the reference count is greater than 1. + */ + inotify_watch_zombify(watch); + mutex_exit(&state->ins_lock); +} + +/* + * Insert a new child watch. Should only be called from VOP context when + * a child is created in a watched directory. + */ +static void +inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = vp }; + + if (!(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) { + mutex_exit(&state->ins_lock); + return; + } + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + return; + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); + VERIFY(watch != NULL); + + mutex_exit(&state->ins_lock); +} + + +static int +inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, + int32_t *wdp) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + uint32_t set; + + set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE; + + /* + * Lookup our vnode to determine if we already have a watch on it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * We don't have this watch; allocate a new one, provided that + * we have fewer than our limit. + */ + if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) { + mutex_exit(&state->ins_lock); + return (ENOSPC); + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, NULL, NULL, vp, set); + *wdp = watch->inw_wd; + mutex_exit(&state->ins_lock); + + return (0); + } + + VERIFY(!watch->inw_zombie); + + if (!(mask & IN_MASK_ADD)) { + /* + * Note that if we're resetting our event mask and we're + * transitioning from an event mask that includes child events + * to one that doesn't, there will be potentially some stale + * child watches. This is basically fine: they won't fire, + * and they will correctly be removed when the watch is + * removed. + */ + watch->inw_mask = 0; + } + + watch->inw_mask |= set; + + *wdp = watch->inw_wd; + + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + vnode_t *cvp; + int err; + + /* + * Verify that the specified child doesn't have a directory component + * within it. + */ + if (strchr(name, '/') != NULL) + return (EINVAL); + + /* + * Lookup the underlying file. Note that this will succeed even if + * we don't have permissions to actually read the file. + */ + if ((err = lookupnameat(name, + UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) { + return (err); + } + + /* + * Use our vnode to find our watch, and then add our child watch to it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * This is unexpected -- it means that we don't have the + * watch that we thought we had. + */ + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (ENXIO); + } + + /* + * Now lookup the child vnode in the watch; we'll only add it if it + * isn't already there. + */ + cmp.inw_vp = cvp; + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (0); + } + + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); + VERIFY(watch != NULL); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_rm_watch(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + inotify_watch_remove(state, watch); + mutex_exit(&state->ins_lock); + + /* + * Because removing a watch will generate an IN_IGNORED event (and + * because inotify_watch_remove() won't alone induce a pollwakeup()), + * we need to explicitly issue a pollwakeup(). + */ + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); + + return (0); +} + +static int +inotify_activate(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + watch->inw_active = 1; + + mutex_exit(&state->ins_lock); + + return (0); +} + +/* + * Called periodically as a cyclic to process the orphans and zombies. + */ +static void +inotify_clean(void *arg) +{ + inotify_state_t *state = arg; + inotify_watch_t *watch, *parent, *next, **prev; + cred_t *savecred; + int err; + + mutex_enter(&state->ins_lock); + + for (watch = list_head(&state->ins_orphans); + watch != NULL; watch = next) { + next = list_next(&state->ins_orphans, watch); + + VERIFY(!watch->inw_zombie); + VERIFY((parent = watch->inw_parent) != NULL); + + if (watch->inw_vp->v_count > 1) + continue; + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + list_remove(&state->ins_orphans, watch); + + /* + * For purposes of releasing the vnode, we need to switch our + * cred to be the cred of the orphaning thread (which we held + * at the time this watch was orphaned). + */ + savecred = curthread->t_cred; + curthread->t_cred = watch->inw_cred; + VN_RELE(watch->inw_vp); + crfree(watch->inw_cred); + curthread->t_cred = savecred; + + inotify_watch_zombify(watch); + } + + prev = &state->ins_zombies; + + while ((watch = *prev) != NULL) { + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + *prev = watch->inw_parent; + inotify_watch_destroy(watch); + continue; + } + + prev = &watch->inw_parent; + mutex_exit(&watch->inw_lock); + } + + mutex_exit(&state->ins_lock); +} + +/*ARGSUSED*/ +static int +inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + int instances = 0; + char c[64]; + + if (minor != INOTIFYMNRN_INOTIFY) + return (ENXIO); + + mutex_enter(&inotify_lock); + + for (state = inotify_state; state != NULL; state = state->ins_next) { + if (state->ins_cred == cred_p) + instances++; + } + + if (instances >= inotify_maxinstances) { + mutex_exit(&inotify_lock); + return (EMFILE); + } + + minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) { + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&inotify_lock); + return (NULL); + } + + state = ddi_get_soft_state(inotify_softstate, minor); + *devp = makedevice(major, minor); + + crhold(cred_p); + state->ins_cred = cred_p; + state->ins_next = inotify_state; + inotify_state = state; + + (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor); + state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + avl_create(&state->ins_bywd, + (int(*)(const void *, const void *))inotify_watch_cmpwd, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_bywd)); + + avl_create(&state->ins_byvp, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + + list_create(&state->ins_orphans, sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_orphan)); + + state->ins_maxwatches = inotify_maxwatches; + state->ins_maxevents = inotify_maxevents; + + mutex_exit(&inotify_lock); + + state->ins_cleaner = ddi_periodic_add(inotify_clean, + state, NANOSEC, DDI_IPL_0); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + inotify_state_t *state; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + int err = 0, nevents = 0; + size_t len; + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + while (state->ins_head == NULL) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->ins_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) { + mutex_exit(&state->ins_lock); + return (EINTR); + } + } + + /* + * We have events and we have our lock; return as many as we can. + */ + while ((event = state->ins_head) != NULL) { + len = sizeof (event->ine_event) + event->ine_event.len; + + if (uio->uio_resid < len) { + if (nevents == 0) + err = EINVAL; + break; + } + + nevents++; + + if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0) + break; + + VERIFY(state->ins_nevents > 0); + state->ins_nevents--; + + VERIFY(state->ins_size > 0); + state->ins_size -= len; + + if ((state->ins_head = event->ine_next) == NULL) { + VERIFY(event == state->ins_tail); + VERIFY(state->ins_nevents == 0); + state->ins_tail = NULL; + } + + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + mutex_exit(&state->ins_lock); + + return (err); +} + +static int +inotify_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + if (state->ins_head != NULL) { + *reventsp = events & (POLLRDNORM | POLLIN); + } else { + *reventsp = 0; + } + + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &state->ins_pollhd; + } + + mutex_exit(&state->ins_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + file_t *fp; + int rval; + + state = ddi_get_soft_state(inotify_softstate, minor); + + switch (cmd) { + case INOTIFYIOC_ADD_WATCH: { + inotify_addwatch_t addwatch; + file_t *fp; + + if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0) + return (EFAULT); + + if ((fp = getf(addwatch.inaw_fd)) == NULL) + return (EBADF); + + rval = inotify_add_watch(state, fp->f_vnode, + addwatch.inaw_mask, rv); + + releasef(addwatch.inaw_fd); + return (rval); + } + + case INOTIFYIOC_ADD_CHILD: { + inotify_addchild_t addchild; + char name[MAXPATHLEN]; + + if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0) + return (EFAULT); + + if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0) + return (EFAULT); + + if ((fp = getf(addchild.inac_fd)) == NULL) + return (EBADF); + + rval = inotify_add_child(state, fp->f_vnode, name); + + releasef(addchild.inac_fd); + return (rval); + } + + case INOTIFYIOC_RM_WATCH: + return (inotify_rm_watch(state, arg)); + + case INOTIFYIOC_ACTIVATE: + return (inotify_activate(state, arg)); + + case FIONREAD: { + int32_t size; + + mutex_enter(&state->ins_lock); + size = state->ins_size; + mutex_exit(&state->ins_lock); + + if (copyout(&size, (void *)arg, sizeof (size)) != 0) + return (EFAULT); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state, **sp; + inotify_watch_t *watch, *zombies; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + if (state->ins_pollhd.ph_list != NULL) { + pollwakeup(&state->ins_pollhd, POLLERR); + pollhead_clean(&state->ins_pollhd); + } + + mutex_enter(&state->ins_lock); + + /* + * First, destroy all of our watches. + */ + while ((watch = avl_first(&state->ins_bywd)) != NULL) + inotify_watch_remove(state, watch); + + /* + * And now destroy our event queue. + */ + while ((event = state->ins_head) != NULL) { + state->ins_head = event->ine_next; + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + zombies = state->ins_zombies; + state->ins_zombies = NULL; + mutex_exit(&state->ins_lock); + + /* + * Now that our state lock is dropped, we can synchronously wait on + * any zombies. + */ + while ((watch = zombies) != NULL) { + zombies = zombies->inw_parent; + + mutex_enter(&watch->inw_lock); + + while (watch->inw_refcnt > 1) + cv_wait(&watch->inw_cv, &watch->inw_lock); + + inotify_watch_destroy(watch); + } + + if (state->ins_cleaner != NULL) { + ddi_periodic_delete(state->ins_cleaner); + state->ins_cleaner = NULL; + } + + mutex_enter(&inotify_lock); + + /* + * Remove our state from our global list, and release our hold on + * the cred. + */ + for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->ins_next; + crfree(state->ins_cred); + vmem_destroy(state->ins_wds); + + ddi_soft_state_free(inotify_softstate, minor); + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&inotify_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&inotify_lock); + + if (ddi_soft_state_init(&inotify_softstate, + sizeof (inotify_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/inotify failed to create soft state"); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "inotify", S_IFCHR, + INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node"); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (fem_create("inotify_fem", + inotify_vnodesrc_template, &inotify_femp) != 0) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + inotify_devi = devi; + + inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE, + UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&inotify_lock); + fem_free(inotify_femp); + vmem_destroy(inotify_minor); + + ddi_remove_minor_node(inotify_devi, NULL); + inotify_devi = NULL; + + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)inotify_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops inotify_cb_ops = { + inotify_open, /* open */ + inotify_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + inotify_read, /* read */ + nodev, /* write */ + inotify_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + inotify_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops inotify_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + inotify_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + inotify_attach, /* attach */ + inotify_detach, /* detach */ + nodev, /* reset */ + &inotify_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "inotify support", /* name of module */ + &inotify_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf new file mode 100644 index 0000000000..ce9da6180f --- /dev/null +++ b/usr/src/uts/common/io/inotify.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="inotify" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c index c10e23a8a6..cde57df235 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c @@ -996,17 +996,20 @@ static s32 ixgbe_clear_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq) * @vlan: VLAN id to write to VLAN filter * @vind: VMDq output index that maps queue to VLAN id in VFTA * @vlan_on: boolean flag to turn on/off VLAN in VFTA + * @vlvf_bypass: boolean flag - unused * * Turn on/off specified VLAN in the VLAN filter table. **/ s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on) + bool vlan_on, bool vlvf_bypass) { u32 regindex; u32 bitindex; u32 bits; u32 vftabyte; + UNREFERENCED_1PARAMETER(vlvf_bypass); + DEBUGFUNC("ixgbe_set_vfta_82598"); if (vlan > 4095) diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h index d2241c70cd..c32672187a 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h @@ -40,7 +40,8 @@ s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw); s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw); void ixgbe_enable_relaxed_ordering_82598(struct ixgbe_hw *hw); s32 ixgbe_set_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq); -s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on); +s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, + bool vlvf_bypass); s32 ixgbe_read_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 *val); s32 ixgbe_write_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 val); s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c index 894d0b2ac9..c550982710 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c @@ -1057,33 +1057,38 @@ s32 ixgbe_clear_vfta(struct ixgbe_hw *hw) * ixgbe_set_vfta - Set VLAN filter table * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFTA - * @vlan_on: boolean flag to turn on/off VLAN in VFTA + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN + * @vlvf_bypass: boolean flag indicating updating the default pool is okay * * Turn on/off specified VLAN in the VLAN filter table. **/ -s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on) +s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, + bool vlvf_bypass) { return ixgbe_call_func(hw, hw->mac.ops.set_vfta, (hw, vlan, vind, - vlan_on), IXGBE_NOT_IMPLEMENTED); + vlan_on, vlvf_bypass), IXGBE_NOT_IMPLEMENTED); } /** * ixgbe_set_vlvf - Set VLAN Pool Filter * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF - * @vfta_changed: pointer to boolean flag which indicates whether VFTA - * should be changed + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN in VLVF + * @vfta_delta: pointer to the difference between the current value of VFTA + * and the desired value + * @vfta: the desired value of the VFTA + * @vlvf_bypass: boolean flag indicating updating the default pool is okay * * Turn on/off specified bit in VLVF table. **/ s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, - bool *vfta_changed) + u32 *vfta_delta, u32 vfta, bool vlvf_bypass) { return ixgbe_call_func(hw, hw->mac.ops.set_vlvf, (hw, vlan, vind, - vlan_on, vfta_changed), IXGBE_NOT_IMPLEMENTED); + vlan_on, vfta_delta, vfta, vlvf_bypass), + IXGBE_NOT_IMPLEMENTED); } /** diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h index 24d507039d..3bee89e45e 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h @@ -125,9 +125,10 @@ s32 ixgbe_enable_mc(struct ixgbe_hw *hw); s32 ixgbe_disable_mc(struct ixgbe_hw *hw); s32 ixgbe_clear_vfta(struct ixgbe_hw *hw); s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, - u32 vind, bool vlan_on); + u32 vind, bool vlan_on, bool vlvf_bypass); s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed); + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass); s32 ixgbe_fc_enable(struct ixgbe_hw *hw); s32 ixgbe_setup_fc(struct ixgbe_hw *hw); s32 ixgbe_set_fw_drv_ver(struct ixgbe_hw *hw, u8 maj, u8 min, u8 build, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c index f342eee637..656534862c 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c @@ -3810,68 +3810,65 @@ s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw) * return the VLVF index where this VLAN id should be placed * **/ -s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan) +s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass) { - u32 bits = 0; - u32 first_empty_slot = 0; - s32 regindex; + s32 regindex, first_empty_slot; + u32 bits; /* short cut the special case */ if (vlan == 0) return 0; - /* - * Search for the vlan id in the VLVF entries. Save off the first empty - * slot found along the way - */ - for (regindex = 1; regindex < IXGBE_VLVF_ENTRIES; regindex++) { + /* if vlvf_bypass is set we don't want to use an empty slot, we + * will simply bypass the VLVF if there are no entries present in the + * VLVF that contain our VLAN + */ + first_empty_slot = vlvf_bypass ? IXGBE_ERR_NO_SPACE : 0; + + /* add VLAN enable bit for comparison */ + vlan |= IXGBE_VLVF_VIEN; + + /* Search for the vlan id in the VLVF entries. Save off the first empty + * slot found along the way. + * + * pre-decrement loop covering (IXGBE_VLVF_ENTRIES - 1) .. 1 + */ + for (regindex = IXGBE_VLVF_ENTRIES; --regindex;) { bits = IXGBE_READ_REG(hw, IXGBE_VLVF(regindex)); - if (!bits && !(first_empty_slot)) + if (bits == vlan) + return regindex; + if (!first_empty_slot && !bits) first_empty_slot = regindex; - else if ((bits & 0x0FFF) == vlan) - break; } - /* - * If regindex is less than IXGBE_VLVF_ENTRIES, then we found the vlan - * in the VLVF. Else use the first empty VLVF register for this - * vlan id. - */ - if (regindex >= IXGBE_VLVF_ENTRIES) { - if (first_empty_slot) - regindex = first_empty_slot; - else { - ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, - "No space in VLVF.\n"); - regindex = IXGBE_ERR_NO_SPACE; - } - } + /* If we are here then we didn't find the VLAN. Return first empty + * slot we found during our search, else error. + */ + if (!first_empty_slot) + ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, "No space in VLVF.\n"); - return regindex; + return first_empty_slot ? first_empty_slot : IXGBE_ERR_NO_SPACE; } /** * ixgbe_set_vfta_generic - Set VLAN filter table * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN + * @vlvf_bypass: boolean flag indicating updating default pool is okay * * Turn on/off specified VLAN in the VLAN filter table. **/ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on) + bool vlan_on, bool vlvf_bypass) { - s32 regindex; - u32 bitindex; - u32 vfta; - u32 targetbit; - s32 ret_val = IXGBE_SUCCESS; - bool vfta_changed = FALSE; + u32 regidx, vfta_delta, vfta; + s32 ret_val; DEBUGFUNC("ixgbe_set_vfta_generic"); - if (vlan > 4095) + if (vlan > 4095 || vind > 63) return IXGBE_ERR_PARAM; /* @@ -3886,33 +3883,33 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * bits[11-5]: which register * bits[4-0]: which bit in the register */ - regindex = (vlan >> 5) & 0x7F; - bitindex = vlan & 0x1F; - targetbit = (1 << bitindex); - vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regindex)); - - if (vlan_on) { - if (!(vfta & targetbit)) { - vfta |= targetbit; - vfta_changed = TRUE; - } - } else { - if ((vfta & targetbit)) { - vfta &= ~targetbit; - vfta_changed = TRUE; - } - } + regidx = vlan / 32; + vfta_delta = 1 << (vlan % 32); + vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regidx)); + + /* + * vfta_delta represents the difference between the current value + * of vfta and the value we want in the register. Since the diff + * is an XOR mask we can just update the vfta using an XOR + */ + vfta_delta &= vlan_on ? ~vfta : vfta; + vfta ^= vfta_delta; /* Part 2 * Call ixgbe_set_vlvf_generic to set VLVFB and VLVF */ - ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, - &vfta_changed); - if (ret_val != IXGBE_SUCCESS) + ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, &vfta_delta, + vfta, vlvf_bypass); + if (ret_val != IXGBE_SUCCESS) { + if (vlvf_bypass) + goto vfta_update; return ret_val; + } - if (vfta_changed) - IXGBE_WRITE_REG(hw, IXGBE_VFTA(regindex), vfta); +vfta_update: + /* Update VFTA now that we are ready for traffic */ + if (vfta_delta) + IXGBE_WRITE_REG(hw, IXGBE_VFTA(regidx), vfta); return IXGBE_SUCCESS; } @@ -3921,21 +3918,25 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * ixgbe_set_vlvf_generic - Set VLAN Pool Filter * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF - * @vfta_changed: pointer to boolean flag which indicates whether VFTA - * should be changed + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN in VLVF + * @vfta_delta: pointer to the difference between the current value of VFTA + * and the desired value + * @vfta: the desired value of the VFTA + * @vlvf_bypass: boolean flag indicating updating default pool is okay * * Turn on/off specified bit in VLVF table. **/ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed) + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass) { - u32 vt; + u32 bits; + s32 vlvf_index; DEBUGFUNC("ixgbe_set_vlvf_generic"); - if (vlan > 4095) + if (vlan > 4095 || vind > 63) return IXGBE_ERR_PARAM; /* If VT Mode is set @@ -3945,83 +3946,60 @@ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * Or !vlan_on * clear the pool bit and possibly the vind */ - vt = IXGBE_READ_REG(hw, IXGBE_VT_CTL); - if (vt & IXGBE_VT_CTL_VT_ENABLE) { - s32 vlvf_index; - u32 bits; - - vlvf_index = ixgbe_find_vlvf_slot(hw, vlan); - if (vlvf_index < 0) - return vlvf_index; - - if (vlan_on) { - /* set the pool bit */ - if (vind < 32) { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - bits |= (1 << vind); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB(vlvf_index * 2), - bits); - } else { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - bits |= (1 << (vind - 32)); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1), - bits); - } - } else { - /* clear the pool bit */ - if (vind < 32) { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - bits &= ~(1 << vind); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB(vlvf_index * 2), - bits); - bits |= IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - } else { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - bits &= ~(1 << (vind - 32)); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1), - bits); - bits |= IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - } - } + if (!(IXGBE_READ_REG(hw, IXGBE_VT_CTL) & IXGBE_VT_CTL_VT_ENABLE)) + return IXGBE_SUCCESS; - /* - * If there are still bits set in the VLVFB registers - * for the VLAN ID indicated we need to see if the - * caller is requesting that we clear the VFTA entry bit. - * If the caller has requested that we clear the VFTA - * entry bit but there are still pools/VFs using this VLAN - * ID entry then ignore the request. We're not worried - * about the case where we're turning the VFTA VLAN ID - * entry bit on, only when requested to turn it off as - * there may be multiple pools and/or VFs using the - * VLAN ID entry. In that case we cannot clear the - * VFTA bit until all pools/VFs using that VLAN ID have also - * been cleared. This will be indicated by "bits" being - * zero. + vlvf_index = ixgbe_find_vlvf_slot(hw, vlan, vlvf_bypass); + if (vlvf_index < 0) + return vlvf_index; + + bits = IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32)); + + /* set the pool bit */ + bits |= 1 << (vind % 32); + if (vlan_on) + goto vlvf_update; + + /* clear the pool bit */ + bits ^= 1 << (vind % 32); + + if (!bits && + !IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + 1 - vind / 32))) { + /* Clear VFTA first, then disable VLVF. Otherwise + * we run the risk of stray packets leaking into + * the PF via the default pool */ - if (bits) { - IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), - (IXGBE_VLVF_VIEN | vlan)); - if ((!vlan_on) && (vfta_changed != NULL)) { - /* someone wants to clear the vfta entry - * but some pools/VFs are still using it. - * Ignore it. */ - *vfta_changed = FALSE; - } - } else - IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0); + if (*vfta_delta) + IXGBE_WRITE_REG(hw, IXGBE_VFTA(vlan / 32), vfta); + + /* disable VLVF and clear remaining bit from pool */ + IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0); + IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), 0); + + return IXGBE_SUCCESS; } + /* If there are still bits set in the VLVFB registers + * for the VLAN ID indicated we need to see if the + * caller is requesting that we clear the VFTA entry bit. + * If the caller has requested that we clear the VFTA + * entry bit but there are still pools/VFs using this VLAN + * ID entry then ignore the request. We're not worried + * about the case where we're turning the VFTA VLAN ID + * entry bit on, only when requested to turn it off as + * there may be multiple pools and/or VFs using the + * VLAN ID entry. In that case we cannot clear the + * VFTA bit until all pools/VFs using that VLAN ID have also + * been cleared. This will be indicated by "bits" being + * zero. + */ + *vfta_delta = 0; + +vlvf_update: + /* record pool change and enable VLAN ID if not already enabled */ + IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), bits); + IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), IXGBE_VLVF_VIEN | vlan); + return IXGBE_SUCCESS; } diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h index 069fc88c96..bd18e96f82 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h @@ -135,11 +135,12 @@ s32 ixgbe_clear_vmdq_generic(struct ixgbe_hw *hw, u32 rar, u32 vmdq); s32 ixgbe_insert_mac_addr_generic(struct ixgbe_hw *hw, u8 *addr, u32 vmdq); s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw); s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, - u32 vind, bool vlan_on); + u32 vind, bool vlan_on, bool vlvf_bypass); s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed); + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass); s32 ixgbe_clear_vfta_generic(struct ixgbe_hw *hw); -s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan); +s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass); s32 ixgbe_check_mac_link_generic(struct ixgbe_hw *hw, ixgbe_link_speed *speed, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h index 45e8a7d029..9231979ff7 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h @@ -3715,8 +3715,9 @@ struct ixgbe_mac_operations { s32 (*enable_mc)(struct ixgbe_hw *); s32 (*disable_mc)(struct ixgbe_hw *); s32 (*clear_vfta)(struct ixgbe_hw *); - s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool); - s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, bool *); + s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool, bool); + s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, u32 *, u32, + bool); s32 (*init_uta_tables)(struct ixgbe_hw *); void (*set_mac_anti_spoofing)(struct ixgbe_hw *, bool, int); void (*set_vlan_anti_spoofing)(struct ixgbe_hw *, bool, int); diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c index 2ce4d32a30..66d836eb8f 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c @@ -321,15 +321,16 @@ static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr) return vector; } -static void ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, - u32 *msg, u16 size) +static s32 ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, u32 *msg, + u32 *retmsg, u16 size) { struct ixgbe_mbx_info *mbx = &hw->mbx; - u32 retmsg[IXGBE_VFMAILBOX_SIZE]; s32 retval = mbx->ops.write_posted(hw, msg, size, 0); - if (!retval) - mbx->ops.read_posted(hw, retmsg, size, 0); + if (retval) + return retval; + + return mbx->ops.read_posted(hw, retmsg, size, 0); } /** @@ -415,29 +416,29 @@ s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list, return mbx->ops.write_posted(hw, msgbuf, IXGBE_VFMAILBOX_SIZE, 0); } -/** +/* * ixgbe_set_vfta_vf - Set/Unset vlan filter table address * @hw: pointer to the HW structure * @vlan: 12 bit VLAN ID * @vind: unused by VF drivers * @vlan_on: if TRUE then set bit, else clear bit + * @vlvf_bypass: boolean flag indicating updating default pool is okay + * + * Turn on/off specified VLAN in the VLAN filter table. **/ -s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on) +s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, + bool vlan_on, bool vlvf_bypass) { - struct ixgbe_mbx_info *mbx = &hw->mbx; u32 msgbuf[2]; s32 ret_val; - UNREFERENCED_1PARAMETER(vind); + UNREFERENCED_2PARAMETER(vind, vlvf_bypass); msgbuf[0] = IXGBE_VF_SET_VLAN; msgbuf[1] = vlan; /* Setting the 8 bit field MSG INFO to TRUE indicates "add" */ msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT; - ret_val = mbx->ops.write_posted(hw, msgbuf, 2, 0); - if (!ret_val) - ret_val = mbx->ops.read_posted(hw, msgbuf, 1, 0); - + ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); if (!ret_val && (msgbuf[0] & IXGBE_VT_MSGTYPE_ACK)) return IXGBE_SUCCESS; @@ -628,7 +629,7 @@ void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size) msgbuf[0] = IXGBE_VF_SET_LPE; msgbuf[1] = max_size; - ixgbevf_write_msg_read_ack(hw, msgbuf, 2); + ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); } /** diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h index edc801367d..e9b8dc34ae 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h @@ -132,7 +132,8 @@ s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr); s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr, bool clear); -s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on); +s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, + bool vlan_on, bool vlvf_bypass); void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size); int ixgbevf_negotiate_api_version(struct ixgbe_hw *hw, int api); int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs, diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index 2b8084801c..1be0c424e4 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2013 OSN Online Service Nuernberg GmbH. All rights reserved. @@ -57,8 +57,8 @@ static int ixgbe_alloc_rings(ixgbe_t *); static void ixgbe_free_rings(ixgbe_t *); static int ixgbe_alloc_rx_data(ixgbe_t *); static void ixgbe_free_rx_data(ixgbe_t *); -static void ixgbe_setup_rings(ixgbe_t *); -static void ixgbe_setup_rx(ixgbe_t *); +static int ixgbe_setup_rings(ixgbe_t *); +static int ixgbe_setup_rx(ixgbe_t *); static void ixgbe_setup_tx(ixgbe_t *); static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *); static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *); @@ -67,6 +67,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *); static void ixgbe_setup_vmdq_rss(ixgbe_t *); static void ixgbe_setup_rss_table(ixgbe_t *); static void ixgbe_init_unicst(ixgbe_t *); +static int ixgbe_init_vlan(ixgbe_t *); static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *); static void ixgbe_setup_multicst(ixgbe_t *); static void ixgbe_get_hw_state(ixgbe_t *); @@ -113,6 +114,8 @@ static void ixgbe_intr_other_work(ixgbe_t *, uint32_t); static void ixgbe_get_driver_control(struct ixgbe_hw *); static int ixgbe_addmac(void *, const uint8_t *); static int ixgbe_remmac(void *, const uint8_t *); +static int ixgbe_addvlan(mac_group_driver_t, uint16_t); +static int ixgbe_remvlan(mac_group_driver_t, uint16_t); static void ixgbe_release_driver_control(struct ixgbe_hw *); static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t); @@ -273,7 +276,7 @@ static adapter_info_t ixgbe_82599eb_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -304,7 +307,7 @@ static adapter_info_t ixgbe_X540_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -1149,6 +1152,8 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe) rx_group = &ixgbe->rx_groups[i]; rx_group->index = i; rx_group->ixgbe = ixgbe; + list_create(&rx_group->vlans, sizeof (ixgbe_vlan_t), + offsetof(ixgbe_vlan_t, ixvl_link)); } for (i = 0; i < ixgbe->num_tx_rings; i++) { @@ -1898,7 +1903,8 @@ ixgbe_start(ixgbe_t *ixgbe, boolean_t alloc_buffer) /* * Setup the rx/tx rings */ - ixgbe_setup_rings(ixgbe); + if (ixgbe_setup_rings(ixgbe) != IXGBE_SUCCESS) + goto start_failure; /* * ixgbe_start() will be called when resetting, however if reset @@ -1999,6 +2005,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, void *arg1, void *arg2) { ixgbe_t *ixgbe = (ixgbe_t *)arg1; + int prev = ixgbe->intr_cnt; switch (cbaction) { /* IRM callback */ @@ -2012,7 +2019,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, if (ixgbe_intr_adjust(ixgbe, cbaction, count) != DDI_SUCCESS) { ixgbe_error(ixgbe, - "IRM CB: Failed to adjust interrupts"); + "IRM CB: Failed to adjust interrupts [%d %d %d]", + cbaction, count, prev); goto cb_fail; } break; @@ -2271,6 +2279,16 @@ ixgbe_free_rings(ixgbe_t *ixgbe) ixgbe->tx_rings = NULL; } + for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) { + ixgbe_vlan_t *vlp; + ixgbe_rx_group_t *rx_group = &ixgbe->rx_groups[i]; + + while ((vlp = list_remove_head(&rx_group->vlans)) != NULL) + kmem_free(vlp, sizeof (ixgbe_vlan_t)); + + list_destroy(&rx_group->vlans); + } + if (ixgbe->rx_groups != NULL) { kmem_free(ixgbe->rx_groups, sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups); @@ -2325,7 +2343,7 @@ ixgbe_free_rx_data(ixgbe_t *ixgbe) /* * ixgbe_setup_rings - Setup rx/tx rings. */ -static void +static int ixgbe_setup_rings(ixgbe_t *ixgbe) { /* @@ -2335,9 +2353,12 @@ ixgbe_setup_rings(ixgbe_t *ixgbe) * 2. Initialize necessary registers for receive/transmit; * 3. Initialize software pointers/parameters for receive/transmit; */ - ixgbe_setup_rx(ixgbe); + if (ixgbe_setup_rx(ixgbe) != IXGBE_SUCCESS) + return (IXGBE_FAILURE); ixgbe_setup_tx(ixgbe); + + return (IXGBE_SUCCESS); } static void @@ -2423,7 +2444,7 @@ ixgbe_setup_rx_ring(ixgbe_rx_ring_t *rx_ring) IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rx_ring->hw_index), reg_val); } -static void +static int ixgbe_setup_rx(ixgbe_t *ixgbe) { ixgbe_rx_ring_t *rx_ring; @@ -2517,6 +2538,15 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) } /* + * Initialize VLAN SW and HW state if VLAN filtering is + * enabled. + */ + if (ixgbe->vlft_enabled) { + if (ixgbe_init_vlan(ixgbe) != IXGBE_SUCCESS) + return (IXGBE_FAILURE); + } + + /* * Enable the receive unit. This must be done after filter * control is set in FCTRL. On 82598, we disable the descriptor monitor. * 82598 is the only adapter which defines this RXCTRL option. @@ -2598,6 +2628,8 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg_val); } + + return (IXGBE_SUCCESS); } static void @@ -2829,7 +2861,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; - uint32_t vmdctl, i, vtctl; + uint32_t vmdctl, i, vtctl, vlnctl; /* * Setup the VMDq Control register, enable VMDq based on @@ -2864,10 +2896,20 @@ ixgbe_setup_vmdq(ixgbe_t *ixgbe) /* * Enable Virtualization and Replication. */ - vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; + vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL); + ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK; + vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl); /* + * Enable VLAN filtering and switching (VFTA and VLVF). + */ + vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); + vlnctl |= IXGBE_VLNCTRL_VFE; + IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl); + ixgbe->vlft_enabled = B_TRUE; + + /* * Enable receiving packets to all VFs */ IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL); @@ -2887,7 +2929,7 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; uint32_t i, mrqc; - uint32_t vtctl, vmdctl; + uint32_t vtctl, vmdctl, vlnctl; /* * Initialize RETA/ERETA table @@ -2969,10 +3011,21 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe) /* * Enable Virtualization and Replication. */ + vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL); + ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK; + vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl); /* + * Enable VLAN filtering and switching (VFTA and VLVF). + */ + vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); + vlnctl |= IXGBE_VLNCTRL_VFE; + IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl); + ixgbe->vlft_enabled = B_TRUE; + + /* * Enable receiving packets to all VFs */ IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL); @@ -3142,6 +3195,53 @@ ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr) } /* + * Restore the HW state to match the SW state during restart. + */ +static int +ixgbe_init_vlan(ixgbe_t *ixgbe) +{ + /* + * The device is starting for the first time; there is nothing + * to do. + */ + if (!ixgbe->vlft_init) { + ixgbe->vlft_init = B_TRUE; + return (IXGBE_SUCCESS); + } + + for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) { + int ret; + boolean_t vlvf_bypass; + ixgbe_rx_group_t *rxg = &ixgbe->rx_groups[i]; + struct ixgbe_hw *hw = &ixgbe->hw; + + if (rxg->aupe) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, IXGBE_VMOLR(rxg->index)); + vml2flt |= IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rxg->index), vml2flt); + } + + vlvf_bypass = (rxg->index == ixgbe->rx_def_group); + for (ixgbe_vlan_t *vlp = list_head(&rxg->vlans); vlp != NULL; + vlp = list_next(&rxg->vlans, vlp)) { + ret = ixgbe_set_vfta(hw, vlp->ixvl_vid, rxg->index, + B_TRUE, vlvf_bypass); + + if (ret != IXGBE_SUCCESS) { + ixgbe_error(ixgbe, "Failed to program VFTA" + " for group %u, VID: %u, ret: %d.", + rxg->index, vlp->ixvl_vid, ret); + return (IXGBE_FAILURE); + } + } + } + + return (IXGBE_SUCCESS); +} + +/* * ixgbe_multicst_add - Add a multicst address. */ int @@ -6151,6 +6251,7 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { ixgbe_t *ixgbe = (ixgbe_t *)arg; + struct ixgbe_hw *hw = &ixgbe->hw; switch (rtype) { case MAC_RING_TYPE_RX: { @@ -6164,6 +6265,20 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index, infop->mgi_stop = NULL; infop->mgi_addmac = ixgbe_addmac; infop->mgi_remmac = ixgbe_remmac; + + if ((ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ || + ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ_RSS) && + (hw->mac.type == ixgbe_mac_82599EB || + hw->mac.type == ixgbe_mac_X540 || + hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X550EM_x)) { + infop->mgi_addvlan = ixgbe_addvlan; + infop->mgi_remvlan = ixgbe_remvlan; + } else { + infop->mgi_addvlan = NULL; + infop->mgi_remvlan = NULL; + } + infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups); break; @@ -6263,6 +6378,228 @@ ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh) return (0); } +static ixgbe_vlan_t * +ixgbe_find_vlan(ixgbe_rx_group_t *rx_group, uint16_t vid) +{ + for (ixgbe_vlan_t *vlp = list_head(&rx_group->vlans); vlp != NULL; + vlp = list_next(&rx_group->vlans, vlp)) { + if (vlp->ixvl_vid == vid) + return (vlp); + } + + return (NULL); +} + +/* + * Attempt to use a VLAN HW filter for this group. If the group is + * interested in untagged packets then set AUPE only. If the group is + * the default then only set the VFTA. Leave the VLVF slots open for + * reserved groups to guarantee their use of HW filtering. + */ +static int +ixgbe_addvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver; + ixgbe_t *ixgbe = rx_group->ixgbe; + struct ixgbe_hw *hw = &ixgbe->hw; + ixgbe_vlan_t *vlp; + int ret; + boolean_t is_def_grp; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + /* + * Let's be sure VLAN filtering is enabled. + */ + VERIFY3B(ixgbe->vlft_enabled, ==, B_TRUE); + is_def_grp = (rx_group->index == ixgbe->rx_def_group); + + /* + * VLAN filtering is enabled but we want to receive untagged + * traffic on this group -- set the AUPE bit on the group and + * leave the VLAN tables alone. + */ + if (vid == MAC_VLAN_UNTAGGED) { + /* + * We never enable AUPE on the default group; it is + * redundant. Untagged traffic which passes L2 + * filtering is delivered to the default group if no + * other group is interested. + */ + if (!is_def_grp) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, + IXGBE_VMOLR(rx_group->index)); + vml2flt |= IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rx_group->index), + vml2flt); + rx_group->aupe = B_TRUE; + } + + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + vlp = ixgbe_find_vlan(rx_group, vid); + if (vlp != NULL) { + /* Only the default group supports multiple clients. */ + VERIFY3B(is_def_grp, ==, B_TRUE); + vlp->ixvl_refs++; + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + /* + * The default group doesn't require a VLVF entry, only a VFTA + * entry. All traffic passing L2 filtering (MPSAR + VFTA) is + * delivered to the default group if no other group is + * interested. The fourth argument, vlvf_bypass, tells the + * ixgbe common code to avoid using a VLVF slot if one isn't + * already allocated to this VLAN. + * + * This logic is meant to reserve VLVF slots for use by + * reserved groups: guaranteeing their use of HW filtering. + */ + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, is_def_grp); + + if (ret == IXGBE_SUCCESS) { + vlp = kmem_zalloc(sizeof (ixgbe_vlan_t), KM_SLEEP); + vlp->ixvl_vid = vid; + vlp->ixvl_refs = 1; + list_insert_tail(&rx_group->vlans, vlp); + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + /* + * We should actually never return ENOSPC because we've set + * things up so that every reserved group is guaranteed to + * have a VLVF slot. + */ + if (ret == IXGBE_ERR_PARAM) + ret = EINVAL; + else if (ret == IXGBE_ERR_NO_SPACE) + ret = ENOSPC; + else + ret = EIO; + + mutex_exit(&ixgbe->gen_lock); + return (ret); +} + +/* + * Attempt to remove the VLAN HW filter associated with this group. If + * we are removing a HW filter for the default group then we know only + * the VFTA was set (VLVF is reserved for non-default/reserved + * groups). If the group wishes to stop receiving untagged traffic + * then clear the AUPE but leave the VLAN filters alone. + */ +static int +ixgbe_remvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver; + ixgbe_t *ixgbe = rx_group->ixgbe; + struct ixgbe_hw *hw = &ixgbe->hw; + int ret; + ixgbe_vlan_t *vlp; + boolean_t is_def_grp; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + is_def_grp = (rx_group->index == ixgbe->rx_def_group); + + /* See the AUPE comment in ixgbe_addvlan(). */ + if (vid == MAC_VLAN_UNTAGGED) { + if (!is_def_grp) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, + IXGBE_VMOLR(rx_group->index)); + vml2flt &= ~IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, + IXGBE_VMOLR(rx_group->index), vml2flt); + rx_group->aupe = B_FALSE; + } + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + vlp = ixgbe_find_vlan(rx_group, vid); + if (vlp == NULL) + return (ENOENT); + + /* + * See the comment in ixgbe_addvlan() about is_def_grp and + * vlvf_bypass. + */ + if (vlp->ixvl_refs == 1) { + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_FALSE, + is_def_grp); + } else { + /* + * Only the default group can have multiple clients. + * If there is more than one client, leave the + * VFTA[vid] bit alone. + */ + VERIFY3B(is_def_grp, ==, B_TRUE); + VERIFY3U(vlp->ixvl_refs, >, 1); + vlp->ixvl_refs--; + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + if (ret != IXGBE_SUCCESS) { + mutex_exit(&ixgbe->gen_lock); + /* IXGBE_ERR_PARAM should be the only possible error here. */ + if (ret == IXGBE_ERR_PARAM) + return (EINVAL); + else + return (EIO); + } + + VERIFY3U(vlp->ixvl_refs, ==, 1); + vlp->ixvl_refs = 0; + list_remove(&rx_group->vlans, vlp); + kmem_free(vlp, sizeof (ixgbe_vlan_t)); + + /* + * Calling ixgbe_set_vfta() on a non-default group may have + * cleared the VFTA[vid] bit even though the default group + * still has clients using the vid. This happens because the + * ixgbe common code doesn't ref count the use of VLANs. Check + * for any use of vid on the default group and make sure the + * VFTA[vid] bit is set. This operation is idempotent: setting + * VFTA[vid] to true if already true won't hurt anything. + */ + if (!is_def_grp) { + ixgbe_rx_group_t *defgrp; + + defgrp = &ixgbe->rx_groups[ixgbe->rx_def_group]; + vlp = ixgbe_find_vlan(defgrp, vid); + if (vlp != NULL) { + /* This shouldn't fail, but if it does return EIO. */ + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, + B_TRUE); + if (ret != IXGBE_SUCCESS) + return (EIO); + } + } + + mutex_exit(&ixgbe->gen_lock); + return (0); +} + /* * Add a mac address. */ diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h index ca52b10c89..baa4766c0e 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h +++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h @@ -27,7 +27,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 Saso Kiselkov. All rights reserved. * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _IXGBE_SW_H @@ -91,6 +91,8 @@ extern "C" { #define MAX_NUM_UNICAST_ADDRESSES 0x80 #define MAX_NUM_MULTICAST_ADDRESSES 0x1000 +#define MAX_NUM_VLAN_FILTERS 0x40 + #define IXGBE_INTR_NONE 0 #define IXGBE_INTR_MSIX 1 #define IXGBE_INTR_MSI 2 @@ -387,6 +389,15 @@ typedef union ixgbe_ether_addr { } mac; } ixgbe_ether_addr_t; +/* + * The list of VLANs an Rx group will accept. + */ +typedef struct ixgbe_vlan { + list_node_t ixvl_link; + uint16_t ixvl_vid; /* The VLAN ID */ + uint_t ixvl_refs; /* Number of users of this VLAN */ +} ixgbe_vlan_t; + typedef enum { USE_NONE, USE_COPY, @@ -589,6 +600,7 @@ typedef struct ixgbe_rx_ring { struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ } ixgbe_rx_ring_t; + /* * Software Receive Ring Group */ @@ -596,6 +608,8 @@ typedef struct ixgbe_rx_group { uint32_t index; /* Group index */ mac_group_handle_t group_handle; /* call back group handle */ struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ + boolean_t aupe; /* AUPE bit */ + list_t vlans; /* list of VLANs to allow */ } ixgbe_rx_group_t; /* @@ -662,6 +676,7 @@ typedef struct ixgbe { */ ixgbe_rx_group_t *rx_groups; /* Array of rx groups */ uint32_t num_rx_groups; /* Number of rx groups in use */ + uint32_t rx_def_group; /* Default Rx group index */ /* * Transmit Rings @@ -715,6 +730,9 @@ typedef struct ixgbe { uint32_t mcast_count; struct ether_addr mcast_table[MAX_NUM_MULTICAST_ADDRESSES]; + boolean_t vlft_enabled; /* VLAN filtering enabled? */ + boolean_t vlft_init; /* VLAN filtering initialized? */ + ulong_t sys_page_size; boolean_t link_check_complete; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index a3cd9dfbb1..0a5eec209f 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,7 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/file.h> @@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks) cv_signal(&so->so_closing_cv); } } + +int +ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg) +{ + return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg)); +} + +void +ksocket_krecv_unblock(ksocket_t ks) +{ + so_krecv_unblock(KSTOSO(ks)); +} diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h index ac5251540f..516a68d358 100644 --- a/usr/src/uts/common/io/ksocket/ksocket_impl.h +++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h @@ -22,11 +22,17 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #ifndef _INET_KSOCKET_KSOCKET_IMPL_H #define _INET_KSOCKET_KSOCKET_IMPL_H +/* + * Note that if this relationship ever changes, the logic in ksocket_krecv_set + * must be updated and we must maintain local state about this on whatever the + * new ksocket object is. + */ #define KSTOSO(ks) ((struct sonode *)(ks)) #define SOTOKS(so) ((ksocket_t)(uintptr_t)(so)) diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c index 74e71ed7e8..759b524186 100644 --- a/usr/src/uts/common/io/ksyms.c +++ b/usr/src/uts/common/io/ksyms.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred) char *addr; void *hptr = NULL; ksyms_buflist_hdr_t hdr; + + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); + bzero(&hdr, sizeof (struct ksyms_buflist_hdr)); list_create(&hdr.blist, PAGESIZE, offsetof(ksyms_buflist_t, buflist_node)); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index bba41d7cf3..f258aad701 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2015 Garrett D'Amore <garrett@damore.org> */ @@ -158,7 +158,7 @@ * perimeter) across a call to any other layer from the mac layer. The call to * any other layer could be via mi_* entry points, classifier entry points into * the driver or via upcall pointers into layers above. The mac perimeter may - * be acquired or held only in the down direction, for e.g. when calling into + * be acquired or held only in the down direction, e.g. when calling into * a mi_* driver enty point to provide atomicity of the operation. * * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across @@ -207,7 +207,7 @@ * number whenever the ring's stop routine is invoked. * See comments in mac_rx_ring(); * - * R17 Similarly mi_stop is another synchronization point and the driver must + * R17. Similarly mi_stop is another synchronization point and the driver must * ensure that all upcalls are done and there won't be any future upcall * before returning from mi_stop. * @@ -227,7 +227,7 @@ * * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind] * - * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename] + * mac perim -> i_dls_devnet_lock [dls_devnet_rename] * * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac * client to driver. In the case of clients that explictly use the mac provided @@ -460,7 +460,7 @@ mac_init(void) mac_logging_interval = 20; mac_flow_log_enable = B_FALSE; mac_link_log_enable = B_FALSE; - mac_logging_timer = 0; + mac_logging_timer = NULL; /* Register to be notified of noteworthy pools events */ mac_pool_event_reg.pec_func = mac_pool_event_cb; @@ -707,12 +707,45 @@ mac_callback_remove_wait(mac_cb_info_t *mcbi) } } +void +mac_callback_barrier(mac_cb_info_t *mcbi) +{ + ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); + ASSERT3U(mcbi->mcbi_barrier_cnt, <, UINT_MAX); + + if (mcbi->mcbi_walker_cnt == 0) { + return; + } + + mcbi->mcbi_barrier_cnt++; + do { + cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); + } while (mcbi->mcbi_walker_cnt > 0); + mcbi->mcbi_barrier_cnt--; + cv_broadcast(&mcbi->mcbi_cv); +} + +void +mac_callback_walker_enter(mac_cb_info_t *mcbi) +{ + mutex_enter(mcbi->mcbi_lockp); + /* + * Incoming walkers should give precedence to timely clean-up of + * deleted callback entries and requested barriers. + */ + while (mcbi->mcbi_del_cnt > 0 || mcbi->mcbi_barrier_cnt > 0) { + cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); + } + mcbi->mcbi_walker_cnt++; + mutex_exit(mcbi->mcbi_lockp); +} + /* * The last mac callback walker does the cleanup. Walk the list and unlik * all the logically deleted entries and construct a temporary list of * removed entries. Return the list of removed entries to the caller. */ -mac_cb_t * +static mac_cb_t * mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head) { mac_cb_t *p; @@ -741,7 +774,90 @@ mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head) return (rmlist); } -boolean_t +void +mac_callback_walker_exit(mac_cb_info_t *mcbi, mac_cb_t **headp, + boolean_t is_promisc) +{ + boolean_t do_wake = B_FALSE; + + mutex_enter(mcbi->mcbi_lockp); + + /* If walkers remain, nothing more can be done for now */ + if (--mcbi->mcbi_walker_cnt != 0) { + mutex_exit(mcbi->mcbi_lockp); + return; + } + + if (mcbi->mcbi_del_cnt != 0) { + mac_cb_t *rmlist; + + rmlist = mac_callback_walker_cleanup(mcbi, headp); + + if (!is_promisc) { + /* The "normal" non-promisc callback clean-up */ + mac_callback_free(rmlist); + } else { + mac_cb_t *mcb, *mcb_next; + + /* + * The promisc callbacks are in 2 lists, one off the + * 'mip' and another off the 'mcip' threaded by + * mpi_mi_link and mpi_mci_link respectively. There + * is, however, only a single shared total walker + * count, and an entry cannot be physically unlinked if + * a walker is active on either list. The last walker + * does this cleanup of logically deleted entries. + * + * With a list of callbacks deleted from above from + * mi_promisc_list (headp), remove the corresponding + * entry from mci_promisc_list (headp_pair) and free + * the structure. + */ + for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { + mac_promisc_impl_t *mpip; + mac_client_impl_t *mcip; + + mcb_next = mcb->mcb_nextp; + mpip = (mac_promisc_impl_t *)mcb->mcb_objp; + mcip = mpip->mpi_mcip; + + ASSERT3P(&mcip->mci_mip->mi_promisc_cb_info, + ==, mcbi); + ASSERT3P(&mcip->mci_mip->mi_promisc_list, + ==, headp); + + VERIFY(mac_callback_remove(mcbi, + &mcip->mci_promisc_list, + &mpip->mpi_mci_link)); + mcb->mcb_flags = 0; + mcb->mcb_nextp = NULL; + kmem_cache_free(mac_promisc_impl_cache, mpip); + } + } + + /* + * Wake any walker threads that could be waiting in + * mac_callback_walker_enter() until deleted items have been + * cleaned from the list. + */ + do_wake = B_TRUE; + } + + if (mcbi->mcbi_barrier_cnt != 0) { + /* + * One or more threads are waiting for all walkers to exit the + * callback list. Notify them, now that the list is clear. + */ + do_wake = B_TRUE; + } + + if (do_wake) { + cv_broadcast(&mcbi->mcbi_cv); + } + mutex_exit(mcbi->mcbi_lockp); +} + +static boolean_t mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) { mac_cb_t *mcb; @@ -755,7 +871,7 @@ mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) return (B_FALSE); } -boolean_t +static boolean_t mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) { boolean_t found; @@ -780,40 +896,6 @@ mac_callback_free(mac_cb_t *rmlist) } } -/* - * The promisc callbacks are in 2 lists, one off the 'mip' and another off the - * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there - * is only a single shared total walker count, and an entry can't be physically - * unlinked if a walker is active on either list. The last walker does this - * cleanup of logically deleted entries. - */ -void -i_mac_promisc_walker_cleanup(mac_impl_t *mip) -{ - mac_cb_t *rmlist; - mac_cb_t *mcb; - mac_cb_t *mcb_next; - mac_promisc_impl_t *mpip; - - /* - * Construct a temporary list of deleted callbacks by walking the - * the mi_promisc_list. Then for each entry in the temporary list, - * remove it from the mci_promisc_list and free the entry. - */ - rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info, - &mip->mi_promisc_list); - - for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { - mcb_next = mcb->mcb_nextp; - mpip = (mac_promisc_impl_t *)mcb->mcb_objp; - VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info, - &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link)); - mcb->mcb_flags = 0; - mcb->mcb_nextp = NULL; - kmem_cache_free(mac_promisc_impl_cache, mpip); - } -} - void i_mac_notify(mac_impl_t *mip, mac_notify_type_t type) { @@ -1115,9 +1197,10 @@ mac_start(mac_handle_t mh) if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) { /* - * Start the default ring, since it will be needed - * to receive broadcast and multicast traffic for - * both primary and non-primary MAC clients. + * Start the default group which is responsible + * for receiving broadcast and multicast + * traffic for both primary and non-primary + * MAC clients. */ ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED); err = mac_start_group_and_rings(defgrp); @@ -1456,7 +1539,7 @@ mac_rx_group_unmark(mac_group_t *grp, uint_t flag) * used by the aggr driver to access and control the underlying HW Rx group * and rings. In this case, the aggr driver has exclusive control of the * underlying HW Rx group/rings, it calls the following functions to - * start/stop the HW Rx rings, disable/enable polling, add/remove mac' + * start/stop the HW Rx rings, disable/enable polling, add/remove MAC * addresses, or set up the Rx callback. */ /* ARGSUSED */ @@ -1501,8 +1584,9 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, ASSERT(B_FALSE); return (-1); } + /* - * The mac client did not reserve any RX group, return directly. + * The MAC client did not reserve an Rx group, return directly. * This is probably because the underlying MAC does not support * any groups. */ @@ -1511,7 +1595,7 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, if (grp == NULL) return (0); /* - * This group must be reserved by this mac client. + * This group must be reserved by this MAC client. */ ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && (mcip == MAC_GROUP_ONLY_CLIENT(grp))); @@ -1527,6 +1611,77 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, } /* + * Get the HW ring handles of the given group index. If the MAC + * doesn't have a group at this index, or any groups at all, then 0 is + * returned and hwgh is set to NULL. This is a private client API. The + * MAC perimeter must be held when calling this function. + * + * mh: A handle to the MAC that owns the group. + * + * idx: The index of the HW group to be read. + * + * hwgh: If non-NULL, contains a handle to the HW group on return. + * + * hwrh: An array of ring handles pointing to the HW rings in the + * group. The array must be large enough to hold a handle to each ring + * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP. + * + * rtype: Used to determine if we are fetching Rx or Tx rings. + * + * Returns the number of rings in the group. + */ +uint_t +mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh, + mac_ring_handle_t *hwrh, mac_ring_type_t rtype) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_group_t *grp; + mac_ring_t *ring; + uint_t cnt = 0; + + /* + * The MAC perimeter must be held when accessing the + * mi_{rx,tx}_groups fields. + */ + ASSERT(MAC_PERIM_HELD(mh)); + ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX); + + if (rtype == MAC_RING_TYPE_RX) { + grp = mip->mi_rx_groups; + } else if (rtype == MAC_RING_TYPE_TX) { + grp = mip->mi_tx_groups; + } + + while (grp != NULL && grp->mrg_index != idx) + grp = grp->mrg_next; + + /* + * If the MAC doesn't have a group at this index or doesn't + * impelement RINGS capab, then set hwgh to NULL and return 0. + */ + if (hwgh != NULL) + *hwgh = NULL; + + if (grp == NULL) + return (0); + + ASSERT3U(idx, ==, grp->mrg_index); + + for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) { + ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP); + hwrh[cnt] = (mac_ring_handle_t)ring; + } + + /* A group should always have at least one ring. */ + ASSERT3U(cnt, >, 0); + + if (hwgh != NULL) + *hwgh = (mac_group_handle_t)grp; + + return (cnt); +} + +/* * This function is called to get info about Tx/Rx rings. * * Return value: returns uint_t which will have various bits set @@ -1542,6 +1697,69 @@ mac_hwring_getinfo(mac_ring_handle_t rh) } /* + * Set the passthru callback on the hardware ring. + */ +void +mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1, + mac_resource_handle_t arg2) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER; + + hwring->mr_pt_fn = fn; + hwring->mr_pt_arg1 = arg1; + hwring->mr_pt_arg2 = arg2; +} + +/* + * Clear the passthru callback on the hardware ring. + */ +void +mac_hwring_clear_passthru(mac_ring_handle_t hwrh) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_NO_CLASSIFIER; + + hwring->mr_pt_fn = NULL; + hwring->mr_pt_arg1 = NULL; + hwring->mr_pt_arg2 = NULL; +} + +void +mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)func; + flent->fe_cb_arg1 = arg1; + flent->fe_cb_arg2 = NULL; + flent->fe_flags &= ~FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +void +mac_client_clear_flow_cb(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; + flent->fe_cb_arg1 = NULL; + flent->fe_cb_arg2 = NULL; + flent->fe_flags |= FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +/* * Export ddi interrupt handles from the HW ring to the pseudo ring and * setup the RX callback of the mac client which exclusively controls * HW ring. @@ -1613,17 +1831,56 @@ mac_hwring_enable_intr(mac_ring_handle_t rh) return (intr->mi_enable(intr->mi_handle)); } +/* + * Start the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ int mac_hwring_start(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; + int rv = 0; + + if (rr_ring->mr_state != MR_INUSE) + rv = mac_start_ring(rr_ring); + + return (rv); +} + +/* + * Stop the HW ring pointed to by rh. Also see mac_hwring_start(). + */ +void +mac_hwring_stop(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; + + if (rr_ring->mr_state != MR_FREE) + mac_stop_ring(rr_ring); +} + +/* + * Remove the quiesced flag from the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ +int +mac_hwring_activate(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; MAC_RING_UNMARK(rr_ring, MR_QUIESCE); return (0); } +/* + * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate(). + */ void -mac_hwring_stop(mac_ring_handle_t rh) +mac_hwring_quiesce(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; @@ -1730,6 +1987,68 @@ mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr) } /* + * Program the group's HW VLAN filter if it has such support. + * Otherwise, the group will implicitly accept tagged traffic and + * there is nothing to do. + */ +int +mac_hwgroup_addvlan(mac_group_handle_t gh, uint16_t vid) +{ + mac_group_t *group = (mac_group_t *)gh; + + if (!MAC_GROUP_HW_VLAN(group)) + return (0); + + return (mac_group_addvlan(group, vid)); +} + +int +mac_hwgroup_remvlan(mac_group_handle_t gh, uint16_t vid) +{ + mac_group_t *group = (mac_group_t *)gh; + + if (!MAC_GROUP_HW_VLAN(group)) + return (0); + + return (mac_group_remvlan(group, vid)); +} + +/* + * Determine if a MAC has HW VLAN support. This is a private API + * consumed by aggr. In the future it might be nice to have a bitfield + * in mac_capab_rings_t to track which forms of HW filtering are + * supported by the MAC. + */ +boolean_t +mac_has_hw_vlan(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (MAC_GROUP_HW_VLAN(mip->mi_rx_groups)); +} + +/* + * Get the number of Rx HW groups on this MAC. + */ +uint_t +mac_get_num_rx_groups(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (mip->mi_rx_group_count); +} + +int +mac_set_promisc(mac_handle_t mh, boolean_t value) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (i_mac_promisc_set(mip, value)); +} + +/* * Set the RX group to be shared/reserved. Note that the group must be * started/stopped outside of this function. */ @@ -2416,7 +2735,6 @@ mac_disable(mac_handle_t mh) /* * Called when the MAC instance has a non empty flow table, to de-multiplex * incoming packets to the right flow. - * The MAC's rw lock is assumed held as a READER. */ /* ARGSUSED */ static mblk_t * @@ -2426,19 +2744,6 @@ mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp) uint_t flags = FLOW_INBOUND; int err; - /* - * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN - * to mac_flow_lookup() so that the VLAN packets can be successfully - * passed to the non-VLAN aggregation flows. - * - * Note that there is possibly a race between this and - * mac_unicast_remove/add() and VLAN packets could be incorrectly - * classified to non-VLAN flows of non-aggregation mac clients. These - * VLAN packets will be then filtered out by the mac module. - */ - if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0) - flags |= FLOW_IGNORE_VLAN; - err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent); if (err != 0) { /* no registered receive function */ @@ -3772,9 +4077,27 @@ mac_start_group_and_rings(mac_group_t *group) for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { ASSERT(ring->mr_state == MR_FREE); + if ((rv = mac_start_ring(ring)) != 0) goto error; - ring->mr_classify_type = MAC_SW_CLASSIFIER; + + /* + * When aggr_set_port_sdu() is called, it will remove + * the port client's unicast address. This will cause + * MAC to stop the default group's rings on the port + * MAC. After it modifies the SDU, it will then re-add + * the unicast address. At which time, this function is + * called to start the default group's rings. Normally + * this function would set the classify type to + * MAC_SW_CLASSIFIER; but that will break aggr which + * relies on the passthru classify mode being set for + * correct delivery (see mac_rx_common()). To avoid + * that, we check for a passthru callback and set the + * classify type to MAC_PASSTHRU_CLASSIFIER; as it was + * before the rings were stopped. + */ + ring->mr_classify_type = (ring->mr_pt_fn != NULL) ? + MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER; } return (0); @@ -4077,12 +4400,15 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) /* - * Driver must register group->mgi_addmac/remmac() for rx groups - * to support multiple MAC addresses. + * The driver must register some form of hardware MAC + * filter in order for Rx groups to support multiple + * MAC addresses. */ if (rtype == MAC_RING_TYPE_RX && - ((group_info.mgi_addmac == NULL) || - (group_info.mgi_remmac == NULL))) { + (group_info.mgi_addmac == NULL || + group_info.mgi_remmac == NULL)) { + DTRACE_PROBE1(mac__init__rings__no__mac__filter, + char *, mip->mi_name); err = EINVAL; goto bail; } @@ -4129,8 +4455,9 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) /* Update this group's status */ mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED); - } else + } else { group->mrg_rings = NULL; + } ASSERT(ring_left == 0); @@ -4320,6 +4647,38 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) } /* + * Associate the VLAN filter to the receive group. + */ +int +mac_group_addvlan(mac_group_t *group, uint16_t vlan) +{ + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_addvlan, !=, NULL); + + if (vlan > VLAN_ID_MAX) + return (EINVAL); + + vlan = MAC_VLAN_UNTAGGED_VID(vlan); + return (group->mrg_info.mgi_addvlan(group->mrg_info.mgi_driver, vlan)); +} + +/* + * Dissociate the VLAN from the receive group. + */ +int +mac_group_remvlan(mac_group_t *group, uint16_t vlan) +{ + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_remvlan, !=, NULL); + + if (vlan > VLAN_ID_MAX) + return (EINVAL); + + vlan = MAC_VLAN_UNTAGGED_VID(vlan); + return (group->mrg_info.mgi_remvlan(group->mrg_info.mgi_driver, vlan)); +} + +/* * Associate a MAC address with a receive group. * * The return value of this function should always be checked properly, because @@ -4335,8 +4694,8 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) int mac_group_addmac(mac_group_t *group, const uint8_t *addr) { - ASSERT(group->mrg_type == MAC_RING_TYPE_RX); - ASSERT(group->mrg_info.mgi_addmac != NULL); + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_addmac, !=, NULL); return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr)); } @@ -4347,8 +4706,8 @@ mac_group_addmac(mac_group_t *group, const uint8_t *addr) int mac_group_remmac(mac_group_t *group, const uint8_t *addr) { - ASSERT(group->mrg_type == MAC_RING_TYPE_RX); - ASSERT(group->mrg_info.mgi_remmac != NULL); + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_remmac, !=, NULL); return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr)); } @@ -4523,28 +4882,20 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) switch (ring->mr_type) { case MAC_RING_TYPE_RX: /* - * Setup SRS on top of the new ring if the group is - * reserved for someones exclusive use. + * Setup an SRS on top of the new ring if the group is + * reserved for someone's exclusive use. */ if (group->mrg_state == MAC_GROUP_STATE_RESERVED) { - mac_client_impl_t *mcip; + mac_client_impl_t *mcip = MAC_GROUP_ONLY_CLIENT(group); - mcip = MAC_GROUP_ONLY_CLIENT(group); - /* - * Even though this group is reserved we migth still - * have multiple clients, i.e a VLAN shares the - * group with the primary mac client. - */ - if (mcip != NULL) { - flent = mcip->mci_flent; - ASSERT(flent->fe_rx_srs_cnt > 0); - mac_rx_srs_group_setup(mcip, flent, SRST_LINK); - mac_fanout_setup(mcip, flent, - MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, - mcip, NULL, NULL); - } else { - ring->mr_classify_type = MAC_SW_CLASSIFIER; - } + VERIFY3P(mcip, !=, NULL); + flent = mcip->mci_flent; + VERIFY3S(flent->fe_rx_srs_cnt, >, 0); + mac_rx_srs_group_setup(mcip, flent, SRST_LINK); + mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), + mac_rx_deliver, mcip, NULL, NULL); + } else { + ring->mr_classify_type = MAC_SW_CLASSIFIER; } break; case MAC_RING_TYPE_TX: @@ -4570,7 +4921,7 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) mcip = mgcp->mgc_client; flent = mcip->mci_flent; - is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR); + is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT); mac_srs = MCIP_TX_SRS(mcip); tx = &mac_srs->srs_tx; mac_tx_client_quiesce((mac_client_handle_t)mcip); @@ -4714,7 +5065,7 @@ i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring, mcip = MAC_GROUP_ONLY_CLIENT(group); ASSERT(mcip != NULL); - ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR); + ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT); mac_srs = MCIP_TX_SRS(mcip); ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR || mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR); @@ -4922,12 +5273,12 @@ mac_free_macaddr(mac_address_t *map) mac_impl_t *mip = map->ma_mip; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - ASSERT(mip->mi_addresses != NULL); - - map = mac_find_macaddr(mip, map->ma_addr); + VERIFY3P(mip->mi_addresses, !=, NULL); - ASSERT(map != NULL); - ASSERT(map->ma_nusers == 0); + VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr)); + VERIFY3P(map, !=, NULL); + VERIFY3S(map->ma_nusers, ==, 0); + VERIFY3P(map->ma_vlans, ==, NULL); if (map == mip->mi_addresses) { mip->mi_addresses = map->ma_next; @@ -4943,85 +5294,201 @@ mac_free_macaddr(mac_address_t *map) kmem_free(map, sizeof (mac_address_t)); } +static mac_vlan_t * +mac_find_vlan(mac_address_t *map, uint16_t vid) +{ + mac_vlan_t *mvp; + + for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) { + if (mvp->mv_vid == vid) + return (mvp); + } + + return (NULL); +} + +static mac_vlan_t * +mac_add_vlan(mac_address_t *map, uint16_t vid) +{ + mac_vlan_t *mvp; + + /* + * We should never add the same {addr, VID} tuple more + * than once, but let's be sure. + */ + for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) + VERIFY3U(mvp->mv_vid, !=, vid); + + /* Add the VLAN to the head of the VLAN list. */ + mvp = kmem_zalloc(sizeof (mac_vlan_t), KM_SLEEP); + mvp->mv_vid = vid; + mvp->mv_next = map->ma_vlans; + map->ma_vlans = mvp; + + return (mvp); +} + +static void +mac_rem_vlan(mac_address_t *map, mac_vlan_t *mvp) +{ + mac_vlan_t *pre; + + if (map->ma_vlans == mvp) { + map->ma_vlans = mvp->mv_next; + } else { + pre = map->ma_vlans; + while (pre->mv_next != mvp) { + pre = pre->mv_next; + + /* + * We've reached the end of the list without + * finding mvp. + */ + VERIFY3P(pre, !=, NULL); + } + pre->mv_next = mvp->mv_next; + } + + kmem_free(mvp, sizeof (mac_vlan_t)); +} + /* - * Add a MAC address reference for a client. If the desired MAC address - * exists, add a reference to it. Otherwise, add the new address by adding - * it to a reserved group or setting promiscuous mode. Won't try different - * group is the group is non-NULL, so the caller must explictly share - * default group when needed. - * - * Note, the primary MAC address is initialized at registration time, so - * to add it to default group only need to activate it if its reference - * count is still zero. Also, some drivers may not have advertised RINGS - * capability. + * Create a new mac_address_t if this is the first use of the address + * or add a VID to an existing address. In either case, the + * mac_address_t acts as a list of {addr, VID} tuples where each tuple + * shares the same addr. If group is non-NULL then attempt to program + * the MAC's HW filters for this group. Otherwise, if group is NULL, + * then the MAC has no rings and there is nothing to program. */ int -mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr, - boolean_t use_hw) +mac_add_macaddr_vlan(mac_impl_t *mip, mac_group_t *group, uint8_t *addr, + uint16_t vid, boolean_t use_hw) { - mac_address_t *map; - int err = 0; - boolean_t allocated_map = B_FALSE; + mac_address_t *map; + mac_vlan_t *mvp; + int err = 0; + boolean_t allocated_map = B_FALSE; + boolean_t hw_mac = B_FALSE; + boolean_t hw_vlan = B_FALSE; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - map = mac_find_macaddr(mip, mac_addr); + map = mac_find_macaddr(mip, addr); /* - * If the new MAC address has not been added. Allocate a new one - * and set it up. + * If this is the first use of this MAC address then allocate + * and initialize a new structure. */ if (map == NULL) { map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); map->ma_len = mip->mi_type->mt_addr_length; - bcopy(mac_addr, map->ma_addr, map->ma_len); + bcopy(addr, map->ma_addr, map->ma_len); map->ma_nusers = 0; map->ma_group = group; map->ma_mip = mip; + map->ma_untagged = B_FALSE; - /* add the new MAC address to the head of the address list */ + /* Add the new MAC address to the head of the address list. */ map->ma_next = mip->mi_addresses; mip->mi_addresses = map; allocated_map = B_TRUE; } - ASSERT(map->ma_group == NULL || map->ma_group == group); + VERIFY(map->ma_group == NULL || map->ma_group == group); if (map->ma_group == NULL) map->ma_group = group; + if (vid == VLAN_ID_NONE) { + map->ma_untagged = B_TRUE; + mvp = NULL; + } else { + mvp = mac_add_vlan(map, vid); + } + + /* + * Set the VLAN HW filter if: + * + * o the MAC's VLAN HW filtering is enabled, and + * o the address does not currently rely on promisc mode. + * + * This is called even when the client specifies an untagged + * address (VLAN_ID_NONE) because some MAC providers require + * setting additional bits to accept untagged traffic when + * VLAN HW filtering is enabled. + */ + if (MAC_GROUP_HW_VLAN(group) && + map->ma_type != MAC_ADDRESS_TYPE_UNICAST_PROMISC) { + if ((err = mac_group_addvlan(group, vid)) != 0) + goto bail; + + hw_vlan = B_TRUE; + } + + VERIFY3S(map->ma_nusers, >=, 0); + map->ma_nusers++; + /* - * If the MAC address is already in use, simply account for the - * new client. + * If this MAC address already has a HW filter then simply + * increment the counter. */ - if (map->ma_nusers++ > 0) + if (map->ma_nusers > 1) return (0); /* + * All logic from here on out is executed during initial + * creation only. + */ + VERIFY3S(map->ma_nusers, ==, 1); + + /* * Activate this MAC address by adding it to the reserved group. */ if (group != NULL) { - err = mac_group_addmac(group, (const uint8_t *)mac_addr); - if (err == 0) { - map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; - return (0); + err = mac_group_addmac(group, (const uint8_t *)addr); + + /* + * If the driver is out of filters then we can + * continue and use promisc mode. For any other error, + * assume the driver is in a state where we can't + * program the filters or use promisc mode; so we must + * bail. + */ + if (err != 0 && err != ENOSPC) { + map->ma_nusers--; + goto bail; } + + hw_mac = (err == 0); + } + + if (hw_mac) { + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; + return (0); } /* * The MAC address addition failed. If the client requires a - * hardware classified MAC address, fail the operation. + * hardware classified MAC address, fail the operation. This + * feature is only used by sun4v vsw. */ - if (use_hw) { + if (use_hw && !hw_mac) { err = ENOSPC; + map->ma_nusers--; goto bail; } /* - * Try promiscuous mode. - * - * For drivers that don't advertise RINGS capability, do - * nothing for the primary address. + * If we reach this point then either the MAC doesn't have + * RINGS capability or we are out of MAC address HW filters. + * In any case we must put the MAC into promiscuous mode. + */ + VERIFY(group == NULL || !hw_mac); + + /* + * The one exception is the primary address. A non-RINGS + * driver filters the primary address by default; promisc mode + * is not needed. */ if ((group == NULL) && (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) { @@ -5030,53 +5497,76 @@ mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr, } /* - * Enable promiscuous mode in order to receive traffic - * to the new MAC address. + * Enable promiscuous mode in order to receive traffic to the + * new MAC address. All existing HW filters still send their + * traffic to their respective group/SRSes. But with promisc + * enabled all unknown traffic is delivered to the default + * group where it is SW classified via mac_rx_classify(). */ if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) { map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC; return (0); } - /* - * Free the MAC address that could not be added. Don't free - * a pre-existing address, it could have been the entry - * for the primary MAC address which was pre-allocated by - * mac_init_macaddr(), and which must remain on the list. - */ bail: - map->ma_nusers--; + if (hw_vlan) { + int err2 = mac_group_remvlan(group, vid); + + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from group" + " %d on MAC %s: %d.", vid, group->mrg_index, + mip->mi_name, err2); + } + } + + if (mvp != NULL) + mac_rem_vlan(map, mvp); + if (allocated_map) mac_free_macaddr(map); + return (err); } -/* - * Remove a reference to a MAC address. This may cause to remove the MAC - * address from an associated group or to turn off promiscuous mode. - * The caller needs to handle the failure properly. - */ int -mac_remove_macaddr(mac_address_t *map) +mac_remove_macaddr_vlan(mac_address_t *map, uint16_t vid) { - mac_impl_t *mip = map->ma_mip; - int err = 0; + mac_vlan_t *mvp; + mac_impl_t *mip = map->ma_mip; + mac_group_t *group = map->ma_group; + int err = 0; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr)); - ASSERT(map == mac_find_macaddr(mip, map->ma_addr)); + if (vid == VLAN_ID_NONE) { + map->ma_untagged = B_FALSE; + mvp = NULL; + } else { + mvp = mac_find_vlan(map, vid); + VERIFY3P(mvp, !=, NULL); + } + + if (MAC_GROUP_HW_VLAN(group) && + map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED && + ((err = mac_group_remvlan(group, vid)) != 0)) + return (err); + + if (mvp != NULL) + mac_rem_vlan(map, mvp); /* * If it's not the last client using this MAC address, only update * the MAC clients count. */ - if (--map->ma_nusers > 0) + map->ma_nusers--; + if (map->ma_nusers > 0) return (0); /* - * The MAC address is no longer used by any MAC client, so remove - * it from its associated group, or turn off promiscuous mode - * if it was enabled for the MAC address. + * The MAC address is no longer used by any MAC client, so + * remove it from its associated group. Turn off promiscuous + * mode if this is the last address relying on it. */ switch (map->ma_type) { case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: @@ -5084,18 +5574,44 @@ mac_remove_macaddr(mac_address_t *map) * Don't free the preset primary address for drivers that * don't advertise RINGS capability. */ - if (map->ma_group == NULL) + if (group == NULL) return (0); - err = mac_group_remmac(map->ma_group, map->ma_addr); - if (err == 0) - map->ma_group = NULL; + if ((err = mac_group_remmac(group, map->ma_addr)) != 0) { + if (vid == VLAN_ID_NONE) + map->ma_untagged = B_TRUE; + else + (void) mac_add_vlan(map, vid); + + /* + * If we fail to remove the MAC address HW + * filter but then also fail to re-add the + * VLAN HW filter then we are in a busted + * state and should just crash. + */ + if (MAC_GROUP_HW_VLAN(group)) { + int err2; + + err2 = mac_group_addvlan(group, vid); + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to readd VLAN" + " %u to group %d on MAC %s: %d.", + vid, group->mrg_index, mip->mi_name, + err2); + } + } + + return (err); + } + + map->ma_group = NULL; break; case MAC_ADDRESS_TYPE_UNICAST_PROMISC: err = i_mac_promisc_set(mip, B_FALSE); break; default: - ASSERT(B_FALSE); + panic("Unexpected ma_type 0x%x, file: %s, line %d", + map->ma_type, __FILE__, __LINE__); } if (err != 0) @@ -5252,8 +5768,9 @@ mac_fini_macaddr(mac_impl_t *mip) * If mi_addresses is initialized, there should be exactly one * entry left on the list with no users. */ - ASSERT(map->ma_nusers == 0); - ASSERT(map->ma_next == NULL); + VERIFY3S(map->ma_nusers, ==, 0); + VERIFY3P(map->ma_next, ==, NULL); + VERIFY3P(map->ma_vlans, ==, NULL); kmem_free(map, sizeof (mac_address_t)); mip->mi_addresses = NULL; @@ -5815,7 +6332,7 @@ mac_stop_logusage(mac_logtype_t type) mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate); (void) untimeout(mac_logging_timer); - mac_logging_timer = 0; + mac_logging_timer = NULL; /* Write log entries for each mac_impl in the list */ i_mac_log_info(&net_log_list, &lstate); @@ -5933,7 +6450,7 @@ mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring) } /* - * For a reserved group with multiple clients, return the primary client. + * For a non-default group with multiple clients, return the primary client. */ static mac_client_impl_t * mac_get_grp_primary(mac_group_t *grp) @@ -6292,13 +6809,12 @@ mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip) break; } - VERIFY(mgcp == NULL); + ASSERT(mgcp == NULL); mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP); mgcp->mgc_client = mcip; mgcp->mgc_next = grp->mrg_clients; grp->mrg_clients = mgcp; - } void @@ -6319,8 +6835,27 @@ mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip) } /* - * mac_reserve_rx_group() - * + * Return true if any client on this group explicitly asked for HW + * rings (of type mask) or have a bound share. + */ +static boolean_t +i_mac_clients_hw(mac_group_t *grp, uint32_t mask) +{ + mac_grp_client_t *mgcip; + mac_client_impl_t *mcip; + mac_resource_props_t *mrp; + + for (mgcip = grp->mrg_clients; mgcip != NULL; mgcip = mgcip->mgc_next) { + mcip = mgcip->mgc_client; + mrp = MCIP_RESOURCE_PROPS(mcip); + if (mcip->mci_share != NULL || (mrp->mrp_mask & mask) != 0) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Finds an available group and exclusively reserves it for a client. * The group is chosen to suit the flow's resource controls (bandwidth and * fanout requirements) and the address type. @@ -6343,7 +6878,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) int need_rings = 0; mac_group_t *candidate_grp = NULL; mac_client_impl_t *gclient; - mac_resource_props_t *gmrp; mac_group_t *donorgrp = NULL; boolean_t rxhw = mrp->mrp_mask & MRP_RX_RINGS; boolean_t unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC; @@ -6354,18 +6888,20 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC; /* - * Check if a group already has this mac address (case of VLANs) + * Check if a group already has this MAC address (case of VLANs) * unless we are moving this MAC client from one group to another. */ if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) { if (map->ma_group != NULL) return (map->ma_group); } + if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0) return (NULL); + /* - * If exclusive open, return NULL which will enable the - * caller to use the default group. + * If this client is requesting exclusive MAC access then + * return NULL to ensure the client uses the default group. */ if (mcip->mci_state_flags & MCIS_EXCLUSIVE) return (NULL); @@ -6375,6 +6911,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) { mrp->mrp_nrxrings = 1; } + /* * For static grouping we allow only specifying rings=0 and * unspecified @@ -6383,6 +6920,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) { return (NULL); } + if (rxhw) { /* * We have explicitly asked for a group (with nrxrings, @@ -6444,25 +6982,19 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) * that didn't ask for an exclusive group, but got * one and it has enough rings (combined with what * the donor group can donate) for the new MAC - * client + * client. */ if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) { /* - * If the primary/donor group is not the default - * group, don't bother looking for a candidate group. - * If we don't have enough rings we will check - * if the primary group can be vacated. + * If the donor group is not the default + * group, don't bother looking for a candidate + * group. If we don't have enough rings we + * will check if the primary group can be + * vacated. */ if (candidate_grp == NULL && donorgrp == MAC_DEFAULT_RX_GROUP(mip)) { - ASSERT(!MAC_GROUP_NO_CLIENT(grp)); - gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); - ASSERT(gclient != NULL); - gmrp = MCIP_RESOURCE_PROPS(gclient); - if (gclient->mci_share == 0 && - (gmrp->mrp_mask & MRP_RX_RINGS) == 0 && + if (!i_mac_clients_hw(grp, MRP_RX_RINGS) && (unspec || (grp->mrg_cur_count + donor_grp_rcnt >= need_rings))) { @@ -6528,6 +7060,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) */ mac_stop_group(grp); } + /* We didn't find an exclusive group for this MAC client */ if (i >= mip->mi_rx_group_count) { @@ -6535,12 +7068,12 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) return (NULL); /* - * If we found a candidate group then we switch the - * MAC client from the candidate_group to the default - * group and give the group to this MAC client. If - * we didn't find a candidate_group, check if the - * primary is in its own group and if it can make way - * for this MAC client. + * If we found a candidate group then move the + * existing MAC client from the candidate_group to the + * default group and give the candidate_group to the + * new MAC client. If we didn't find a candidate + * group, then check if the primary is in its own + * group and if it can make way for this MAC client. */ if (candidate_grp == NULL && donorgrp != MAC_DEFAULT_RX_GROUP(mip) && @@ -6551,15 +7084,15 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) boolean_t prim_grp = B_FALSE; /* - * Switch the MAC client from the candidate group - * to the default group.. If this group was the - * donor group, then after the switch we need - * to update the donor group too. + * Switch the existing MAC client from the + * candidate group to the default group. If + * the candidate group is the donor group, + * then after the switch we need to update the + * donor group too. */ grp = candidate_grp; - gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + gclient = grp->mrg_clients->mgc_client; + VERIFY3P(gclient, !=, NULL); if (grp == mip->mi_rx_donor_grp) prim_grp = B_TRUE; if (mac_rx_switch_group(gclient, grp, @@ -6572,7 +7105,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) donorgrp = MAC_DEFAULT_RX_GROUP(mip); } - /* * Now give this group with the required rings * to this MAC client. @@ -6620,10 +7152,10 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) /* * mac_rx_release_group() * - * This is called when there are no clients left for the group. - * The group is stopped and marked MAC_GROUP_STATE_REGISTERED, - * and if it is a non default group, the shares are removed and - * all rings are assigned back to default group. + * Release the group when it has no remaining clients. The group is + * stopped and its shares are removed and all rings are assigned back + * to default group. This should never be called against the default + * group. */ void mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) @@ -6632,6 +7164,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) mac_ring_t *ring; ASSERT(group != MAC_DEFAULT_RX_GROUP(mip)); + ASSERT(MAC_GROUP_NO_CLIENT(group) == B_TRUE); if (mip->mi_rx_donor_grp == group) mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip); @@ -6683,56 +7216,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) } /* - * When we move the primary's mac address between groups, we need to also - * take all the clients sharing the same mac address along with it (VLANs) - * We remove the mac address for such clients from the group after quiescing - * them. When we add the mac address we restart the client. Note that - * the primary's mac address is removed from the group after all the - * other clients sharing the address are removed. Similarly, the primary's - * mac address is added before all the other client's mac address are - * added. While grp is the group where the clients reside, tgrp is - * the group where the addresses have to be added. - */ -static void -mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp, - mac_group_t *tgrp, uint8_t *maddr, boolean_t add) -{ - mac_impl_t *mip = mcip->mci_mip; - mac_grp_client_t *mgcp = grp->mrg_clients; - mac_client_impl_t *gmcip; - boolean_t prim; - - prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; - - /* - * If the clients are in a non-default group, we just have to - * walk the group's client list. If it is in the default group - * (which will be shared by other clients as well, we need to - * check if the unicast address matches mcip's unicast. - */ - while (mgcp != NULL) { - gmcip = mgcp->mgc_client; - if (gmcip != mcip && - (grp != MAC_DEFAULT_RX_GROUP(mip) || - mcip->mci_unicast == gmcip->mci_unicast)) { - if (!add) { - mac_rx_client_quiesce( - (mac_client_handle_t)gmcip); - (void) mac_remove_macaddr(mcip->mci_unicast); - } else { - (void) mac_add_macaddr(mip, tgrp, maddr, prim); - mac_rx_client_restart( - (mac_client_handle_t)gmcip); - } - } - mgcp = mgcp->mgc_next; - } -} - - -/* - * Move the MAC address from fgrp to tgrp. If this is the primary client, - * we need to take any VLANs etc. together too. + * Move the MAC address from fgrp to tgrp. */ static int mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp, @@ -6741,56 +7225,86 @@ mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp, mac_impl_t *mip = mcip->mci_mip; uint8_t maddr[MAXMACADDRLEN]; int err = 0; - boolean_t prim; - boolean_t multiclnt = B_FALSE; + uint16_t vid; + mac_unicast_impl_t *muip; + boolean_t use_hw; mac_rx_client_quiesce((mac_client_handle_t)mcip); - ASSERT(mcip->mci_unicast != NULL); + VERIFY3P(mcip->mci_unicast, !=, NULL); bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len); - prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; - if (mcip->mci_unicast->ma_nusers > 1) { - mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE); - multiclnt = B_TRUE; - } - ASSERT(mcip->mci_unicast->ma_nusers == 1); - err = mac_remove_macaddr(mcip->mci_unicast); + /* + * Does the client require MAC address hardware classifiction? + */ + use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; + vid = i_mac_flow_vid(mcip->mci_flent); + + /* + * You can never move an address that is shared by multiple + * clients. mac_datapath_setup() ensures that clients sharing + * an address are placed on the default group. This guarantees + * that a non-default group will only ever have one client and + * thus make full use of HW filters. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (EINVAL); + + err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid); + if (err != 0) { mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) { - mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr, - B_TRUE); - } return (err); } + + /* + * If this isn't the primary MAC address then the + * mac_address_t has been freed by the last call to + * mac_remove_macaddr_vlan(). In any case, NULL the reference + * to avoid a dangling pointer. + */ + mcip->mci_unicast = NULL; + + /* + * We also have to NULL all the mui_map references -- sun4v + * strikes again! + */ + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next) + muip->mui_map = NULL; + rw_exit(&mcip->mci_rw_lock); + /* - * Program the H/W Classifier first, if this fails we need - * not proceed with the other stuff. + * Program the H/W Classifier first, if this fails we need not + * proceed with the other stuff. */ - if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) { + if ((err = mac_add_macaddr_vlan(mip, tgrp, maddr, vid, use_hw)) != 0) { + int err2; + /* Revert back the H/W Classifier */ - if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) { - /* - * This should not fail now since it worked earlier, - * should we panic? - */ - cmn_err(CE_WARN, - "mac_rx_switch_group: switching %p back" - " to group %p failed!!", (void *)mcip, - (void *)fgrp); + err2 = mac_add_macaddr_vlan(mip, fgrp, maddr, vid, use_hw); + + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to revert HW classification" + " on MAC %s, for client %s: %d.", mip->mi_name, + mcip->mci_name, err2); } + mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) { - mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr, - B_TRUE); - } return (err); } + + /* + * Get a reference to the new mac_address_t and update the + * client's reference. Then restart the client and add the + * other clients of this MAC addr (if they exsit). + */ mcip->mci_unicast = mac_find_macaddr(mip, maddr); + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next) + muip->mui_map = mcip->mci_unicast; + rw_exit(&mcip->mci_rw_lock); mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) - mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE); - return (err); + return (0); } /* @@ -6811,19 +7325,34 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, mac_impl_t *mip = mcip->mci_mip; mac_grp_client_t *mgcp; - ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group); + VERIFY3P(fgrp, ==, mcip->mci_flent->fe_rx_ring_group); if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0) return (err); /* - * The group might be reserved, but SRSs may not be set up, e.g. - * primary and its vlans using a reserved group. + * If the group is marked as reserved and in use by a single + * client, then there is an SRS to teardown. */ if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED && MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) { mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE); } + + /* + * If we are moving the client from a non-default group, then + * we know that any additional clients on this group share the + * same MAC address. Since we moved the MAC address filter, we + * need to move these clients too. + * + * If we are moving the client from the default group and its + * MAC address has VLAN clients, then we must move those + * clients as well. + * + * In both cases the idea is the same: we moved the MAC + * address filter to the tgrp, so we must move all clients + * using that MAC address to tgrp as well. + */ if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) { mgcp = fgrp->mrg_clients; while (mgcp != NULL) { @@ -6834,20 +7363,21 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, gmcip->mci_flent->fe_rx_ring_group = tgrp; } mac_release_rx_group(mcip, fgrp); - ASSERT(MAC_GROUP_NO_CLIENT(fgrp)); + VERIFY3B(MAC_GROUP_NO_CLIENT(fgrp), ==, B_TRUE); mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED); } else { mac_group_remove_client(fgrp, mcip); mac_group_add_client(tgrp, mcip); mcip->mci_flent->fe_rx_ring_group = tgrp; + /* * If there are other clients (VLANs) sharing this address - * we should be here only for the primary. + * then move them too. */ - if (mcip->mci_unicast->ma_nusers > 1) { + if (mac_check_macaddr_shared(mcip->mci_unicast)) { /* * We need to move all the clients that are using - * this h/w address. + * this MAC address. */ mgcp = fgrp->mrg_clients; while (mgcp != NULL) { @@ -6861,20 +7391,24 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, } } } + /* - * The default group will still take the multicast, - * broadcast traffic etc., so it won't go to + * The default group still handles multicast and + * broadcast traffic; it won't transition to * MAC_GROUP_STATE_REGISTERED. */ if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED) mac_rx_group_unmark(fgrp, MR_CONDEMNED); mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED); } + next_state = mac_group_next_state(tgrp, &group_only_mcip, MAC_DEFAULT_RX_GROUP(mip), B_TRUE); mac_set_group_state(tgrp, next_state); + /* - * If the destination group is reserved, setup the SRSs etc. + * If the destination group is reserved, then setup the SRSes. + * Otherwise make sure to use SW classification. */ if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) { mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK); @@ -6885,6 +7419,7 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, } else { mac_rx_switch_grp_to_sw(tgrp); } + return (0); } @@ -6915,6 +7450,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) boolean_t isprimary; isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC; + /* * When we come here for a VLAN on the primary (dladm create-vlan), * we need to pair it along with the primary (to keep it consistent @@ -6996,8 +7532,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) if (grp->mrg_state == MAC_GROUP_STATE_RESERVED && candidate_grp == NULL) { gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + VERIFY3P(gclient, !=, NULL); gmrp = MCIP_RESOURCE_PROPS(gclient); if (gclient->mci_share == 0 && (gmrp->mrp_mask & MRP_TX_RINGS) == 0 && @@ -7034,13 +7569,14 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) */ if (need_exclgrp && candidate_grp != NULL) { /* - * Switch the MAC client from the candidate group - * to the default group. + * Switch the MAC client from the candidate + * group to the default group. We know the + * candidate_grp came from a reserved group + * and thus only has one client. */ grp = candidate_grp; gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + VERIFY3P(gclient, !=, NULL); mac_tx_client_quiesce((mac_client_handle_t)gclient); mac_tx_switch_group(gclient, grp, defgrp); mac_tx_client_restart((mac_client_handle_t)gclient); @@ -7208,7 +7744,7 @@ mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, */ mac_group_remove_client(fgrp, mcip); mac_tx_dismantle_soft_rings(fgrp, flent); - if (mcip->mci_unicast->ma_nusers > 1) { + if (mac_check_macaddr_shared(mcip->mci_unicast)) { mgcp = fgrp->mrg_clients; while (mgcp != NULL) { gmcip = mgcp->mgc_client; @@ -7454,7 +7990,7 @@ mac_no_active(mac_handle_t mh) * changes and update the mac_resource_props_t for the VLAN's client. * We need to do this since we don't support setting these properties * on the primary's VLAN clients, but the VLAN clients have to - * follow the primary w.r.t the rings property; + * follow the primary w.r.t the rings property. */ void mac_set_prim_vlan_rings(mac_impl_t *mip, mac_resource_props_t *mrp) @@ -7603,13 +8139,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group, MAC_GROUP_STATE_RESERVED) { continue; } - mcip = MAC_GROUP_ONLY_CLIENT(tgrp); - if (mcip == NULL) - mcip = mac_get_grp_primary(tgrp); - ASSERT(mcip != NULL); - mrp = MCIP_RESOURCE_PROPS(mcip); - if ((mrp->mrp_mask & MRP_RX_RINGS) != 0) + if (i_mac_clients_hw(tgrp, MRP_RX_RINGS)) continue; + mcip = tgrp->mrg_clients->mgc_client; + VERIFY3P(mcip, !=, NULL); if ((tgrp->mrg_cur_count + defgrp->mrg_cur_count) < (modify + 1)) { continue; @@ -7624,12 +8157,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group, MAC_GROUP_STATE_RESERVED) { continue; } - mcip = MAC_GROUP_ONLY_CLIENT(tgrp); - if (mcip == NULL) - mcip = mac_get_grp_primary(tgrp); - mrp = MCIP_RESOURCE_PROPS(mcip); - if ((mrp->mrp_mask & MRP_TX_RINGS) != 0) + if (i_mac_clients_hw(tgrp, MRP_TX_RINGS)) continue; + mcip = tgrp->mrg_clients->mgc_client; + VERIFY3P(mcip, !=, NULL); if ((tgrp->mrg_cur_count + defgrp->mrg_cur_count) < (modify + 1)) { continue; @@ -7899,10 +8430,10 @@ mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg) * Set effective rings property. This could be called from datapath_setup/ * datapath_teardown or set-linkprop. * If the group is reserved we just go ahead and set the effective rings. - * Additionally, for TX this could mean the default group has lost/gained + * Additionally, for TX this could mean the default group has lost/gained * some rings, so if the default group is reserved, we need to adjust the * effective rings for the default group clients. For RX, if we are working - * with the non-default group, we just need * to reset the effective props + * with the non-default group, we just need to reset the effective props * for the default group clients. */ void @@ -8032,6 +8563,7 @@ mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw) * the first non-primary. */ ASSERT(mip->mi_nactiveclients == 2); + /* * OK, now we have the primary that needs to be relocated. */ diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c index 1ff33c3578..3b674be1d0 100644 --- a/usr/src/uts/common/io/mac/mac_bcast.c +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) uint64_t gen; uint_t i; mblk_t *mp_chain1; - flow_entry_t *flent; + flow_entry_t *flent; int err; rw_enter(&mip->mi_rw_lock, RW_READER); @@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) */ if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL) break; - /* - * Fix the checksum for packets originating - * from the local machine. - */ - if ((src_mcip != NULL) && - (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL) - break; FLOW_TRY_REFHOLD(flent, err); if (err != 0) { diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 66bba78e91..50316bb81e 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright 2017 RackTop Systems. */ @@ -114,6 +114,7 @@ #include <sys/stream.h> #include <sys/strsun.h> #include <sys/strsubr.h> +#include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/modhash.h> #include <sys/mac_impl.h> @@ -865,9 +866,12 @@ mac_unicast_update_client_flow(mac_client_impl_t *mcip) mac_protect_update_mac_token(mcip); /* - * A MAC client could have one MAC address but multiple - * VLANs. In that case update the flow entries corresponding - * to all VLANs of the MAC client. + * When there are multiple VLANs sharing the same MAC address, + * each gets its own MAC client, except when running on sun4v + * vsw. In that case the mci_flent_list is used to place + * multiple VLAN flows on one MAC client. If we ever get rid + * of vsw then this code can go, but until then we need to + * update all flow entries. */ for (flent = mcip->mci_flent_list; flent != NULL; flent = flent->fe_client_next) { @@ -1025,7 +1029,7 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr) return (0); } - if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) { + if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) { i_mac_perim_exit(mip); return (EBUSY); } @@ -1040,9 +1044,9 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr) mac_capab_aggr_t aggr_cap; /* - * If the mac is an aggregation, other than the unicast + * If the MAC is an aggregation, other than the unicast * addresses programming, aggr must be informed about this - * primary unicst address change to change its mac address + * primary unicst address change to change its MAC address * policy to be user-specified. */ ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED); @@ -1353,7 +1357,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_mip = mip; mcip->mci_upper_mip = NULL; - mcip->mci_rx_fn = mac_pkt_drop; + mcip->mci_rx_fn = mac_rx_def; mcip->mci_rx_arg = NULL; mcip->mci_rx_p_fn = NULL; mcip->mci_rx_p_arg = NULL; @@ -1374,7 +1378,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_state_flags |= MCIS_IS_AGGR_PORT; if (mip->mi_state_flags & MIS_IS_AGGR) - mcip->mci_state_flags |= MCIS_IS_AGGR; + mcip->mci_state_flags |= MCIS_IS_AGGR_CLIENT; if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) { datalink_id_t linkid; @@ -1433,6 +1437,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_flent = flent; FLOW_MARK(flent, FE_MC_NO_DATAPATH); flent->fe_mcip = mcip; + /* * Place initial creation reference on the flow. This reference * is released in the corresponding delete action viz. @@ -1539,7 +1544,8 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags) } /* - * Set the rx bypass receive callback. + * Set the Rx bypass receive callback and return B_TRUE. Return + * B_FALSE if it's not possible to enable bypass. */ boolean_t mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) @@ -1550,11 +1556,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); /* - * If the mac_client is a VLAN, we should not do DLS bypass and - * instead let the packets come up via mac_rx_deliver so the vlan - * header can be stripped. + * If the client has more than one VLAN then process packets + * through DLS. This should happen only when sun4v vsw is on + * the scene. */ - if (mcip->mci_nvids > 0) + if (mcip->mci_nvids > 1) return (B_FALSE); /* @@ -1608,8 +1614,8 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) i_mac_perim_exit(mip); /* - * If we're changing the rx function on the primary mac of a vnic, - * make sure any secondary macs on the vnic are updated as well. + * If we're changing the Rx function on the primary MAC of a VNIC, + * make sure any secondary addresses on the VNIC are updated as well. */ if (umip != NULL) { ASSERT((umip->mi_state_flags & MIS_IS_VNIC) != 0); @@ -1623,7 +1629,33 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) void mac_rx_clear(mac_client_handle_t mch) { - mac_rx_set(mch, mac_pkt_drop, NULL); + mac_rx_set(mch, mac_rx_def, NULL); +} + +void +mac_rx_barrier(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + + i_mac_perim_enter(mip); + + /* If a RX callback is set, quiesce and restart that datapath */ + if (mcip->mci_rx_fn != mac_rx_def) { + mac_rx_client_quiesce(mch); + mac_rx_client_restart(mch); + } + + /* If any promisc callbacks are registered, perform a barrier there */ + if (mcip->mci_promisc_list != NULL || mip->mi_promisc_list != NULL) { + mac_cb_info_t *mcbi = &mip->mi_promisc_cb_info; + + mutex_enter(mcbi->mcbi_lockp); + mac_callback_barrier(mcbi); + mutex_exit(mcbi->mcbi_lockp); + } + + i_mac_perim_exit(mip); } void @@ -1787,6 +1819,14 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp, } /* Let check if we can give this an excl group */ } else if (group == defgrp) { + /* + * If multiple clients share an + * address then they must stay on the + * default group. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (0); + ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE); /* Couldn't give it a group, that's fine */ @@ -1809,6 +1849,16 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp, } if (group == defgrp && ((mrp->mrp_nrxrings > 0) || unspec)) { + /* + * We are requesting Rx rings. Try to reserve + * a non-default group. + * + * If multiple clients share an address then + * they must stay on the default group. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (EINVAL); + ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE); if (ngrp == NULL) return (ENOSPC); @@ -2166,10 +2216,10 @@ mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr, flent_flags = FLOW_VNIC_MAC; /* - * For the first flow we use the mac client's name - mci_name, for - * subsequent ones we just create a name with the vid. This is + * For the first flow we use the MAC client's name - mci_name, for + * subsequent ones we just create a name with the VID. This is * so that we can add these flows to the same flow table. This is - * fine as the flow name (except for the one with the mac client's + * fine as the flow name (except for the one with the MAC client's * name) is not visible. When the first flow is removed, we just replace * its fdesc with another from the list, so we will still retain the * flent with the MAC client's flow name. @@ -2327,6 +2377,7 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid, * The unicast MAC address must have been added successfully. */ ASSERT(mcip->mci_unicast != NULL); + /* * Push down the sub-flows that were defined on this link * hitherto. The flows are added to the active flow table @@ -2338,15 +2389,23 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid, ASSERT(!no_unicast); /* - * A unicast flow already exists for that MAC client, - * this flow must be the same mac address but with - * different VID. It has been checked by mac_addr_in_use(). + * A unicast flow already exists for that MAC client + * so this flow must be the same MAC address but with + * a different VID. It has been checked by + * mac_addr_in_use(). * - * We will use the SRS etc. from the mci_flent. Note that - * We don't need to create kstat for this as except for - * the fdesc, everything will be used from in the 1st flent. + * We will use the SRS etc. from the initial + * mci_flent. We don't need to create a kstat for + * this, as except for the fdesc, everything will be + * used from the first flent. + * + * The only time we should see multiple flents on the + * same MAC client is on the sun4v vsw. If we removed + * that code we should be able to remove the entire + * notion of multiple flents on a MAC client (this + * doesn't affect sub/user flows because they have + * their own list unrelated to mci_flent_list). */ - if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) { err = EINVAL; goto bail; @@ -2406,7 +2465,17 @@ done_setup: if (flent->fe_rx_ring_group != NULL) mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT); FLOW_UNMARK(flent, FE_INCIPIENT); - FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + + /* + * If this is an aggr port client, don't enable the flow's + * datapath at this stage. Otherwise, bcast traffic could + * arrive while the aggr port is in the process of + * initializing. Instead, the flow's datapath is started later + * when mac_client_set_flow_cb() is called. + */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) + FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + mac_tx_client_unblock(mcip); return (0); bail: @@ -2475,8 +2544,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, boolean_t is_vnic_primary = (flags & MAC_UNICAST_VNIC_PRIMARY); - /* when VID is non-zero, the underlying MAC can not be VNIC */ - ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0))); + /* + * When the VID is non-zero the underlying MAC cannot be a + * VNIC. I.e., dladm create-vlan cannot take a VNIC as + * argument, only the primary MAC client. + */ + ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != VLAN_ID_NONE))); /* * Can't unicast add if the client asked only for minimal datapath @@ -2489,18 +2562,19 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, * Check for an attempted use of the current Port VLAN ID, if enabled. * No client may use it. */ - if (mip->mi_pvid != 0 && vid == mip->mi_pvid) + if (mip->mi_pvid != VLAN_ID_NONE && vid == mip->mi_pvid) return (EBUSY); /* * Check whether it's the primary client and flag it. */ - if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0) + if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && + vid == VLAN_ID_NONE) mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY; /* * is_vnic_primary is true when we come here as a VLAN VNIC - * which uses the primary mac client's address but with a non-zero + * which uses the primary MAC client's address but with a non-zero * VID. In this case the MAC address is not specified by an upper * MAC client. */ @@ -2552,7 +2626,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, /* * Create a handle for vid 0. */ - ASSERT(vid == 0); + ASSERT(vid == VLAN_ID_NONE); muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP); muip->mui_vid = vid; *mah = (mac_unicast_handle_t)muip; @@ -2572,7 +2646,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, } /* - * If this is a VNIC/VLAN, disable softmac fast-path. + * If this is a VNIC/VLAN, disable softmac fast-path. This is + * only relevant to legacy devices which use softmac to + * interface with GLDv3. */ if (mcip->mci_state_flags & MCIS_IS_VNIC) { err = mac_fastpath_disable((mac_handle_t)mip); @@ -2620,9 +2696,11 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, (void) mac_client_set_resources(mch, mrp); } else if (mcip->mci_state_flags & MCIS_IS_VNIC) { /* - * This is a primary VLAN client, we don't support - * specifying rings property for this as it inherits the - * rings property from its MAC. + * This is a VLAN client sharing the address of the + * primary MAC client; i.e., one created via dladm + * create-vlan. We don't support specifying ring + * properties for this type of client as it inherits + * these from the primary MAC client. */ if (is_vnic_primary) { mac_resource_props_t *vmrp; @@ -2681,7 +2759,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, /* * Set the flags here so that if this is a passive client, we - * can return and set it when we call mac_client_datapath_setup + * can return and set it when we call mac_client_datapath_setup * when this becomes the active client. If we defer to using these * flags to mac_client_datapath_setup, then for a passive client, * we'd have to store the flags somewhere (probably fe_flags) @@ -2918,7 +2996,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip, mac_misc_stat_delete(flent); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; @@ -2984,14 +3062,14 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) i_mac_perim_enter(mip); if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) { /* - * Called made by the upper MAC client of a VNIC. + * Call made by the upper MAC client of a VNIC. * There's nothing much to do, the unicast address will * be removed by the VNIC driver when the VNIC is deleted, * but let's ensure that all our transmit is done before * the client does a mac_client_stop lest it trigger an * assert in the driver. */ - ASSERT(muip->mui_vid == 0); + ASSERT(muip->mui_vid == VLAN_ID_NONE); mac_tx_client_flush(mcip); @@ -3055,6 +3133,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) i_mac_perim_exit(mip); return (0); } + /* * Remove the VID from the list of client's VIDs. */ @@ -3081,7 +3160,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) * flows. */ flent = mac_client_get_flow(mcip, muip); - ASSERT(flent != NULL); + VERIFY3P(flent, !=, NULL); /* * The first one is disappearing, need to make sure @@ -3109,6 +3188,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) FLOW_FINAL_REFRELE(flent); ASSERT(!(mcip->mci_state_flags & MCIS_EXCLUSIVE)); + /* * Enable fastpath if this is a VNIC or a VLAN. */ @@ -3122,7 +3202,8 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) mui_vid = muip->mui_vid; mac_client_datapath_teardown(mch, muip, flent); - if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && mui_vid == 0) { + if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && + mui_vid == VLAN_ID_NONE) { mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY; } else { i_mac_perim_exit(mip); @@ -3264,6 +3345,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mac_cb_info_t *mcbi; int rc; + if ((flags & MAC_PROMISC_FLAGS_NO_COPY) && + (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) { + return (EINVAL); + } + i_mac_perim_enter(mip); if ((rc = mac_start((mac_handle_t)mip)) != 0) { @@ -3310,6 +3396,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mpip->mpi_strip_vlan_tag = ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0); mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0); + mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0); mcbi = &mip->mi_promisc_cb_info; mutex_enter(mcbi->mcbi_lockp); @@ -3530,6 +3617,13 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) : msgdsize(mp_chain)); + /* + * There's a chance this primary client might be part + * of a bridge and the packet forwarded to a local + * receiver -- mark the packet accordingly. + */ + DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC; + MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip); if (mp_chain == NULL) { cookie = 0; @@ -3943,33 +4037,63 @@ mac_client_get_effective_resources(mac_client_handle_t mch, * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched * after classification by mac_rx_deliver(). */ - static void mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, boolean_t loopback) { - mblk_t *mp_copy, *mp_next; + mblk_t *mp_next; + boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0; + + if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag || + (mpip->mpi_do_fixups && local)) { + mblk_t *mp_copy; - if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) { mp_copy = copymsg(mp); if (mp_copy == NULL) return; + /* + * The consumer has requested we emulate HW offloads + * for host-local packets. + */ + if (mpip->mpi_do_fixups && local) { + /* + * Remember that copymsg() doesn't copy + * b_next, so we are only passing a single + * packet to mac_hw_emul(). Also keep in mind + * that mp_copy will become an mblk chain if + * the argument is an LSO message. + */ + mac_hw_emul(&mp_copy, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); + + if (mp_copy == NULL) + return; + } + if (mpip->mpi_strip_vlan_tag) { mp_copy = mac_strip_vlan_tag_chain(mp_copy); if (mp_copy == NULL) return; } - mp_next = NULL; - } else { - mp_copy = mp; - mp_next = mp->b_next; + + /* + * There is code upstack that can't deal with message + * chains. + */ + for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) { + mp_next = tmp->b_next; + tmp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback); + } + + return; } - mp_copy->b_next = NULL; - mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); - if (mp_copy == mp) - mp->b_next = mp_next; + mp_next = mp->b_next; + mp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback); + mp->b_next = mp_next; } /* @@ -4051,8 +4175,9 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, if (is_sender || mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || - is_mcast) + is_mcast) { mac_promisc_dispatch_one(mpip, mp, is_sender); + } } } MAC_PROMISC_WALKER_DCR(mip); @@ -4152,16 +4277,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + return (B_TRUE); else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); else @@ -4180,8 +4304,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) mac_impl_t *mip = (mac_impl_t *)mh; /* - * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM, - * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised. + * Some capabilities are restricted when there are more than one active + * clients on the MAC resource. The ones noted below are safe, + * independent of that count. */ if (mip->mi_nactiveclients > 1) { switch (cap) { @@ -4189,6 +4314,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) return (B_TRUE); case MAC_CAPAB_LEGACY: case MAC_CAPAB_HCKSUM: + case MAC_CAPAB_LSO: case MAC_CAPAB_NO_NATIVEVLAN: break; default: @@ -4340,7 +4466,13 @@ mac_addr_len(mac_handle_t mh) boolean_t mac_is_vnic(mac_handle_t mh) { - return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0); +} + +boolean_t +mac_is_overlay(mac_handle_t mh) +{ + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0); } mac_handle_t diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 81278cfdee..3697d888e7 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -604,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) * * TODO: Cleanup and tighten some of the assumptions. */ +boolean_t mac_check_overlay = B_TRUE; boolean_t mac_use_bw_heuristic = B_TRUE; static int mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) @@ -611,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) uint64_t cpu_speed, bw = 0; int srings = 0; boolean_t bw_enabled = B_FALSE; + mac_client_impl_t *mcip = flent->fe_mcip; ASSERT(!(flent->fe_type & FLOW_USER)); if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && @@ -638,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) */ if (mac_soft_ring_enable) srings = srings * 2; + } else if (mac_check_overlay == B_TRUE && + (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) { + /* Is this a VNIC on an overlay? */ + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; + if (mac_is_overlay(mh) == B_TRUE) { + srings = mac_rx_soft_ring_10gig_count; + } } + + } else { /* * Soft ring computation using CPU speed and specified @@ -1186,7 +1197,7 @@ mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs) mac_srs->srs_tx_soft_rings = (mac_soft_ring_t **) kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_RINGS_PER_GROUP, KM_SLEEP); - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; tx->st_soft_rings = (mac_soft_ring_t **) @@ -1595,13 +1606,13 @@ mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp) /* * When the first sub-flow is added to a link, we disable polling on the - * link and also modify the entry point to mac_rx_srs_subflow_process. + * link and also modify the entry point to mac_rx_srs_subflow_process(). * (polling is disabled because with the subflow added, accounting * for polling needs additional logic, it is assumed that when a subflow is * added, we can take some hit as a result of disabling polling rather than * adding more complexity - if this becomes a perf. issue we need to * re-rvaluate this logic). When the last subflow is removed, we turn back - * polling and also reset the entry point to mac_rx_srs_process. + * polling and also reset the entry point to mac_rx_srs_process(). * * In the future if there are multiple SRS, we can simply * take one and give it to the flow rather than disabling polling and @@ -1646,7 +1657,7 @@ mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable) * Change the S/W classifier so that we can land in the * correct processing function with correct argument. * If all subflows have been removed we can revert to - * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process. + * mac_rx_srs_process(), else we need mac_rx_srs_subflow_process(). */ mutex_enter(&flent->fe_lock); flent->fe_cb_fn = (flow_fn_t)rx_func; @@ -1977,8 +1988,6 @@ no_softrings: } /* - * mac_fanout_setup: - * * Calls mac_srs_fanout_init() or modify() depending upon whether * the SRS is getting initialized or re-initialized. */ @@ -1991,14 +2000,14 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, int i, rx_srs_cnt; ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* - * This is an aggregation port. Fanout will be setup - * over the aggregation itself. + * Aggr ports do not have SRSes. This function should never be + * called on an aggr port. */ - if (mcip->mci_state_flags & MCIS_EXCLUSIVE) - return; - + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); mac_rx_srs = flent->fe_rx_srs[0]; + /* * Set up the fanout on the tx side only once, with the * first rx SRS. The CPU binding, fanout, and bandwidth @@ -2054,8 +2063,6 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } /* - * mac_srs_create: - * * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side * processing is created. @@ -2187,7 +2194,7 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, * find nothing plus we have an existing backlog * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll * the H/W for packets anymore (let the polling thread go to sleep). - * 5) Once the backlog is relived (packets are processed) we reenable + * 5) Once the backlog is relieved (packets are processed) we reenable * polling (by signalling the poll thread) only when the backlog * dips below sr_poll_thres. * 6) sr_hiwat is used exclusively when we are not polling capable @@ -2210,7 +2217,14 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, mac_srs->srs_state |= SRS_SOFTRING_QUEUE; } - mac_srs->srs_worker = thread_create(NULL, 0, + /* + * Create the srs_worker with twice the stack of a normal kernel thread + * to reduce the likelihood of stack overflows in receive-side + * processing. (The larger stacks are not the only precaution taken + * against stack overflows; see the use of the MAC_RX_SRS_TOODEEP + * macro for details.) + */ + mac_srs->srs_worker = thread_create(NULL, default_stksize << 1, mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri); if (is_tx_srs) { @@ -2258,8 +2272,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, /* * Some drivers require serialization and don't send * packet chains in interrupt context. For such - * drivers, we should always queue in soft ring - * so that we get a chance to switch into a polling + * drivers, we should always queue in the soft ring + * so that we get a chance to switch into polling * mode under backlog. */ ring_info = mac_hwring_getinfo((mac_ring_handle_t)ring); @@ -2357,6 +2371,10 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_rx_srs_group_setup(mcip, flent, link_type); mac_tx_srs_group_setup(mcip, flent, link_type); + /* Aggr ports don't have SRSes; thus there is no soft ring fanout. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + return; + pool_lock(); cpupart = mac_pset_find(mrp, &use_default); mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), @@ -2366,9 +2384,11 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } /* - * Set up the RX SRSs. If the S/W SRS is not set, set it up, if there - * is a group associated with this MAC client, set up SRSs for individual - * h/w rings. + * Set up the Rx SRSes. If there is no group associated with the + * client, then only setup SW classification. If the client has + * exlusive (MAC_GROUP_STATE_RESERVED) use of the group, then create an + * SRS for each HW ring. If the client is sharing a group, then make + * sure to teardown the HW SRSes. */ void mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, @@ -2379,13 +2399,37 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_ring_t *ring; uint32_t fanout_type; mac_group_t *rx_group = flent->fe_rx_ring_group; + boolean_t no_unicast; + + /* + * If this is an an aggr port, then don't setup Rx SRS and Rx + * soft rings as they won't be used. However, we still need to + * start the rings to receive data on them. + */ + if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) { + if (rx_group == NULL) + return; + + for (ring = rx_group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) + (void) mac_start_ring(ring); + } + + return; + } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); fanout_type = mac_find_fanout(flent, link_type); + no_unicast = (mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR) != 0; - /* Create the SRS for S/W classification if none exists */ + /* Create the SRS for SW classification if none exists */ if (flent->fe_rx_srs[0] == NULL) { ASSERT(flent->fe_rx_srs_cnt == 0); - /* Setup the Rx SRS */ mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, mac_rx_deliver, mcip, NULL, NULL); mutex_enter(&flent->fe_lock); @@ -2397,15 +2441,17 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, if (rx_group == NULL) return; + /* - * fanout for default SRS is done when default SRS are created - * above. As each ring is added to the group, we setup the - * SRS and fanout to it. + * If the group is marked RESERVED then setup an SRS and + * fanout for each HW ring. */ switch (rx_group->mrg_state) { case MAC_GROUP_STATE_RESERVED: for (ring = rx_group->mrg_rings; ring != NULL; ring = ring->mr_next) { + uint16_t vid = i_mac_flow_vid(mcip->mci_flent); + switch (ring->mr_state) { case MR_INUSE: case MR_FREE: @@ -2415,20 +2461,23 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, (void) mac_start_ring(ring); /* - * Since the group is exclusively ours create - * an SRS for this ring to allow the - * individual SRS to dynamically poll the - * ring. Do this only if the client is not - * a VLAN MAC client, since for VLAN we do - * s/w classification for the VID check, and - * if it has a unicast address. + * If a client requires SW VLAN + * filtering or has no unicast address + * then we don't create any HW ring + * SRSes. */ - if ((mcip->mci_state_flags & - MCIS_NO_UNICAST_ADDR) || - i_mac_flow_vid(mcip->mci_flent) != - VLAN_ID_NONE) { + if ((!MAC_GROUP_HW_VLAN(rx_group) && + vid != VLAN_ID_NONE) || no_unicast) break; - } + + /* + * When a client has exclusive use of + * a group, and that group's traffic + * is fully HW classified, we create + * an SRS for each HW ring in order to + * make use of dynamic polling of said + * HW rings. + */ mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, mac_rx_deliver, mcip, NULL, ring); @@ -2444,14 +2493,9 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, break; case MAC_GROUP_STATE_SHARED: /* - * Set all rings of this group to software classified. - * - * If the group is current RESERVED, the existing mac - * client (the only client on this group) is using - * this group exclusively. In that case we need to - * disable polling on the rings of the group (if it - * was enabled), and free the SRS associated with the - * rings. + * When a group is shared by multiple clients, we must + * use SW classifiction to ensure packets are + * delivered to the correct client. */ mac_rx_switch_grp_to_sw(rx_group); break; @@ -2468,46 +2512,49 @@ void mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t link_type) { - int cnt; - int ringcnt; - mac_ring_t *ring; - mac_group_t *grp; - /* - * If we are opened exclusively (like aggr does for aggr_ports), - * don't set up Tx SRS and Tx soft rings as they won't be used. - * The same thing has to be done for Rx side also. See bug: - * 6880080 + * If this is an exclusive client (e.g. an aggr port), then + * don't setup Tx SRS and Tx soft rings as they won't be used. + * However, we still need to start the rings to send data + * across them. */ if (mcip->mci_state_flags & MCIS_EXCLUSIVE) { - /* - * If we have rings, start them here. - */ - if (flent->fe_tx_ring_group == NULL) - return; + mac_ring_t *ring; + mac_group_t *grp; + grp = (mac_group_t *)flent->fe_tx_ring_group; - ringcnt = grp->mrg_cur_count; - ring = grp->mrg_rings; - for (cnt = 0; cnt < ringcnt; cnt++) { - if (ring->mr_state != MR_INUSE) { + + if (grp == NULL) + return; + + for (ring = grp->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) (void) mac_start_ring(ring); - } - ring = ring->mr_next; } + return; } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); + if (flent->fe_tx_srs == NULL) { (void) mac_srs_create(mcip, flent, SRST_TX | link_type, NULL, mcip, NULL, NULL); } + mac_tx_srs_setup(mcip, flent); } /* - * Remove all the RX SRSs. If we want to remove only the SRSs associated - * with h/w rings, leave the S/W SRS alone. This is used when we want to - * move the MAC client from one group to another, so we need to teardown - * on the h/w SRSs. + * Teardown all the Rx SRSes. Unless hwonly is set, then only teardown + * the Rx HW SRSes and leave the SW SRS alone. The hwonly flag is set + * when we wish to move a MAC client from one group to another. In + * that case, we need to release the current HW SRSes but keep the SW + * SRS for continued traffic classifiction. */ void mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly) @@ -2525,8 +2572,16 @@ mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly) flent->fe_rx_srs[i] = NULL; flent->fe_rx_srs_cnt--; } - ASSERT(!hwonly || flent->fe_rx_srs_cnt == 1); - ASSERT(hwonly || flent->fe_rx_srs_cnt == 0); + + /* + * If we are only tearing down the HW SRSes then there must be + * one SRS left for SW classification. Otherwise we are tearing + * down both HW and SW and there should be no SRSes left. + */ + if (hwonly) + VERIFY3S(flent->fe_rx_srs_cnt, ==, 1); + else + VERIFY3S(flent->fe_rx_srs_cnt, ==, 0); } /* @@ -2828,6 +2883,7 @@ mac_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip, * even if this is the only client in the default group, we will * leave group as shared). */ + int mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t link_type) @@ -2838,7 +2894,8 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_group_t *default_rgroup; mac_group_t *default_tgroup; int err; - uint8_t *mac_addr; + uint8_t *mac_addr; + uint16_t vid; mac_group_state_t next_state; mac_client_impl_t *group_only_mcip; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); @@ -2850,6 +2907,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, boolean_t no_unicast; boolean_t isprimary = flent->fe_type & FLOW_PRIMARY_MAC; mac_client_impl_t *reloc_pmcip = NULL; + boolean_t use_hw; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); @@ -2881,15 +2939,19 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)); /* - * By default we have given the primary all the rings - * i.e. the default group. Let's see if the primary - * needs to be relocated so that the addition of this - * client doesn't impact the primary's performance, - * i.e. if the primary is in the default group and - * we add this client, the primary will lose polling. - * We do this only for NICs supporting dynamic ring - * grouping and only when this is the first client - * after the primary (i.e. nactiveclients is 2) + * All the rings initially belong to the default group + * under dynamic grouping. The primary client uses the + * default group when it is the only client. The + * default group is also used as the destination for + * all multicast and broadcast traffic of all clients. + * Therefore, the primary client loses its ability to + * poll the softrings on addition of a second client. + * To avoid a performance penalty, MAC will move the + * primary client to a dedicated group when it can. + * + * When using static grouping, the primary client + * begins life on a non-default group. There is + * no moving needed upon addition of a second client. */ if (!isprimary && mip->mi_nactiveclients == 2 && (group_only_mcip = mac_primary_client_handle(mip)) != @@ -2897,6 +2959,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, reloc_pmcip = mac_check_primary_relocation( group_only_mcip, rxhw); } + /* * Check to see if we can get an exclusive group for * this mac address or if there already exists a @@ -2910,6 +2973,26 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } else if (rgroup == NULL) { rgroup = default_rgroup; } + + /* + * If we are adding a second client to a + * non-default group then we need to move the + * existing client to the default group and + * add the new client to the default group as + * well. + */ + if (rgroup != default_rgroup && + rgroup->mrg_state == MAC_GROUP_STATE_RESERVED) { + group_only_mcip = MAC_GROUP_ONLY_CLIENT(rgroup); + err = mac_rx_switch_group(group_only_mcip, rgroup, + default_rgroup); + + if (err != 0) + goto setup_failed; + + rgroup = default_rgroup; + } + /* * Check to see if we can get an exclusive group for * this mac client. If no groups are available, use @@ -2941,14 +3024,17 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, rgroup->mrg_cur_count); } } + flent->fe_rx_ring_group = rgroup; /* - * Add the client to the group. This could cause - * either this group to move to the shared state or - * cause the default group to move to the shared state. - * The actions on this group are done here, while the - * actions on the default group are postponed to - * the end of this function. + * Add the client to the group and update the + * group's state. If rgroup != default_group + * then the rgroup should only ever have one + * client and be in the RESERVED state. But no + * matter what, the default_rgroup will enter + * the SHARED state since it has to receive + * all broadcast and multicast traffic. This + * case is handled later in the function. */ mac_group_add_client(rgroup, mcip); next_state = mac_group_next_state(rgroup, @@ -2973,28 +3059,37 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, &group_only_mcip, default_tgroup, B_FALSE); tgroup->mrg_state = next_state; } - /* - * Setup the Rx and Tx SRSes. If we got a pristine group - * exclusively above, mac_srs_group_setup would simply create - * the required SRSes. If we ended up sharing a previously - * reserved group, mac_srs_group_setup would also dismantle the - * SRSes of the previously exclusive group - */ - mac_srs_group_setup(mcip, flent, link_type); /* We are setting up minimal datapath only */ - if (no_unicast) + if (no_unicast) { + mac_srs_group_setup(mcip, flent, link_type); break; - /* Program the S/W Classifer */ + } + + /* Program software classification. */ if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) goto setup_failed; - /* Program the H/W Classifier */ - if ((err = mac_add_macaddr(mip, rgroup, mac_addr, - (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0)) != 0) + /* Program hardware classification. */ + vid = i_mac_flow_vid(flent); + use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; + err = mac_add_macaddr_vlan(mip, rgroup, mac_addr, vid, use_hw); + + if (err != 0) goto setup_failed; + mcip->mci_unicast = mac_find_macaddr(mip, mac_addr); - ASSERT(mcip->mci_unicast != NULL); + VERIFY3P(mcip->mci_unicast, !=, NULL); + + /* + * Setup the Rx and Tx SRSes. If the client has a + * reserved group, then mac_srs_group_setup() creates + * the required SRSes for the HW rings. If we have a + * shared group, mac_srs_group_setup() dismantles the + * HW SRSes of the previously exclusive group. + */ + mac_srs_group_setup(mcip, flent, link_type); + /* (Re)init the v6 token & local addr used by link protection */ mac_protect_update_mac_token(mcip); break; @@ -3038,17 +3133,23 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, ASSERT(default_rgroup->mrg_state == MAC_GROUP_STATE_SHARED); } + /* - * If we get an exclusive group for a VLAN MAC client we - * need to take the s/w path to make the additional check for - * the vid. Disable polling and set it to s/w classification. - * Similarly for clients that don't have a unicast address. + * A VLAN MAC client on a reserved group still + * requires SW classification if the MAC doesn't + * provide VLAN HW filtering. + * + * Clients with no unicast address also require SW + * classification. */ if (rgroup->mrg_state == MAC_GROUP_STATE_RESERVED && - (i_mac_flow_vid(flent) != VLAN_ID_NONE || no_unicast)) { + ((!MAC_GROUP_HW_VLAN(rgroup) && vid != VLAN_ID_NONE) || + no_unicast)) { mac_rx_switch_grp_to_sw(rgroup); } + } + mac_set_rings_effective(mcip); return (0); @@ -3074,6 +3175,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, boolean_t check_default_group = B_FALSE; mac_group_state_t next_state; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + uint16_t vid; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); @@ -3086,16 +3188,24 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, case SRST_LINK: /* Stop sending packets */ mac_tx_client_block(mcip); + group = flent->fe_rx_ring_group; + vid = i_mac_flow_vid(flent); - /* Stop the packets coming from the H/W */ + /* + * Stop the packet flow from the hardware by disabling + * any hardware filters assigned to this client. + */ if (mcip->mci_unicast != NULL) { int err; - err = mac_remove_macaddr(mcip->mci_unicast); + + err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid); + if (err != 0) { - cmn_err(CE_WARN, "%s: failed to remove a MAC" - " address because of error 0x%x", + cmn_err(CE_WARN, "%s: failed to remove a MAC HW" + " filters because of error 0x%x", mip->mi_name, err); } + mcip->mci_unicast = NULL; } @@ -3103,12 +3213,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); mac_flow_wait(flent, FLOW_DRIVER_UPCALL); - /* Now quiesce and destroy all SRS and soft rings */ + /* Quiesce and destroy all the SRSes. */ mac_rx_srs_group_teardown(flent, B_FALSE); mac_tx_srs_group_teardown(mcip, flent, SRST_LINK); - ASSERT((mcip->mci_flent == flent) && - (flent->fe_next == NULL)); + ASSERT3P(mcip->mci_flent, ==, flent); + ASSERT3P(flent->fe_next, ==, NULL); /* * Release our hold on the group as well. We need @@ -3116,17 +3226,17 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, * left who can use it exclusively. Also, if we * were the last client, release the group. */ - group = flent->fe_rx_ring_group; default_group = MAC_DEFAULT_RX_GROUP(mip); if (group != NULL) { mac_group_remove_client(group, mcip); next_state = mac_group_next_state(group, &grp_only_mcip, default_group, B_TRUE); + if (next_state == MAC_GROUP_STATE_RESERVED) { /* * Only one client left on this RX group. */ - ASSERT(grp_only_mcip != NULL); + VERIFY3P(grp_only_mcip, !=, NULL); mac_set_group_state(group, MAC_GROUP_STATE_RESERVED); group_only_flent = grp_only_mcip->mci_flent; @@ -3151,7 +3261,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, * to see if the primary client can get * exclusive access to the default group. */ - ASSERT(group != MAC_DEFAULT_RX_GROUP(mip)); + VERIFY3P(group, !=, MAC_DEFAULT_RX_GROUP(mip)); if (mrp->mrp_mask & MRP_RX_RINGS) { MAC_RX_GRP_RELEASED(mip); if (mip->mi_rx_group_type == @@ -3165,7 +3275,8 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, MAC_GROUP_STATE_REGISTERED); check_default_group = B_TRUE; } else { - ASSERT(next_state == MAC_GROUP_STATE_SHARED); + VERIFY3S(next_state, ==, + MAC_GROUP_STATE_SHARED); mac_set_group_state(group, MAC_GROUP_STATE_SHARED); mac_rx_group_unmark(group, MR_CONDEMNED); @@ -3254,12 +3365,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, */ if (check_default_group) { default_group = MAC_DEFAULT_RX_GROUP(mip); - ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED); + VERIFY3S(default_group->mrg_state, ==, MAC_GROUP_STATE_SHARED); next_state = mac_group_next_state(default_group, &grp_only_mcip, default_group, B_TRUE); if (next_state == MAC_GROUP_STATE_RESERVED) { - ASSERT(grp_only_mcip != NULL && - mip->mi_nactiveclients == 1); + VERIFY3P(grp_only_mcip, !=, NULL); + VERIFY3U(mip->mi_nactiveclients, ==, 1); mac_set_group_state(default_group, MAC_GROUP_STATE_RESERVED); mac_rx_srs_group_setup(grp_only_mcip, @@ -3385,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs) ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); + mac_drop_chain(mac_srs->srs_first, "SRS free"); mac_srs_ring_free(mac_srs); mac_srs_soft_rings_free(mac_srs); mac_srs_fanout_list_free(mac_srs); @@ -3783,7 +3894,7 @@ mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) * is also stored in st_soft_rings[] array. That entry should * be removed. */ - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; ASSERT(tx->st_soft_rings[tx_ring->mr_index] == remove_sring); @@ -3812,7 +3923,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent) boolean_t is_aggr; uint_t ring_info = 0; - is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR) != 0; + is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) != 0; grp = flent->fe_tx_ring_group; if (grp == NULL) { ring = (mac_ring_t *)mip->mi_default_tx_ring; @@ -3956,8 +4067,8 @@ mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart) } /* - * Walk through the list of mac clients for the MAC. - * For each active mac client, recompute the number of soft rings + * Walk through the list of MAC clients for the MAC. + * For each active MAC client, recompute the number of soft rings * associated with every client, only if current speed is different * from the speed that was previously used for soft ring computation. * If the cable is disconnected whlie the NIC is started, we would get @@ -3980,6 +4091,10 @@ mac_fanout_recompute(mac_impl_t *mip) for (mcip = mip->mi_clients_list; mcip != NULL; mcip = mcip->mci_client_next) { + /* Aggr port clients don't have SRSes. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + continue; + if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 || !MCIP_DATAPATH_SETUP(mcip)) continue; @@ -3992,6 +4107,7 @@ mac_fanout_recompute(mac_impl_t *mip) mac_set_pool_effective(use_default, cpupart, mrp, emrp); pool_unlock(); } + i_mac_perim_exit(mip); } diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c index aa4985fe4c..62612122d6 100644 --- a/usr/src/uts/common/io/mac/mac_flow.c +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/strsun.h> @@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_index = -1; } (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index da83dc643e..ee493bbca1 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. */ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. @@ -209,7 +209,7 @@ typedef struct slaac_addr { } slaac_addr_t; static void start_txn_cleanup_timer(mac_client_impl_t *); -static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t); +static boolean_t dynamic_method_set(mac_protect_t *, uint32_t); #define BUMP_STAT(m, s) (m)->mci_misc_stat.mms_##s++ @@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end) if (get_dhcpv4_info(ipha, end, &dh4) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV4_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4)) return (B_FALSE); if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 || @@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end) if (get_dhcpv6_info(ip6h, end, &dh6) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV6_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6)) return (B_FALSE); /* @@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end, { struct nd_opt_hdr *opt; int len, optlen; + mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect; + + if (!dynamic_method_set(protect, MPT_DYN_SLAAC)) + return; if (ip6h->ip6_hlim != 255) { DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim); @@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, if (*addr == INADDR_ANY) return (B_TRUE); + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v4addr = &protect->mp_ipaddrs[i]; @@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } } - return (protect->mp_ipaddrcnt == 0 ? - check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE); + + if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) { + return (check_dhcpv4_dyn_ip(mcip, *addr)); + } + + return (B_FALSE); } static boolean_t ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, in6_addr_t *addr) { + boolean_t slaac_enabled, dhcpv6_enabled; uint_t i; /* @@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr))) return (B_TRUE); - + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v6addr = &protect->mp_ipaddrs[i]; @@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } - if (protect->mp_ipaddrcnt == 0) { - return (check_slaac_ip(mcip, addr) || - check_dhcpv6_dyn_ip(mcip, addr)); - } else { - return (B_FALSE); - } + slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC); + if (slaac_enabled && check_slaac_ip(mcip, addr)) + return (B_TRUE); + + dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6); + if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr)) + return (B_TRUE); + + return (B_FALSE); } /* @@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen) bcmp(dcid->dc_id, cid, cidlen) == 0) return (B_TRUE); } + + DTRACE_PROBE3(missing__cid, mac_protect_t *, p, + uchar_t *, cid, uint_t, cidlen); return (B_FALSE); } @@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p, bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) { return (B_FALSE); } + + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0) cidlen = optlen; @@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p, mtype == DHCPV6_MSG_RECONFIGURE) return (B_TRUE); + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL, DHCPV6_OPT_CLIENTID, &cidlen); if (d6o == NULL || (uchar_t *)d6o + cidlen > end) @@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect, return (0); fail: - /* increment dhcpnospoof stat here */ freemsg(nmp); return (err); } @@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp) if ((err = validate_cids(p)) != 0) return (err); + if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE && + p->mp_allcids != MPT_RESET) { + return (EINVAL); + } + return (0); } @@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr) cp->mp_cidcnt = 0; } } + if (np->mp_allcids == MPT_RESET) { + cp->mp_allcids = MPT_FALSE; + } else if (np->mp_allcids != 0) { + cp->mp_allcids = MPT_TRUE; + } + if (np->mp_dynamic == MPT_RESET) { + cp->mp_dynamic = 0; + } else if (np->mp_dynamic != 0) { + cp->mp_dynamic = np->mp_dynamic; + } } void @@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip) } static boolean_t -allowed_ips_set(mac_resource_props_t *mrp, uint32_t af) +dynamic_method_set(mac_protect_t *mpt, uint32_t method) +{ + if (mpt->mp_dynamic != 0) { + return ((mpt->mp_dynamic & method) != 0); + } else { + return (mpt->mp_ipaddrcnt == 0); + } +} + +boolean_t +mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6, + in6_addr_t *v6addr) { - int i; + mac_perim_handle_t perim; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; - for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) { - if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af) - return (B_TRUE); + mac_perim_enter_by_mh(mh, &perim); + + mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + mac_protect_t *p; + boolean_t allowed; + + ASSERT(mrp != NULL); + + p = &mrp->mrp_protect; + + /* If mac protection/ipnospoof isn't enabled, return true */ + if ((mrp->mrp_mask & MRP_PROTECT) == 0 || + (p->mp_types & MPT_IPNOSPOOF) == 0) { + allowed = B_TRUE; + goto done; } - return (B_FALSE); + + if (isv6) { + allowed = ipnospoof_check_v6(mcip, p, v6addr); + } else { + in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr)); + allowed = ipnospoof_check_v4(mcip, p, v4addr); + } + +done: + mac_perim_exit(perim); + return (allowed); } mac_protect_t * diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 07201afdec..cb1a76aef6 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -56,6 +57,7 @@ #include <sys/sdt.h> #include <sys/pattr.h> #include <sys/strsun.h> +#include <sys/vlan.h> /* * MAC Provider Interface. @@ -351,6 +353,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) + mip->mi_state_flags |= MIS_IS_OVERLAY; + mac_addr_factory_init(mip); mac_transceiver_init(mip); @@ -697,7 +702,6 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) mac_ring_t *mr = (mac_ring_t *)mrh; mac_soft_ring_set_t *mac_srs; mblk_t *bp = mp_chain; - boolean_t hw_classified = B_FALSE; /* * If there are any promiscuous mode callbacks defined for @@ -709,7 +713,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) if (mr != NULL) { /* * If the SRS teardown has started, just return. The 'mr' - * continues to be valid until the driver unregisters the mac. + * continues to be valid until the driver unregisters the MAC. * Hardware classified packets will not make their way up * beyond this point once the teardown has started. The driver * is never passed a pointer to a flow entry or SRS or any @@ -722,11 +726,25 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) freemsgchain(mp_chain); return; } - if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { - hw_classified = B_TRUE; + + /* + * The ring is in passthru mode; pass the chain up to + * the pseudo ring. + */ + if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) { MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain, + B_FALSE); + MR_REFRELE(mr); + return; } - mutex_exit(&mr->mr_lock); + + /* + * The passthru callback should only be set when in + * MAC_PASSTHRU_CLASSIFIER mode. + */ + ASSERT3P(mr->mr_pt_fn, ==, NULL); /* * We check if an SRS is controlling this ring. @@ -734,19 +752,24 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * routine otherwise we need to go through mac_rx_classify * to reach the right place. */ - if (hw_classified) { + if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { + MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + ASSERT3P(mr->mr_srs, !=, NULL); mac_srs = mr->mr_srs; + /* - * This is supposed to be the fast path. - * All packets received though here were steered by - * the hardware classifier, and share the same - * MAC header info. + * This is the fast path. All packets received + * on this ring are hardware classified and + * share the same MAC header info. */ mac_srs->srs_rx.sr_lower_proc(mh, (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); MR_REFRELE(mr); return; } + + mutex_exit(&mr->mr_lock); /* We'll fall through to software classification */ } else { flow_entry_t *flent; @@ -1472,7 +1495,8 @@ mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm) pr->pr_flags |= MAC_PROP_INFO_PERM; } -void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff, +void +mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff, uint32_t *end, uint32_t *value, uint32_t *flags_ptr) { uint32_t flags; @@ -1497,8 +1521,9 @@ void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff, *flags_ptr = flags; } -void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, - uint32_t end, uint32_t value, uint32_t flags) +void +mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end, + uint32_t value, uint32_t flags) { ASSERT(DB_TYPE(mp) == M_DATA); @@ -1510,6 +1535,31 @@ void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, } void +mac_hcksum_clone(const mblk_t *src, mblk_t *dst) +{ + ASSERT3U(DB_TYPE(src), ==, M_DATA); + ASSERT3U(DB_TYPE(dst), ==, M_DATA); + + /* + * Do these assignments unconditionally, rather than only when + * flags is non-zero. This protects a situation where zeroed + * hcksum data does not make the jump onto an mblk_t with + * stale data in those fields. It's important to copy all + * possible flags (HCK_* as well as HW_*) and not just the + * checksum specific flags. Dropping flags during a clone + * could result in dropped packets. If the caller has good + * reason to drop those flags then it should do it manually, + * after the clone. + */ + DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); + DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); + DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); + DB_CKSUMEND(dst) = DB_CKSUMEND(src); + DB_CKSUM16(dst) = DB_CKSUM16(src); + DB_LSOMSS(dst) = DB_LSOMSS(src); +} + +void mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags) { ASSERT(DB_TYPE(mp) == M_DATA); diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index d046930873..0e62f828a9 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -300,9 +300,8 @@ * * Otherwise, all fanout is performed by software. MAC divides incoming frames * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and - * everything else. Note, VLAN tagged traffic is considered other, regardless of - * the interior EtherType. Regardless of the type of fanout, these three - * categories or buckets are always used. + * everything else. Regardless of the type of fanout, these three categories + * or buckets are always used. * * The difference between protocol level fanout and full software ring protocol * fanout is the number of software rings that end up getting created. The @@ -969,6 +968,7 @@ #include <sys/types.h> #include <sys/callb.h> +#include <sys/pattr.h> #include <sys/sdt.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -1328,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0; * b_prev may be set to the fanout hint \ * hence can't use freemsg directly \ */ \ - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ + mac_drop_chain(mp_chain, "SRS Tx max queue"); \ DTRACE_PROBE1(tx_queued_hiwat, \ mac_soft_ring_set_t *, srs); \ enqueue = 0; \ @@ -1347,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0; if (!(srs->srs_type & SRST_TX)) \ mutex_exit(&srs->srs_bw->mac_bw_lock); -#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ - mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ +#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \ + mac_drop_pkt((chain), (s)); \ /* increment freed stats */ \ - mac_srs->srs_tx.st_stat.mts_sdrops++; \ - cookie = (mac_tx_cookie_t)srs; \ + (srs)->srs_tx.st_stat.mts_sdrops++; \ + (cookie) = (mac_tx_cookie_t)(srs); \ } #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ @@ -1367,11 +1367,11 @@ int mac_srs_worker_wakeup_ticks = 0; * can occur in situ (in the interrupt thread) or if it should be left to a * worker thread. Note that the constant used to make this determination is * not entirely made-up, and is a result of some emprical validation. That - * said, the constant is left as a static variable to allow it to be + * said, the constant is left as a global variable to allow it to be * dynamically tuned in the field if and as needed. */ -static uintptr_t mac_rx_srs_stack_needed = 10240; -static uint_t mac_rx_srs_stack_toodeep; +uintptr_t mac_rx_srs_stack_needed = 14336; +uint_t mac_rx_srs_stack_toodeep; #ifndef STACK_GROWTH_DOWN #error Downward stack growth assumed. @@ -1379,7 +1379,7 @@ static uint_t mac_rx_srs_stack_toodeep; #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \ - ++mac_rx_srs_stack_toodeep) + (++mac_rx_srs_stack_toodeep || (mac_rx_srs_stack_toodeep = 1))) /* @@ -1475,16 +1475,15 @@ enum pkt_type { #define PORTS_SIZE 4 /* - * mac_rx_srs_proto_fanout - * - * This routine delivers packets destined to an SRS into one of the + * This routine delivers packets destined for an SRS into one of the * protocol soft rings. * - * Given a chain of packets we need to split it up into multiple sub chains - * destined into TCP, UDP or OTH soft ring. Instead of entering - * the soft ring one packet at a time, we want to enter it in the form of a - * chain otherwise we get this start/stop behaviour where the worker thread - * goes to sleep and then next packets comes in forcing it to wake up etc. + * Given a chain of packets we need to split it up into multiple sub + * chains: TCP, UDP or OTH soft ring. Instead of entering the soft + * ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker + * thread goes to sleep and then next packet comes in forcing it to + * wake up. */ static void mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) @@ -1523,9 +1522,9 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; /* - * Special clients (eg. VLAN, non ether, etc) need DLS - * processing in the Rx path. SRST_DLS_BYPASS will be clear for - * such SRSs. Another way of disabling bypass is to set the + * Some clients, such as non-ethernet, need DLS processing in + * the Rx path. Such clients clear the SRST_DLS_BYPASS flag. + * DLS bypass may also be disabled via the * MCIS_RX_BYPASS_DISABLE flag. */ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && @@ -1537,10 +1536,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) bzero(sz, MAX_SR_TYPES * sizeof (size_t)); /* - * We got a chain from SRS that we need to send to the soft rings. - * Since squeues for TCP & IPv4 sap poll their soft rings (for - * performance reasons), we need to separate out v4_tcp, v4_udp - * and the rest goes in other. + * We have a chain from SRS that we need to split across the + * soft rings. The squeues for the TCP and IPv4 SAPs use their + * own soft rings to allow polling from the squeue. The rest of + * the packets are delivered on the OTH soft ring which cannot + * be polled. */ while (head != NULL) { mp = head; @@ -1568,9 +1568,14 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) evhp = (struct ether_vlan_header *)mp->b_rptr; sap = ntohs(evhp->ether_type); hdrsize = sizeof (struct ether_vlan_header); + /* - * Check if the VID of the packet, if any, - * belongs to this client. + * Check if the VID of the packet, if + * any, belongs to this client. + * Technically, if this packet came up + * via a HW classified ring then we + * don't need to perform this check. + * Perhaps a future optimization. */ if (!mac_client_check_flow_vid(mcip, VLAN_ID(ntohs(evhp->ether_tci)))) { @@ -1635,7 +1640,6 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * performance and may bypass DLS. All other cases go through * the 'OTH' type path without DLS bypass. */ - ipha = (ipha_t *)(mp->b_rptr + hdrsize); if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) type = OTH; @@ -1647,11 +1651,13 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) } ASSERT(type == UNDEF); + /* - * We look for at least 4 bytes past the IP header to get - * the port information. If we get an IP fragment, we don't - * have the port information, and we use just the protocol - * information. + * Determine the type from the IP protocol value. If + * classified as TCP or UDP, then update the read + * pointer to the beginning of the IP header. + * Otherwise leave the message as is for further + * processing by DLS. */ switch (ipha->ipha_protocol) { case IPPROTO_TCP: @@ -1695,11 +1701,10 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) int fanout_unaligned = 0; /* - * mac_rx_srs_long_fanout - * - * The fanout routine for VLANs, and for anything else that isn't performing - * explicit dls bypass. Returns -1 on an error (drop the packet due to a - * malformed packet), 0 on success, with values written in *indx and *type. + * The fanout routine for any clients with DLS bypass disabled or for + * traffic classified as "other". Returns -1 on an error (drop the + * packet due to a malformed packet), 0 on success, with values + * written in *indx and *type. */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, @@ -1865,16 +1870,15 @@ src_dst_based_fanout: } /* - * mac_rx_srs_fanout - * - * This routine delivers packets destined to an SRS into a soft ring member + * This routine delivers packets destined for an SRS into a soft ring member * of the set. * - * Given a chain of packets we need to split it up into multiple sub chains - * destined for one of the TCP, UDP or OTH soft rings. Instead of entering - * the soft ring one packet at a time, we want to enter it in the form of a - * chain otherwise we get this start/stop behaviour where the worker thread - * goes to sleep and then next packets comes in forcing it to wake up etc. + * Given a chain of packets we need to split it up into multiple sub + * chains: TCP, UDP or OTH soft ring. Instead of entering the soft + * ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker + * thread goes to sleep and then next packet comes in forcing it to + * wake up. * * Note: * Since we know what is the maximum fanout possible, we create a 2D array @@ -1935,10 +1939,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; /* - * Special clients (eg. VLAN, non ether, etc) need DLS - * processing in the Rx path. SRST_DLS_BYPASS will be clear for - * such SRSs. Another way of disabling bypass is to set the - * MCIS_RX_BYPASS_DISABLE flag. + * Some clients, such as non Ethernet, need DLS processing in + * the Rx path. Such clients clear the SRST_DLS_BYPASS flag. + * DLS bypass may also be disabled via the + * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by + * sun4v vsw currently. */ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); @@ -1960,7 +1965,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) /* * We got a chain from SRS that we need to send to the soft rings. - * Since squeues for TCP & IPv4 sap poll their soft rings (for + * Since squeues for TCP & IPv4 SAP poll their soft rings (for * performance reasons), we need to separate out v4_tcp, v4_udp * and the rest goes in other. */ @@ -1990,9 +1995,14 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) evhp = (struct ether_vlan_header *)mp->b_rptr; sap = ntohs(evhp->ether_type); hdrsize = sizeof (struct ether_vlan_header); + /* - * Check if the VID of the packet, if any, - * belongs to this client. + * Check if the VID of the packet, if + * any, belongs to this client. + * Technically, if this packet came up + * via a HW classified ring then we + * don't need to perform this check. + * Perhaps a future optimization. */ if (!mac_client_check_flow_vid(mcip, VLAN_ID(ntohs(evhp->ether_tci)))) { @@ -2032,7 +2042,6 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) continue; } - /* * If we are using the default Rx ring where H/W or S/W * classification has not happened, we need to verify if @@ -2621,7 +2630,6 @@ again: mac_srs->srs_state |= (SRS_PROC|proc_type); - /* * mcip is NULL for broadcast and multicast flows. The promisc * callbacks for broadcast and multicast packets are delivered from @@ -2641,10 +2649,8 @@ again: } /* - * Check if SRS itself is doing the processing - * This direct path does not apply when subflows are present. In this - * case, packets need to be dispatched to a soft ring according to the - * flow's bandwidth and other resources contraints. + * Check if SRS itself is doing the processing. This direct + * path applies only when subflows are present. */ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { mac_direct_rx_t proc; @@ -2888,7 +2894,7 @@ again: mac_srs->srs_bw->mac_bw_sz -= sz; mac_srs->srs_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_srs->srs_bw->mac_bw_lock); - mac_pkt_drop(NULL, NULL, head, B_FALSE); + mac_drop_chain(head, "Rx no bandwidth"); goto leave_poll; } else { mutex_exit(&mac_srs->srs_bw->mac_bw_lock); @@ -3270,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, } /* - * mac_rx_srs_process - * - * Receive side routine called from the interrupt path. + * MAC SRS receive side routine. If the data is coming from the + * network (i.e. from a NIC) then this is called in interrupt context. + * If the data is coming from a local sender (e.g. mac_tx_send() or + * bridge_forward()) then this is not called in interrupt context. * * loopback is set to force a context switch on the loopback * path between MAC clients. @@ -3332,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, mac_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_bw->mac_bw_lock); mutex_exit(&mac_srs->srs_lock); - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Rx no bandwidth"); return; } else { if ((mac_bw->mac_bw_sz + sz) <= @@ -3454,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); if (flag & MAC_DROP_ON_NO_DESC) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no desc"); } else { if (mac_srs->srs_first != NULL) wakeup_worker = B_FALSE; @@ -3517,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx SRS hiwat"); } else { MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, cnt, sz); @@ -3890,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, cookie = (mac_tx_cookie_t)mac_srs; *ret_mp = mp_chain; } else { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no bandwidth"); } mutex_exit(&mac_srs->srs_lock); return (cookie); @@ -4336,6 +4346,14 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, obytes += (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); + /* + * Mark all packets as local so that a + * receiver can determine if a packet arrived + * from a local source or from the network. + * This allows some consumers to avoid + * unecessary work like checksum computation. + */ + DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; CHECK_VID_AND_ADD_TAG(mp); MAC_TX(mip, ring, mp, src_mcip); @@ -4368,7 +4386,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, flow_entry_t *dst_flow_ent; void *flow_cookie; size_t pkt_size; - mblk_t *mp1; next = mp->b_next; mp->b_next = NULL; @@ -4378,49 +4395,25 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, CHECK_VID_AND_ADD_TAG(mp); /* + * Mark all packets as local so that a receiver can + * determine if a packet arrived from a local source + * or from the network. This allows some consumers to + * avoid unecessary work like checksum computation. + */ + DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; + + /* * Find the destination. */ dst_flow_ent = mac_tx_classify(mip, mp); if (dst_flow_ent != NULL) { - size_t hdrsize; - int err = 0; - - if (mip->mi_info.mi_nativemedia == DL_ETHER) { - struct ether_vlan_header *evhp = - (struct ether_vlan_header *)mp->b_rptr; - - if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) - hdrsize = sizeof (*evhp); - else - hdrsize = sizeof (struct ether_header); - } else { - mac_header_info_t mhi; - - err = mac_header_info((mac_handle_t)mip, - mp, &mhi); - if (err == 0) - hdrsize = mhi.mhi_hdrsize; - } - /* * Got a matching flow. It's either another * MAC client, or a broadcast/multicast flow. - * Make sure the packet size is within the - * allowed size. If not drop the packet and - * move to next packet. */ - if (err != 0 || - (pkt_size - hdrsize) > mip->mi_sdu_max) { - oerrors++; - DTRACE_PROBE2(loopback__drop, size_t, pkt_size, - mblk_t *, mp); - freemsg(mp); - mp = next; - FLOW_REFRELE(dst_flow_ent); - continue; - } flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); + if (flow_cookie != NULL) { /* * The vnic_bcast_send function expects @@ -4438,6 +4431,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * bypass is set. */ boolean_t do_switch; + mac_client_impl_t *dst_mcip = dst_flow_ent->fe_mcip; @@ -4453,19 +4447,19 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * check is done inside the MAC_TX() * macro. */ - if (mip->mi_promisc_list != NULL) + if (mip->mi_promisc_list != NULL) { mac_promisc_dispatch(mip, mp, src_mcip); + } do_switch = ((src_mcip->mci_state_flags & dst_mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE) != 0); - if ((mp1 = mac_fix_cksum(mp)) != NULL) { - (dst_flow_ent->fe_cb_fn)( - dst_flow_ent->fe_cb_arg1, - dst_flow_ent->fe_cb_arg2, - mp1, do_switch); - } + (dst_flow_ent->fe_cb_fn)( + dst_flow_ent->fe_cb_arg1, + dst_flow_ent->fe_cb_arg2, + mp, do_switch); + } FLOW_REFRELE(dst_flow_ent); } else { @@ -4656,6 +4650,9 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, * the packet to the promiscuous listeners of the * client, since they expect to see the whole * frame including the VLAN headers. + * + * The MCIS_STRIP_DISABLE is only issued when sun4v + * vsw is in play. */ mp_chain = mac_strip_vlan_tag_chain(mp_chain); } @@ -4664,13 +4661,11 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, } /* - * mac_rx_soft_ring_process - * - * process a chain for a given soft ring. The number of packets queued - * in the SRS and its associated soft rings (including this one) is - * very small (tracked by srs_poll_pkt_cnt), then allow the entering - * thread (interrupt or poll thread) to do inline processing. This - * helps keep the latency down under low load. + * Process a chain for a given soft ring. If the number of packets + * queued in the SRS and its associated soft rings (including this + * one) is very small (tracked by srs_poll_pkt_cnt) then allow the + * entering thread (interrupt or poll thread) to process the chain + * inline. This is meant to reduce latency under low load. * * The proc and arg for each mblk is already stored in the mblk in * appropriate places. @@ -4729,13 +4724,13 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); /* - * If we have a soft ring set which is doing - * bandwidth control, we need to decrement - * srs_size and count so it the SRS can have a - * accurate idea of what is the real data - * queued between SRS and its soft rings. We - * decrement the counters only when the packet - * gets processed by both SRS and the soft ring. + * If we have an SRS performing bandwidth + * control then we need to decrement the size + * and count so the SRS has an accurate count + * of the data queued between the SRS and its + * soft rings. We decrement the counters only + * when the packet is processed by both the + * SRS and the soft ring. */ mutex_enter(&mac_srs->srs_lock); MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); @@ -4751,8 +4746,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, if ((ringp->s_ring_first == NULL) || (ringp->s_ring_state & S_RING_BLANK)) { /* - * We processed inline our packet and - * nothing new has arrived or our + * We processed a single packet inline + * and nothing new has arrived or our * receiver doesn't want to receive * any packets. We are done. */ @@ -4821,7 +4816,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Tx softring no desc"); /* increment freed stats */ ringp->s_ring_drops += cnt; cookie = (mac_tx_cookie_t)ringp; @@ -4865,8 +4860,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, * b_prev may be set to the fanout hint * hence can't use freemsg directly */ - mac_pkt_drop(NULL, NULL, - mp_chain, B_FALSE); + mac_drop_chain(mp_chain, + "Tx softring max queue"); DTRACE_PROBE1(tx_queued_hiwat, mac_soft_ring_t *, ringp); enqueue = B_FALSE; diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index dc8cfdd145..4655631dc1 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -207,7 +207,7 @@ mac_soft_ring_create(int id, clock_t wait, uint16_t type, ringp->s_ring_tx_hiwat = (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ? mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat; - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; ASSERT(tx->st_soft_rings[ @@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring) ASSERT((softring->s_ring_state & (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) == (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE); + mac_drop_chain(softring->s_ring_first, "softring free"); softring->s_ring_tx_arg2 = NULL; mac_soft_ring_stat_delete(softring); mac_callback_free(softring->s_ring_notify_cb_list); @@ -339,15 +339,14 @@ mac_soft_ring_fire(void *arg) } /* - * mac_rx_soft_ring_drain + * Drain the soft ring pointed to by ringp. * - * Called when worker thread model (ST_RING_WORKER_ONLY) of processing - * incoming packets is used. s_ring_first contain the queued packets. - * s_ring_rx_func contains the upper level (client) routine where the - * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the - * cookie meant for the client. + * o s_ring_first: pointer to the queued packet chain. + * + * o s_ring_rx_func: pointer to to the client's Rx routine. + * + * o s_ring_rx_{arg1,arg2}: opaque values specific to the client. */ -/* ARGSUSED */ static void mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) { @@ -392,13 +391,12 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) (*proc)(arg1, arg2, mp, NULL); /* - * If we have a soft ring set which is doing - * bandwidth control, we need to decrement its - * srs_size so it can have a accurate idea of - * what is the real data queued between SRS and - * its soft rings. We decrement the size for a - * packet only when it gets processed by both - * SRS and the soft ring. + * If we have an SRS performing bandwidth control, then + * we need to decrement the size and count so the SRS + * has an accurate measure of the data queued between + * the SRS and its soft rings. We decrement the + * counters only when the packet is processed by both + * the SRS and the soft ring. */ mutex_enter(&mac_srs->srs_lock); MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); @@ -414,12 +412,10 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) } /* - * mac_soft_ring_worker - * * The soft ring worker routine to process any queued packets. In - * normal case, the worker thread is bound to a CPU. It the soft - * ring is dealing with TCP packets, then the worker thread will - * be bound to the same CPU as the TCP squeue. + * normal case, the worker thread is bound to a CPU. If the soft ring + * handles TCP packets then the worker thread is bound to the same CPU + * as the TCP squeue. */ static void mac_soft_ring_worker(mac_soft_ring_t *ringp) @@ -604,7 +600,7 @@ mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1) mac_soft_ring_t *softring = arg; mac_soft_ring_set_t *srs; - ASSERT(rx_func != NULL); + VERIFY3P(rx_func, !=, NULL); mutex_enter(&softring->s_ring_lock); softring->s_ring_rx_func = rx_func; diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index 31972f94d8..2244218f20 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname, kstat_t *ksp; kstat_named_t *knp; - ksp = kstat_create(modname, 0, statname, "net", - KSTAT_TYPE_NAMED, count, 0); + ksp = kstat_create_zone(modname, 0, statname, "net", + KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return (NULL); @@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip) major_t major = getmajor(mip->mi_phy_dev); count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount; - ksp = kstat_create((const char *)ddi_major_to_name(major), + ksp = kstat_create_zone((const char *)ddi_major_to_name(major), getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME, - MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0); + MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return; @@ -1003,6 +1004,7 @@ void mac_ring_stat_create(mac_ring_t *ring) { mac_impl_t *mip = ring->mr_mip; + mac_group_t *grp = (mac_group_t *)ring->mr_gh; char statname[MAXNAMELEN]; char modname[MAXNAMELEN]; @@ -1014,8 +1016,8 @@ mac_ring_stat_create(mac_ring_t *ring) switch (ring->mr_type) { case MAC_RING_TYPE_RX: - (void) snprintf(statname, sizeof (statname), "mac_rx_ring%d", - ring->mr_index); + (void) snprintf(statname, sizeof (statname), + "mac_rx_ring_%d_%d", grp->mrg_index, ring->mr_index); i_mac_rx_ring_stat_create(ring, modname, statname); break; diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index e83af37f16..334d1d034b 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -47,6 +48,74 @@ #include <inet/sadb.h> #include <inet/ipsecesp.h> #include <inet/ipsecah.h> +#include <inet/tcp.h> +#include <inet/udp_impl.h> + +/* + * The next two functions are used for dropping packets or chains of + * packets, respectively. We could use one function for both but + * separating the use cases allows us to specify intent and prevent + * dropping more data than intended. + * + * The purpose of these functions is to aid the debugging effort, + * especially in production. Rather than use freemsg()/freemsgchain(), + * it's preferable to use these functions when dropping a packet in + * the MAC layer. These functions should only be used during + * unexpected conditions. That is, any time a packet is dropped + * outside of the regular, successful datapath. Consolidating all + * drops on these functions allows the user to trace one location and + * determine why the packet was dropped based on the msg. It also + * allows the user to inspect the packet before it is freed. Finally, + * it allows the user to avoid tracing freemsg()/freemsgchain() thus + * keeping the hot path running as efficiently as possible. + * + * NOTE: At this time not all MAC drops are aggregated on these + * functions; but that is the plan. This comment should be erased once + * completed. + */ + +/*PRINTFLIKE2*/ +void +mac_drop_pkt(mblk_t *mp, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + ASSERT3P(mp->b_next, ==, NULL); + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); +} + +/*PRINTFLIKE2*/ +void +mac_drop_chain(mblk_t *chain, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + /* + * We could use freemsgchain() for the actual freeing but + * since we are already walking the chain to fire the dtrace + * probe we might as well free the msg here too. + */ + for (mblk_t *mp = chain, *next; mp != NULL; ) { + next = mp->b_next; + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); + mp = next; + } +} /* * Copy an mblk, preserving its hardware checksum flags. @@ -55,15 +124,12 @@ static mblk_t * mac_copymsg_cksum(mblk_t *mp) { mblk_t *mp1; - uint32_t start, stuff, end, value, flags; mp1 = copymsg(mp); if (mp1 == NULL) return (NULL); - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); - (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, - flags, KM_NOSLEEP); + mac_hcksum_clone(mp, mp1); return (mp1); } @@ -91,224 +157,1135 @@ mac_copymsgchain_cksum(mblk_t *mp) } /* - * Process the specified mblk chain for proper handling of hardware - * checksum offload. This routine is invoked for loopback traffic - * between MAC clients. - * The function handles a NULL mblk chain passed as argument. + * Perform software checksum on a single message, if needed. The + * emulation performed is determined by an intersection of the mblk's + * flags and the emul flags requested. The emul flags are documented + * in mac.h. */ -mblk_t * -mac_fix_cksum(mblk_t *mp_chain) +static mblk_t * +mac_sw_cksum(mblk_t *mp, mac_emul_t emul) { - mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; + mblk_t *skipped_hdr = NULL; uint32_t flags, start, stuff, end, value; + uint16_t len; + uint32_t offset; + uint16_t etype; + struct ether_header *ehp; + ipha_t *ipha; + uint8_t proto; + const char *err = ""; - for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { - uint16_t len; - uint32_t offset; - struct ether_header *ehp; - uint16_t sap; + /* + * This function should only be called from mac_hw_emul() + * which handles mblk chains and the shared ref case. + */ + ASSERT3P(mp->b_next, ==, NULL); - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, - &flags); - if (flags == 0) - continue; + mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); + + /* + * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because + * we don't want to mask-out the HW_LOCAL_MAC flag. + */ + flags = DB_CKSUMFLAGS(mp); + + /* Why call this if checksum emulation isn't needed? */ + ASSERT3U(flags & (HCK_FLAGS), !=, 0); + + /* + * Ethernet, and optionally VLAN header. mac_hw_emul() has + * already verified we have enough data to read the L2 header. + */ + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) { + struct ether_vlan_header *evhp; + + evhp = (struct ether_vlan_header *)mp->b_rptr; + etype = ntohs(evhp->ether_type); + offset = sizeof (struct ether_vlan_header); + } else { + etype = ntohs(ehp->ether_type); + offset = sizeof (struct ether_header); + } + + /* + * If this packet isn't IPv4, then leave it alone. We still + * need to add IPv6 support and we don't want to affect non-IP + * traffic like ARP. + */ + if (etype != ETHERTYPE_IP) + return (mp); + + ASSERT3U(MBLKL(mp), >=, offset); + + /* + * If the first mblk of this packet contains only the ethernet + * header, skip past it for now. Packets with their data + * contained in only a single mblk can then use the fastpaths + * tuned to that possibility. + */ + if (MBLKL(mp) == offset) { + offset -= MBLKL(mp); + /* This is guaranteed by mac_hw_emul(). */ + ASSERT3P(mp->b_cont, !=, NULL); + skipped_hdr = mp; + mp = mp->b_cont; + } + + /* + * Both full and partial checksum rely on finding the IP + * header in the current mblk. Our native TCP stack honors + * this assumption but it's prudent to guard our future + * clients that might not honor this contract. + */ + ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t)); + if (MBLKL(mp) < (offset + sizeof (ipha_t))) { + err = "mblk doesn't contain IP header"; + goto bail; + } + + /* + * We are about to modify the header mblk; make sure we are + * modifying our own copy. The code that follows assumes that + * the IP/ULP headers exist in this mblk (and drops the + * message if they don't). + */ + if (DB_REF(mp) > 1) { + mblk_t *tmp = copyb(mp); + + if (tmp == NULL) { + err = "copyb failed"; + goto bail; + } + + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + skipped_hdr->b_cont = tmp; + } + + tmp->b_cont = mp->b_cont; + freeb(mp); + mp = tmp; + } + + ipha = (ipha_t *)(mp->b_rptr + offset); + + /* + * This code assumes a "simple" IP header (20 bytes, no + * options). IPv4 options are mostly a historic artifact. The + * one slight exception is Router Alert, but we don't expect + * such a packet to land here. + */ + proto = ipha->ipha_protocol; + ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); + if (ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION) { + err = "not simple IP header"; + goto bail; + } + + switch (proto) { + case IPPROTO_TCP: + ASSERT3U(MBLKL(mp), >=, + (offset + sizeof (ipha_t) + sizeof (tcph_t))); + if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) { + err = "mblk doesn't contain TCP header"; + goto bail; + } + break; + + case IPPROTO_UDP: + ASSERT3U(MBLKL(mp), >=, + (offset + sizeof (ipha_t) + sizeof (udpha_t))); + if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (udpha_t))) { + err = "mblk doesn't contain UDP header"; + goto bail; + } + break; + + default: + err = "unexpected protocol"; + goto bail; + } + + if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + ipaddr_t src, dst; + uint32_t cksum; + uint16_t *up; + + /* Get a pointer to the ULP checksum. */ + switch (proto) { + case IPPROTO_TCP: + /* LINTED: improper alignment cast */ + up = IPH_TCPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + + case IPPROTO_UDP: + /* LINTED: improper alignment cast */ + up = IPH_UDPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + } + + /* Pseudo-header checksum. */ + src = ipha->ipha_src; + dst = ipha->ipha_dst; + len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; + + cksum = (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum += htons(len); + + /* + * The checksum value stored in the packet + * needs to be correct. Compute it here. + */ + *up = 0; + cksum += (((proto) == IPPROTO_UDP) ? + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); + cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + + offset, cksum); + *(up) = (uint16_t)(cksum ? cksum : ~cksum); + + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_FULLCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } /* - * Since the processing of checksum offload for loopback - * traffic requires modification of the packet contents, - * ensure sure that we are always modifying our own copy. + * Out of paranoia, and for the sake of correctness, + * we won't calulate the IP header checksum if it's + * already populated. While unlikely, it's possible to + * write code that might end up calling mac_sw_cksum() + * twice on the same mblk (performing both LSO and + * checksum emualtion in a single mblk chain loop -- + * the LSO emulation inserts a new chain into the + * existing chain and then the loop iterates back over + * the new segments and emulates the checksum a second + * time). Normally this wouldn't be a problem, because + * the HCK_*_OK flags are supposed to indicate that we + * don't need to do peform the work. But + * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the + * same value; so we cannot use these flags to + * determine if the IP header checksum has already + * been calculated or not. Luckily, if IP requests + * HCK_IPV4_HDRCKSUM, then the IP header checksum will + * be zero. So this test works just as well as + * checking the flag. However, in the future, we + * should fix the HCK_* flags. */ - if (DB_REF(mp) > 1) { - mp1 = copymsg(mp); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; + if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) && + ipha->ipha_hdr_checksum == 0) { + ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); + flags &= ~HCK_IPV4_HDRCKSUM; + flags |= HCK_IPV4_HDRCKSUM_OK; } + } + + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + uint16_t *up, partial, cksum; + uchar_t *ipp; /* ptr to beginning of IP header */ + + ipp = mp->b_rptr + offset; + /* LINTED: cast may result in improper alignment */ + up = (uint16_t *)((uchar_t *)ipp + stuff); + partial = *up; + *up = 0; + + ASSERT3S(end, >, start); + cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial); + *up = cksum != 0 ? cksum : ~cksum; + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_PARTIALCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } + + mac_hcksum_set(mp, start, stuff, end, value, flags); + + /* Don't forget to reattach the header. */ + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); /* - * Ethernet, and optionally VLAN header. + * Duplicate the HCKSUM data into the header mblk. + * This mimics mac_add_vlan_tag which ensures that + * both the first mblk _and_ the first data bearing + * mblk possess the HCKSUM information. Consumers like + * IP will end up discarding the ether_header mblk, so + * for now, it is important that the data be available + * in both places. */ - /* LINTED: improper alignment cast */ - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) { - struct ether_vlan_header *evhp; + mac_hcksum_clone(mp, skipped_hdr); + mp = skipped_hdr; + } - ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); - /* LINTED: improper alignment cast */ - evhp = (struct ether_vlan_header *)mp->b_rptr; - sap = ntohs(evhp->ether_type); - offset = sizeof (struct ether_vlan_header); + return (mp); + +bail: + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + mp = skipped_hdr; + } + + mac_drop_pkt(mp, err); + return (NULL); +} + +/* + * Build a single data segment from an LSO packet. The mblk chain + * returned, seg_head, represents the data segment and is always + * exactly seg_len bytes long. The lso_mp and offset input/output + * parameters track our position in the LSO packet. This function + * exists solely as a helper to mac_sw_lso(). + * + * Case A + * + * The current lso_mp is larger than the requested seg_len. The + * beginning of seg_head may start at the beginning of lso_mp or + * offset into it. In either case, a single mblk is returned, and + * *offset is updated to reflect our new position in the current + * lso_mp. + * + * +----------------------------+ + * | in *lso_mp / out *lso_mp | + * +----------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = 0 out *offset = seg_len + * + * |------ seg_len ----| + * + * + * +------------------------------+ + * | in *lso_mp / out *lso_mp | + * +------------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = N out *offset = N + seg_len + * + * |------ seg_len ----| + * + * + * + * Case B + * + * The requested seg_len consumes exactly the rest of the lso_mp. + * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. + * The seg_head may start at the beginning of the lso_mp or at some + * offset into it. In either case we return a single mblk, reset + * *offset to zero, and walk to the next lso_mp. + * + * +------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = 0 + * + * |------ seg_len ----| + * + * + * + * +----------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +----------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = N + * + * |------ seg_len ----| + * + * + * Case C + * + * The requested seg_len is greater than the current lso_mp. In + * this case we must consume LSO mblks until we have enough data to + * satisfy either case (A) or (B) above. We will return multiple + * mblks linked via b_cont, offset will be set based on the cases + * above, and lso_mp will walk forward at least one mblk, but maybe + * more. + * + * N.B. This digram is not exhaustive. The seg_head may start on + * the beginning of an lso_mp. The seg_tail may end exactly on the + * boundary of an lso_mp. And there may be two (in this case the + * middle block wouldn't exist), three, or more mblks in the + * seg_head chain. This is meant as one example of what might + * happen. The main thing to remember is that the seg_tail mblk + * must be one of case (A) or (B) above. + * + * +------------------+ +----------------+ +------------------+ + * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | + * +------------------+ +----------------+ +------------------+ + * ^ ^ ^ ^ ^ ^ + * | | | | | | + * | | | | | | + * | | | | | | + * | | | | | | + * +------------+ +----------------+ +------------+ + * | seg_head |--->| |--->| seg_tail | + * +------------+ +----------------+ +------------+ + * ^ ^ + * | | + * in *offset = N out *offset = MBLKL(seg_tail) + * + * |------------------- seg_len -------------------| + * + */ +static mblk_t * +build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) +{ + mblk_t *seg_head, *seg_tail, *seg_mp; + + ASSERT3P(*lso_mp, !=, NULL); + ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); + + seg_mp = dupb(*lso_mp); + if (seg_mp == NULL) + return (NULL); + + seg_head = seg_mp; + seg_tail = seg_mp; + + /* Continue where we left off from in the lso_mp. */ + seg_mp->b_rptr += *offset; + +last_mblk: + /* Case (A) */ + if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { + *offset += seg_len; + seg_mp->b_wptr = seg_mp->b_rptr + seg_len; + return (seg_head); + } + + /* Case (B) */ + if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + return (seg_head); + } + + /* Case (C) */ + ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); + + /* + * The current LSO mblk doesn't have enough data to satisfy + * seg_len -- continue peeling off LSO mblks to build the new + * segment message. If allocation fails we free the previously + * allocated segment mblks and return NULL. + */ + while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { + ASSERT3U(MBLKL(seg_mp), <=, seg_len); + seg_len -= MBLKL(seg_mp); + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + seg_mp = dupb(*lso_mp); + + if (seg_mp == NULL) { + freemsgchain(seg_head); + return (NULL); + } + + seg_tail->b_cont = seg_mp; + seg_tail = seg_mp; + } + + /* + * We've walked enough LSO mblks that we can now satisfy the + * remaining seg_len. At this point we need to jump back to + * determine if we have arrived at case (A) or (B). + */ + + /* Just to be paranoid that we didn't underflow. */ + ASSERT3U(seg_len, <, IP_MAXPACKET); + ASSERT3U(seg_len, >, 0); + goto last_mblk; +} + +/* + * Perform software segmentation of a single LSO message. Take an LSO + * message as input and return head/tail pointers as output. This + * function should not be invoked directly but instead through + * mac_hw_emul(). + * + * The resulting chain is comprised of multiple (nsegs) MSS sized + * segments. Each segment will consist of two or more mblks joined by + * b_cont: a header and one or more data mblks. The header mblk is + * allocated anew for each message. The first segment's header is used + * as a template for the rest with adjustments made for things such as + * ID, sequence, length, TCP flags, etc. The data mblks reference into + * the existing LSO mblk (passed in as omp) by way of dupb(). Their + * b_rptr/b_wptr values are adjusted to reference only the fraction of + * the LSO message they are responsible for. At the successful + * completion of this function the original mblk (omp) is freed, + * leaving the newely created segment chain as the only remaining + * reference to the data. + */ +static void +mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, + uint_t *count) +{ + uint32_t ocsum_flags, ocsum_start, ocsum_stuff; + uint32_t mss; + uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; + uint32_t oleft; + uint_t nsegs, seg; + int len; + + struct ether_vlan_header *oevh; + const ipha_t *oiph; + const tcph_t *otcph; + ipha_t *niph; + tcph_t *ntcph; + uint16_t ip_id; + uint32_t tcp_seq, tcp_sum, otcp_sum; + + uint32_t offset; + mblk_t *odatamp; + mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; + mblk_t *tmptail; + + ASSERT3P(head, !=, NULL); + ASSERT3P(tail, !=, NULL); + ASSERT3P(count, !=, NULL); + ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); + + /* Assume we are dealing with a single LSO message. */ + ASSERT3P(omp->b_next, ==, NULL); + + /* + * XXX: This is a hack to deal with mac_add_vlan_tag(). + * + * When VLANs are in play, mac_add_vlan_tag() creates a new + * mblk with just the ether_vlan_header and tacks it onto the + * front of 'omp'. This breaks the assumptions made below; + * namely that the TCP/IP headers are in the first mblk. In + * this case, since we already have to pay the cost of LSO + * emulation, we simply pull up everything. While this might + * seem irksome, keep in mind this will only apply in a couple + * of scenarios: a) an LSO-capable VLAN client sending to a + * non-LSO-capable client over the "MAC/bridge loopback" + * datapath or b) an LSO-capable VLAN client is sending to a + * client that, for whatever reason, doesn't have DLS-bypass + * enabled. Finally, we have to check for both a tagged and + * untagged sized mblk depending on if the mblk came via + * mac_promisc_dispatch() or mac_rx_deliver(). + * + * In the future, two things should be done: + * + * 1. This function should make use of some yet to be + * implemented "mblk helpers". These helper functions would + * perform all the b_cont walking for us and guarantee safe + * access to the mblk data. + * + * 2. We should add some slop to the mblks so that + * mac_add_vlan_tag() can just edit the first mblk instead + * of allocating on the hot path. + */ + if (MBLKL(omp) == sizeof (struct ether_vlan_header) || + MBLKL(omp) == sizeof (struct ether_header)) { + mblk_t *tmp = msgpullup(omp, -1); + + if (tmp == NULL) { + mac_drop_pkt(omp, "failed to pull up"); + goto fail; + } + + mac_hcksum_clone(omp, tmp); + freemsg(omp); + omp = tmp; + } + + mss = DB_LSOMSS(omp); + ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + + sizeof (struct ether_vlan_header)); + opktlen = msgsize(omp); + + /* + * First, get references to the IP and TCP headers and + * determine the total TCP length (header + data). + * + * Thanks to mac_hw_emul() we know that the first mblk must + * contain (at minimum) the full L2 header. However, this + * function assumes more than that. It assumes the L2/L3/L4 + * headers are all contained in the first mblk of a message + * (i.e., no b_cont walking for headers). While this is a + * current reality (our native TCP stack and viona both + * enforce this) things may become more nuanced in the future + * (e.g. when introducing encap support or adding new + * clients). For now we guard against this case by dropping + * the packet. + */ + oevh = (struct ether_vlan_header *)omp->b_rptr; + if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) + oehlen = sizeof (struct ether_vlan_header); + else + oehlen = sizeof (struct ether_header); + + ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); + if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { + mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); + goto fail; + } + + oiph = (ipha_t *)(omp->b_rptr + oehlen); + oiphlen = IPH_HDR_LENGTH(oiph); + otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); + otcphlen = TCP_HDR_LENGTH(otcph); + + /* + * Currently we only support LSO for TCP/IPv4. + */ + if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { + mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", + IPH_HDR_VERSION(oiph)); + goto fail; + } + + if (oiph->ipha_protocol != IPPROTO_TCP) { + mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", + oiph->ipha_protocol); + goto fail; + } + + if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { + mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); + goto fail; + } + + ohdrslen = oehlen + oiphlen + otcphlen; + if ((len = MBLKL(omp)) < ohdrslen) { + mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, + ohdrslen); + goto fail; + } + + /* + * Either we have data in the first mblk or it's just the + * header. In either case, we need to set rptr to the start of + * the TCP data. + */ + if (len > ohdrslen) { + odatamp = omp; + offset = ohdrslen; + } else { + ASSERT3U(len, ==, ohdrslen); + odatamp = omp->b_cont; + offset = 0; + } + + /* Make sure we still have enough data. */ + ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); + + /* + * If a MAC negotiated LSO then it must negotioate both + * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or + * HCKSUM_INET_PARTIAL; because both the IP and TCP headers + * change during LSO segmentation (only the 3 fields of the + * pseudo header checksum don't change: src, dst, proto). Thus + * we would expect these flags (HCK_IPV4_HDRCKSUM | + * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this + * function to emulate those checksums in software. However, + * that assumes a world where we only expose LSO if the + * underlying hardware exposes LSO. Moving forward the plan is + * to assume LSO in the upper layers and have MAC perform + * software LSO when the underlying provider doesn't support + * it. In such a world, if the provider doesn't support LSO + * but does support hardware checksum offload, then we could + * simply perform the segmentation and allow the hardware to + * calculate the checksums. To the hardware it's just another + * chain of non-LSO packets. + */ + ASSERT3S(DB_TYPE(omp), ==, M_DATA); + ocsum_flags = DB_CKSUMFLAGS(omp); + ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); + ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); + + /* + * If hardware only provides partial checksum then software + * must supply the pseudo-header checksum. In the case of LSO + * we leave the TCP length at zero to be filled in by + * hardware. This function must handle two scenarios. + * + * 1. Being called by a MAC client on the Rx path to segment + * an LSO packet and calculate the checksum. + * + * 2. Being called by a MAC provider to segment an LSO packet. + * In this case the LSO segmentation is performed in + * software (by this routine) but the MAC provider should + * still calculate the TCP/IP checksums in hardware. + * + * To elaborate on the second case: we cannot have the + * scenario where IP sends LSO packets but the underlying HW + * doesn't support checksum offload -- because in that case + * TCP/IP would calculate the checksum in software (for the + * LSO packet) but then MAC would segment the packet and have + * to redo all the checksum work. So IP should never do LSO + * if HW doesn't support both IP and TCP checksum. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + ocsum_start = (uint32_t)DB_CKSUMSTART(omp); + ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); + } + + odatalen = opktlen - ohdrslen; + + /* + * Subtract one to account for the case where the data length + * is evenly divisble by the MSS. Add one to account for the + * fact that the division will always result in one less + * segment than needed. + */ + nsegs = ((odatalen - 1) / mss) + 1; + if (nsegs < 2) { + mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); + goto fail; + } + + DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, + __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, + nsegs); + + seg_chain = NULL; + tmptail = seg_chain; + oleft = odatalen; + + for (uint_t i = 0; i < nsegs; i++) { + boolean_t last_seg = ((i + 1) == nsegs); + uint32_t seg_len; + + /* + * If we fail to allocate, then drop the partially + * allocated chain as well as the LSO packet. Let the + * sender deal with the fallout. + */ + if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "failed to alloc segment header"); + goto fail; + } + ASSERT3P(nhdrmp->b_cont, ==, NULL); + + if (seg_chain == NULL) { + seg_chain = nhdrmp; } else { - sap = ntohs(ehp->ether_type); - offset = sizeof (struct ether_header); + ASSERT3P(tmptail, !=, NULL); + tmptail->b_next = nhdrmp; } - if (MBLKL(mp) <= offset) { - offset -= MBLKL(mp); - if (mp->b_cont == NULL) { - /* corrupted packet, skip it */ - if (prev != NULL) - prev->b_next = mp->b_next; - else - new_chain = mp->b_next; - mp1 = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = mp1; - continue; - } - mp = mp->b_cont; + tmptail = nhdrmp; + + /* + * Calculate this segment's lengh. It's either the MSS + * or whatever remains for the last segment. + */ + seg_len = last_seg ? oleft : mss; + ASSERT3U(seg_len, <=, mss); + ndatamp = build_data_seg(&odatamp, &offset, seg_len); + + if (ndatamp == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "LSO failed to segment data"); + goto fail; } - if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { - ipha_t *ipha = NULL; + /* Attach data mblk to header mblk. */ + nhdrmp->b_cont = ndatamp; + DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; + ASSERT3U(seg_len, <=, oleft); + oleft -= seg_len; + } - /* - * In order to compute the full and header - * checksums, we need to find and parse - * the IP and/or ULP headers. - */ + /* We should have consumed entire LSO msg. */ + ASSERT3S(oleft, ==, 0); + ASSERT3P(odatamp, ==, NULL); + + /* + * All seg data mblks are referenced by the header mblks, null + * out this pointer to catch any bad derefs. + */ + ndatamp = NULL; + + /* + * Set headers and checksum for first segment. + */ + nhdrmp = seg_chain; + bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ip_id = ntohs(niph->ipha_ident); + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + tcp_seq = BE32_TO_U32(ntcph->th_seq); + tcp_seq += mss; + + /* + * The first segment shouldn't: + * + * o indicate end of data transmission (FIN), + * o indicate immediate handling of the data (PUSH). + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + + /* + * If the underlying HW provides partial checksum, then make + * sure to correct the pseudo header checksum before calling + * mac_sw_cksum(). The native TCP stack doesn't include the + * length field in the pseudo header when LSO is in play -- so + * we need to calculate it here. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = BE16_TO_U16(ntcph->th_sum); + otcp_sum = tcp_sum; + tcp_sum += mss + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; - sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + /* + * We may have freed the nhdrmp argument during + * checksum emulation, make sure that seg_chain + * references a valid mblk. + */ + seg_chain = nhdrmp; + } + ASSERT3P(nhdrmp, !=, NULL); + + seg = 1; + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, + uint_t, seg); + seg++; + + /* There better be at least 2 segs. */ + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + + /* + * Now adjust the headers of the middle segments. For each + * header we need to adjust the following. + * + * o IP ID + * o IP length + * o TCP sequence + * o TCP flags + * o cksum flags + * o cksum values (if MAC_HWCKSUM_EMUL is set) + */ + for (; seg < nsegs; seg++) { + /* + * We use seg_chain as a reference to the first seg + * header mblk -- this first header is a template for + * the rest of the segments. This copy will include + * the now updated checksum values from the first + * header. We must reset these checksum values to + * their original to make sure we produce the correct + * value. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + tcp_seq += mss; + /* + * Just like the first segment, the middle segments + * shouldn't have these flags set. + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + + if (ocsum_flags & HCK_PARTIALCKSUM) { /* - * IP header. + * First and middle segs have same + * pseudo-header checksum. */ - if (sap != ETHERTYPE_IP) - continue; + U16_TO_BE16(tcp_sum, ntcph->th_sum); + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + } - ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); - /* LINTED: improper alignment cast */ - ipha = (ipha_t *)(mp->b_rptr + offset); - - if (flags & HCK_FULLCKSUM) { - ipaddr_t src, dst; - uint32_t cksum; - uint16_t *up; - uint8_t proto; - - /* - * Pointer to checksum field in ULP header. - */ - proto = ipha->ipha_protocol; - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - - switch (proto) { - case IPPROTO_TCP: - /* LINTED: improper alignment cast */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - case IPPROTO_UDP: - /* LINTED: improper alignment cast */ - up = IPH_UDPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - default: - cmn_err(CE_WARN, "mac_fix_cksum: " - "unexpected protocol: %d", proto); - continue; - } - - /* - * Pseudo-header checksum. - */ - src = ipha->ipha_src; - dst = ipha->ipha_dst; - len = ntohs(ipha->ipha_length) - - IP_SIMPLE_HDR_LENGTH; - - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum += htons(len); - - /* - * The checksum value stored in the packet needs - * to be correct. Compute it here. - */ - *up = 0; - cksum += (((proto) == IPPROTO_UDP) ? - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); - cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + - offset, cksum); - *(up) = (uint16_t)(cksum ? cksum : ~cksum); - - /* - * Flag the packet so that it appears - * that the checksum has already been - * verified by the hardware. - */ - flags &= ~HCK_FULLCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; - } + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; + /* We may have freed the original nhdrmp. */ + prev_nhdrmp->b_next = nhdrmp; + } - if (flags & HCK_IPV4_HDRCKSUM) { - ASSERT(ipha != NULL); - ipha->ipha_hdr_checksum = - (uint16_t)ip_csum_hdr(ipha); - flags &= ~HCK_IPV4_HDRCKSUM; - flags |= HCK_IPV4_HDRCKSUM_OK; + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), + uint_t, mss, uint_t, seg); - } + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + } + + /* Make sure we are on the last segment. */ + ASSERT3U(seg, ==, nsegs); + ASSERT3P(nhdrmp->b_next, ==, NULL); + + /* + * Now we set the last segment header. The difference being + * that FIN/PSH/RST flags are allowed. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + len = msgsize(nhdrmp->b_cont); + ASSERT3S(len, >, 0); + niph->ipha_length = htons(oiphlen + otcphlen + len); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = otcp_sum; + tcp_sum += len + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + /* This should be the last mblk. */ + ASSERT3P(nhdrmp->b_next, ==, NULL); + nhdrmp = mac_sw_cksum(nhdrmp, emul); + prev_nhdrmp->b_next = nhdrmp; + } + + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, + uint_t, seg); + + /* + * Free the reference to the original LSO message as it is + * being replaced by seg_cahin. + */ + freemsg(omp); + *head = seg_chain; + *tail = nhdrmp; + *count = nsegs; + return; + +fail: + *head = NULL; + *tail = NULL; + *count = 0; +} + +#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) + +/* + * Emulate various hardware offload features in software. Take a chain + * of packets as input and emulate the hardware features specified in + * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' + * pointer given as input, and its tail pointer is written to + * '*otail'. The number of packets in the new chain is written to + * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus + * may be NULL. The 'mp_chain' argument may point to a NULL chain; in + * which case 'mp_chain' will simply stay a NULL chain. + * + * While unlikely, it is technically possible that this function could + * receive a non-NULL chain as input and return a NULL chain as output + * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be + * zero). This could happen if all the packets in the chain are + * dropped or if we fail to allocate new mblks. In this case, there is + * nothing for the caller to free. In any event, the caller shouldn't + * assume that '*mp_chain' is non-NULL on return. + * + * This function was written with two main use cases in mind. + * + * 1. A way for MAC clients to emulate hardware offloads when they + * can't directly handle LSO packets or packets without fully + * calculated checksums. + * + * 2. A way for MAC providers (drivers) to offer LSO even when the + * underlying HW can't or won't supply LSO offload. + * + * At the time of this writing no provider is making use of this + * function. However, the plan for the future is to always assume LSO + * is available and then add SW LSO emulation to all providers that + * don't support it in HW. + */ +void +mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) +{ + mblk_t *head = NULL, *tail = NULL; + uint_t count = 0; + + ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); + ASSERT3P(mp_chain, !=, NULL); + + for (mblk_t *mp = *mp_chain; mp != NULL; ) { + mblk_t *tmp, *next, *tmphead, *tmptail; + struct ether_header *ehp; + uint32_t flags; + uint_t len = MBLKL(mp), l2len; + + /* Perform LSO/cksum one message at a time. */ + next = mp->b_next; + mp->b_next = NULL; + + /* + * For our sanity the first mblk should contain at + * least the full L2 header. + */ + if (len < sizeof (struct ether_header)) { + mac_drop_pkt(mp, "packet too short (A): %u", len); + mp = next; + continue; } - if (flags & HCK_PARTIALCKSUM) { - uint16_t *up, partial, cksum; - uchar_t *ipp; /* ptr to beginning of IP header */ - - if (mp->b_cont != NULL) { - mblk_t *mp1; - - mp1 = msgpullup(mp, offset + end); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; - } + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) + l2len = sizeof (struct ether_vlan_header); + else + l2len = sizeof (struct ether_header); - ipp = mp->b_rptr + offset; - /* LINTED: cast may result in improper alignment */ - up = (uint16_t *)((uchar_t *)ipp + stuff); - partial = *up; - *up = 0; + /* + * If the first mblk is solely the L2 header, then + * there better be more data. + */ + if (len < l2len || (len == l2len && mp->b_cont == NULL)) { + mac_drop_pkt(mp, "packet too short (C): %u", len); + mp = next; + continue; + } - cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, - end - start, partial); - cksum = ~cksum; - *up = cksum ? cksum : ~cksum; + DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); + + /* + * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) + * because we don't want to mask-out the LSO flag. + */ + flags = DB_CKSUMFLAGS(mp); + + if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { + uint_t tmpcount = 0; /* - * Since we already computed the whole checksum, - * indicate to the stack that it has already - * been verified by the hardware. + * LSO fix-up handles checksum emulation + * inline (if requested). It also frees mp. */ - flags &= ~HCK_PARTIALCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; + mac_sw_lso(mp, emul, &tmphead, &tmptail, + &tmpcount); + if (tmphead == NULL) { + /* mac_sw_lso() freed the mp. */ + mp = next; + continue; + } + count += tmpcount; + } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { + tmp = mac_sw_cksum(mp, emul); + if (tmp == NULL) { + /* mac_sw_cksum() freed the mp. */ + mp = next; + continue; + } + tmphead = tmp; + tmptail = tmp; + count++; + } else { + /* There is nothing to emulate. */ + tmp = mp; + tmphead = tmp; + tmptail = tmp; + count++; + } + + /* + * The tmp mblk chain is either the start of the new + * chain or added to the tail of the new chain. + */ + if (head == NULL) { + head = tmphead; + tail = tmptail; + } else { + /* Attach the new mblk to the end of the new chain. */ + tail->b_next = tmphead; + tail = tmptail; } - (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, - value, flags, KM_NOSLEEP); + mp = next; } - return (new_chain); + *mp_chain = head; + + if (otail != NULL) + *otail = tail; + + if (ocount != NULL) + *ocount = count; } /* @@ -320,7 +1297,6 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) mblk_t *hmp; struct ether_vlan_header *evhp; struct ether_header *ehp; - uint32_t start, stuff, end, value, flags; ASSERT(pri != 0 || vid != 0); @@ -350,9 +1326,7 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) * Free the original message if it's now empty. Link the * rest of messages to the header message. */ - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); - (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags, - KM_NOSLEEP); + mac_hcksum_clone(mp, hmp); if (MBLKL(mp) == 0) { hmp->b_cont = mp->b_cont; freeb(mp); @@ -456,16 +1430,9 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain) */ /* ARGSUSED */ void -mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, +mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp, boolean_t loopback) { - mblk_t *mp1 = mp; - - while (mp1 != NULL) { - mp1->b_prev = NULL; - mp1->b_queue = NULL; - mp1 = mp1->b_next; - } freemsgchain(mp); } diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c index 950fab1272..fcea4a8f03 100644 --- a/usr/src/uts/common/io/mem.c +++ b/usr/src/uts/common/io/mem.c @@ -225,10 +225,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred) case M_NULL: case M_ZERO: case M_FULL: + /* standard devices */ + break; + case M_MEM: case M_KMEM: case M_ALLKMEM: - /* standard devices */ + /* + * These devices should never be visible in a zone, but if they + * somehow do get created we refuse to allow the zone to use + * them. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); break; default: diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index cfda434e23..6c585c6a42 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -13,3 +13,11 @@ # Fast-Path specific flag. Default is "yes". # mrsas-enable-fp="yes"; +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE new file mode 100644 index 0000000000..187088ff34 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014, Thales UK Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..cde8b65b37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +NFAST CRYPTO ACCELERATOR DRIVER diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h new file mode 100644 index 0000000000..b9021942b2 --- /dev/null +++ b/usr/src/uts/common/io/nfp/autoversion.h @@ -0,0 +1,21 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* AUTOGENERATED - DO NOT EDIT */ +#ifndef AUTOVERSION_H +#define AUTOVERSION_H + +#define VERSION_RELEASEMAJOR 2 +#define VERSION_RELEASEMINOR 26 +#define VERSION_RELEASEPATCH 40 +#define VERSION_NO "2.26.40cam999" +#define VERSION_COMPNAME "nfdrv" + +#endif diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c new file mode 100644 index 0000000000..a04b1fd5b0 --- /dev/null +++ b/usr/src/uts/common/io/nfp/drvlist.c @@ -0,0 +1,19 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include "nfp_common.h" +#include "nfp_cmd.h" + +const nfpcmd_dev *nfp_drvlist[] = { + &i21285_cmddev, + &i21555_cmddev, + NULL +}; + diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c new file mode 100644 index 0000000000..684be703ea --- /dev/null +++ b/usr/src/uts/common/io/nfp/hostif.c @@ -0,0 +1,1192 @@ +/* + +hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8 + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv +12/02/2002 jsh added high level interrupt support + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" + +#include "nfp.h" + +/* mapped memory attributes, no-swap endianess (done in higher level) */ +static struct ddi_device_acc_attr nosw_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* dma attributes */ +static ddi_dma_attr_t dma_attrs = { + DMA_ATTR_V0, /* version number */ + (uint64_t)0x0, /* low address */ + (uint64_t)0xffffffff, /* high address */ + (uint64_t)0xffffff, /* DMA counter max */ + (uint64_t)0x1, /* alignment */ + 0x0c, /* burst sizes */ + 0x1, /* minimum transfer size */ + (uint64_t)0x3ffffff, /* maximum transfer size */ + (uint64_t)0x7fff, /* maximum segment size */ + 1, /* no scatter/gather lists */ + 1, /* granularity */ + 0 /* DMA flags */ +}; + +/* + * Debug message control + * Debug Levels: + * 0 = no messages + * 1 = Errors + * 2 = Subroutine calls & control flow + * 3 = I/O Data (verbose!) + * Can be set with adb or in the /etc/system file with + * "set nfp:nfp_debug=<value>" + */ + +int nfp_debug= 1; + +static void *state_head; /* opaque handle top of state structs */ + +static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp); +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp); +static int nfp_release_dev( dev_info_t *dip ); + +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_strategy(struct buf *bp); + +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp); +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static void nfp_wrtimeout (void *pdev); +static void nfp_rdtimeout (void *pdev); + +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result); +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +static void nfp_read_complete_final(nfp_dev *pdev, int ok); +static void nfp_write_complete_final(nfp_dev *pdev, int ok); + +/* nfp file ops --------------------------------------------------- */ + +static struct cb_ops nfp_cb_ops = { + nfp_open, + nfp_close, + nodev, /* no nfp_strategy */ + nodev, /* no print routine */ + nodev, /* no dump routine */ + nfp_read, + nfp_write, + nfp_ioctl, + nodev, /* no devmap routine */ + nodev, /* no mmap routine */ + nodev, /* no segmap routine */ + nfp_chpoll, + ddi_prop_op, + 0, /* not a STREAMS driver, no cb_str routine */ + D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */ + CB_REV, + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops nfp_ops = { + DEVO_REV, /* DEVO_REV indicated by manual */ + 0, /* device reference count */ + nfp_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + nfp_attach, + nfp_detach, + nodev, /* device reset routine */ + &nfp_cb_ops, + (struct bus_ops *)0, /* bus operations */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv modldrv = { + &mod_driverops, + NFP_DRVNAME, + &nfp_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* MODREV_1 indicated by manual */ + (void *)&modldrv, + NULL, /* termination of list of linkage structures */ +}; + +/* interface resource allocation */ + +int nfp_alloc_pci_push( nfp_dev *pdev ) { + /* allocate resources needed for PCI Push, + * if not already allocated. + * return True if successful + */ + nfp_err ret; + uint_t cookie_count; + size_t real_length; + + if(!pdev->read_buf) { + /* allocate read buffer */ + pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP ); + } + if(!pdev->read_buf) { + nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed"); + pdev->read_buf = NULL; + return 0; + } + + if(!pdev->rd_dma_ok) { + /* allocate dma handle for read buffer */ + ret = ddi_dma_alloc_handle( pdev->dip, + &dma_attrs, + DDI_DMA_DONTWAIT, + NULL, + &pdev->read_dma_handle ); + if( ret != DDI_SUCCESS ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)", + ret ); + return 0; + } + + /* Allocate the memory for dma transfers */ + ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr, + DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL, + (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle); + if (ret != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + + ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle, + NULL, /* kernel address space */ + (caddr_t)pdev->read_buf, real_length, + DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */ + DDI_DMA_DONTWAIT, NULL, + &pdev->read_dma_cookie, &cookie_count ); + if( ret != DDI_DMA_MAPPED ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)", + ret); + ddi_dma_mem_free(&pdev->acchandle); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + if( cookie_count > 1 ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: error:" + " ddi_dma_addr_bind_handle wants %d transfers", + cookie_count); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + pdev->rd_dma_ok = 1; + } + return pdev->rd_dma_ok; +} + +void nfp_free_pci_push( nfp_dev *pdev ) { + /* free resources allocated to PCI Push */ + if( pdev->rd_dma_ok ) { + (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + pdev->rd_dma_ok = 0; + } + if( pdev->read_buf ) { + kmem_free( pdev->read_buf, NFP_READBUF_SIZE ); + pdev->read_buf = NULL; + } +} + +/* include definition of nfp_set_ifvers() */ +#define nfp_ifvers NFDEV_IF_PCI_PUSH +#include "nfp_ifvers.c" +#undef nfp_ifvers + +/*--------------------*/ +/* nfp_isr */ +/*--------------------*/ + +static u_int nfp_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + nfp_err ne; + int handled; + + nfp_log( NFP_DBG3, "nfp_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + + /* The isr needs to be mutex'ed - an SMP can call us while we're still + * running! + */ + mutex_enter(&pdev->low_mutex); + ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled ); + mutex_exit(&pdev->low_mutex); + + if( !ne && handled ) + return DDI_INTR_CLAIMED; + if (ne) + nfp_log( NFP_DBG1, "nfp_isr: failed"); + else + nfp_log( NFP_DBG3, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + +static u_int nfp_soft_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + int rd, wr; + + nfp_log( NFP_DBG3, "nfp_soft_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + rd= wr= 0; + + mutex_enter(&pdev->high_mutex); + if(pdev->high_read) { + pdev->high_read= 0; + mutex_exit(&pdev->high_mutex); + rd= 1; + } + if(pdev->high_write) { + pdev->high_write= 0; + wr= 1; + } + mutex_exit(&pdev->high_mutex); + + if(rd) { + nfp_log( NFP_DBG3, "nfp_soft_isr: read done"); + nfp_read_complete_final(pdev, pdev->rd_ok); + } + if(wr) { + nfp_log( NFP_DBG3, "nfp_soft_isr: write done"); + nfp_write_complete_final(pdev, pdev->wr_ok); + } + if( rd || wr ) + return DDI_INTR_CLAIMED; + + nfp_log( NFP_DBG2, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + + +/*-------------------------*/ +/* nfp_read */ +/*-------------------------*/ + +void nfp_read_complete(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete: entering"); + + if(pdev->high_intr) { + nfp_log(NFP_DBG2, "nfp_read_complete: high_intr"); + mutex_enter(&pdev->high_mutex); + nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered"); + if(pdev->high_read) + nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!"); + pdev->high_read= 1; + pdev->rd_ok= ok; + nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex"); + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_read_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_read_complete: exiting"); +} + +static void nfp_read_complete_final(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete_final: entering"); + if(pdev->rdtimeout) + (void) untimeout(pdev->rdtimeout); + if(!pdev->rd_outstanding) { + nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding"); + } + nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok); + mutex_enter(&pdev->isr_mutex); + pdev->rd_outstanding= 0; + pdev->rd_ready= 1; + pdev->rd_ok= ok; + cv_broadcast(&pdev->rd_cv); + mutex_exit(&pdev->isr_mutex); + pollwakeup (&pdev->pollhead, POLLRDNORM); + nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting"); +} + +static void nfp_rdtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." ); + return; + } + pdev->rdtimeout= 0; + nfp_read_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_read: entered" ); + if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) { + nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev"); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_read: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_write */ +/*-------------------------*/ + +void nfp_write_complete( nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_write_complete: entering"); + + if(pdev->high_intr) { + mutex_enter(&pdev->high_mutex); + if(pdev->high_write) + nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!"); + pdev->high_write= 1; + pdev->wr_ok= ok; + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_write_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_write_complete: exiting"); +} + +static void nfp_write_complete_final( nfp_dev *pdev, int ok) { + struct buf *local_wr_bp; + nfp_log( NFP_DBG2,"nfp_write_complete_final: entering"); + if(pdev->wrtimeout) + (void) untimeout(pdev->wrtimeout); + + if (!pdev->wr_bp) { + nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." ); + return; + } + + bp_mapout(pdev->wr_bp); + pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount; + /* Make sure we set wr_ready before calling biodone to avoid a race */ + pdev->wr_ready = 1; + bioerror(pdev->wr_bp, ok ? 0 : ENXIO); + local_wr_bp = pdev->wr_bp; + pdev->wr_bp = 0; + biodone(local_wr_bp); + nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited"); + pollwakeup (&pdev->pollhead, POLLWRNORM); + + nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving"); +} + +static void nfp_wrtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." ); + return; + } + pdev->wrtimeout= 0; + nfp_write_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_write: entered." ); + if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) { + nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev."); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_write: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_strategy */ +/*-------------------------*/ + +#define NFP_STRAT_ERR(thebp,err,txt) \ + nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \ + (thebp)->b_resid = (thebp)->b_bcount; \ + bioerror ((thebp), err); \ + biodone ((thebp)); + +static int nfp_strategy(struct buf *bp) { + register struct nfp_dev *pdev; + nfp_err ne; + + nfp_log( NFP_DBG2, "nfp_strategy: entered." ); + if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) { + NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev"); + return (0); + } + + if (bp->b_flags & B_READ) { + int count; + /* read */ + if (!pdev->rd_ready) { + NFP_STRAT_ERR (bp,ENXIO,"read called when not ready"); + return (0); + } + pdev->rd_ready=0; + pdev->rd_pending = 0; + if( !pdev->rd_ok) { + NFP_STRAT_ERR (bp,ENXIO,"read failed"); + return (0); + } + /* copy data from module */ + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer"); + if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS ) + { + NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed"); + return (0); + } + /* LINTED: alignment */ + count= *(unsigned int *)(pdev->read_buf+4); + count= FROM_LE32_MEM(&count); + nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count); + if(count<0 || count>bp->b_bcount) { + NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device"); + nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count); + return (0); + } + bp_mapin (bp); + bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count ); + bp_mapout (bp); + } else { + bp_mapin (bp); + ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count ); + bp_mapout (bp); + if( ne != NFP_SUCCESS) { + NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed"); + return (0); + } + } + bioerror(bp, 0); + bp->b_resid = 0; + biodone (bp); + } else { + /* write */ + if (!pdev->wr_ready) { + NFP_STRAT_ERR (bp,ENXIO,"write called when not ready"); + return (0); + } + if (pdev->wr_bp) { + NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL"); + return (0); + } + pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + pdev->wr_bp = bp; + pdev->wr_ready = 0; + bp_mapin (bp); + ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx); + if( ne != NFP_SUCCESS ) { + bp_mapout (bp); + (void) untimeout(pdev->wrtimeout); + pdev->wr_bp = 0; + pdev->wr_ready = 1; + NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed"); + return (0); + } + } + nfp_log( NFP_DBG2, "nfp_strategy: leaving"); + + return (0); +} + + +/*--------------------*/ +/* poll / select */ +/*--------------------*/ + +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) { + nfp_dev *pdev; + short revents; + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev"); + *reventsp=0; + return (0); + } + nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events); + + revents=0; + if (events&POLLWRNORM) { + if (pdev->wr_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: write ready"); + revents|=POLLWRNORM; + } + } + + if (events&POLLRDNORM) { + if (pdev->rd_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: read ready"); + revents|=POLLRDNORM; + } + } + + if (!revents && !anyyet) { + *phpp=&pdev->pollhead; + } + *reventsp=revents; + + nfp_log( NFP_DBG2, "nfp_chpoll: leaving"); + return (0); +} + + +/*--------------------*/ +/* ioctl */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) { + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_ioctl: entered." ); + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev."); + return (ENXIO); + } + + switch (cmd) { + case NFDEV_IOCTL_ENQUIRY: + { + long *outp; + int outlen; + nfdev_enquiry_str enq_data; + + enq_data.busno = (unsigned int)-1; + enq_data.slotno = (unsigned char)-1; + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." ); + if( outlen > 0 ) { + enq_data.busno = ((*outp)>>16) & 0xff; + enq_data.slotno = ((*outp)>>11) & 0x1f; + nfp_log( NFP_DBG2, "busno %d, slotno %d.", + enq_data.busno, enq_data.slotno ); + } + } else + nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." ); + + if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + case NFDEV_IOCTL_ENSUREREADING: + { + unsigned int addr, len; + nfp_err ret; + if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + /* signal a read to the module */ + nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len ); + if (len>8192) { + nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len ); + return EINVAL; + } + if (pdev->rd_outstanding==1) { + nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding."); + return EIO; + } + + addr= 0; + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + if( len > NFP_READBUF_SIZE ) { + nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len ); + return EINVAL; + } + addr= pdev->read_dma_cookie.dmac_address; + } + + pdev->rd_outstanding = 1; + nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1"); + + /* setup timeout timer */ + pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + + nfp_log( NFP_DBG2, "nfp_ioctl: read request"); + ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx); + if ( ret != NFP_SUCCESS ) { + (void) untimeout(pdev->rdtimeout); + pdev->rdtimeout = 0; + pdev->rd_outstanding = 0; + nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed "); + return nfp_oserr( ret ); + } + } + break; + + case NFDEV_IOCTL_PCI_IFVERS: + { + int vers; + + nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS"); + + if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + + if( pdev->rd_outstanding ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers); + return EIO; + } + + nfp_set_ifvers(pdev, vers); + if( pdev->ifvers != vers ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers); + return EIO; + } + } + break; + + case NFDEV_IOCTL_STATS: + { + if( ddi_copyout( (char *)&(pdev->common.stats), + (void *)arg, + sizeof(nfdev_stats_str), + mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + default: + nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." ); + return EINVAL; + } + + return 0; +} + +/*-------------------------*/ +/* nfp_open */ +/*-------------------------*/ + +/* ARGSUSED */ +int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + nfp_err ret; + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "entered nfp_open." ); + + pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev)); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev."); + return (ENODEV); + } + + if( otyp != OTYP_CHR ) { + nfp_log( NFP_DBG1, "nfp_open: not opened as character device"); + return (EINVAL); + } + + mutex_enter(&pdev->busy_mutex); + + if (pdev->busy) { + mutex_exit(&pdev->busy_mutex); + nfp_log( NFP_DBG1, "nfp_open: device busy"); + return EBUSY; + } + pdev->busy= 1; + mutex_exit(&pdev->busy_mutex); + + /* use oldest possible interface until told otherwise */ + pdev->ifvers= NFDEV_IF_STANDARD; + nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers); + pdev->rd_ready= 0; /* drop any old data */ + + ret = pdev->cmddev->open(pdev->common.cmdctx); + if( ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed "); + return nfp_oserr( ret ); + } + + nfp_log( NFP_DBG2, "nfp_open: done"); + + return 0; +} + +/*--------------------*/ +/* nfp_close */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) { + nfp_dev *pdev; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_close: entered"); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + mutex_enter(&pdev->isr_mutex); + if(pdev->rd_outstanding) { + int lbolt, err; + nfp_get_lbolt(&lbolt, err); + if(!err) + (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) ); + } + mutex_exit(&pdev->isr_mutex); + ret = pdev->cmddev->close(pdev->common.cmdctx); + if (ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed"); + return nfp_oserr( ret ); + } + + mutex_enter(&pdev->busy_mutex); + pdev->busy= 0; + mutex_exit(&pdev->busy_mutex); + + return 0; +} + +/**************************************************************************** + + nfp driver config + + ****************************************************************************/ + +/*-------------------------*/ +/* nfp_getinfo */ +/*-------------------------*/ + +/* ARGSUSED */ +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + int error; + nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_getinfo: entered" ); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if (pdev == NULL) { + *result = NULL; + error = DDI_FAILURE; + } else { + /* + * don't need to use a MUTEX even though we are + * accessing our instance structure; dev->dip + * never changes. + */ + *result = pdev->dip; + error = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + break; + default: + *result = NULL; + error = DDI_FAILURE; + } + + nfp_log( NFP_DBG2, "nfp_getinfo: leaving." ); + return (error); +} + +/*-------------------------*/ +/* nfp_release */ +/*-------------------------*/ + +static int nfp_release_dev( dev_info_t *dip ) { + nfp_dev *pdev; + int instance, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_release_dev: entering" ); + + instance = ddi_get_instance(dip); + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if (pdev) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing device" ); + + nfp_free_pci_push(pdev); + + if( pdev->cmddev ) { + nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" ); + ret = pdev->cmddev->destroy(pdev->common.cmdctx); + if (ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed "); + return nfp_oserr( ret ); + } + } + + if(pdev->high_iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" ); + ddi_remove_softintr(pdev->soft_int_id); + ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + mutex_destroy( &pdev->high_mutex ); + } else if(pdev->iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" ); + ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + } + if(pdev->low_iblock_cookie) { + ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie); + mutex_destroy( &pdev->low_mutex); + } + + for(i=0;i<6;i++) { + if( pdev->common.extra[i] ) { + nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i ); + ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]); + } + } + + ddi_remove_minor_node(dip, NULL); + + if (pdev->conf_handle) + pci_config_teardown( &pdev->conf_handle ); + + ddi_soft_state_free(state_head, instance); + } + nfp_log( NFP_DBG2, "nfp_release: finished" ); + + return DDI_SUCCESS; +} + + +/*-------------------------*/ +/* nfp_attach */ +/*-------------------------*/ + +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + int instance; + nfp_dev *pdev = NULL; + int intres; + uint16_t device, vendor, sub_device, sub_vendor; + long *outp; + nfpcmd_dev const *cmddev; + int index, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_attach: entered." ); + + if (cmd != DDI_ATTACH) { + nfp_log( NFP_DBG1, "nfp_attach: bad command." ); + goto bailout; + } + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(state_head, instance) != 0) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." ); + goto bailout; + } + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_attach: cannot find dev."); + return ENODEV; + } + pdev->dip = dip; + + /* map in pci config registers */ + if (pci_config_setup(dip, &pdev->conf_handle)) { + nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." ); + goto bailout; + } + + /* find out what we have got */ + vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID ); + device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID ); + sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID ); + sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID ); + + index= 0; + while( (cmddev = nfp_drvlist[index++]) != NULL ) { + if( cmddev->vendorid == vendor && + cmddev->deviceid == device && + cmddev->sub_vendorid == sub_vendor && + cmddev->sub_deviceid == sub_device ) + break; + } + if( !cmddev ) { + nfp_log( NFP_DBG1, "nfp_attach: unknonw device." ); + goto bailout; + } + + /* map BARs */ + for( i=0; i<6; i++ ) { + if( cmddev->bar_sizes[i] ) { + off_t size; + if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i ); + goto bailout; + } + if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { + nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) ); + goto bailout; + } + if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i], + 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i ); + goto bailout; + } + nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size ); + } + } + + pdev->read_buf = NULL; + pdev->rd_dma_ok = 0; + + /* attach to minor node */ + if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." ); + goto bailout; + } + + pdev->wr_ready = 1; + pdev->rd_ready = 0; + pdev->rd_pending = 0; + pdev->rd_outstanding = 0; + pdev->busy=0; + pdev->cmddev= cmddev; + + ret = pdev->cmddev->create(&pdev->common); + if( ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: failed to create command device"); + goto bailout; + } + pdev->common.dev= pdev; + + if (ddi_intr_hilevel(dip, 0) != 0){ + nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt"); + if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." ); + goto bailout; + } + if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." ); + goto bailout; + } + mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->high_iblock_cookie); + mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->low_iblock_cookie); + if (ddi_add_intr(dip, 0, NULL, + NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." ); + goto bailout; + } + if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH, + &pdev->iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." ); + goto bailout; + } + mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->iblock_cookie); + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id, + &pdev->iblock_cookie, NULL, + nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS) + goto bailout; + pdev->high_intr= 1; + } else { + nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt"); + + if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." ); + goto bailout; + } + + mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie); + + if (ddi_add_intr(dip, 0, NULL, + (ddi_idevice_cookie_t *)NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." ); + goto bailout; + } + } + mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL ); + cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL ); + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." ); + if( intres > 0 ) { + nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.", + ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f ); + } + } + + nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." ); + return DDI_SUCCESS; + +bailout: + (void) nfp_release_dev( dip ); + + return DDI_FAILURE; +} + +/*-------------------------*/ +/* nfp_detach */ +/*-------------------------*/ + +/* + * When our driver is unloaded, nfp_detach cleans up and frees the resources + * we allocated in nfp_attach. + */ +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + (void) nfp_release_dev(dip); + + return (DDI_SUCCESS); +} + +/*-------------------------*/ +/* _init */ +/*-------------------------*/ + +int _init(void) { + register int error; + + nfp_log( NFP_DBG2, "_init: entered" ); + + if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) { + nfp_log( NFP_DBG1, "_init: soft_state_init() failed" ); + return (error); + } + + if ((error = mod_install(&modlinkage)) != 0) { + nfp_log( NFP_DBG1, "_init: mod_install() failed" ); + ddi_soft_state_fini(&state_head); + } + + nfp_log( NFP_DBG2, "_init: leaving" ); + return (error); +} + +/*-------------------------*/ +/* _info */ +/*-------------------------*/ + +int _info(struct modinfo *modinfop) { + nfp_log( NFP_DBG2, "_info: entered" ); + + return (mod_info(&modlinkage, modinfop)); +} + +/*-------------------------*/ +/* _fini */ +/*-------------------------*/ + +int _fini(void) { + int status; + + nfp_log( NFP_DBG2, "_fini: entered" ); + + if ((status = mod_remove(&modlinkage)) != 0) { + nfp_log( NFP_DBG2, "_fini: mod_remove() failed." ); + return (status); + } + + ddi_soft_state_fini(&state_head); + + nfp_log( NFP_DBG2, "_fini: leaving" ); + + return (status); +} + diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c new file mode 100644 index 0000000000..f51a09188d --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.c @@ -0,0 +1,310 @@ +/* + +i21285.c: nCipher PCI HSM intel/digital 21285 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21285.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* create ------------------------------------------------------- */ + +static nfp_err i21285_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + nfp_log( NFP_DBG2, "i21285_create: enable doorbell"); + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21285_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_open( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21285_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + unsigned int doorbell; + unsigned int tmp32; + + nfp_log( NFP_DBG3, "i21285_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_isr: NULL pdev"); + return NFP_ENODEV; + } + + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell ); + TO_LE32_IO( &tmp32, 0xffff & doorbell ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + } + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + } + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21285_write( const char *block, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_write: NULL pdev"); + return NFP_ENODEV; + } + + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]); + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]); + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_write: length not written"); + return NFP_EIO; + } + + TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST); + + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log( NFP_DBG2, "i21285_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_read: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if(ne) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if( ne ) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21285_read: done"); + *rcount= count; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21285_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + unsigned int tmp32; + nfp_err ne; + + nfp_log( NFP_DBG2, "i21285_ensure_reading: entered"); + + if(addr) { + nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr"); + return -NFP_EINVAL; + } + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR ); + return NFP_ENXIO; + } + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM( &hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written"); + return NFP_EIO; + }; + TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST ); + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + + +const nfpcmd_dev i21285_cmddev = { + "nCipher Gen 1 PCI", + PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, + PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21285_create, + i21285_destroy, + i21285_open, + i21285_close, + i21285_isr, + i21285_write, + i21285_read, + i21285_chupdate, + i21285_ensure_reading, + 0, /* no debug */ +}; + diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h new file mode 100644 index 0000000000..4ea1d853ec --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.h @@ -0,0 +1,43 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_I21285_H +#define NFP_I21285_H + +#ifndef PCI_VENDOR_ID_DEC +#define PCI_VENDOR_ID_DEC 0x1011 +#endif +#ifndef PCI_DEVICE_ID_DEC_21285 +#define PCI_DEVICE_ID_DEC_21285 0x1065 +#endif +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_DEVICE_ID_NFAST_GEN1 +#define PCI_DEVICE_ID_NFAST_GEN1 0x0100 +#endif + +#define I21285_OFFSET_DOORBELL 0x60 +#define I21285_OFFSET_INTERRUPT_MASK 0x34 + +#define DOORBELL_ENABLE 0x0 +#define DOORBELL_DISABLE 0x4 + +#define POSTLIST_ENABLE 0x0 +#define POSTLIST_DISABLE 0x8 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x80 +#define MEMSIZE 0x100000 + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c new file mode 100644 index 0000000000..82024dc800 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.c @@ -0,0 +1,423 @@ +/* + +i21555.c: nCipher PCI HSM intel 21555 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21555.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* started ------------------------------------------------------ + * + * Check that device is ready to talk, by checking that + * the i21555 has master enabled on its secondary interface + */ + +static nfp_err i21555_started( nfp_cdev *pdev ) { + unsigned int tmp32; +#ifdef CONFIGSPACE_DEBUG + unsigned int reg32[64]; + int i; +#endif + nfp_err ne; + + nfp_log( NFP_DBG2, "i21555_started: entered"); + +#ifdef CONFIGSPACE_DEBUG + /* Suck up all the registers */ + for (i=0; i < 64; i++) { + ne = nfp_config_inl( pdev, i*4, ®32[i] ); + } + + for (i=0; i < 16; i++) { + int j = i * 4; + nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4, + reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]); + } +#endif + + ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 ); + if (ne) { + /* succeed if PCI config reads are not implemented */ + if (ne == NFP_EUNKNOWN) + return NFP_SUCCESS; + nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed"); + return ne; + } + + tmp32= FROM_LE32_IO(&tmp32) & 0xffff; + + if ( tmp32 & CFG_CMD_MASTER ) { + nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32); + return NFP_SUCCESS; + } else { + nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32); + return NFP_ESTARTING; + } +} + +/* create ------------------------------------------------------- */ + +static nfp_err i21555_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + nfp_log( NFP_DBG2, "i21555_create: enable doorbell"); + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21555_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_open( void * ctx ) { + + nfp_log( NFP_DBG2, "i21555_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21555_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21555_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + nfp_err ne; + unsigned short doorbell; + unsigned short tmp16; + + nfp_log( NFP_DBG3, "i21555_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_isr: NULL pdev"); + return NFP_ENODEV; + } + + pdev->stats.isr++; + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + /* This interrupt may not be from our module, so check that it actually is + * us before handling it. + */ + ne = i21555_started( pdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed"); + } + return ne; + } + + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + pdev->stats.isr_write++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + pdev->stats.isr_read++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE16_IO(&tmp16,doorbell); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell ); + } + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + } + nfp_log( NFP_DBG3, "i21555_isr: exiting"); + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21555_write( const char *block, int len, void *ctx) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned short tmp16; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_write: NULL cdev"); + return NFP_ENODEV; + } + + cdev->stats.write_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_write: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_write: length not written"); + return NFP_EIO; + } + TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.write_fail--; + cdev->stats.write_block++; + cdev->stats.write_byte += len; + + nfp_log( NFP_DBG2, "i21555_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21555_read: entered"); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_read: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.read_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21555_read: done"); + *rcount= count; + cdev->stats.read_fail--; + cdev->stats.read_block++; + cdev->stats.read_byte += len; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21555_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[3]; + unsigned short tmp16; + unsigned int tmp32; + nfp_err ne; + int hdr_len; + + nfp_log( NFP_DBG2, "i21555_ensure_reading: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.ensure_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + if(addr) { + nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr); + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH); + TO_LE32_MEM(&hdr[1], len); + TO_LE32_MEM(&hdr[2], addr); + hdr_len= 12; + } else { + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + hdr_len= 8; + } + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written"); + return NFP_EIO; + } + TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.ensure_fail--; + cdev->stats.ensure++; + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + +const nfpcmd_dev i21555_cmddev = { + "nCipher Gen 2 PCI", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555, + PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21555_create, + i21555_destroy, + i21555_open, + i21555_close, + i21555_isr, + i21555_write, + i21555_read, + i21555_chupdate, + i21555_ensure_reading, + i21555_debug, +}; diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h new file mode 100644 index 0000000000..d8f3965938 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.h @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef I21555_H +#define I21555_H + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL_21555 +#define PCI_DEVICE_ID_INTEL_21555 0xb555 +#endif + +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1 +#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100 +#endif + +#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C +#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E +#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98 + +#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4 +#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0 + +#define I21555_DOORBELL_PRI_ENABLE 0x0000 +#define I21555_DOORBELL_PRI_DISABLE 0xFFFF + +#define I21555_CFG_SEC_CMD_STATUS 0x44 + +#define CFG_CMD_MASTER 0x0004 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x100 + +extern nfp_err i21555_debug( int cmd, void *ctx ); + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c new file mode 100644 index 0000000000..183ace8275 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555d.c @@ -0,0 +1,28 @@ +/* + +i21555d.c: nCipher PCI HSM intel 21555 debug ioctl + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +15/05/2002 jsh Original, does nothing + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "i21555.h" + +/* ARGSUSED */ +nfp_err i21555_debug( int cmd, void *ctx) { + nfp_log( NFP_DBG1, "i21555_debug: entered"); + + return NFP_EUNKNOWN; +} diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h new file mode 100644 index 0000000000..8a97bf2c63 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-common.h @@ -0,0 +1,141 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ +/** \file nfdev-common.h + * + * \brief nFast device driver (not generic SCSI) ioctl struct definition file + * include NFDEV-$(system) for ioctl number definitions + * + * 1998.07.13 jsh Started + * + * + */ + +#ifndef NFDEV_COMMON_H +#define NFDEV_COMMON_H + +/** + * Result of the ENQUIRY ioctl. + */ +typedef struct nfdev_enquiry_str { + unsigned int busno; /**< Which bus is the PCI device on. */ + unsigned char slotno; /**< Which slot is the PCI device in. */ + unsigned char reserved[3]; /**< for consistant struct alignment */ +} nfdev_enquiry_str; + +/** + * Result of the STATS ioctl. + */ +typedef struct nfdev_stats_str { + unsigned long isr; /**< Count interrupts. */ + unsigned long isr_read; /**< Count read interrupts. */ + unsigned long isr_write; /**< Count write interrupts. */ + unsigned long write_fail; /**< Count write failures. */ + unsigned long write_block; /**< Count blocks written. */ + unsigned long write_byte; /**< Count bytes written. */ + unsigned long read_fail; /**< Count read failures. */ + unsigned long read_block; /**< Count blocks read. */ + unsigned long read_byte; /**< Count bytes read. */ + unsigned long ensure_fail; /**< Count read request failures. */ + unsigned long ensure; /**< Count read requests. */ +} nfdev_stats_str; + +/** + * Input to the CONTROL ioctl. + */ +typedef struct nfdev_control_str { + unsigned control; /**< Control flags. */ +} nfdev_control_str; + +/** Control bit indicating host supports MOI control */ +#define NFDEV_CONTROL_HOST_MOI 0x0001 + +/** Index of control bits indicating desired mode + * + * Desired mode follows the M_ModuleMode enumeration. + */ +#define NFDEV_CONTROL_MODE_SHIFT 1 + +/** Detect a backwards-compatible control value + * + * Returns true if the request control value "makes no difference", i.e. + * and the failure of an attempt to set it is therefore uninteresting. + */ +#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1) + +/** + * Result of the STATUS ioctl. + */ +typedef struct nfdev_status_str { + unsigned status; /**< Status flags. */ + char error[8]; /**< Error string. */ +} nfdev_status_str; + +/** Monitor firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_MONITOR_MOI 0x0001 + +/** Application firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_APPLICATION_MOI 0x0002 + +/** Application firmware running and supports error reporting */ +#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004 + +/** HSM failed + * + * Consult error[] for additional information. + */ +#define NFDEV_STATUS_FAILED 0x0008 + +/** Standard PCI interface. */ +#define NFDEV_IF_STANDARD 0x01 + +/** PCI interface with results pushed from device + * via DMA. + */ +#define NFDEV_IF_PCI_PUSH 0x02 + +/* platform independant base ioctl numbers */ + +/** Enquiry ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_ENQUIRY 0x01 +/** Channel Update ioctl. + * \deprecated */ +#define NFDEV_IOCTL_NUM_CHUPDATE 0x02 +/** Ensure Reading ioctl. + * Signal a read request to the device. + * \param (unsigned int) Length of data to be read. + */ +#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03 +/** Device Count ioctl. + * Not implemented for on all platforms. + * \return (int) the number of attached devices. */ +#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04 +/** Internal Debug ioctl. + * Not implemented in release drivers. */ +#define NFDEV_IOCTL_NUM_DEBUG 0x05 +/** PCI Interface Version ioctl. + * \param (int) Maximum PCI interface version + * supported by the user of the device. */ +#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06 +/** Statistics ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_STATS 0x07 + +/** Module control ioctl + * \param (nfdev_control_str) Value to write to HSM control register + */ +#define NFDEV_IOCTL_NUM_CONTROL 0x08 + +/** Module state ioctl + * \return (nfdev_status_str) Values read from HSM status/error registers + */ +#define NFDEV_IOCTL_NUM_STATUS 0x09 + +#endif diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h new file mode 100644 index 0000000000..923b902e46 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h @@ -0,0 +1,37 @@ +/* + +nfdev-solaris.h: nFast solaris specific device ioctl interface. + +(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +14/07/1998 jsh Original + +*/ + +#ifndef NFDEV_SOLARIS_H +#define NFDEV_SOLARIS_H + +#include "nfdev-common.h" + +#define NFDEV_IOCTL_TYPE ('n'<<8) + +#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENQUIRY ) +#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENSUREREADING ) +#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEVCOUNT ) +#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEBUG ) +#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_PCI_IFVERS ) +#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_STATS ) + +#endif /* NFDEV_SOLARIS_H */ diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h new file mode 100644 index 0000000000..9704f04fbc --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp.h @@ -0,0 +1,113 @@ +/* + +nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7 + +(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv + +*/ + +#ifndef NFP_H +#define NFP_H + +#ifndef _KERNEL +#error Hello? this is a driver, please compile with -D_KERNEL +#endif + +#if ( CH_KERNELVER < 260 ) +typedef int ioctlptr_t; +typedef unsigned short uint16_t; +#define DDI_GET32 ddi_getl +#define DDI_PUT32 ddi_putl +#define DDI_GET16 ddi_getw +#define DDI_PUT16 ddi_putw +#define DDI_REP_GET8 ddi_rep_getb +#define DDI_REP_PUT8 ddi_rep_putb +#define DDI_REP_GET32 ddi_rep_getl +#define DDI_REP_PUT32 ddi_rep_putl +#define PCI_CONFIG_GET16 pci_config_getw +#else /* ( CH_KERNELVER >= 260 ) */ +typedef intptr_t ioctlptr_t; +#define DDI_GET32 ddi_get32 +#define DDI_PUT32 ddi_put32 +#define DDI_GET16 ddi_get16 +#define DDI_PUT16 ddi_put16 +#define DDI_REP_GET8 ddi_rep_get8 +#define DDI_REP_PUT8 ddi_rep_put8 +#define DDI_REP_GET32 ddi_rep_get32 +#define DDI_REP_PUT32 ddi_rep_put32 +#define PCI_CONFIG_GET16 pci_config_get16 +#endif + +#if ( CH_KERNELVER < 270 ) +typedef int nfp_timeout_t; +#define EXTRA_CB_FLAGS 0 +#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap) +#else /* ( CH_KERNELVER >= 270 ) */ +typedef timeout_id_t nfp_timeout_t; +#define EXTRA_CB_FLAGS D_64BIT +#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap) +#endif + +typedef struct nfp_dev { + int rd_ok; + int wr_ok; + + int ifvers; + + /* for PCI push read interface */ + unsigned char *read_buf; + ddi_dma_handle_t read_dma_handle; + ddi_dma_cookie_t read_dma_cookie; + + ddi_acc_handle_t acchandle; + + int rd_dma_ok; + + nfp_timeout_t wrtimeout; + nfp_timeout_t rdtimeout; + + struct buf *wr_bp; + int wr_ready; + int rd_ready; + int rd_pending; + int rd_outstanding; + kcondvar_t rd_cv; + + struct pollhead pollhead; + dev_info_t *dip; + + ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */ + ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */ + kmutex_t high_mutex; + kmutex_t low_mutex; + int high_intr; + ddi_softintr_t soft_int_id; + int high_read; + int high_write; + + ddi_iblock_cookie_t iblock_cookie; /* for mutex */ + kmutex_t isr_mutex; + + kmutex_t busy_mutex; + int busy; + + ddi_acc_handle_t conf_handle; + + nfp_cdev common; + const nfpcmd_dev *cmddev; +} nfp_dev; + +extern struct nfp_dev *nfp_dev_list[]; + +#endif /* NFP_H */ diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h new file mode 100644 index 0000000000..db8af0b2f9 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_cmd.h @@ -0,0 +1,68 @@ +/* + +nfp_cmd.h: nCipher PCI HSM command driver decalrations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFPCMD_H +#define NFPCMD_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* read and write called with userspace buffer */ + +typedef struct nfpcmd_dev { + const char *name; + unsigned short vendorid, deviceid, + sub_vendorid, sub_deviceid; + unsigned int bar_sizes[6]; /* includes IO bit */ + unsigned int flags; + nfp_err (*create)(struct nfp_cdev *pdev); + nfp_err (*destroy)(void * ctx); + nfp_err (*open)(void * ctx); + nfp_err (*close)(void * ctx); + nfp_err (*isr)(void *ctx, int *handled); + nfp_err (*write_block)( const char *ublock, int len, void *ctx ); + nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount); + nfp_err (*channel_update)( char *data, int len, void *ctx); + nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx ); + nfp_err (*debug)( int cmd, void *ctx); +} nfpcmd_dev; + +#define NFP_CMD_FLG_NEED_IOBUF 0x1 + +/* list of all supported drivers ---------------------------------------- */ + +extern const nfpcmd_dev *nfp_drvlist[]; + +extern const nfpcmd_dev i21285_cmddev; +extern const nfpcmd_dev i21555_cmddev; +extern const nfpcmd_dev bcm5820_cmddev; + +#ifndef PCI_BASE_ADDRESS_SPACE_IO +#define PCI_BASE_ADDRESS_SPACE_IO 0x1 +#endif + +#define NFP_MAXDEV 16 + + +#define NFP_MEMBAR_MASK ~0xf +#define NFP_IOBAR_MASK ~0x3 +/* + This masks off the bottom bits of the PCI_CSR_BAR which signify that the + BAR is an IO BAR rather than a MEM BAR +*/ + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h new file mode 100644 index 0000000000..d1d2100fea --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_common.h @@ -0,0 +1,68 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_COMMON_H +#define NFP_COMMON_H + +#include <sys/types.h> +#include <sys/conf.h> + +typedef uint32_t UINT32; +typedef uint8_t BYTE; + +#define DEFINE_NFPCI_PACKED_STRUCTS +#include "nfpci.h" +#include "nfdev-solaris.h" + +typedef int oserr_t; + +#if CH_BIGENDIAN + +/* Big Endian Sparc */ + +#define SWP32(x) \ +( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) + +#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) ) + +#define FROM_LE32_IO(x) SWP32(*x) +#define TO_LE32_IO(x,y) *x=SWP32(y) + +#define FROM_LE32_MEM(x) SWP32(*x) +#define TO_LE32_MEM(x,y) *x=SWP32(y) + +#define FROM_LE16_IO(x) SWP16(*x) +#define TO_LE16_IO(x,y) *x=SWP16(y) + +#else + +/* Little Endian x86 */ + +#define FROM_LE32_IO(x) (*x) +#define TO_LE32_IO(x,y) (*x=y) + +#define FROM_LE32_MEM(x) (*x) +#define TO_LE32_MEM(x,y) (*x=y) + +#define FROM_LE16_IO(x) (*x) +#define TO_LE16_IO(x,y) (*x=y) + +#endif /* !CH_BIGENDIAN */ + +#include <sys/types.h> + +#if CH_KERNELVER == 260 +#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt ) +#else +#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; } +#endif + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h new file mode 100644 index 0000000000..d64cb78fd4 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_error.h @@ -0,0 +1,48 @@ +/* + +nfp_error.h: nCipher PCI HSM error handling + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +05/12/2001 jsh Original + +*/ + +#ifndef NFP_ERROR_H +#define NFP_ERROR_H + +#include "nfp_common.h" + +#define NFP_SUCCESS 0x0 +#define NFP_EFAULT 0x1 +#define NFP_ENOMEM 0x2 +#define NFP_EINVAL 0x3 +#define NFP_EIO 0x4 +#define NFP_ENXIO 0x5 +#define NFP_ENODEV 0x6 +#define NFP_EINTR 0x7 +#define NFP_ESTARTING 0x8 +#define NFP_EAGAIN 0x9 +#define NFP_EUNKNOWN 0x100 + +typedef int nfp_err; + +extern oserr_t nfp_oserr( nfp_err nerr ); +extern nfp_err nfp_error( oserr_t oerr ); + +#define nfr( x) \ + return nfp_error((x)) + +#define nfer(x, fn, msg) \ + { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } } + +#define er(x, fn, msg ) \ +{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } } + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h new file mode 100644 index 0000000000..3e7d8187e5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_hostif.h @@ -0,0 +1,54 @@ +/* + +nfp_hostif.h: nCipher PCI HSM host interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_HOSTIF_H +#define NFP_HOSTIF_H + +#include "nfdev-common.h" + +struct nfp_dev; + +/* common device structure */ + +typedef struct nfp_cdev { + unsigned char *bar[6]; + void *extra[6]; + + int busno; + int slotno; + + void *cmdctx; + + char *iobuf; + + struct nfp_dev* dev; + + struct nfdev_stats_str stats; + +} nfp_cdev; + +/* callbacks from command drivers -------------------------------------- */ + +void nfp_read_complete( struct nfp_dev *pdev, int ok); +void nfp_write_complete( struct nfp_dev *pdev, int ok); + +#define NFP_READ_MAX (8 * 1024) +#define NFP_READBUF_SIZE (NFP_READ_MAX + 8) +#define NFP_TIMEOUT_SEC 10 + +#define NFP_DRVNAME "nCipher nFast PCI driver" + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c new file mode 100644 index 0000000000..807b4f24c5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* + * nfp_ifervs.c - common pci interface versioning + * + * uses: + * + * int pdev->ifvers + * device interface version + * + * int nfp_ifvers + * interface version limit + * + * int nfp_alloc_pci_push( nfp_dev *pdev ) + * allocates resources needed for PCI Push, + * if not already allocated, and return True if successful + * + * void nfp_free_pci_push( nfp_dev *pdev ) { + * frees any resources allocated to PCI Push + */ + +void nfp_set_ifvers( nfp_dev *pdev, int vers ) { + if( nfp_ifvers != 0 && vers > nfp_ifvers ) { + nfp_log( NFP_DBG2, + "nfp_set_ifvers: can't set ifvers %d" + " as nfp_ifvers wants max ifvers %d", + vers, nfp_ifvers); + return; + } + if( vers >= NFDEV_IF_PCI_PUSH ) { + if(!nfp_alloc_pci_push(pdev)) { + nfp_log( NFP_DBG1, + "nfp_set_ifvers: can't set ifvers %d" + " as resources not available", + vers); + return; + } + } else { + nfp_free_pci_push(pdev); + } + pdev->ifvers= vers; + nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers); +} diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h new file mode 100644 index 0000000000..17ffe469ce --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_osif.h @@ -0,0 +1,105 @@ +/* + +nfp_osif.h: nCipher PCI HSM OS interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_OSIF_H +#define NFP_OSIF_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* general typedefs ----------------------------------------------- */ + +typedef volatile unsigned int reg32; +typedef volatile unsigned short reg16; +typedef volatile unsigned char reg8; + +/* sempaphores, mutexs and events --------------------------------- */ + +#if 0 +extern nfp_err nfp_sema_init( nfp_sema *sema, int initial); +extern void nfp_sema_destroy( nfp_sema *sema ); +extern void nfp_sema_post( nfp_sema *sema ); +extern void nfp_sema_wait( nfp_sema *sema ); +extern int nfp_sema_wait_sig( nfp_sema *sema ); + +extern nfp_err nfp_mutex_init( nfp_mutex *mutex ); +extern void nfp_mutex_destroy( nfp_mutex *mutex ); +extern void nfp_mutex_enter( nfp_mutex *mutex ); +extern void nfp_mutex_exit( nfp_mutex *mutex ); + +extern nfp_err nfp_event_init( nfp_event *event ); +extern void nfp_event_destroy( nfp_event *event ); +extern void nfp_event_set( nfp_event *event ); +extern void nfp_event_clear( nfp_event *event ); +extern void nfp_event_wait( nfp_event *event ); +extern void nfp_event_wait_sig( nfp_event *event ); + +#endif + +/* timeouts ------------------------------------------------------ */ + +extern void nfp_sleep( int ms ); + +/* memory handling ----------------------------------------------- */ + +#define KMALLOC_DMA 0 +#define KMALLOC_CACHED 1 + +extern void *nfp_kmalloc( int size, int flags ); +extern void *nfp_krealloc( void *ptr, int size, int flags ); +extern void nfp_kfree( void * ); + +/* config space access ------------------------------------------------ */ + +/* return Little Endian 32 bit config register */ +extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ); + +/* io space access ------------------------------------------------ */ + +extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ); +extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ); +extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ); +extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ); + +/* user and device memory space access ---------------------------- */ + +/* NB these 2 functions are not guarenteed to be re-entrant for a given device */ +extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len); +extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len); + +extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len ); +extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len ); + +extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len ); +extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len); + +/* debug ------------------------------------------------------------ */ + +#define NFP_DBG1 1 +#define NFP_DBGE NFP_DBG1 +#define NFP_DBG2 2 +#define NFP_DBG3 3 +#define NFP_DBG4 4 + +#ifdef STRANGE_VARARGS +extern void nfp_log(); +#else +extern void nfp_log( int severity, const char *format, ...); +#endif + +extern int nfp_debug; + +#endif diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h new file mode 100644 index 0000000000..793f5995e6 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfpci.h @@ -0,0 +1,171 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* +* +* NFPCI.H - nFast PCI interface definition file +* +* +* +* 1998.06.09 IH Started +* +* The interface presented by nFast PCI devices consists of: +* +* A region of shared RAM used for data transfer & control information +* A doorbell interrupt register, so both sides can give each other interrupts +* A number of DMA channels for transferring data +*/ + +#ifndef NFPCI_H +#define NFPCI_H + +/* Sizes of some regions */ +#define NFPCI_RAM_MINSIZE 0x00100000 +/* This is the minimum size of shared RAM. In future it may be possible to + negotiate larger sizes of shared RAM or auto-detect how big it is */ +#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */ +#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */ + +/* Offsets within shared memory space. + The following main regions are: + jobs input area + jobs output area + kernel jobs input area + kernel output area +*/ + +#define NFPCI_OFFSET_JOBS 0x00000000 +#define NFPCI_OFFSET_JOBS_WR 0x00000000 +#define NFPCI_OFFSET_JOBS_RD 0x00010000 +#define NFPCI_OFFSET_KERN 0x00020000 +#define NFPCI_OFFSET_KERN_WR 0x00020000 +#define NFPCI_OFFSET_KERN_RD 0x00030000 + +/* Interrupts, defined by bit position in doorbell register */ + +/* Interrupts from device to host */ +#define NFAST_INT_DEVICE_WRITE_OK 0x00000001 +#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002 +#define NFAST_INT_DEVICE_READ_OK 0x00000004 +#define NFAST_INT_DEVICE_READ_FAILED 0x00000008 +#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010 +#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020 +#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040 +#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080 + +/* Interrupts from host to device */ +#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000 +#define NFAST_INT_HOST_READ_REQUEST 0x00020000 +#define NFAST_INT_HOST_DEBUG 0x00040000 +#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000 +#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000 + +/* Ordinary job submission ------------------------ */ + +/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined + by the following (byte) address offsets... */ + +#define NFPCI_OFFSET_CONTROL 0x0 +#define NFPCI_OFFSET_LENGTH 0x4 +#define NFPCI_OFFSET_DATA 0x8 +#define NFPCI_OFFSET_PUSH_ADDR 0x8 + +#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8) + +#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8) + +/* Kernel inferface job submission ---------------- */ + +#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8) + +#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8) + +#ifdef DEFINE_NFPCI_PACKED_STRUCTS +typedef struct +{ + UINT32 controlword; + UINT32 length; /* length of data to follow */ + union { + BYTE data[1]; + UINT32 addr; + } uu; +} + NFPCI_JOBS_BLOCK; +#endif + + +#define NFPCI_JOB_CONTROL 0x00000001 +#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002 +/* + The 'Control' word is analogous to the SCSI read/write address; + 1 = standard push/pull IO + 2 = push/push IO + + To submit a block of job data, the host: + - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data + - copies the data to NFPCI_JOBS_WR_DATA + - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register + - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back + + To read a block of jobs back, the host: + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at + NFPCI_JOBS_RD_LENGTH to its actual length. + + Optionally the host can request the PCI read data to be pushed to host PCI mapped ram: + - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max + size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8 + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of + the buffer + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The + module will set NFPCI_OFFSET_LENGTH to the actual length. +*/ + +#define NFPCI_SCRATCH_CONTROL 0 + +#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0) +#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1 +#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT) + +#define NFPCI_SCRATCH_STATUS 1 + +#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0) +#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1) +#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2) +#define NFPCI_SCRATCH_STATUS_ERROR (1<<3) + +#define NFPCI_SCRATCH_ERROR_LO 2 +#define NFPCI_SCRATCH_ERROR_HI 3 + +#endif diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c new file mode 100644 index 0000000000..fba62f9a37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/osif.c @@ -0,0 +1,184 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" +#include "nfp.h" +#include "autoversion.h" + +/* config space access ---------------------------------- */ + +nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) { + unsigned int tmp32; + if ( !pdev || !pdev->dev || !pdev->dev->conf_handle ) + return NFP_ENODEV; + +/* pci_config_get32() does byte swapping, so put back to LE */ + tmp32 = pci_config_get32( pdev->dev->conf_handle, offset ); + TO_LE32_IO(res, tmp32); + + return NFP_SUCCESS; +} + +/* user space memory access ---------------------------------- */ + +nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) { + bcopy(ubuf, kbuf, len); + return 0; +} + +nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) { + bcopy(kbuf, ubuf, len); + return 0; +} + +nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */ + return nfp_copy_to_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */ + return nfp_copy_from_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR); + else + /* LINTED: alignment */ + DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR); + return NFP_SUCCESS; +} + +nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR ); + else + /* LINTED: alignment */ + DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR ); + return NFP_SUCCESS; +} + +/* pci io space access --------------------------------------- */ + +unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) ); +} + +unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) ); +} + +void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); +} + +void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); +} + +/* logging ---------------------------------------------------- */ + +void nfp_log( int level, const char *fmt, ...) +{ + auto char buf[256]; + va_list ap; + + switch (level) { + case NFP_DBG4: if (nfp_debug < 4) break; + /*FALLTHROUGH*/ + case NFP_DBG3: if (nfp_debug < 3) break; + /*FALLTHROUGH*/ + case NFP_DBG2: if (nfp_debug < 2) break; + /*FALLTHROUGH*/ + case NFP_DBG1: if (nfp_debug < 1) break; + /*FALLTHROUGH*/ + default: + va_start(ap, fmt); + (void) vsnprintf(buf, 256, fmt, ap); + va_end(ap); + cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf); + break; + } +} + +struct errstr { + int oserr; + nfp_err nferr; +}; + + +static struct errstr errtab[] = { + { EFAULT, NFP_EFAULT }, + { ENOMEM, NFP_ENOMEM }, + { EINVAL, NFP_EINVAL }, + { EIO, NFP_EIO }, + { ENXIO, NFP_ENXIO }, + { ENODEV, NFP_ENODEV }, + { EINVAL, NFP_EUNKNOWN }, + { 0, 0 } +}; + +nfp_err nfp_error( int oserr ) +{ + struct errstr *perr; + if(!oserr) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->oserr == oserr) + return perr->nferr; + perr++; + } + return NFP_EUNKNOWN; +} + +int nfp_oserr( nfp_err nferr ) +{ + struct errstr *perr; + if(nferr == NFP_SUCCESS) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->nferr == nferr) + return perr->oserr; + perr++; + } + return EIO; +} diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c new file mode 100644 index 0000000000..2ad3f4f591 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -0,0 +1,2184 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay Devices + * + * Overlay devices provide a means for creating overlay networks, a means of + * multiplexing multiple logical, isolated, and discrete layer two and layer + * three networks on top of one physical network. + * + * In general, these overlay devices encapsulate the logic to answer two + * different questions: + * + * 1) How should I transform a packet to put it on the wire? + * 2) Where should I send a transformed packet? + * + * Each overlay device is presented to the user as a GLDv3 device. While the + * link itself cannot have an IP interface created on top of it, it allows for + * additional GLDv3 devices, such as a VNIC, to be created on top of it which + * can be plumbed up with IP interfaces. + * + * + * -------------------- + * General Architecture + * -------------------- + * + * The logical overlay device that a user sees in dladm(1M) is a combination of + * two different components that work together. The first component is this + * kernel module, which is responsible for answering question one -- how should + * I transform a packet to put it on the wire. + * + * The second component is what we call the virtual ARP daemon, or varpd. It is + * a userland component that is responsible for answering the second question -- + * Where should I send a transformed packet. Instances of the kernel overlay + * GLDv3 device ask varpd the question of where should a packet go. + * + * The split was done for a few reasons. Importantly, we wanted to keep the act + * of generating encapsulated packets in the kernel so as to ensure that the + * general data path was fast and also kept simple. On the flip side, while the + * question of where should something go may be simple, it may often be + * complicated and need to interface with several different external or + * distributed systems. In those cases, it's simpler to allow for the full + * flexibility of userland to be brought to bear to solve that problem and in + * general, the path isn't very common. + * + * The following is what makes up the logical overlay device that a user would + * create with dladm(1M). + * + * Kernel Userland + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . +--------+ +--------+ +--------+ . . . + * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . + * . +--------+ +--------+ +--------+ . . . + * . | | | . . . + * . | | | . . . + * . +------------+-----------+ . . . + * . | . . /dev/overlay . + * . +--------------+ . . . +------------+ . + * . | | . . . | | . + * . | Overlay |======*=================| Virtual | . + * . | GLDv3 Device |========================| ARP Daemon | . + * . | | . . | | . + * . +--------------+ . . +------------+ . + * . | . . | . + * . | . . | . + * . +----------------+ . . +--------+ . + * . | Overlay | . . | varpd | . + * . | Encapsulation | . . | Lookup | . + * . | Plugin | . . | Plugin | . + * . +----------------+ . . +--------+ . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * + * + * This image shows the two different components and where they live. + * Importantly, it also shows that both the kernel overlay device and the + * userland varpd both support plugins. The plugins actually implement the + * things that users care about and the APIs have been designed to try to + * minimize the amount of things that a module writer needs to worry about it. + * + * IDENTIFIERS + * + * Every overlay device is defined by a unique identifier which is the overlay + * identifier. Its purpose is similar to that of a VLAN identifier, it's a + * unique number that is used to differentiate between different entries on the + * wire. + * + * ENCAPSULATION + * + * An overlay encapsulation plugin is a kernel miscellaneous module whose + * purpose is to contain knowledge about how to transform packets to put them + * onto the wire and to take them off. An example of an encapsulation plugin is + * vxlan. It's also how support for things like nvgre or geneve would be brought + * into the system. + * + * Each encapsulation plugins defines a series of operation vectors and + * properties. For the full details on everything they should provide, please + * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible + * for telling the system what information is required to send a packet. For + * example, vxlan is defined to send everything over a UDP packet and therefore + * requires a port and an IP address, while nvgre on the other hand is its own + * IP type and therefore just requires an IP address. In addition, it also + * provides information about the kind of socket that should be created. This is + * used by the kernel multiplexor, more of that in the Kernel Components + * section. + * + * LOOKUPS + * + * The kernel communicates requests for lookups over the character device + * /dev/overlay. varpd is responsible for listening for requests on that device + * and answering them. The character device is specific to the target path and + * varpd. + * + * Much as the kernel overlay module handles the bulk of the scaffolding but + * leaves the important work to the encapsulation plugin, varpd provides a + * similar role and leaves the full brunt of lookups to a userland dynamic + * shared object which implements the logic of lookups. + * + * Each lookup plugin defines a series of operation vectors and properties. For + * the full details on everything that they should provide, please read + * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC + * address and asked to give an address on the physical network that it should + * be sent to. In addition, they handle questions related to how to handle + * things like broadcast and multicast traffic, etc. + * + * ---------- + * Properties + * ---------- + * + * A device from a dladm perspective has a unique set of properties that are + * combined from three different sources: + * + * 1) Generic properties that every overlay device has + * 2) Properties that are specific to the encapsulation plugin + * 3) Properties that are specific to the lookup plugin + * + * All of these are exposed in a single set of properties in dladm. Note that + * these are not necessarily traditional link properties. However, if something + * is both a traditional GLDv3 link property, say the MTU of a device, and a + * specific property here, than the driver ensures that all existing GLDv3 + * specific means of manipulating it are used and wraps up its private property + * interfaces to ensure that works. + * + * Properties in the second and third category are prefixed with the name of + * their module. For example, the vxlan encapsulation module has a property + * called the 'listen_ip'. This property would show up in dladm as + * 'vxlan/listen_ip'. This allows different plugins to both use similar names + * for similar properties and to also have independent name spaces so that + * overlapping names do not conflict with anything else. + * + * While the kernel combines both sets one and two into a single coherent view, + * it does not do anything with respect to the properties that are owned by the + * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in + * charge of bridging these two worlds into one magical experience for the user. + * It carries the burden of knowing about both overlay specific and varpd + * specific properties. Importantly, we want to maintain this distinction. We + * don't want to treat the kernel as an arbitrary key/value store for varpd and + * we want the kernel to own its own data and not have to ask userland for + * information that it owns. + * + * Every property in the system has the following attributes: + * + * o A name + * o A type + * o A size + * o Permissions + * o Default value + * o Valid value ranges + * o A value + * + * Everything except for the value is obtained by callers through the propinfo + * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, + * currently 256 bytes. + * + * The following are the supported types of properties: + * + * OVERLAY_PROP_T_INT + * + * A signed integer, its length is 8 bytes, corresponding to a + * int64_t. + * + * OVERLAY_PROP_T_UINT + * + * An unsigned integer, its length is 8 bytes, corresponding to a + * uint64_t. + * + * OVERLAY_PROP_T_IP + * + * A struct in6_addr, it has a fixed size. + * + * OVERLAY_PROP_T_STRING + * + * A null-terminated character string encoded in either ASCII or + * UTF-8. Note that the size of the string includes the null + * terminator. + * + * The next thing that we apply to a property is its permission. The permissions + * are put together by the bitwise or of the following flags and values. + * + * OVERLAY_PROP_PERM_REQ + * + * This indicates a required property. A property that is required + * must be set by a consumer before the device can be created. If a + * required property has a default property, this constraint is + * loosened because the default property defines the value. + * + * OVERLAY_PORP_PERM_READ + * + * This indicates that a property can be read. All properties will + * have this value set. + * + * OVERLAY_PROP_PERM_WRITE + * + * This indicates that a property can be written to and thus + * updated by userland. Properties that are only intended to + * display information, will not have OVERLAY_PROP_PERM_WRITE set. + * + * In addition, a few additional values are defined as a convenience to + * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of + * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, + * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, + * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a + * property should generally be a constant across its lifetime. + * + * A property may optionally have a default value. If it does have a default + * value, and that property is not set to be a different value, then the default + * value is inherited automatically. It also means that if the default value is + * acceptable, there is no need to set the value for a required property. For + * example, the vxlan module has the vxlan/listen_port property which is + * required, but has a default value of 4789 (the IANA assigned port). Because + * of that default value, there is no need for it to be set. + * + * Finally, a property may declare a list of valid values. These valid values + * are used for display purposes, they are not enforced by the broader system, + * but merely allow a means for the information to be communicated to the user + * through dladm(1M). Like a default value, this is optional. + * + * The general scaffolding does not do very much with respect to the getting and + * setting of properties. That is really owned by the individual plugins + * themselves. + * + * ----------------------------- + * Destinations and Plugin Types + * ----------------------------- + * + * Both encapsulation and lookup plugins define the kinds of destinations that + * they know how to support. There are three different pieces of information + * that can be used to address to a destination currently, all of which is + * summarized in the type overlay_point_t. Any combination of these is + * supported. + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * An Ethernet MAC address is required. + * + * OVERLAY_PLUGIN_D_IP + * + * An IP address is required. All IP addresses used by the overlay + * system are transmitted as IPv6 addresses. IPv4 addresses can be + * represented by using IPv4-mapped IPv6 addresses. + * + * OVERLAY_PLUGIN_D_PORT + * + * A TCP/UDP port is required. + * + * A kernel encapsulation plugin declares which of these that it requires, it's + * a static set. On the other hand, a userland lookup plugin can be built to + * support all of these or any combination thereof. It gets passed the required + * destination type, based on the kernel encapsulation method, and then it makes + * the determination as to whether or not it supports it. For example, the + * direct plugin can support either an IP or both an IP and a port, it simply + * doesn't display the direct/dest_port property in the cases where a port is + * not required to support this. + * + * The user lookup plugins have two different modes of operation which + * determines how they interact with the broader system and how look ups are + * performed. These types are: + * + * OVERLAY_TARGET_POINT + * + * A point to point plugin has a single static definition for where + * to send all traffic. Every packet in the system always gets sent + * to the exact same destination which is programmed into the + * kernel when the general device is activated. + * + * OVERLAY_TARGET_DYNAMIC + * + * A dynamic plugin does not have a single static definition. + * Instead, for each destination, the kernel makes an asynchronous + * request to varpd to determine where the packet should be routed, + * and if a specific destination is found, then that destination is + * cached in the overlay device's target cache. + * + * This distinction, while important for the general overlay device's operation, + * is not important to the encapsulation plugins. They don't need to know about + * any of these pieces. It's just a concern for varpd, the userland plugin, and + * the general overlay scaffolding. + * + * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not + * maintain a target cache, and instead just keeps track of the destination and + * always sends encapsulated packets to that address. When the target type is of + * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such + * destinations. These destinations are kept around in an instance of a + * reference hash that is specific to the given overlay device. Entries in the + * cache can be invalidated and replaced by varpd and its lookup plugins. + * + * ---------------------------------- + * Kernel Components and Architecture + * ---------------------------------- + * + * There are multiple pieces inside the kernel that work together, there is the + * general overlay_dev_t structure, which is the logical GLDv3 device, but it + * itself has references to things like an instance of an encapsulation plugin, + * a pointer to a mux and a target cache. It can roughly be summarized in the + * following image: + * + * +------------------+ + * | global | + * | overlay list | + * | overlay_dev_list | + * +------------------+ + * | + * | +-----------------------+ +---------------+ + * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... + * | overlay_dev_t | | overlay_dev_t | + * | | +---------------+ + * | | + * | mac_handle_t -----+---> GLDv3 handle to MAC + * | datalink_id_t -----+---> Datalink ID used by DLS + * | overlay_dev_flag_t ---+---> Device state + * | uint_t -----+---> Curent device MTU + * | uint_t -----+---> In-progress RX operations + * | uint_t -----+---> In-progress TX operations + * | char[] -----+---> FMA degraded message + * | void * -----+---> plugin private data + * | overlay_target_t * ---+---------------------+ + * | overlay_plugin_t * ---+---------+ | + * +-----------------------+ | | + * ^ | | + * +--------------------+ | | | + * | Kernel Socket | | | | + * | Multiplexor | | | | + * | overlay_mux_t | | | | + * | | | | | + * | avl_tree_t -+--+ | | + * | uint_t -+--> socket family | | + * | uint_t -+--> socket type | | + * | uint_t -+--> socket protocol | | + * | ksocket_t -+--> I/O socket | | + * | struct sockaddr * -+--> ksocket address | | + * | overlay_plugin_t --+--------+ | | + * +--------------------+ | | | + * | | | + * +-------------------------+ | | | + * | Encap Plugin |<--+-----------+ | + * | overlay_plugin_t | | + * | | | + * | char * ---+--> plugin name | + * | overlay_plugin_ops_t * -+--> plugin downcalls | + * | char ** (props) ---+--> property list | + * | uint_t ---+--> id length | + * | overlay_plugin_flags_t -+--> plugin flags | + * | overlay_plugin_dest_t --+--> destination type v + * +-------------------------+ +-------------------------+ + * | Target Cache | + * | overlay_target_t | + * | | + * cache mode <--+- overlay_target_mode_t | + * dest type <--+- overlay_plugin_dest_t | + * cache flags <--+- overlay_target_flag_t | + * varpd id <--+- uint64_t | + * outstanding varpd reqs. <--+- uint_t | + * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | + * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | + * | +-------------------------+ + * +-----------------------+ + * | + * v + * +-------------------------------+ +------------------------+ + * | Target Entry |-->| Target Entry |--> ... + * | overlay_target_entry_t | | overlay_target_entry_t | + * | | +------------------------+ + * | | + * | overlay_target_entry_flags_t -+--> Entry flags + * | uint8_t[ETHERADDRL] ---+--> Target MAC address + * | overlay_target_point_t ---+--> Target underlay address + * | mblk_t * ---+--> outstanding mblk head + * | mblk_t * ---+--> outstanding mblk tail + * | size_t ---+--> outstanding mblk size + * +-------------------------------+ + * + * The primary entries that we care about are the overlay_dev_t, which + * correspond to each overlay device that is created with dladm(1M). Globally, + * these devices are maintained in a simple list_t which is protected with a + * lock. Hence, these include important information such as the mac_handle_t + * and a datalink_id_t which is used to interact with the broader MAC and DLS + * ecosystem. We also maintain additional information such as the current state, + * outstanding operations, the mtu, and importantly, the plugin's private data. + * This is the instance of an encapsulation plugin that gets created as part of + * creating an overlay device. Another aspect of this is that the overlay_dev_t + * also includes information with respect to FMA. For more information, see the + * FMA section. + * + * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin + * is the encapsulation plugin. This allows the device to make downcalls into it + * based on doing things like getting and setting properties. Otherwise, the + * plugin itself is a fairly straightforward entity. They are maintained in an + * (not pictured above) list. The plugins themselves mostly maintain things like + * the static list of properties, what kind of destination they require, and the + * operations vector. A given module may contain more if necessary. + * + * The next piece of the puzzle is the mux, or a multiplexor. The mux itself + * maintains a ksocket and it is through the mux that we send and receive + * message blocks. The mux represents a socket type and address, as well as a + * plugin. Multiple overlay_dev_t devices may then share the same mux. For + * example, consider the case where you have different instances of vxlan all on + * the same underlay network. These would all logically share the same IP + * address and port that packets are sent and received on; however, what differs + * is the decapuslation ID. + * + * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike + * a socket, we enable a direct callback on the ksocket. This means that + * whenever a message block chain is received, rather than sitting there and + * getting a callback in a context and kicking that back out to a taskq. Instead + * data comes into the callback function overlay_mux_recv(). + * + * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx + * function) to transmit. It receives encapsulated packets, decapsulates them to + * determine the overlay identifier, looks up the given device that matches that + * identifier, and then causes the broader MAC world to receive the packet with + * a call to mac_rx(). + * + * Today, we don't do too much that's special with the ksocket; however, as + * hardware is gaining understanding for these encapuslation protocols, we'll + * probably want to think of better ways to get those capabilities passed down + * and potentially better ways to program receive filters so they get directly + * to us. Though, that's all fantasy future land. + * + * The next part of the puzzle is the target cache. The purpose of the target + * cache is to cache where we should send a packet on the underlay network, + * given its mac address. The target cache operates in two modes depending on + * whether the lookup module was declared to OVERLAY_TARGET_POINT or + * OVERLAY_TARGET_DYANMIC. + * + * In the case where the target cache has been programmed to be + * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t + * which has the destination that we send everything, no matter the destination + * mac address. + * + * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things + * are much more interesting and as a result, more complicated. We primarily + * store lists of overlay_target_entry_t's which are stored in both an avl tree + * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * is only used for a few of the target ioctls used to dump data such that we + * can get a consistent iteration order for things like dladm show-overlay -t. + * The key that we use for the reference hashtable is based on the mac address + * in the cache and currently we just do a simple CRC32 to transform it into a + * hash. + * + * Each entry maintains a set of flags to indicate the current status of the + * request. The flags may indicate one of three states: that current cache entry + * is valid, that the current cache entry has been directed to drop all output, + * and that the current cache entry is invalid and may be being looked up. In + * the case where it's valid, we just take the destination address and run with + * it. + * + * If it's invalid and a lookup has not been made, then we start the process + * that prepares a query that will make its way up to varpd. The cache entry + * entry maintains a message block chain of outstanding message blocks and a + * size. These lists are populated only when we don't know the answer as to + * where should these be sent. The size entry is used to cap the amount of + * outstanding data that we don't know the answer to. If we exceed a cap on the + * amount of outstanding data (currently 1 Mb), then we'll drop any additional + * packets. Once we get an answer indicating a valid destination, we transmit + * any outstanding data to that place. For the full story on how we look that up + * will be discussed in the section on the Target Cache Lifecycle. + * + * ------------------------ + * FMA and Degraded Devices + * ------------------------ + * + * Every kernel overlay device keeps track of its FMA state. Today in FMA we + * cannot represent partitions between resources nor can we represent that a + * given minor node of a psuedo device has failed -- if we degrade the overlay + * device, then the entire dev_info_t is degraded. However, we still want to be + * able to indicate to administrators that things may go wrong. + * + * To this end, we've added a notion of a degraded state to every overlay + * device. This state is primarily dictated by userland and it can happen for + * various reasons. Generally, because a userland lookup plugin has been + * partitioned, or something has gone wrong such that there is no longer any + * userland lookup module for a device, then we'll mark it degraded. + * + * As long as any of our minor instances is degraded, then we'll fire off the + * FMA event to note that. Once the last degraded instance is no longer + * degraded, then we'll end up telling FMA that we're all clean. + * + * To help administrators get a better sense of which of the various minor + * devices is wrong, we store the odd_fmamsg[] character array. This character + * array can be fetched with doing a dladm show-overlay -f. + * + * Note, that it's important that we do not update the link status of the + * devices. We want to remain up as much as possible. By changing the link in a + * degraded state, this may end up making things worse. We may still actually + * have information in the target cache and if we mark the link down, that'll + * result in not being able to use it. The reason being that this'll mark all + * the downstream VNICs down which will go to IP and from there we end up + * dealing with sadness. + * + * ----------------------- + * Target Cache Life Cycle + * ----------------------- + * + * This section only applies when we have a lookup plugin of + * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type + * OVERLAY_TARGET_POINT. + * + * While we got into the target cache in the general architecture section, it's + * worth going into more details as to how this actually works and showing some + * examples and state machines. Recall that a target cache entry basically has + * the following state transition diagram: + * + * Initial state + * . . . . . . first access . . . varpd lookup enqueued + * . . . + * . . . + * +-------+ . +----------+ . + * | No |------*---->| Invalid |-------*----+ + * | Entry | | Entry | | + * +-------+ +----------+ | + * varpd ^ ^ varpd | + * invalidate | | drop | + * . . . * * . . v + * +-------+ | | +---------+ + * | Entry |--->-----+ +----<----| Entry | + * | Valid |<----------*---------<----| Pending |->-+ varpd + * +-------+ . +---------+ * . . drop, but + * . varpd ^ | other queued + * . success | | entries + * +-----+ + * + * When the table is first created, it is empty. As we attempt to lookup entries + * and we find there is no entry at all, we'll create a new table entry for it. + * At that point the entry is technically in an invalid state, that means that + * we have no valid data from varpd. In that case, we'll go ahead and queue the + * packet into the entry's pending chain, and queue a varpd lookup, setting the + * OVERLAY_ENTRY_F_PENDING flag in the progress. + * + * If additional mblk_t's come in for this entry, we end up appending them to + * the tail of the chain, if and only if, we don't exceed the threshold for the + * amount of space they can take up. An entry remains pending until we get a + * varpd reply. If varpd replies with a valid results, we move to the valid + * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one + * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. + * + * Once an entry is valid, it stays valid until user land tells us to invalidate + * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and + * OVERLAY_TARG_CACHE_SET respectively. + * + * If the lookup fails with a call to drop the packet, then the next state is + * determined by the state of the queue. If the set of outstanding entries is + * empty, then we just transition back to the invalid state. If instead, the + * set of outstanding entries is not empty, then we'll queue another entry and + * stay in the same state, repeating this until the number of requests is + * drained. + * + * The following images describes the flow of a given lookup and where the + * overlay_target_entry_t is at any given time. + * + * +-------------------+ + * | Invalid Entry | An entry starts off as an invalid entry + * | de:ad:be:ef:00:00 | and only exists in the target cache. + * +-------------------+ + * + * ~~~~ + * + * +---------------------+ + * | Global list_t | A mblk_t comes in for an entry. We + * | overlay_target_list | append it to the overlay_target_list. + * +---------------------+ + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry |--->... + * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +--------------------------+ + * | /dev/overlay minor state | User land said that it would look up an + * | overlay_target_hdl_t | entry for us. We remove it from the + * +--------------------------+ global list and add it to the handle's + * | outstanding list. + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry | + * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +-------------------+ + * | Valid Entry | varpd returned an answer with + * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache + * | 10.169.23.42:4789 | entry is now populated with a + * +-------------------+ destination and marked as valid + * + * + * The lookup mechanism is performed via a series of operations on the character + * psuedo-device /dev/overlay. The only thing that uses this device is the + * userland daemon varpd. /dev/overlay is a cloneable device, each open of it + * granting a new minor number which maintains its own state. We maintain this + * state so that way if an outstanding lookup was queued to something that + * crashed or closed its handle without responding, we can know about this and + * thus handle it appropriately. + * + * When a lookup is first created it's added to our global list of outstanding + * lookups. To service requests, userland is required to perform an ioctl to ask + * for a request. We will block it in the kernel a set amount of time waiting + * for a request. When we give a request to a given minor instance of the + * device, we remove it from the global list and append the request to the + * device's list of outstanding entries, for the reasons we discussed above. + * When a lookup comes in, we give user land a smaller amount of information + * specific to that packet, the overlay_targ_lookup_t. It includes a request id + * to identify this, and then the overlay id, the varpd id, the header and + * packet size, the source and destination mac address, the SAP, and any + * potential VLAN header. + * + * At that point, it stays in that outstanding list until one of two ioctls are + * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, + * userland may also perform other operations. For example, it may use + * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth + * analysis of what to do beyond what we gave it initially. This is useful for + * providing proxy arp and the like. Finally, there are two other ioctls that + * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the + * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which + * causes us to encapsulate and send out the packet they've given us. + * + * + * Finally, through the target cache, several ioctls are provided to allow for + * interrogation and management of the cache. They allow for individual entries + * to be retrieved, set, or have the entire table flushed. For the full set of + * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. + * + * ------------------ + * Sample Packet Flow + * ------------------ + * + * There's a lot of pieces here, hopefully an example of how this all fits + * together will help clarify and elucidate what's going on. We're going to + * first track an outgoing packet, eg. one that is sent from an IP interface on + * a VNIC on top of an overlay device, and then we'll look at what it means to + * respond to that. + * + * + * +----------------+ +--------------+ +------------------+ + * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | + * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | + * +----------------+ | VNIC device | | overlay_m_tx() | + * +--------------+ +------------------+ + * | + * . lookup . cache | + * . drop . miss v + * +---------+ . +--------+ . +------------------+ + * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | + * | mblk_t | | lookup | | in the target | + * +---------+ | queued | | cache | + * ^ +--------+ +------------------+ + * on send | | | cache + * error . . * *. . lookup * . . hit + * | | success v + * | | +------------------+ + * +-----------------+ +--------------->| call plugin | + * | Send out | | ovpo_encap() to | + * | overlay_mux_t's |<----------------------------------| get encap mblk_t | + * | ksocket | +------------------+ + * +-----------------+ + * + * The receive end point looks a little different and looks more like: + * + * +------------------+ +----------------+ +-----------+ + * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ + * | the physical | | IP stack | | to | * . . direct + * | device | +----------------+ | ksocket | | callback + * +------------------+ +-----------+ | + * . overlay id | + * . not found v + * +-----------+ . +-----------------+ +--------------------+ + * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | + * | mblk_t | | ovpo_decap() to | +--------------------+ + * +-----------+ | decap mblk_t | + * +-----------------+ + * | + * * . . overlay id + * v found + * +--------+ +----------------+ + * | adjust |----->| call mac_rx | + * | mblk_t | | on original | + * +--------+ | decaped packet | + * +----------------+ + * + * ------------------ + * Netstack Awareness + * ------------------ + * + * In the above image we note that this enters a netstack. Today the only + * netstack that can be is the global zone as the overlay driver itself is not + * exactly netstack aware. What this really means is that varpd cannot run in a + * non-global zone and an overlay device cannot belong to a non-global zone. + * Non-global zones can still have a VNIC assigned to them that's been created + * over the overlay device the same way they would if it had been created over + * an etherstub or a physical device. + * + * The majority of the work to make it netstack aware is straightforward and the + * biggest thing is to create a netstack module that allows us to hook into + * netstack (and thus zone) creation and destruction. From there, we need to + * amend the target cache lookup routines that we discussed earlier to not have + * a global outstanding list and a global list of handles, but rather, one per + * netstack. + * + * For the mux, we'll need to open the ksocket in the context of the zone, we + * can likely do this with a properly composed credential, but we'll need to do + * some more work on that path. Finally, we'll want to make sure the dld ioctls + * are aware of the zoneid of the caller and we use that appropriately and store + * it in the overlay_dev_t. + * + * ----------- + * GLDv3 Notes + * ----------- + * + * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more + * relevant and other parts are much less relevant for us. For example, the + * GLDv3 is used to toggle the device being put into and out of promiscuous + * mode, to program MAC addresses for unicast and multicast hardware filters. + * Today, an overlay device doesn't have a notion of promiscuous mode nor does + * it have a notion of unicast and multicast addresses programmed into the + * device. Instead, for the purposes of the hardware filter, we don't do + * anything and just always accept new addresses being added and removed. + * + * If the GLDv3 start function has not been called, then we will not use this + * device for I/O purposes. Any calls to transmit or receive should be dropped, + * though the GLDv3 guarantees us that transmit will not be called without + * calling start. Similarly, once stop is called, then no packets can be dealt + * with. + * + * Today we don't support the stat interfaces, though there's no good reason + * that we shouldn't assemble some of the stats based on what we have in the + * future. + * + * When it comes to link properties, many of the traditional link properties do + * not apply and many others MAC handles for us. For example, we don't need to + * implement anything for overlay_m_getprop() to deal with returning the MTU, as + * MAC never calls into us for that. As such, there isn't much of anything to + * support in terms of properties. + * + * Today, we don't support any notion of hardware capabilities. However, if + * future NIC hardware or other changes to the system cause it to make sense for + * us to emulate logical groups, then we should do that. However, we still do + * implement a capab function so that we can identify ourselves as an overlay + * device to the broader MAC framework. This is done mostly so that a device + * created on top of us can have fanout rings as we don't try to lie about a + * speed for our device. + * + * The other question is what should be done for a device's MTU and margin. We + * set our minimum supported MTU to be the minimum value that an IP network may + * be set to 576 -- which mimics what an etherstub does. On the flip side, we + * have our upper bound set to 8900. This value comes from the fact that a lot + * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 + * bytes, which isn't exactly the most accurate number, but it'll be good enough + * for now. Because of that, our default MTU off of these devices is 1400, as + * the default MTU for everything is usually 1500 or whatever the underlying + * device is at; however, this is a bit simpler than asking the netstack what + * are all the IP interfaces at. It also calls into question how PMTU and PMTU + * discovery should work here. The challenge, especially for + * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's + * not clear that if you have a single bad entry that the overall MTU should be + * lowered. Instead, we should figure out a better way of determining these + * kinds of PMTU errors and appropriately alerting the administrator via FMA. + * + * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether + * or not the underlying encapsulation device supports VLAN tags. If it does, + * then we'll set the margin to allow for it, otherwise, we will not. + */ + +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/policy.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/ddifm.h> + +#include <sys/dls.h> +#include <sys/dld_ioc.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> + +#include <sys/overlay_impl.h> + +dev_info_t *overlay_dip; +static kmutex_t overlay_dev_lock; +static list_t overlay_dev_list; +static uint8_t overlay_macaddr[ETHERADDRL] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + +typedef enum overlay_dev_prop { + OVERLAY_DEV_P_MTU = 0, + OVERLAY_DEV_P_VNETID, + OVERLAY_DEV_P_ENCAP, + OVERLAY_DEV_P_VARPDID +} overlay_dev_prop_t; + +#define OVERLAY_DEV_NPROPS 4 +static const char *overlay_dev_props[] = { + "mtu", + "vnetid", + "encap", + "varpd/id" +}; + +#define OVERLAY_MTU_MIN 576 +#define OVERLAY_MTU_DEF 1400 +#define OVERLAY_MTU_MAX 8900 + +overlay_dev_t * +overlay_hold_by_dlid(datalink_id_t id) +{ + overlay_dev_t *o; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (id == o->odd_linkid) { + mutex_enter(&o->odd_lock); + o->odd_ref++; + mutex_exit(&o->odd_lock); + mutex_exit(&overlay_dev_lock); + return (o); + } + } + + mutex_exit(&overlay_dev_lock); + return (NULL); +} + +void +overlay_hold_rele(overlay_dev_t *odd) +{ + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_ref > 0); + odd->odd_ref--; + mutex_exit(&odd->odd_lock); +} + +void +overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) + odd->odd_rxcount++; + if (flag & OVERLAY_F_IN_TX) + odd->odd_txcount++; + odd->odd_flags |= flag; +} + +void +overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + boolean_t signal = B_FALSE; + + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) { + ASSERT(odd->odd_rxcount > 0); + odd->odd_rxcount--; + if (odd->odd_rxcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_RX; + } + } + if (flag & OVERLAY_F_IN_TX) { + ASSERT(odd->odd_txcount > 0); + odd->odd_txcount--; + if (odd->odd_txcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_TX; + } + } + + if (signal == B_TRUE) + cv_broadcast(&odd->odd_iowait); +} + +static void +overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + while (odd->odd_flags & flag) { + cv_wait(&odd->odd_iowait, &odd->odd_lock); + } +} + +void +overlay_dev_iter(overlay_dev_iter_f func, void *arg) +{ + overlay_dev_t *odd; + + mutex_enter(&overlay_dev_lock); + for (odd = list_head(&overlay_dev_list); odd != NULL; + odd = list_next(&overlay_dev_list, odd)) { + if (func(odd, arg) != 0) { + mutex_exit(&overlay_dev_lock); + return; + } + } + mutex_exit(&overlay_dev_lock); +} + +/* ARGSUSED */ +static int +overlay_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + return (ENOTSUP); +} + +static int +overlay_m_start(void *arg) +{ + overlay_dev_t *odd = arg; + overlay_mux_t *mux; + int ret, domain, family, prot; + struct sockaddr_storage storage; + socklen_t slen; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { + mutex_exit(&odd->odd_lock); + return (EAGAIN); + } + mutex_exit(&odd->odd_lock); + + ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, + &family, &prot, (struct sockaddr *)&storage, &slen); + if (ret != 0) + return (ret); + + mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, + (struct sockaddr *)&storage, slen, &ret); + if (mux == NULL) + return (ret); + + overlay_mux_add_dev(mux, odd); + odd->odd_mux = mux; + mutex_enter(&odd->odd_lock); + ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); + odd->odd_flags |= OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); + + return (0); +} + +static void +overlay_m_stop(void *arg) +{ + overlay_dev_t *odd = arg; + + /* + * The MAC Perimeter is held here, so we don't have to worry about + * synchornizing this with respect to metadata operations. + */ + mutex_enter(&odd->odd_lock); + VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); + VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + overlay_mux_close(odd->odd_mux); + odd->odd_mux = NULL; + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + odd->odd_flags &= ~OVERLAY_F_MDDROP; + VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); + mutex_exit(&odd->odd_lock); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_promisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_unicast(void *arg, const uint8_t *macaddr) +{ + return (0); +} + +mblk_t * +overlay_m_tx(void *arg, mblk_t *mp_chain) +{ + overlay_dev_t *odd = arg; + mblk_t *mp, *ep; + int ret; + ovep_encap_info_t einfo; + struct msghdr hdr; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + freemsgchain(mp_chain); + return (NULL); + } + overlay_io_start(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + + bzero(&hdr, sizeof (struct msghdr)); + + bzero(&einfo, sizeof (ovep_encap_info_t)); + einfo.ovdi_id = odd->odd_vid; + mp = mp_chain; + while (mp != NULL) { + socklen_t slen; + struct sockaddr_storage storage; + + mp_chain = mp->b_next; + mp->b_next = NULL; + ep = NULL; + + ret = overlay_target_lookup(odd, mp, + (struct sockaddr *)&storage, &slen); + if (ret != OVERLAY_TARGET_OK) { + if (ret == OVERLAY_TARGET_DROP) + freemsg(mp); + mp = mp_chain; + continue; + } + + hdr.msg_name = &storage; + hdr.msg_namelen = slen; + + ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, + &einfo, &ep); + if (ret != 0 || ep == NULL) { + freemsg(mp); + goto out; + } + + ASSERT(ep->b_cont == mp || ep == mp); + ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); + if (ret != 0) + goto out; + + mp = mp_chain; + } + +out: + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + return (mp_chain); +} + +/* ARGSUSED */ +static void +overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + miocnak(q, mp, 0, ENOTSUP); +} + +/* ARGSUSED */ +static boolean_t +overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + /* + * Tell MAC we're an overlay. + */ + if (cap == MAC_CAPAB_OVERLAY) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t mtu, old; + int err; + overlay_dev_t *odd = arg; + + if (pr_num != MAC_PROP_MTU) + return (ENOTSUP); + + bcopy(pr_val, &mtu, sizeof (mtu)); + if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) + return (EINVAL); + + mutex_enter(&odd->odd_lock); + old = odd->odd_mtu; + odd->odd_mtu = mtu; + err = mac_maxsdu_update(odd->odd_mh, mtu); + if (err != 0) + odd->odd_mtu = old; + mutex_exit(&odd->odd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static void +overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + if (pr_num != MAC_PROP_MTU) + return; + + mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); + mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); +} + +static mac_callbacks_t overlay_m_callbacks = { + .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | + MC_PROPINFO), + .mc_getstat = overlay_m_stat, + .mc_start = overlay_m_start, + .mc_stop = overlay_m_stop, + .mc_setpromisc = overlay_m_promisc, + .mc_multicst = overlay_m_multicast, + .mc_unicst = overlay_m_unicast, + .mc_tx = overlay_m_tx, + .mc_ioctl = overlay_m_ioctl, + .mc_getcapab = overlay_m_getcapab, + .mc_getprop = overlay_m_getprop, + .mc_setprop = overlay_m_setprop, + .mc_propinfo = overlay_m_propinfo +}; + +static boolean_t +overlay_valid_name(const char *name, size_t buflen) +{ + size_t actlen; + int err, i; + + for (i = 0; i < buflen; i++) { + if (name[i] == '\0') + break; + } + + if (i == 0 || i == buflen) + return (B_FALSE); + actlen = i; + if (strchr(name, '/') != NULL) + return (B_FALSE); + if (u8_validate((char *)name, actlen, NULL, + U8_VALIDATE_ENTIRE, &err) < 0) + return (B_FALSE); + return (B_TRUE); +} + +/* ARGSUSED */ +static int +overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int err; + uint64_t maxid; + overlay_dev_t *odd, *o; + mac_register_t *mac; + overlay_ioc_create_t *oicp = karg; + + if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) + return (EINVAL); + + odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); + odd->odd_linkid = oicp->oic_linkid; + odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); + if (odd->odd_plugin == NULL) { + kmem_free(odd, sizeof (overlay_dev_t)); + return (ENOENT); + } + err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, + &odd->odd_pvoid); + if (err != 0) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + /* + * Make sure that our virtual network id is valid for the given plugin + * that we're working with. + */ + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; + if (oicp->oic_vnetid > maxid) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_vid = oicp->oic_vnetid; + + mac = mac_alloc(MAC_VERSION); + if (mac == NULL) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = odd; + mac->m_dip = overlay_dip; + mac->m_dst_addr = NULL; + mac->m_callbacks = &overlay_m_callbacks; + mac->m_pdata = NULL; + mac->m_pdata_size = 0; + + mac->m_priv_props = NULL; + + /* Let mac handle this itself. */ + mac->m_instance = (uint_t)-1; + + /* + * There is no real source address that should be used here, but saying + * that we're not ethernet is going to cause its own problems. At the + * end of the say, this is fine. + */ + mac->m_src_addr = overlay_macaddr; + + /* + * Start with the default MTU as the max SDU. If the MTU is changed, the + * SDU will be changed to reflect that. + */ + mac->m_min_sdu = 1; + mac->m_max_sdu = OVERLAY_MTU_DEF; + mac->m_multicast_sdu = 0; + + /* + * The underlying device doesn't matter, instead this comes from the + * encapsulation protocol and whether or not they allow VLAN tags. + */ + if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { + mac->m_margin = VLAN_TAGSZ; + } else { + mac->m_margin = 0; + } + + /* + * Today, we have no MAC virtualization, it may make sense in the future + * to go ahead and emulate some subset of this, but it doesn't today. + */ + mac->m_v12n = MAC_VIRT_NONE; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (o->odd_linkid == oicp->oic_linkid) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + + if (o->odd_vid == oicp->oic_vnetid && + o->odd_plugin == odd->odd_plugin) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + } + + err = mac_register(mac, &odd->odd_mh); + mac_free(mac); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + (void) mac_unregister(odd->odd_mh); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); + odd->odd_ref = 0; + odd->odd_flags = 0; + list_insert_tail(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int i, ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_activate_t *oiap = karg; + overlay_ioc_propinfo_t *infop; + overlay_ioc_prop_t *oip; + overlay_prop_handle_t phdl; + + odd = overlay_hold_by_dlid(oiap->oia_linkid); + if (odd == NULL) + return (ENOENT); + + infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); + oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); + phdl = (overlay_prop_handle_t)infop; + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EEXIST); + } + mutex_exit(&odd->odd_lock); + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + const char *pname = odd->odd_plugin->ovp_props[i]; + bzero(infop, sizeof (overlay_ioc_propinfo_t)); + overlay_prop_init(phdl); + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + + if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) + continue; + bzero(oip, sizeof (overlay_ioc_prop_t)); + oip->oip_size = sizeof (oip->oip_value); + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + pname, oip->oip_value, &oip->oip_size); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + if (oip->oip_size == 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EINVAL); + } + } + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ENXIO); + } + + ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); + odd->odd_flags |= OVERLAY_F_ACTIVATED; + + /* + * Now that we've activated ourselves, we should indicate to the world + * that we're up. Note that we may not be able to perform lookups at + * this time, but our notion of being 'up' isn't dependent on that + * ability. + */ + mac_link_update(odd->odd_mh, LINK_STATE_UP); + mutex_exit(&odd->odd_lock); + + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + overlay_ioc_delete_t *oidp = karg; + overlay_dev_t *odd; + datalink_id_t tid; + int ret; + + odd = overlay_hold_by_dlid(oidp->oid_linkid); + if (odd == NULL) { + return (ENOENT); + } + + mutex_enter(&odd->odd_lock); + /* If we're not the only hold, we're busy */ + if (odd->odd_ref != 1) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + if (odd->odd_flags & OVERLAY_F_IN_MUX) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + /* + * To remove this, we need to first remove it from dls and then remove + * it from mac. The act of removing it from mac will check if there are + * devices on top of this, eg. vnics. If there are, then that will fail + * and we'll have to go through and recreate the dls entry. Only after + * mac_unregister has succeeded, then we'll go through and actually free + * everything and drop the dev lock. + */ + ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); + if (ret != 0) { + overlay_hold_rele(odd); + return (ret); + } + + ASSERT(oidp->oid_linkid == tid); + ret = mac_disable(odd->odd_mh); + if (ret != 0) { + (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + overlay_hold_rele(odd); + return (ret); + } + + overlay_target_quiesce(odd->odd_target); + + mutex_enter(&overlay_dev_lock); + list_remove(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + cv_destroy(&odd->odd_iowait); + mutex_destroy(&odd->odd_lock); + overlay_target_free(odd); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_nprops_t *on = karg; + + odd = overlay_hold_by_dlid(on->oipn_linkid); + if (odd == NULL) + return (ENOENT); + on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) +{ + overlay_prop_handle_t phdl = arg; + overlay_prop_set_range_str(phdl, opp->ovp_name); + return (0); +} + +static int +overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) +{ + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], name) == 0) { + *id = i; + return (0); + } + } + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { + *id = i + OVERLAY_DEV_NPROPS; + return (0); + } + } + + return (ENOENT); +} + +static void +overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) +{ + uint32_t def; + mac_propval_range_t range; + uint_t perm; + + ASSERT(MAC_PERIM_HELD(odd->odd_mh)); + + bzero(&range, sizeof (mac_propval_range_t)); + range.mpr_count = 1; + if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, + sizeof (def), &range, &perm) != 0) + return; + + if (perm == MAC_PROP_PERM_READ) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + else if (perm == MAC_PROP_PERM_WRITE) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); + else if (perm == MAC_PROP_PERM_RW) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, + range.mpr_range_uint32[0].mpur_max); +} + +/* ARGSUSED */ +static int +overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + int ret; + mac_perim_handle_t mph; + uint_t propid = UINT_MAX; + overlay_ioc_propinfo_t *oip = karg; + overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; + + odd = overlay_hold_by_dlid(oip->oipi_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_prop_init(phdl); + mac_perim_enter_by_mh(odd->odd_mh, &mph); + + /* + * If the id is -1, then the property that we're looking for is named in + * oipi_name and we should fill in its id. Otherwise, we've been given + * an id and we need to turn that into a name for our plugin's sake. The + * id is our own fabrication for property discovery. + */ + if (oip->oipi_id == -1) { + /* + * Determine if it's a known generic property or it belongs to a + * module by checking against the list of known names. + */ + oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, + &propid)) != 0) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + oip->oipi_id = propid; + if (propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + oip->oipi_name, phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + + } + } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; + + if (id >= odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + odd->odd_plugin->ovp_props[id], phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oipi_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oipi_id >= 0); + propid = oip->oipi_id; + (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], + sizeof (oip->oipi_name)); + } + + switch (propid) { + case OVERLAY_DEV_P_MTU: + overlay_i_propinfo_mtu(odd, phdl); + break; + case OVERLAY_DEV_P_VNETID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + case OVERLAY_DEV_P_ENCAP: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); + overlay_prop_set_nodefault(phdl); + overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); + break; + case OVERLAY_DEV_P_VARPDID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + default: + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ENOENT); + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_prop_t *oip = karg; + uint_t propid, mtu; + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + oip->oip_size = OVERLAY_PROP_SIZEMAX; + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_getprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oip_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + /* + * The MTU is always set and retrieved through MAC, to allow for + * MAC to do whatever it wants, as really that property belongs + * to MAC. This is important for things where vnics have hold on + * the MTU. + */ + mac_sdu_get(odd->odd_mh, NULL, &mtu); + bcopy(&mtu, oip->oip_value, sizeof (uint_t)); + oip->oip_size = sizeof (uint_t); + break; + case OVERLAY_DEV_P_VNETID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint64_t); + break; + case OVERLAY_DEV_P_ENCAP: + oip->oip_size = strlcpy((char *)oip->oip_value, + odd->odd_plugin->ovp_name, oip->oip_size); + break; + case OVERLAY_DEV_P_VARPDID: + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + const uint64_t val = odd->odd_target->ott_id; + bcopy(&val, oip->oip_value, sizeof (uint64_t)); + oip->oip_size = sizeof (uint64_t); + } else { + oip->oip_size = 0; + } + mutex_exit(&odd->odd_lock); + break; + default: + ret = ENOENT; + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); +} + +static void +overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +/* ARGSUSED */ +static int +overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + overlay_ioc_prop_t *oip = karg; + uint_t propid = UINT_MAX; + mac_perim_handle_t mph; + uint64_t maxid, *vidp; + + if (oip->oip_size > OVERLAY_PROP_SIZEMAX) + return (EINVAL); + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mac_perim_exit(mph); + mutex_exit(&odd->odd_lock); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); + } else if (oip->oip_id < -1) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", + oip->oip_value, oip->oip_size); + break; + case OVERLAY_DEV_P_VNETID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vidp = (uint64_t *)oip->oip_value; + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - + 1ULL; + if (*vidp >= maxid) { + ret = EINVAL; + break; + } + overlay_setprop_vnetid(odd, *vidp); + break; + case OVERLAY_DEV_P_ENCAP: + case OVERLAY_DEV_P_VARPDID: + ret = EPERM; + break; + default: + ret = ENOENT; + } + + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); +} + +/* ARGSUSED */ +static int +overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_status_t *os = karg; + + odd = overlay_hold_by_dlid(os->ois_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { + os->ois_status = OVERLAY_I_DEGRADED; + if (odd->odd_fmamsg != NULL) { + (void) strlcpy(os->ois_message, odd->odd_fmamsg, + OVERLAY_STATUS_BUFLEN); + } else { + os->ois_message[0] = '\0'; + } + + } else { + os->ois_status = OVERLAY_I_OK; + os->ois_message[0] = '\0'; + } + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + + return (0); +} + +static dld_ioc_info_t overlay_ioc_list[] = { + { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), + overlay_i_create, secpolicy_dl_config }, + { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), + overlay_i_activate, secpolicy_dl_config }, + { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), + overlay_i_delete, secpolicy_dl_config }, + { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, + secpolicy_dl_config }, + { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_prop_t), overlay_i_getprop, + secpolicy_dl_config }, + { OVERLAY_IOC_SETPROP, DLDCOPYIN, + sizeof (overlay_ioc_prop_t), overlay_i_setprop, + secpolicy_dl_config }, + { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_nprops_t), overlay_i_nprops, + secpolicy_dl_config }, + { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_status_t), overlay_i_status, + NULL } +}; + +static int +overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int fmcap = DDI_FM_EREPORT_CAPABLE; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (overlay_dip != NULL || ddi_get_instance(dip) != 0) + return (DDI_FAILURE); + + ddi_fm_init(dip, &fmcap, NULL); + + if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, + DLDIOCCNT(overlay_ioc_list)) != 0) { + ddi_remove_minor_node(dip, OVERLAY_CTL); + return (DDI_FAILURE); + } + + overlay_dip = dip; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resp = (void *)overlay_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resp = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&overlay_dev_lock); + if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { + mutex_exit(&overlay_dev_lock); + return (EBUSY); + } + mutex_exit(&overlay_dev_lock); + + + dld_ioc_unregister(OVERLAY_IOC); + ddi_remove_minor_node(dip, OVERLAY_CTL); + ddi_fm_fini(dip); + overlay_dip = NULL; + return (DDI_SUCCESS); +} + +static struct cb_ops overlay_cbops = { + overlay_target_open, /* cb_open */ + overlay_target_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + overlay_target_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev, /* cb_awrite */ +}; + +static struct dev_ops overlay_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + overlay_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + overlay_attach, /* devo_attach */ + overlay_detach, /* devo_detach */ + nulldev, /* devo_reset */ + &overlay_cbops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + NULL, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv overlay_modldrv = { + &mod_driverops, + "Overlay Network Driver", + &overlay_dev_ops +}; + +static struct modlinkage overlay_linkage = { + MODREV_1, + &overlay_modldrv +}; + +static int +overlay_init(void) +{ + mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&overlay_dev_list, sizeof (overlay_dev_t), + offsetof(overlay_dev_t, odd_link)); + overlay_mux_init(); + overlay_plugin_init(); + overlay_target_init(); + + return (DDI_SUCCESS); +} + +static void +overlay_fini(void) +{ + overlay_target_fini(); + overlay_plugin_fini(); + overlay_mux_fini(); + mutex_destroy(&overlay_dev_lock); + list_destroy(&overlay_dev_list); +} + +int +_init(void) +{ + int err; + + if ((err = overlay_init()) != DDI_SUCCESS) + return (err); + + mac_init_ops(NULL, "overlay"); + err = mod_install(&overlay_linkage); + if (err != DDI_SUCCESS) { + overlay_fini(); + return (err); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&overlay_linkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + err = mod_remove(&overlay_linkage); + if (err != 0) + return (err); + + overlay_fini(); + return (0); +} diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf new file mode 100644 index 0000000000..4b62fafd94 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015, Joyent, Inc. +# + +name="overlay" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile new file mode 100644 index 0000000000..800d72dc2b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.mapfile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # Encapsualation Plugin interfaces + overlay_plugin_alloc; + overlay_plugin_free; + overlay_plugin_register; + overlay_plugin_unregister; + local: + *; +}; diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c new file mode 100644 index 0000000000..0701d08e8b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_fm.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device FMA operations. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/ddifm.h> +#include <sys/overlay_impl.h> + +kmutex_t overlay_fm_lock; +uint_t overlay_fm_count; + +void +overlay_fm_init(void) +{ + overlay_fm_count = 0; + mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_fm_fini(void) +{ + VERIFY(overlay_fm_count == 0); + mutex_destroy(&overlay_fm_lock); +} + +void +overlay_fm_degrade(overlay_dev_t *odd, const char *msg) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + + if (msg != NULL) + (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN); + + if (odd->odd_flags & OVERLAY_F_DEGRADED) + goto out; + + odd->odd_flags |= OVERLAY_F_DEGRADED; + overlay_fm_count++; + if (overlay_fm_count == 1) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} + +void +overlay_fm_restore(overlay_dev_t *odd) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_DEGRADED)) + goto out; + + odd->odd_fmamsg[0] = '\0'; + odd->odd_flags &= ~OVERLAY_F_DEGRADED; + overlay_fm_count--; + if (overlay_fm_count == 0) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c new file mode 100644 index 0000000000..58e9f2665d --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Overlay device ksocket multiplexer. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ksynch.h> +#include <sys/ksocket.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/pattr.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> + +#include <sys/overlay_impl.h> + +#include <sys/sdt.h> + +#define OVERLAY_FREEMSG(mp, reason) \ + DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason) + +static list_t overlay_mux_list; +static kmutex_t overlay_mux_lock; + +void +overlay_mux_init(void) +{ + list_create(&overlay_mux_list, sizeof (overlay_mux_t), + offsetof(overlay_mux_t, omux_lnode)); + mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_mux_fini(void) +{ + mutex_destroy(&overlay_mux_lock); + list_destroy(&overlay_mux_list); +} + +static int +overlay_mux_comparator(const void *a, const void *b) +{ + const overlay_dev_t *odl, *odr; + odl = a; + odr = b; + if (odl->odd_vid > odr->odd_vid) + return (1); + else if (odl->odd_vid < odr->odd_vid) + return (-1); + else + return (0); +} + +/* + * This is the central receive data path. We need to decode the packet, if we + * can, and then deliver it to the appropriate overlay. + */ +/* ARGSUSED */ +static boolean_t +overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, + void *arg) +{ + mblk_t *mp, *nmp, *fmp; + overlay_mux_t *mux = arg; + + /* + * We may have a received a chain of messages. Each messsage in the + * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. + * If we aren't getting that, we should probably drop that for the + * moment. + */ + for (mp = mpchain; mp != NULL; mp = nmp) { + struct T_unitdata_ind *tudi; + ovep_encap_info_t infop; + overlay_dev_t od, *odd; + int ret; + + nmp = mp->b_next; + mp->b_next = NULL; + + if (DB_TYPE(mp) != M_PROTO) { + OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); + freemsg(mp); + continue; + } + + if (mp->b_cont == NULL) { + OVERLAY_FREEMSG(mp, "missing a b_cont"); + freemsg(mp); + continue; + } + + tudi = (struct T_unitdata_ind *)mp->b_rptr; + if (tudi->PRIM_type != T_UNITDATA_IND) { + OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); + freemsg(mp); + continue; + } + + /* + * In the future, we'll care about the source information + * for purposes of telling varpd for oob invalidation. But for + * now, just drop that block. + */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + + /* + * Until we have VXLAN-or-other-decap HW acceleration support + * (e.g. we support NICs that reach into VXLAN-encapsulated + * packets and check the inside-VXLAN IP packets' checksums, + * or do LSO with VXLAN), we should clear any HW-accelerated- + * performed bits. + * + * We do this, even in cases of HW_LOCAL_MAC, because we + * absolutely have NO context about the inner packet. + * It could've arrived off an external NIC and been forwarded + * to the overlay network, which means no context. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Decap and deliver. + */ + bzero(&infop, sizeof (ovep_encap_info_t)); + ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); + if (ret != 0) { + OVERLAY_FREEMSG(mp, "decap failed"); + freemsg(mp); + continue; + } + if (MBLKL(mp) > infop.ovdi_hdr_size) { + mp->b_rptr += infop.ovdi_hdr_size; + } else { + while (infop.ovdi_hdr_size != 0) { + size_t rem, blkl; + + if (mp == NULL) + break; + + blkl = MBLKL(mp); + rem = MIN(infop.ovdi_hdr_size, blkl); + infop.ovdi_hdr_size -= rem; + mp->b_rptr += rem; + if (rem == blkl) { + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + OVERLAY_FREEMSG(mp, + "freed a fmp block"); + freemsg(fmp); + } + } + if (mp == NULL) { + OVERLAY_FREEMSG(mp, "freed it all..."); + continue; + } + } + + + od.odd_vid = infop.ovdi_id; + mutex_enter(&mux->omux_lock); + odd = avl_find(&mux->omux_devices, &od, NULL); + if (odd == NULL) { + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "no matching vid"); + freemsg(mp); + continue; + } + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "dev dropped"); + freemsg(mp); + continue; + } + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + } + + return (B_TRUE); +} + +/* + * Register a given device with a socket backend. If no such device socket + * exists, create a new one. + */ +overlay_mux_t * +overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, + struct sockaddr *addr, socklen_t len, int *errp) +{ + int err; + overlay_mux_t *mux; + ksocket_t ksock; + + if (errp == NULL) + errp = &err; + + mutex_enter(&overlay_mux_lock); + for (mux = list_head(&overlay_mux_list); mux != NULL; + mux = list_next(&overlay_mux_list, mux)) { + if (domain == mux->omux_domain && + family == mux->omux_family && + protocol == mux->omux_protocol && + len == mux->omux_alen && + bcmp(addr, mux->omux_addr, len) == 0) { + + if (opp != mux->omux_plugin) { + *errp = EEXIST; + return (NULL); + } + + mutex_enter(&mux->omux_lock); + mux->omux_count++; + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + *errp = 0; + return (mux); + } + } + + /* + * Today we aren't zone-aware and only exist in the global zone. When we + * allow for things to exist in the non-global zone, we'll want to use a + * credential that's actually specific to the zone. + */ + *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, + kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + return (NULL); + } + + *errp = ksocket_bind(ksock, addr, len, kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + /* + * Ask our lower layer to optionally toggle anything they need on this + * socket. Because a socket is owned by a single type of plugin, we can + * then ask it to perform any additional socket set up it'd like to do. + */ + if (opp->ovp_ops->ovpo_sockopt != NULL && + (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); + list_link_init(&mux->omux_lnode); + mux->omux_ksock = ksock; + mux->omux_plugin = opp; + mux->omux_domain = domain; + mux->omux_family = family; + mux->omux_protocol = protocol; + mux->omux_addr = kmem_alloc(len, KM_SLEEP); + bcopy(addr, mux->omux_addr, len); + mux->omux_alen = len; + mux->omux_count = 1; + avl_create(&mux->omux_devices, overlay_mux_comparator, + sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); + mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); + + + /* Once this is called, we need to expect to rx data */ + *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); + if (*errp != 0) { + ksocket_close(ksock, kcred); + mutex_destroy(&mux->omux_lock); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, len); + kmem_free(mux, sizeof (overlay_mux_t)); + return (NULL); + } + + list_insert_tail(&overlay_mux_list, mux); + mutex_exit(&overlay_mux_lock); + + *errp = 0; + return (mux); +} + +void +overlay_mux_close(overlay_mux_t *mux) +{ + mutex_enter(&overlay_mux_lock); + mutex_enter(&mux->omux_lock); + mux->omux_count--; + if (mux->omux_count != 0) { + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + return; + } + list_remove(&overlay_mux_list, mux); + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + + ksocket_close(mux->omux_ksock, kcred); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, mux->omux_alen); + kmem_free(mux, sizeof (overlay_mux_t)); +} + +void +overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_add(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +void +overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_remove(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +int +overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) +{ + int ret; + + /* + * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, + * that isn't actually supported by UDP at this time. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); + if (ret != 0) + freemsg(mp); + + return (ret); +} diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c new file mode 100644 index 0000000000..348ddb92a2 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_plugin.c @@ -0,0 +1,281 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device encapsulation plugin management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/modctl.h> + +#include <sys/overlay_impl.h> + +static kmem_cache_t *overlay_plugin_cache; +static kmutex_t overlay_plugin_lock; +static list_t overlay_plugin_list; + +#define OVERLAY_MODDIR "overlay" + +/* ARGSUSED */ +static int +overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags) +{ + overlay_plugin_t *opp = buf; + + mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL); + list_link_init(&opp->ovp_link); + + return (0); +} + +/* ARGSUSED */ +static void +overlay_plugin_cache_destructor(void *buf, void *arg) +{ + overlay_plugin_t *opp = buf; + ASSERT(list_link_active(&opp->ovp_link) == 0); + mutex_destroy(&opp->ovp_mutex); +} + +void +overlay_plugin_init(void) +{ + mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0); + + /* + * In the future we may want to have a reaper to unload unused modules + * to help the kernel be able to reclaim memory. + */ + overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache", + sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor, + overlay_plugin_cache_destructor, NULL, NULL, NULL, 0); + list_create(&overlay_plugin_list, sizeof (overlay_plugin_t), + offsetof(overlay_plugin_t, ovp_link)); +} + +void +overlay_plugin_fini(void) +{ + mutex_enter(&overlay_plugin_lock); + VERIFY(list_is_empty(&overlay_plugin_list)); + mutex_exit(&overlay_plugin_lock); + + list_destroy(&overlay_plugin_list); + kmem_cache_destroy(overlay_plugin_cache); + mutex_destroy(&overlay_plugin_lock); +} + +overlay_plugin_register_t * +overlay_plugin_alloc(uint_t version) +{ + overlay_plugin_register_t *ovrp; + /* Version 1 is the only one that exists */ + if (version != OVEP_VERSION_ONE) + return (NULL); + + ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP); + ovrp->ovep_version = version; + return (ovrp); +} + +void +overlay_plugin_free(overlay_plugin_register_t *ovrp) +{ + kmem_free(ovrp, sizeof (overlay_plugin_register_t)); +} + +int +overlay_plugin_register(overlay_plugin_register_t *ovrp) +{ + overlay_plugin_t *opp, *ipp; + + /* Sanity check parameters of the registration */ + if (ovrp->ovep_version != OVEP_VERSION_ONE) + return (EINVAL); + + if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL) + return (EINVAL); + + if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0) + return (EINVAL); + + if (ovrp->ovep_id_size < 1) + return (EINVAL); + + /* Don't support anything that has an id size larger than 8 bytes */ + if (ovrp->ovep_id_size > 8) + return (ENOTSUP); + + if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID) + return (EINVAL); + + if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0) + return (EINVAL); + + if (ovrp->ovep_ops->ovpo_callbacks != 0) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_init == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_fini == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_encap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_decap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_socket == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_getprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_setprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_propinfo == NULL) + return (EINVAL); + + + opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP); + opp->ovp_active = 0; + opp->ovp_name = ovrp->ovep_name; + opp->ovp_ops = ovrp->ovep_ops; + opp->ovp_props = ovrp->ovep_props; + opp->ovp_id_size = ovrp->ovep_id_size; + opp->ovp_flags = ovrp->ovep_flags; + opp->ovp_dest = ovrp->ovep_dest; + + opp->ovp_nprops = 0; + if (ovrp->ovep_props != NULL) { + while (ovrp->ovep_props[opp->ovp_nprops] != NULL) { + if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >= + OVERLAY_PROP_NAMELEN) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EINVAL); + } + opp->ovp_nprops++; + } + } + + mutex_enter(&overlay_plugin_lock); + for (ipp = list_head(&overlay_plugin_list); ipp != NULL; + ipp = list_next(&overlay_plugin_list, ipp)) { + if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EEXIST); + } + } + list_insert_tail(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + return (0); +} + +int +overlay_plugin_unregister(const char *name) +{ + overlay_plugin_t *opp; + + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(opp->ovp_name, name) == 0) + break; + } + + if (opp == NULL) { + mutex_exit(&overlay_plugin_lock); + return (ENOENT); + } + + mutex_enter(&opp->ovp_mutex); + if (opp->ovp_active > 0) { + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (EBUSY); + } + mutex_exit(&opp->ovp_mutex); + + list_remove(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + kmem_cache_free(overlay_plugin_cache, opp); + return (0); +} + +overlay_plugin_t * +overlay_plugin_lookup(const char *name) +{ + overlay_plugin_t *opp; + boolean_t trymodload = B_FALSE; + + for (;;) { + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(name, opp->ovp_name) == 0) { + mutex_enter(&opp->ovp_mutex); + opp->ovp_active++; + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (opp); + } + } + mutex_exit(&overlay_plugin_lock); + + if (trymodload == B_TRUE) + return (NULL); + + /* + * If we didn't find it, it may still exist, but just not have + * been a loaded module. In that case, we'll do one attempt to + * load it. + */ + if (modload(OVERLAY_MODDIR, (char *)name) == -1) + return (NULL); + trymodload = B_TRUE; + } + +} + +void +overlay_plugin_rele(overlay_plugin_t *opp) +{ + mutex_enter(&opp->ovp_mutex); + ASSERT(opp->ovp_active > 0); + opp->ovp_active--; + mutex_exit(&opp->ovp_mutex); +} + +void +overlay_plugin_walk(overlay_plugin_walk_f func, void *arg) +{ + overlay_plugin_t *opp; + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (func(opp, arg) != 0) { + mutex_exit(&overlay_plugin_lock); + return; + } + } + mutex_exit(&overlay_plugin_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c new file mode 100644 index 0000000000..ba1ea2a629 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_prop.c @@ -0,0 +1,122 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Routines for manipulating property information structures. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/overlay_impl.h> + +void +overlay_prop_init(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + infop->oipi_posssize = sizeof (mac_propval_range_t); + bzero(rangep, sizeof (mac_propval_range_t)); +} + +void +overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN); +} + +void +overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_prot = prot; +} + +void +overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_type = type; +} + +int +overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + + if (len > OVERLAY_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(def, infop->oipi_default, len); + infop->oipi_defsize = (uint32_t)len; + + return (0); +} + +void +overlay_prop_set_nodefault(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_default[0] = '\0'; + infop->oipi_defsize = 0; +} + +void +overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min, + uint32_t max) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->oipi_poss)) + return; + + infop->oipi_posssize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str) +{ + size_t len = strlen(str) + 1; /* Account for a null terminator */ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->oipi_posssize + len > sizeof (infop->oipi_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->oipi_poss) - infop->oipi_posssize); + pstr->mpur_nextbyte += len; + infop->oipi_posssize += len; +} diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c new file mode 100644 index 0000000000..f4147b56d1 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -0,0 +1,1651 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device target cache management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> +#include <sys/vlan.h> +#include <sys/crc32.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/overlay_impl.h> +#include <sys/sdt.h> + +/* + * This is total straw man, but at least it's a prime number. Here we're + * going to have to go through and do a lot of evaluation and understanding as + * to how these target caches should grow and shrink, as well as, memory + * pressure and evictions. This just gives us a starting point that'll be 'good + * enough', until it's not. + */ +#define OVERLAY_HSIZE 823 + +/* + * We use this data structure to keep track of what requests have been actively + * allocated to a given instance so we know what to put back on the pending + * list. + */ +typedef struct overlay_target_hdl { + minor_t oth_minor; /* RO */ + zoneid_t oth_zoneid; /* RO */ + int oth_oflags; /* RO */ + list_node_t oth_link; /* overlay_target_lock */ + kmutex_t oth_lock; + list_t oth_outstanding; /* oth_lock */ +} overlay_target_hdl_t; + +typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); +typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); +typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); + +typedef struct overaly_target_ioctl { + int oti_cmd; /* ioctl id */ + boolean_t oti_write; /* ioctl requires FWRITE */ + boolean_t oti_ncopyout; /* copyout data? */ + overlay_target_copyin_f oti_copyin; /* copyin func */ + overlay_target_ioctl_f oti_func; /* function to call */ + overlay_target_copyout_f oti_copyout; /* copyin func */ + size_t oti_size; /* size of user level structure */ +} overlay_target_ioctl_t; + +static kmem_cache_t *overlay_target_cache; +static kmem_cache_t *overlay_entry_cache; +static id_space_t *overlay_thdl_idspace; +static void *overlay_thdl_state; + +/* + * When we support overlay devices in the NGZ, then all of these need to become + * zone aware, by plugging into the netstack engine and becoming per-netstack + * data. + */ +static list_t overlay_thdl_list; +static kmutex_t overlay_target_lock; +static kcondvar_t overlay_target_condvar; +static list_t overlay_target_list; +static boolean_t overlay_target_excl; + +/* + * Outstanding data per hash table entry. + */ +static int overlay_ent_size = 128 * 1024; + +/* ARGSUSED */ +static int +overlay_target_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_t *ott = buf; + + mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_target_cache_destructor(void *buf, void *arg) +{ + overlay_target_t *ott = buf; + + cv_destroy(&ott->ott_cond); + mutex_destroy(&ott->ott_lock); +} + +/* ARGSUSED */ +static int +overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_entry_t *ote = buf; + + bzero(ote, sizeof (overlay_target_entry_t)); + mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_entry_cache_destructor(void *buf, void *arg) +{ + overlay_target_entry_t *ote = buf; + + mutex_destroy(&ote->ote_lock); +} + +static uint64_t +overlay_mac_hash(const void *v) +{ + uint32_t crc; + CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + return (crc); +} + +static int +overlay_mac_cmp(const void *a, const void *b) +{ + return (bcmp(a, b, ETHERADDRL)); +} + +/* ARGSUSED */ +static void +overlay_target_entry_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + + ote->ote_flags = 0; + bzero(ote->ote_addr, ETHERADDRL); + ote->ote_ott = NULL; + ote->ote_odd = NULL; + freemsgchain(ote->ote_chead); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = 0; + kmem_cache_free(overlay_entry_cache, ote); +} + +static int +overlay_mac_avl(const void *a, const void *b) +{ + int i; + const overlay_target_entry_t *l, *r; + l = a; + r = b; + + for (i = 0; i < ETHERADDRL; i++) { + if (l->ote_addr[i] > r->ote_addr[i]) + return (1); + else if (l->ote_addr[i] < r->ote_addr[i]) + return (-1); + } + + return (0); +} + +void +overlay_target_init(void) +{ + int ret; + ret = ddi_soft_state_init(&overlay_thdl_state, + sizeof (overlay_target_hdl_t), 1); + VERIFY(ret == 0); + overlay_target_cache = kmem_cache_create("overlay_target", + sizeof (overlay_target_t), 0, overlay_target_cache_constructor, + overlay_target_cache_destructor, NULL, NULL, NULL, 0); + overlay_entry_cache = kmem_cache_create("overlay_entry", + sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor, + overlay_entry_cache_destructor, NULL, NULL, NULL, 0); + mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL); + list_create(&overlay_target_list, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t), + offsetof(overlay_target_hdl_t, oth_link)); + overlay_thdl_idspace = id_space_create("overlay_target_minors", + 1, INT32_MAX); +} + +void +overlay_target_fini(void) +{ + id_space_destroy(overlay_thdl_idspace); + list_destroy(&overlay_thdl_list); + list_destroy(&overlay_target_list); + cv_destroy(&overlay_target_condvar); + mutex_destroy(&overlay_target_lock); + kmem_cache_destroy(overlay_entry_cache); + kmem_cache_destroy(overlay_target_cache); + ddi_soft_state_fini(&overlay_thdl_state); +} + +void +overlay_target_free(overlay_dev_t *odd) +{ + if (odd->odd_target == NULL) + return; + + if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { + refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; + avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; + overlay_target_entry_t *ote; + + /* + * Our AVL tree and hashtable contain the same elements, + * therefore we should just remove it from the tree, but then + * delete the entries when we remove them from the hash table + * (which happens through the refhash dtor). + */ + while ((ote = avl_first(ap)) != NULL) + avl_remove(ap, ote); + + avl_destroy(ap); + for (ote = refhash_first(rp); ote != NULL; + ote = refhash_next(rp, ote)) { + refhash_remove(rp, ote); + } + refhash_destroy(rp); + } + + ASSERT(odd->odd_target->ott_ocount == 0); + kmem_cache_free(overlay_target_cache, odd->odd_target); +} + +int +overlay_target_busy() +{ + int ret; + + mutex_enter(&overlay_target_lock); + ret = !list_is_empty(&overlay_thdl_list); + mutex_exit(&overlay_target_lock); + + return (ret); +} + +static void +overlay_target_queue(overlay_target_entry_t *entry) +{ + mutex_enter(&overlay_target_lock); + mutex_enter(&entry->ote_ott->ott_lock); + if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&entry->ote_ott->ott_lock); + mutex_exit(&overlay_target_lock); + return; + } + entry->ote_ott->ott_ocount++; + mutex_exit(&entry->ote_ott->ott_lock); + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&overlay_target_lock); +} + +void +overlay_target_quiesce(overlay_target_t *ott) +{ + if (ott == NULL) + return; + mutex_enter(&ott->ott_lock); + ott->ott_flags |= OVERLAY_T_TEARDOWN; + while (ott->ott_ocount != 0) + cv_wait(&ott->ott_cond, &ott->ott_lock); + mutex_exit(&ott->ott_lock); +} + +/* + * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at + * this time, say for NVGRE, we drop all packets that mcuh this. + */ +int +overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, + socklen_t *slenp) +{ + int ret; + struct sockaddr_in6 *v6; + overlay_target_t *ott; + mac_header_info_t mhi; + overlay_target_entry_t *entry; + + ASSERT(odd->odd_target != NULL); + + /* + * At this point, the overlay device is in a mux which means that it's + * been activated. At this point, parts of the target, such as the mode + * and the destination are now read-only and we don't have to worry + * about synchronization for them. + */ + ott = odd->odd_target; + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (OVERLAY_TARGET_DROP); + + v6 = (struct sockaddr_in6 *)sock; + bzero(v6, sizeof (struct sockaddr_in6)); + v6->sin6_family = AF_INET6; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + mutex_enter(&ott->ott_lock); + bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(ott->ott_u.ott_point.otp_port); + mutex_exit(&ott->ott_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (OVERLAY_TARGET_OK); + } + + ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); + + /* + * Note we only want the MAC address here, therefore we won't bother + * using mac_vlan_header_info(). If any caller needs the vlan info at + * this point, this should change to a call to mac_vlan_header_info(). + */ + if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + return (OVERLAY_TARGET_DROP); + mutex_enter(&ott->ott_lock); + entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + mhi.mhi_daddr); + if (entry == NULL) { + entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (entry == NULL) { + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_DROP); + } + bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; + entry->ote_odd = odd; + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); + mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + return (OVERLAY_TARGET_ASYNC); + } + refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(entry->ote_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + ret = OVERLAY_TARGET_OK; + } else { + size_t mlen = msgsize(mp); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + ret = OVERLAY_TARGET_DROP; + } else { + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == + NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & + OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= + OVERLAY_ENTRY_F_PENDING; + overlay_target_queue(entry); + } + ret = OVERLAY_TARGET_ASYNC; + } + } + mutex_exit(&entry->ote_lock); + + mutex_enter(&ott->ott_lock); + refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_info(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_info_t *oti = arg; + + odd = overlay_hold_by_dlid(oti->oti_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + oti->oti_flags = 0; + oti->oti_needs = odd->odd_plugin->ovp_dest; + if (odd->odd_flags & OVERLAY_F_DEGRADED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED; + if (odd->odd_flags & OVERLAY_F_ACTIVATED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; + oti->oti_vnetid = odd->odd_vid; + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_associate_t *ota = arg; + + odd = overlay_hold_by_dlid(ota->ota_linkid); + if (odd == NULL) + return (ENOENT); + + if (ota->ota_id == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode != OVERLAY_TARGET_POINT && + ota->ota_mode != OVERLAY_TARGET_DYNAMIC) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_provides != odd->odd_plugin->ovp_dest) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode == OVERLAY_TARGET_POINT) { + if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + + if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) { + if (ota->ota_point.otp_port == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + } + + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); + ott->ott_flags = 0; + ott->ott_ocount = 0; + ott->ott_mode = ota->ota_mode; + ott->ott_dest = ota->ota_provides; + ott->ott_id = ota->ota_id; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + bcopy(&ota->ota_point, &ott->ott_u.ott_point, + sizeof (overlay_target_point_t)); + } else { + ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + overlay_mac_hash, overlay_mac_cmp, + overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); + } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (EEXIST); + } + + odd->odd_flags |= OVERLAY_F_VARPD; + odd->odd_target = ott; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + + + return (0); +} + + +/* ARGSUSED */ +static int +overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_degrade_t *otd = arg; + + odd = overlay_hold_by_dlid(otd->otd_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_degrade(odd, otd->otd_buf); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_restore(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_restore(odd); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_VARPD; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + return (0); + +} + +static int +overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_lookup_t *otl = arg; + overlay_target_entry_t *entry; + clock_t ret, timeout; + mac_header_info_t mhi; + + timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC); +again: + mutex_enter(&overlay_target_lock); + while (list_is_empty(&overlay_target_list)) { + ret = cv_timedwait(&overlay_target_condvar, + &overlay_target_lock, timeout); + if (ret == -1) { + mutex_exit(&overlay_target_lock); + return (ETIME); + } + } + entry = list_remove_head(&overlay_target_list); + mutex_exit(&overlay_target_lock); + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + ASSERT(entry->ote_chead == NULL); + mutex_exit(&entry->ote_lock); + goto again; + } + ASSERT(entry->ote_chead != NULL); + + /* + * If we have a bogon that doesn't have a valid mac header, drop it and + * try again. + */ + if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead, + &mhi) != 0) { + boolean_t queue = B_FALSE; + mblk_t *mp = entry->ote_chead; + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + if (entry->ote_chead != NULL) + queue = B_TRUE; + mutex_exit(&entry->ote_lock); + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + goto again; + } + + otl->otl_dlid = entry->ote_odd->odd_linkid; + otl->otl_reqid = (uintptr_t)entry; + otl->otl_varpdid = entry->ote_ott->ott_id; + otl->otl_vnetid = entry->ote_odd->odd_vid; + + otl->otl_hdrsize = mhi.mhi_hdrsize; + otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; + bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); + otl->otl_dsttype = mhi.mhi_dsttype; + otl->otl_sap = mhi.mhi_bindsap; + otl->otl_vlan = VLAN_ID(mhi.mhi_tci); + mutex_exit(&entry->ote_lock); + + mutex_enter(&thdl->oth_lock); + list_insert_tail(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + return (0); +} + +static int +overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + bcopy(&otr->otr_answer, &entry->ote_dest, + sizeof (overlay_target_point_t)); + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; + entry->ote_chead = NULL; + entry->ote_ctail = NULL; + entry->ote_mbsize = 0; + entry->ote_vtime = gethrtime(); + mutex_exit(&entry->ote_lock); + + /* + * For now do an in-situ drain. + */ + mp = overlay_m_tx(entry->ote_odd, mp); + freemsgchain(mp); + + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +static int +overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + boolean_t queue = B_FALSE; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + + /* Safeguard against a confused varpd */ + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + DTRACE_PROBE1(overlay__target__valid__drop, + overlay_target_entry_t *, entry); + mutex_exit(&entry->ote_lock); + goto done; + } + + mp = entry->ote_chead; + if (mp != NULL) { + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + } + if (entry->ote_chead != NULL) { + queue = B_TRUE; + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + } else { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + } + mutex_exit(&entry->ote_lock); + + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + +done: + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_pkt_t *pkt; + overlay_targ_pkt32_t *pkt32; + + pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP); + *outp = pkt; + *bsize = sizeof (overlay_targ_pkt_t); + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + uintptr_t addr; + + if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + pkt32 = (overlay_targ_pkt32_t *)pkt; + addr = pkt32->otp_buf; + pkt->otp_buf = (void *)addr; + } else { + if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + } + return (0); +} + +static int +overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + overlay_targ_pkt_t *pkt = buf; + overlay_targ_pkt32_t *pkt32 = buf; + uintptr_t addr = (uintptr_t)pkt->otp_buf; + pkt32->otp_buf = (caddr32_t)addr; + if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + } else { + if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +static int +overlay_target_packet(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + size_t mlen; + size_t boff; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + mutex_enter(&entry->ote_lock); + mutex_exit(&thdl->oth_lock); + mp = entry->ote_chead; + /* Protect against a rogue varpd */ + if (mp == NULL) { + mutex_exit(&entry->ote_lock); + return (EINVAL); + } + mlen = MIN(msgsize(mp), pkt->otp_size); + pkt->otp_size = mlen; + boff = 0; + while (mlen > 0) { + size_t wlen = MIN(MBLKL(mp), mlen); + if (ddi_copyout(mp->b_rptr, + (void *)((uintptr_t)pkt->otp_buf + boff), + wlen, 0) != 0) { + mutex_exit(&entry->ote_lock); + return (EFAULT); + } + mlen -= wlen; + boff += wlen; + mp = mp->b_cont; + } + mutex_exit(&entry->ote_lock); + return (0); +} + +static int +overlay_target_inject(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mutex_enter(&odd->odd_lock); + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + return (0); +} + +static int +overlay_target_resend(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mp = overlay_m_tx(odd, mp); + freemsgchain(mp); + + return (0); +} + +typedef struct overlay_targ_list_int { + boolean_t otli_count; + uint32_t otli_cur; + uint32_t otli_nents; + uint32_t otli_ents[]; +} overlay_targ_list_int_t; + +static int +overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_list_t n; + overlay_targ_list_int_t *otl; + + if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + /* + */ + if (n.otl_nents >= INT32_MAX / sizeof (uint32_t)) + return (EINVAL); + *bsize = sizeof (overlay_targ_list_int_t) + + sizeof (uint32_t) * n.otl_nents; + otl = kmem_zalloc(*bsize, KM_SLEEP); + otl->otli_cur = 0; + otl->otli_nents = n.otl_nents; + if (otl->otli_nents != 0) { + otl->otli_count = B_FALSE; + if (ddi_copyin((void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + otl->otli_ents, n.otl_nents * sizeof (uint32_t), + flags & FKIOCTL) != 0) { + kmem_free(otl, *bsize); + return (EFAULT); + } + } else { + otl->otli_count = B_TRUE; + } + + *outp = otl; + return (0); +} + +static int +overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg) +{ + overlay_targ_list_int_t *otl = arg; + + if (otl->otli_cur < otl->otli_nents) + otl->otli_ents[otl->otli_cur] = odd->odd_linkid; + otl->otli_cur++; + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_iter(overlay_target_ioctl_list_cb, arg); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags) +{ + overlay_targ_list_int_t *otl = buf; + + if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (otl->otli_count == B_FALSE) { + if (ddi_copyout(otl->otli_ents, + (void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + sizeof (uint32_t) * otl->otli_nents, + flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_POINT && + ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + otc->otc_entry.otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } else { + overlay_target_entry_t *ote; + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & + OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else { + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } + ret = 0; + } else { + ret = ENOENT; + } + mutex_exit(&ote->ote_lock); + } else { + ret = ENOENT; + } + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + mblk_t *mp = NULL; + + if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + return (EINVAL); + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote == NULL) { + ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_ott = ott; + ote->ote_odd = odd; + mutex_enter(&ote->ote_lock); + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } else { + mutex_enter(&ote->ote_lock); + } + + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { + ote->ote_flags |= OVERLAY_ENTRY_F_DROP; + } else { + ote->ote_flags |= OVERLAY_ENTRY_F_VALID; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + sizeof (overlay_target_point_t)); + mp = ote->ote_chead; + ote->ote_chead = NULL; + ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = gethrtime(); + } + + mutex_exit(&ote->ote_lock); + mutex_exit(&ott->ott_lock); + + if (mp != NULL) { + mp = overlay_m_tx(ote->ote_odd, mp); + freemsgchain(mp); + } + + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + ret = 0; + } else { + ret = ENOENT; + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) +{ + avl_tree_t *avl; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + avl = &ott->ott_u.ott_dyn.ott_tree; + + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_cache_iter_t base, *iter; + + if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (base.otci_count > OVERLAY_TARGET_ITER_MAX) + return (E2BIG); + + if (base.otci_count == 0) + return (EINVAL); + + *bsize = sizeof (overlay_targ_cache_iter_t) + + base.otci_count * sizeof (overlay_targ_cache_entry_t); + iter = kmem_alloc(*bsize, KM_SLEEP); + bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t)); + *outp = iter; + + return (0); +} + +typedef struct overlay_targ_cache_marker { + uint8_t otcm_mac[ETHERADDRL]; + uint16_t otcm_done; +} overlay_targ_cache_marker_t; + +/* ARGSUSED */ +static int +overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t lookup, *ent; + overlay_targ_cache_marker_t *mark; + avl_index_t where; + avl_tree_t *avl; + uint16_t written = 0; + + overlay_targ_cache_iter_t *iter = arg; + mark = (void *)&iter->otci_marker; + + if (mark->otcm_done != 0) { + iter->otci_count = 0; + return (0); + } + + odd = overlay_hold_by_dlid(iter->otci_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC && + ott->ott_mode != OVERLAY_TARGET_POINT) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + + /* + * Holding this lock across the entire iteration probably isn't very + * good. We should perhaps add an r/w lock for the avl tree. But we'll + * wait until we now it's necessary before we do more. + */ + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; + bzero(out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mark->otcm_done = 1; + } + + avl = &ott->ott_u.ott_dyn.ott_tree; + bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + ent = avl_find(avl, &lookup, &where); + + /* + * NULL ent means that the entry does not exist, so we want to start + * with the closest node in the tree. This means that we implicitly rely + * on the tree's order and the first node will be the mac 00:00:00:00:00 + * and the last will be ff:ff:ff:ff:ff:ff. + */ + if (ent == NULL) { + ent = avl_nearest(avl, where, AVL_AFTER); + if (ent == NULL) { + mark->otcm_done = 1; + goto done; + } + } + + for (; ent != NULL && written < iter->otci_count; + ent = AVL_NEXT(avl, ent)) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[written]; + mutex_enter(&ent->ote_lock); + if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) { + mutex_exit(&ent->ote_lock); + continue; + } + bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) + out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; + if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) + bcopy(&ent->ote_dest, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mutex_exit(&ent->ote_lock); + } + + if (ent != NULL) { + bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + } else { + mark->otcm_done = 1; + } + +done: + iter->otci_count = written; + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + size_t outsize; + const overlay_targ_cache_iter_t *iter = buf; + + outsize = sizeof (overlay_targ_cache_iter_t) + + iter->otci_count * sizeof (overlay_targ_cache_entry_t); + + if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static overlay_target_ioctl_t overlay_target_ioctab[] = { + { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, + NULL, overlay_target_info, + NULL, sizeof (overlay_targ_info_t) }, + { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_associate, + NULL, sizeof (overlay_targ_associate_t) }, + { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_disassociate, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE, + NULL, overlay_target_degrade, + NULL, sizeof (overlay_targ_degrade_t) }, + { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE, + NULL, overlay_target_restore, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE, + NULL, overlay_target_lookup_request, + NULL, sizeof (overlay_targ_lookup_t) }, + { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_respond, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_DROP, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_drop, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_PKT, B_TRUE, B_TRUE, + overlay_target_pkt_copyin, + overlay_target_packet, + overlay_target_pkt_copyout, + sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_inject, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_resend, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_LIST, B_FALSE, B_TRUE, + overlay_target_list_copyin, + overlay_target_ioctl_list, + overlay_target_list_copyout, + sizeof (overlay_targ_list_t) }, + { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE, + NULL, overlay_target_cache_get, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_set, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE, + NULL, overlay_target_cache_flush, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE, + overlay_target_cache_iter_copyin, + overlay_target_cache_iter, + overlay_target_cache_iter_copyout, + sizeof (overlay_targ_cache_iter_t) }, + { 0 } +}; + +int +overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + minor_t mid; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if (getminor(*devp) != 0) + return (ENXIO); + + if (otype & OTYP_BLK) + return (EINVAL); + + if (flags & ~(FREAD | FWRITE | FEXCL)) + return (EINVAL); + + if ((flags & FWRITE) && + !(flags & FEXCL)) + return (EINVAL); + + if (!(flags & FREAD) && !(flags & FWRITE)) + return (EINVAL); + + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EPERM); + + mid = id_alloc(overlay_thdl_idspace); + if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) { + id_free(overlay_thdl_idspace, mid); + return (ENXIO); + } + + thdl = ddi_get_soft_state(overlay_thdl_state, mid); + VERIFY(thdl != NULL); + thdl->oth_minor = mid; + thdl->oth_zoneid = crgetzoneid(credp); + thdl->oth_oflags = flags; + mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + *devp = makedevice(getmajor(*devp), mid); + + mutex_enter(&overlay_target_lock); + if ((flags & FEXCL) && overlay_target_excl == B_TRUE) { + mutex_exit(&overlay_target_lock); + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + return (EEXIST); + } else if ((flags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_FALSE); + overlay_target_excl = B_TRUE; + } + list_insert_tail(&overlay_thdl_list, thdl); + mutex_exit(&overlay_target_lock); + + return (0); +} + +/* ARGSUSED */ +int +overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + overlay_target_ioctl_t *ioc; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, + getminor(dev))) == NULL) + return (ENXIO); + + for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) { + int ret; + caddr_t buf; + size_t bufsize; + + if (ioc->oti_cmd != cmd) + continue; + + if (ioc->oti_write == B_TRUE && !(mode & FWRITE)) + return (EBADF); + + if (ioc->oti_copyin == NULL) { + bufsize = ioc->oti_size; + buf = kmem_alloc(bufsize, KM_SLEEP); + if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize, + mode & FKIOCTL) != 0) { + kmem_free(buf, bufsize); + return (EFAULT); + } + } else { + if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg, + (void **)&buf, &bufsize, mode)) != 0) + return (ret); + } + + ret = ioc->oti_func(thdl, buf); + if (ret == 0 && ioc->oti_size != 0 && + ioc->oti_ncopyout == B_TRUE) { + if (ioc->oti_copyout == NULL) { + if (ddi_copyout(buf, (void *)(uintptr_t)arg, + bufsize, mode & FKIOCTL) != 0) + ret = EFAULT; + } else { + ret = ioc->oti_copyout((void *)(uintptr_t)arg, + buf, bufsize, mode); + } + } + + kmem_free(buf, bufsize); + return (ret); + } + + return (ENOTTY); +} + +/* ARGSUSED */ +int +overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + overlay_target_hdl_t *thdl; + overlay_target_entry_t *entry; + minor_t mid = getminor(dev); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL) + return (ENXIO); + + mutex_enter(&overlay_target_lock); + list_remove(&overlay_thdl_list, thdl); + mutex_enter(&thdl->oth_lock); + while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL) + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&thdl->oth_lock); + if ((thdl->oth_oflags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_TRUE); + overlay_target_excl = B_FALSE; + } + mutex_exit(&overlay_target_lock); + + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + mid = thdl->oth_minor; + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + + return (0); +} diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c new file mode 100644 index 0000000000..92144b3985 --- /dev/null +++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c @@ -0,0 +1,394 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * VXLAN encapsulation module + * + * + * The VXLAN header looks as follows in network byte order: + * + * |0 3| 4 |5 31| + * +----------+---+------------------------+ + * | Reserved | I | Reserved | + * +---------------------------------------+ + * | Virtual Network ID | Reserved | + * +----------------------------+----------+ + * |0 23|24 31| + * + * All reserved values must be 0. The I bit must be 1. We call the top + * word the VXLAN magic field for the time being. The second word is + * definitely not the most friendly way to operate. Specifically, the ID + * is a 24-bit big endian value, but we have to make sure not to use the + * reserved byte. + * + * For us, VXLAN encapsulation is a fairly straightforward implementation. It + * only has two properties, a listen_ip and a listen_port. These determine on + * what address we should be listening on. While we do not have a default + * address to listen upon, we do have a default port, which is the IANA assigned + * port for VXLAN -- 4789. + */ + +#include <sys/overlay_plugin.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/byteorder.h> +#include <sys/vxlan.h> +#include <inet/ip.h> +#include <netinet/in.h> +#include <sys/strsun.h> +#include <netinet/udp.h> + +static const char *vxlan_ident = "vxlan"; +static uint16_t vxlan_defport = IPPORT_VXLAN; + +/* + * Should we enable UDP source port hashing for fanout. + */ +boolean_t vxlan_fanout = B_TRUE; + +/* + * This represents the size in bytes that we want to allocate when allocating a + * vxlan header block. This is intended such that lower levels can try and use + * the message block that we allocate for the IP and UPD header. The hope is + * that even if this is tunneled, that this is enough space. + * + * The vxlan_noalloc_min value represents the minimum amount of space we need to + * consider not allocating a message block and just passing it down the stack in + * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet + * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header. + */ +uint_t vxlan_alloc_size = 128; +uint_t vxlan_noalloc_min = 54; + +static const char *vxlan_props[] = { + "vxlan/listen_ip", + "vxlan/listen_port", + NULL +}; + +typedef struct vxlan { + kmutex_t vxl_lock; + overlay_handle_t vxl_oh; + uint16_t vxl_lport; + boolean_t vxl_hladdr; + struct in6_addr vxl_laddr; +} vxlan_t; + +static int +vxlan_o_init(overlay_handle_t oh, void **outp) +{ + vxlan_t *vxl; + + vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); + *outp = vxl; + mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); + vxl->vxl_oh = oh; + vxl->vxl_lport = vxlan_defport; + vxl->vxl_hladdr = B_FALSE; + + return (0); +} + +static void +vxlan_o_fini(void *arg) +{ + vxlan_t *vxl = arg; + + mutex_destroy(&vxl->vxl_lock); + kmem_free(arg, sizeof (vxlan_t)); +} + +static int +vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, + socklen_t *slenp) +{ + vxlan_t *vxl = arg; + struct sockaddr_in6 *in; + + in = (struct sockaddr_in6 *)addr; + *dp = AF_INET6; + *fp = SOCK_DGRAM; + *pp = 0; + bzero(in, sizeof (struct sockaddr_in6)); + in->sin6_family = AF_INET6; + + /* + * We should consider a more expressive private errno set that + * provider's can use. + */ + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + mutex_exit(&vxl->vxl_lock); + return (EINVAL); + } + in->sin6_port = htons(vxl->vxl_lport); + in->sin6_addr = vxl->vxl_laddr; + mutex_exit(&vxl->vxl_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (0); +} + +static int +vxlan_o_sockopt(ksocket_t ksock) +{ + int val, err; + if (vxlan_fanout == B_FALSE) + return (0); + + val = UDP_HASH_VXLAN; + err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, + sizeof (val), kcred); + return (err); +} + +/* ARGSUSED */ +static int +vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, + mblk_t **outp) +{ + mblk_t *ob; + vxlan_hdr_t *vxh; + + ASSERT(einfop->ovdi_id < (1 << 24)); + + if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) { + /* + * This allocation could get hot. We may want to have a good + * way to cache and handle this allocation the same way that IP + * does with keeping around a message block per entry, or + * basically treating this as an immutable message block in the + * system. Basically freemsg() will be a nop, but we'll do the + * right thing with respect to the rest of the chain. + */ + ob = allocb(vxlan_alloc_size, 0); + if (ob == NULL) + return (ENOMEM); + + ob->b_wptr = DB_LIM(ob); + ob->b_rptr = ob->b_wptr; + ob->b_cont = mp; + } else { + ob = mp; + } + ob->b_rptr -= VXLAN_HDR_LEN; + + vxh = (vxlan_hdr_t *)ob->b_rptr; + vxh->vxlan_flags = ntohl(VXLAN_F_VDI); + vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); + *outp = ob; + + return (0); +} + +/* ARGSUSED */ +static int +vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) +{ + vxlan_hdr_t *vxh; + + if (MBLKL(mp) < sizeof (vxlan_hdr_t)) + return (EINVAL); + vxh = (vxlan_hdr_t *)mp->b_rptr; + if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) + return (EINVAL); + + dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; + dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; + + return (0); +} + +static int +vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + if (*bufsize < sizeof (struct in6_addr)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + *bufsize = 0; + } else { + bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); + *bufsize = sizeof (struct in6_addr); + } + mutex_exit(&vxl->vxl_lock); + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + uint64_t val; + if (*bufsize < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + val = vxl->vxl_lport; + bcopy(&val, buf, sizeof (uint64_t)); + *bufsize = sizeof (uint64_t); + mutex_exit(&vxl->vxl_lock); + return (0); + } + + return (EINVAL); +} + +static int +vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, + uint32_t bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + if (bufsize != sizeof (struct in6_addr)) + return (EINVAL); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_MULTICAST(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + ipaddr_t v4; + IN6_V4MAPPED_TO_IPADDR(ipv6, v4); + if (IN_MULTICAST(v4)) + return (EINVAL); + } + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_hladdr = B_TRUE; + bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); + mutex_exit(&vxl->vxl_lock); + + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + const uint64_t *valp = buf; + if (bufsize != 8) + return (EINVAL); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_lport = *valp; + mutex_exit(&vxl->vxl_lock); + return (0); + } + return (EINVAL); +} + +static int +vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) +{ + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[0]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); + overlay_prop_set_nodefault(phdl); + return (0); + } + + if (strcmp(pr_name, vxlan_props[1]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[1]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + (void) overlay_prop_set_default(phdl, &vxlan_defport, + sizeof (vxlan_defport)); + overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static struct overlay_plugin_ops vxlan_o_ops = { + 0, + vxlan_o_init, + vxlan_o_fini, + vxlan_o_encap, + vxlan_o_decap, + vxlan_o_socket, + vxlan_o_sockopt, + vxlan_o_getprop, + vxlan_o_setprop, + vxlan_o_propinfo +}; + +static struct modlmisc vxlan_modlmisc = { + &mod_miscops, + "VXLAN encap plugin" +}; + +static struct modlinkage vxlan_modlinkage = { + MODREV_1, + &vxlan_modlmisc +}; + +int +_init(void) +{ + int err; + overlay_plugin_register_t *ovrp; + + ovrp = overlay_plugin_alloc(OVEP_VERSION); + if (ovrp == NULL) + return (ENOTSUP); + ovrp->ovep_name = vxlan_ident; + ovrp->ovep_ops = &vxlan_o_ops; + ovrp->ovep_id_size = VXLAN_ID_LEN; + ovrp->ovep_flags = OVEP_F_VLAN_TAG; + ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; + ovrp->ovep_props = vxlan_props; + + if ((err = overlay_plugin_register(ovrp)) == 0) { + if ((err = mod_install(&vxlan_modlinkage)) != 0) { + (void) overlay_plugin_unregister(vxlan_ident); + } + } + + overlay_plugin_free(ovrp); + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vxlan_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) + return (err); + + return (mod_remove(&vxlan_modlinkage)); +} diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c index 4ea5cd9778..b06e750888 100644 --- a/usr/src/uts/common/io/pciex/pcie.c +++ b/usr/src/uts/common/io/pciex/pcie.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ #include <sys/sysmacros.h> @@ -684,6 +684,7 @@ pcie_init_pfd(dev_info_t *dip) pfd_p->pe_bus_p = bus_p; pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; pfd_p->pe_lock = B_FALSE; pfd_p->pe_valid = B_FALSE; @@ -840,6 +841,7 @@ pcie_rc_init_pfd(dev_info_t *dip, pf_data_t *pfd_p) { pfd_p->pe_bus_p = PCIE_DIP2DOWNBUS(dip); pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; pfd_p->pe_lock = B_FALSE; pfd_p->pe_valid = B_FALSE; @@ -921,7 +923,7 @@ pcie_rc_init_bus(dev_info_t *dip) bus_p->bus_aer_off = (uint16_t)-1; /* Needed only for handle lookup */ - bus_p->bus_fm_flags |= PF_FM_READY; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY); ndi_set_bus_private(dip, B_FALSE, DEVI_PORT_TYPE_PCI, bus_p); @@ -938,6 +940,180 @@ pcie_rc_fini_bus(dev_info_t *dip) } /* + * We need to capture the supported, maximum, and current device speed and + * width. The way that this has been done has changed over time. + * + * Prior to PCIe Gen 3, there were only current and supported speed fields. + * These were found in the link status and link capabilities registers of the + * PCI express capability. With the change to PCIe Gen 3, the information in the + * link capabilities changed to the maximum value. The supported speeds vector + * was moved to the link capabilities 2 register. + * + * Now, a device may not implement some of these registers. To determine whether + * or not it's here, we have to do the following. First, we need to check the + * revision of the PCI express capability. The link capabilities 2 register did + * not exist prior to version 2 of this register. + */ +static void +pcie_capture_speeds(pcie_bus_t *bus_p, pcie_req_id_t bdf, dev_info_t *rcdip) +{ + uint16_t vers, status; + uint32_t val, cap, cap2; + + if (!PCIE_IS_PCIE(bus_p)) + return; + + vers = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + PCIE_PCIECAP); + if (vers == PCI_EINVAL16) + return; + vers &= PCIE_PCIECAP_VER_MASK; + + /* + * Verify the capability's version. + */ + switch (vers) { + case PCIE_PCIECAP_VER_1_0: + cap2 = 0; + break; + case PCIE_PCIECAP_VER_2_0: + cap2 = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + + PCIE_LINKCAP2); + if (cap2 == PCI_EINVAL32) + cap2 = 0; + break; + default: + /* Don't try and handle an unknown version */ + return; + } + + status = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + + PCIE_LINKSTS); + cap = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + PCIE_LINKCAP); + if (status == PCI_EINVAL16 || cap == PCI_EINVAL32) + return; + + switch (status & PCIE_LINKSTS_SPEED_MASK) { + case PCIE_LINKSTS_SPEED_2_5: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_2_5; + break; + case PCIE_LINKSTS_SPEED_5: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_5; + break; + case PCIE_LINKSTS_SPEED_8: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_8; + break; + case PCIE_LINKSTS_SPEED_16: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_16; + break; + default: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_UNKNOWN; + break; + } + + switch (status & PCIE_LINKSTS_NEG_WIDTH_MASK) { + case PCIE_LINKSTS_NEG_WIDTH_X1: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X1; + break; + case PCIE_LINKSTS_NEG_WIDTH_X2: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X2; + break; + case PCIE_LINKSTS_NEG_WIDTH_X4: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X4; + break; + case PCIE_LINKSTS_NEG_WIDTH_X8: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X8; + break; + case PCIE_LINKSTS_NEG_WIDTH_X12: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X12; + break; + case PCIE_LINKSTS_NEG_WIDTH_X16: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X16; + break; + case PCIE_LINKSTS_NEG_WIDTH_X32: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X32; + break; + default: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_UNKNOWN; + break; + } + + switch (cap & PCIE_LINKCAP_MAX_WIDTH_MASK) { + case PCIE_LINKCAP_MAX_WIDTH_X1: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X1; + break; + case PCIE_LINKCAP_MAX_WIDTH_X2: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X2; + break; + case PCIE_LINKCAP_MAX_WIDTH_X4: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X4; + break; + case PCIE_LINKCAP_MAX_WIDTH_X8: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X8; + break; + case PCIE_LINKCAP_MAX_WIDTH_X12: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X12; + break; + case PCIE_LINKCAP_MAX_WIDTH_X16: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X16; + break; + case PCIE_LINKCAP_MAX_WIDTH_X32: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X32; + break; + default: + bus_p->bus_max_width = PCIE_LINK_WIDTH_UNKNOWN; + break; + } + + /* + * If we have the Link Capabilities 2, then we can get the supported + * speeds from it and treat the bits in Link Capabilities 1 as the + * maximum. If we don't, then we need to follow the Implementation Note + * in the standard under Link Capabilities 2. Effectively, this means + * that if the value of 10b is set in Link Capabilities register, that + * it supports both 2.5 and 5 GT/s speeds. + */ + if (cap2 != 0) { + if (cap2 & PCIE_LINKCAP2_SPEED_2_5) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_2_5; + if (cap2 & PCIE_LINKCAP2_SPEED_5) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_5; + if (cap2 & PCIE_LINKCAP2_SPEED_8) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_8; + if (cap2 & PCIE_LINKCAP2_SPEED_16) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_16; + + switch (cap & PCIE_LINKCAP_MAX_SPEED_MASK) { + case PCIE_LINKCAP_MAX_SPEED_2_5: + bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5; + break; + case PCIE_LINKCAP_MAX_SPEED_5: + bus_p->bus_max_speed = PCIE_LINK_SPEED_5; + break; + case PCIE_LINKCAP_MAX_SPEED_8: + bus_p->bus_max_speed = PCIE_LINK_SPEED_8; + break; + case PCIE_LINKCAP_MAX_SPEED_16: + bus_p->bus_max_speed = PCIE_LINK_SPEED_16; + break; + default: + bus_p->bus_max_speed = PCIE_LINK_SPEED_UNKNOWN; + break; + } + } else { + if (cap & PCIE_LINKCAP_MAX_SPEED_5) { + bus_p->bus_max_speed = PCIE_LINK_SPEED_5; + bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5 | + PCIE_LINK_SPEED_5; + } + + if (cap & PCIE_LINKCAP_MAX_SPEED_2_5) { + bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5; + bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5; + } + } +} + +/* * partially init pcie_bus_t for device (dip,bdf) for accessing pci * config space * @@ -1134,6 +1310,10 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags) } } + /* + * Save and record speed information about the device. + */ + caps_done: /* save RP dip and RP bdf */ if (PCIE_IS_RP(bus_p)) { @@ -1170,7 +1350,7 @@ caps_done: } bus_p->bus_soft_state = PCI_SOFT_STATE_CLOSED; - bus_p->bus_fm_flags = 0; + (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0); bus_p->bus_mps = 0; ndi_set_bus_private(dip, B_TRUE, DEVI_PORT_TYPE_PCI, (void *)bus_p); @@ -1226,6 +1406,8 @@ initial_done: pcie_init_plat(dip); + pcie_capture_speeds(bus_p, bdf, rcdip); + final_done: PCIE_DBG("Add %s(dip 0x%p, bdf 0x%x, secbus 0x%x)\n", @@ -1318,14 +1500,15 @@ pcie_get_rc_dip(dev_info_t *dip) return (rcdip); } -static boolean_t +boolean_t pcie_is_pci_device(dev_info_t *dip) { dev_info_t *pdip; char *device_type; pdip = ddi_get_parent(dip); - ASSERT(pdip); + if (pdip == NULL) + return (B_FALSE); if (ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, DDI_PROP_DONTPASS, "device_type", &device_type) != DDI_PROP_SUCCESS) diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c index a8c02caa9c..6a335db3e2 100644 --- a/usr/src/uts/common/io/pciex/pcie_fault.c +++ b/usr/src/uts/common/io/pciex/pcie_fault.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ #include <sys/sysmacros.h> @@ -919,10 +920,18 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) } /* - * Read vendor/device ID and check with cached data, if it doesn't match - * could very well be a device that isn't responding anymore. Just - * stop. Save the basic info in the error q for post mortem debugging - * purposes. + * If this is a device used for PCI passthrough into a virtual machine, + * don't let any error it caused panic the system. + */ + if (bus_p->bus_fm_flags & PF_FM_IS_PASSTHRU) + pfd_p->pe_severity_mask |= PF_ERR_PANIC; + + /* + * Read vendor/device ID and check with cached data; if it doesn't + * match, it could very well mean that the device is no longer + * responding. In this case, we return PF_SCAN_BAD_RESPONSE; should + * the caller choose to panic in this case, we will have the basic + * info in the error queue for the purposes of postmortem debugging. */ if (PCIE_GET(32, bus_p, PCI_CONF_VENID) != bus_p->bus_dev_ven_id) { char buf[FM_MAX_CLASS]; @@ -933,12 +942,12 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0, NULL); /* - * For IOV/Hotplug purposes skip gathering info fo this device, + * For IOV/Hotplug purposes skip gathering info for this device, * but populate affected info and severity. Clear out any data * that maybe been saved in the last fabric scan. */ pf_reset_pfd(pfd_p); - pfd_p->pe_severity_flags = PF_ERR_PANIC_BAD_RESPONSE; + pfd_p->pe_severity_flags = PF_ERR_BAD_RESPONSE; PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_SELF; /* Add the snapshot to the error q */ @@ -950,6 +959,7 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) pf_pci_regs_gather(pfd_p, bus_p); pf_pci_regs_clear(pfd_p, bus_p); + if (PCIE_IS_RP(bus_p)) pf_pci_find_rp_fault(pfd_p, bus_p); @@ -984,6 +994,22 @@ done: } /* + * Set the passthru flag on a device bus_p. Called by passthru drivers to + * indicate when a device is or is no longer under passthru control. + */ +void +pf_set_passthru(dev_info_t *dip, boolean_t is_passthru) +{ + pcie_bus_t *bus_p = PCIE_DIP2BUS(dip); + + if (is_passthru) { + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_PASSTHRU); + } else { + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_PASSTHRU); + } +} + +/* * Called during postattach to initialize a device's error handling * capabilities. If the devices has already been hardened, then there isn't * much needed. Otherwise initialize the device's default FMA capabilities. @@ -1026,7 +1052,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE); cap &= (DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE); - bus_p->bus_fm_flags |= PF_FM_IS_NH; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_NH); if (cmd == DDI_ATTACH) { ddi_fm_init(dip, &cap, &ibc); @@ -1041,7 +1067,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) /* If ddi_fm_init fails for any reason RETURN */ if (!fmhdl) { - bus_p->bus_fm_flags = 0; + (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0); return; } @@ -1051,7 +1077,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) ddi_fm_handler_register(dip, pf_dummy_cb, NULL); } - bus_p->bus_fm_flags |= PF_FM_READY; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY); } /* undo FMA lock, called at predetach */ @@ -1068,7 +1094,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd) return; /* no other code should set the flag to false */ - bus_p->bus_fm_flags &= ~PF_FM_READY; + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_READY); /* * Grab the mutex to make sure device isn't in the middle of @@ -1082,7 +1108,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd) /* undo non-hardened drivers */ if (bus_p->bus_fm_flags & PF_FM_IS_NH) { if (cmd == DDI_DETACH) { - bus_p->bus_fm_flags &= ~PF_FM_IS_NH; + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_NH); pci_ereport_teardown(dip); /* * ddi_fini itself calls ddi_handler_unregister, @@ -1379,7 +1405,7 @@ pf_analyse_error(ddi_fm_error_t *derr, pf_impl_t *impl) sts_flags = 0; /* skip analysing error when no error info is gathered */ - if (pfd_p->pe_severity_flags == PF_ERR_PANIC_BAD_RESPONSE) + if (pfd_p->pe_severity_flags == PF_ERR_BAD_RESPONSE) goto done; switch (PCIE_PFD2BUS(pfd_p)->bus_dev_type) { @@ -1457,6 +1483,8 @@ done: /* Have pciev_eh adjust the severity */ pfd_p->pe_severity_flags = pciev_eh(pfd_p, impl); + pfd_p->pe_severity_flags &= ~pfd_p->pe_severity_mask; + error_flags |= pfd_p->pe_severity_flags; } @@ -3060,6 +3088,7 @@ pf_reset_pfd(pf_data_t *pfd_p) pcie_bus_t *bus_p = PCIE_PFD2BUS(pfd_p); pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; /* pe_lock and pe_valid were reset in pf_send_ereport */ diff --git a/usr/src/uts/common/io/pciex/pciev.c b/usr/src/uts/common/io/pciex/pciev.c index 18794318dd..da68026dcf 100644 --- a/usr/src/uts/common/io/pciex/pciev.c +++ b/usr/src/uts/common/io/pciex/pciev.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2017, Joyent, Inc. + */ + #include <sys/types.h> #include <sys/ddi.h> #include <sys/dditypes.h> @@ -302,7 +306,7 @@ pciev_eh(pf_data_t *pfd_p, pf_impl_t *impl) pcie_faulty_all = B_TRUE; } else if (severity & (PF_ERR_NO_PANIC | PF_ERR_MATCHED_DEVICE | - PF_ERR_PANIC | PF_ERR_PANIC_BAD_RESPONSE)) { + PF_ERR_PANIC | PF_ERR_BAD_RESPONSE)) { uint16_t affected_flag, dev_affected_flags; uint_t is_panic = 0, is_aff_dev_found = 0; diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c index 665c9eff6c..9aaf58fb7b 100644 --- a/usr/src/uts/common/io/physmem.c +++ b/usr/src/uts/common/io/physmem.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) int ret; static int msg_printed = 0; + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EACCES); + if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { return (EINVAL); } diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf index 42248e93d6..08affec609 100644 --- a/usr/src/uts/common/io/pseudo.conf +++ b/usr/src/uts/common/io/pseudo.conf @@ -22,8 +22,7 @@ # # Copyright 2003 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2014 Joyent, Inc. All rights reserved. # # This file is private to the pseudonex driver. It should not be edited. # @@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0; # /pseudo; it has as its children the zone console pseudo nodes. # name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons"; + +# +# zfdnex is an alias for pseudo; this node is instantiated as a child of +# /pseudo; it has as its children the zone fd pseudo nodes. +# +name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd"; diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c index f83b0abf39..0ae06f88cc 100644 --- a/usr/src/uts/common/io/pseudonex.c +++ b/usr/src/uts/common/io/pseudonex.c @@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t); static int pseudonex_open(dev_t *, int, int, cred_t *); static int pseudonex_close(dev_t, int, int, cred_t *); static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int, + ddi_iblock_cookie_t *); static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, void *); @@ -90,6 +92,8 @@ static void *pseudonex_state; typedef struct pseudonex_state { dev_info_t *pnx_devi; + int pnx_fmcap; + ddi_iblock_cookie_t pnx_fm_ibc; } pseudonex_state_t; static struct bus_ops pseudonex_bus_ops = { @@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = { NULL, /* bus_intr_ctl */ NULL, /* bus_config */ NULL, /* bus_unconfig */ - NULL, /* bus_fm_init */ + pseudonex_fm_init, /* bus_fm_init */ NULL, /* bus_fm_fini */ NULL, /* bus_fm_access_enter */ NULL, /* bus_fm_access_exit */ @@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) pnx_state = ddi_get_soft_state(pseudonex_state, instance); pnx_state->pnx_devi = devi; + pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE; + ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc); + if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance, DDI_NT_NEXUS, 0) != DDI_SUCCESS) { ddi_remove_minor_node(devi, NULL); @@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) if (cmd == DDI_SUSPEND) return (DDI_SUCCESS); + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ddi_fm_fini(devi); ddi_remove_minor_node(devi, NULL); ddi_soft_state_free(pseudonex_state, instance); return (DDI_SUCCESS); @@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child) } static int +pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, + ddi_iblock_cookie_t *ibc) +{ + pseudonex_state_t *pnx_state; + + pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip)); + ASSERT(pnx_state != NULL); + ASSERT(ibc != NULL); + *ibc = pnx_state->pnx_fm_ibc; + return (pnx_state->pnx_fmcap & cap); +} + +static int pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, void *arg, void *result) { diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index bc8c17bedd..54bcee88bc 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg Binary files differnew file mode 100644 index 0000000000..b932ffaa7c --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg Binary files differnew file mode 100644 index 0000000000..9421ecc0db --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png Binary files differnew file mode 100644 index 0000000000..4b8a66761a --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png Binary files differnew file mode 100644 index 0000000000..3254fbdc3b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg Binary files differnew file mode 100644 index 0000000000..7bb0dbf21b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin Binary files differnew file mode 100644 index 0000000000..43014fd8ea --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin Binary files differnew file mode 100644 index 0000000000..9524eb4a63 --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h index 2350cb4117..656d2a915f 100644 --- a/usr/src/uts/common/io/qede/qede_list.h +++ b/usr/src/uts/common/io/qede/qede_list.h @@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list, #define QEDE_LIST_FOR_EACH_ENTRY_SAFE OSAL_LIST_FOR_EACH_ENTRY_SAFE #endif /* !_QEDE_LIST_H */ - diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h index 43584f95f0..0ee38b4338 100644 --- a/usr/src/uts/common/io/qede/qede_version.h +++ b/usr/src/uts/common/io/qede/qede_version.h @@ -42,4 +42,3 @@ #define REVVERSION 25 #endif /* !_QEDE_VERSION_H */ - diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c index d79b86362c..a50bbcceec 100644 --- a/usr/src/uts/common/io/random.c +++ b/usr/src/uts/common/io/random.c @@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp) if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0) return (error); + if (crgetzone(credp) != global_zone) + continue; + switch (devno) { case DEVRANDOM: if ((error = random_add_entropy(buf, bytes, 0)) != 0) diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c index b49d5b735a..d9d40c83fd 100644 --- a/usr/src/uts/common/io/rsm/rsm.c +++ b/usr/src/uts/common/io/rsm/rsm.c @@ -22,8 +22,8 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. */ diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c index d615f8dd62..0e4fb433cf 100644 --- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c +++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c @@ -82,6 +82,7 @@ */ #include <sys/note.h> +#include <sys/debug.h> #include <sys/scsi/scsi.h> #include <sys/pci.h> #include <sys/disp.h> @@ -10552,8 +10553,7 @@ ahci_em_led_task(void *arg) mutex_enter(&led->aelta_ctl->ahcictl_mutex); if (ret) { - led->aelta_ctl->ahcictl_em_state[led->aelta_port] = - led->aelta_state; + led->aelta_ctl->ahcictl_em_state[led->aelta_port] = state; led->aelta_ret = 0; } else { led->aelta_ret = EIO; @@ -10763,6 +10763,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg) } task->aelta_ctl = ahci_ctlp; + task->aelta_port = set.aiems_port; task->aelta_port = (uint8_t)set.aiems_port; task->aelta_op = set.aiems_op; task->aelta_state = set.aiems_leds; @@ -10839,22 +10840,19 @@ static void ahci_em_quiesce(ahci_ctl_t *ahci_ctlp) { ASSERT(ahci_ctlp->ahcictl_em_flags & AHCI_EM_PRESENT); + VERIFY(mutex_owned(&ahci_ctlp->ahcictl_mutex)); - mutex_enter(&ahci_ctlp->ahcictl_mutex); ahci_ctlp->ahcictl_em_flags |= AHCI_EM_QUIESCE; - mutex_exit(&ahci_ctlp->ahcictl_mutex); - ddi_taskq_wait(ahci_ctlp->ahcictl_em_taskq); } static void ahci_em_suspend(ahci_ctl_t *ahci_ctlp) { - ahci_em_quiesce(ahci_ctlp); + VERIFY(mutex_owned(&ahci_ctlp->ahcictl_mutex)); - mutex_enter(&ahci_ctlp->ahcictl_mutex); + ahci_em_quiesce(ahci_ctlp); ahci_ctlp->ahcictl_em_flags &= ~AHCI_EM_READY; - mutex_exit(&ahci_ctlp->ahcictl_mutex); } static void @@ -10875,7 +10873,10 @@ ahci_em_fini(ahci_ctl_t *ahci_ctlp) return; } + mutex_enter(&ahci_ctlp->ahcictl_mutex); ahci_em_quiesce(ahci_ctlp); + mutex_exit(&ahci_ctlp->ahcictl_mutex); + ddi_taskq_destroy(ahci_ctlp->ahcictl_em_taskq); ahci_ctlp->ahcictl_em_taskq = NULL; } diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c index e5aa96f469..05298d8b05 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c @@ -72,6 +72,7 @@ #include <sys/file.h> #include <sys/policy.h> #include <sys/model.h> +#include <sys/refhash.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/dr.h> @@ -99,7 +100,6 @@ #include <sys/scsi/adapters/mpt_sas/mptsas_var.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/raidioctl.h> #include <sys/fs/dv_node.h> /* devfs_clean */ diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c new file mode 100644 index 0000000000..a7ef2b69d7 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c @@ -0,0 +1,565 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static int smrt_attach(dev_info_t *, ddi_attach_cmd_t); +static int smrt_detach(dev_info_t *, ddi_detach_cmd_t); +static int smrt_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static void smrt_cleanup(smrt_t *); +static int smrt_command_comparator(const void *, const void *); + +/* + * Controller soft state. Each entry is an object of type "smrt_t". + */ +void *smrt_state; + +/* + * DMA attributes template. Each controller will make a copy of this template + * with appropriate customisations; e.g., the Scatter/Gather List Length. + */ +static ddi_dma_attr_t smrt_dma_attr_template = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00FFFFFF, + .dma_attr_align = 0x20, + .dma_attr_burstsizes = 0x20, + .dma_attr_minxfer = DMA_UNIT_8, + .dma_attr_maxxfer = 0xFFFFFFFF, + /* + * There is some suggestion that at least some, possibly older, Smart + * Array controllers cannot tolerate a DMA segment that straddles a 4GB + * boundary. + */ + .dma_attr_seg = 0xFFFFFFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 512, + .dma_attr_flags = 0 +}; + +/* + * Device memory access attributes for device control registers. + */ +ddi_device_acc_attr_t smrt_dev_attributes = { + .devacc_attr_version = DDI_DEVICE_ATTR_V0, + .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, + .devacc_attr_dataorder = DDI_STRICTORDER_ACC, + .devacc_attr_access = 0 +}; + +/* + * Character/Block Operations Structure + */ +static struct cb_ops smrt_cb_ops = { + .cb_rev = CB_REV, + .cb_flag = D_NEW | D_MP, + + .cb_open = scsi_hba_open, + .cb_close = scsi_hba_close, + + .cb_ioctl = smrt_ioctl, + + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_str = NULL, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +/* + * Device Operations Structure + */ +static struct dev_ops smrt_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + + .devo_attach = smrt_attach, + .devo_detach = smrt_detach, + + .devo_cb_ops = &smrt_cb_ops, + + .devo_getinfo = nodev, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_reset = nodev, + .devo_bus_ops = NULL, + .devo_power = nodev, + .devo_quiesce = nodev +}; + +/* + * Linkage structures + */ +static struct modldrv smrt_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "HP Smart Array", + .drv_dev_ops = &smrt_dev_ops +}; + +static struct modlinkage smrt_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &smrt_modldrv, NULL } +}; + + +int +_init() +{ + int r; + + VERIFY0(ddi_soft_state_init(&smrt_state, sizeof (smrt_t), 0)); + + if ((r = scsi_hba_init(&smrt_modlinkage)) != 0) { + goto fail; + } + + if ((r = mod_install(&smrt_modlinkage)) != 0) { + scsi_hba_fini(&smrt_modlinkage); + goto fail; + } + + return (r); + +fail: + ddi_soft_state_fini(&smrt_state); + return (r); +} + +int +_fini() +{ + int r; + + if ((r = mod_remove(&smrt_modlinkage)) == 0) { + scsi_hba_fini(&smrt_modlinkage); + ddi_soft_state_fini(&smrt_state); + } + + return (r); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&smrt_modlinkage, modinfop)); +} + +static int +smrt_iport_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + const char *addr; + dev_info_t *pdip; + int instance; + smrt_t *smrt; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Note, we cannot get to our parent via the tran's tran_hba_private + * member. This pointer is reset to NULL when the scsi_hba_tran_t + * structure is duplicated. + */ + addr = scsi_hba_iport_unit_address(dip); + VERIFY(addr != NULL); + pdip = ddi_get_parent(dip); + instance = ddi_get_instance(pdip); + smrt = ddi_get_soft_state(smrt_state, instance); + VERIFY(smrt != NULL); + + if (strcmp(addr, SMRT_IPORT_VIRT) == 0) { + if (smrt_logvol_hba_setup(smrt, dip) != DDI_SUCCESS) + return (DDI_FAILURE); + smrt->smrt_virt_iport = dip; + } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) { + if (smrt_phys_hba_setup(smrt, dip) != DDI_SUCCESS) + return (DDI_FAILURE); + smrt->smrt_phys_iport = dip; + } else { + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +smrt_iport_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + const char *addr; + scsi_hba_tran_t *tran; + smrt_t *smrt; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + tran = ddi_get_driver_private(dip); + VERIFY(tran != NULL); + smrt = tran->tran_hba_private; + VERIFY(smrt != NULL); + + addr = scsi_hba_iport_unit_address(dip); + VERIFY(addr != NULL); + + if (strcmp(addr, SMRT_IPORT_VIRT) == 0) { + smrt_logvol_hba_teardown(smrt, dip); + smrt->smrt_virt_iport = NULL; + } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) { + smrt_phys_hba_teardown(smrt, dip); + smrt->smrt_phys_iport = NULL; + } else { + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +smrt_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + uint32_t instance; + smrt_t *smrt; + boolean_t check_for_interrupts = B_FALSE; + int r; + char taskq_name[64]; + + if (scsi_hba_iport_unit_address(dip) != NULL) + return (smrt_iport_attach(dip, cmd)); + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + /* + * Allocate the per-controller soft state object and get + * a pointer to it. + */ + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(smrt_state, instance) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not allocate soft state"); + return (DDI_FAILURE); + } + if ((smrt = ddi_get_soft_state(smrt_state, instance)) == NULL) { + dev_err(dip, CE_WARN, "could not get soft state"); + ddi_soft_state_free(smrt_state, instance); + return (DDI_FAILURE); + } + + /* + * Initialise per-controller state object. + */ + smrt->smrt_dip = dip; + smrt->smrt_instance = instance; + smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER; + list_create(&smrt->smrt_commands, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link)); + list_create(&smrt->smrt_finishq, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link_finish)); + list_create(&smrt->smrt_abortq, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link_abort)); + list_create(&smrt->smrt_volumes, sizeof (smrt_volume_t), + offsetof(smrt_volume_t, smlv_link)); + list_create(&smrt->smrt_physicals, sizeof (smrt_physical_t), + offsetof(smrt_physical_t, smpt_link)); + list_create(&smrt->smrt_targets, sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_ctlr)); + avl_create(&smrt->smrt_inflight, smrt_command_comparator, + sizeof (smrt_command_t), offsetof(smrt_command_t, + smcm_node)); + cv_init(&smrt->smrt_cv_finishq, NULL, CV_DRIVER, NULL); + + smrt->smrt_init_level |= SMRT_INITLEVEL_BASIC; + + /* + * Perform basic device setup, including identifying the board, mapping + * the I2O registers and the Configuration Table. + */ + if (smrt_device_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "device setup failed"); + goto fail; + } + + /* + * Select a Transport Method (e.g. Simple or Performant) and update + * the Configuration Table. This function also waits for the + * controller to become ready. + */ + if (smrt_ctlr_init(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "controller initialisation failed"); + goto fail; + } + + /* + * Each controller may have a different Scatter/Gather Element count. + * Configure a per-controller set of DMA attributes with the + * appropriate S/G size. + */ + VERIFY(smrt->smrt_sg_cnt > 0); + smrt->smrt_dma_attr = smrt_dma_attr_template; + smrt->smrt_dma_attr.dma_attr_sgllen = smrt->smrt_sg_cnt; + + /* + * Now that we have selected a Transport Method, we can configure + * the appropriate interrupt handlers. + */ + if (smrt_interrupts_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "interrupt handler setup failed"); + goto fail; + } + + /* + * Now that we have the correct interrupt priority, we can initialise + * the mutex. This must be done before the interrupt handler is + * enabled. + */ + mutex_init(&smrt->smrt_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(smrt->smrt_interrupt_pri)); + smrt->smrt_init_level |= SMRT_INITLEVEL_MUTEX; + + /* + * From this point forward, the controller is able to accept commands + * and (at least by polling) return command submissions. Setting this + * flag allows the rest of the driver to interact with the device. + */ + smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING; + + if (smrt_interrupts_enable(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "interrupt handler could not be enabled"); + goto fail; + } + + if (smrt_ctrl_hba_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "SCSI framework setup failed"); + goto fail; + } + + /* + * Set the appropriate Interrupt Mask Register bits to start + * command completion interrupts from the controller. + */ + smrt_intr_set(smrt, B_TRUE); + check_for_interrupts = B_TRUE; + + /* + * Register the maintenance routine for periodic execution: + */ + smrt->smrt_periodic = ddi_periodic_add(smrt_periodic, smrt, + SMRT_PERIODIC_RATE * NANOSEC, DDI_IPL_0); + smrt->smrt_init_level |= SMRT_INITLEVEL_PERIODIC; + + (void) snprintf(taskq_name, sizeof (taskq_name), "smrt_discover_%u", + instance); + smrt->smrt_discover_taskq = ddi_taskq_create(smrt->smrt_dip, taskq_name, + 1, TASKQ_DEFAULTPRI, 0); + if (smrt->smrt_discover_taskq == NULL) { + dev_err(dip, CE_WARN, "failed to create discovery task queue"); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_TASKQ; + + if ((r = smrt_event_init(smrt)) != 0) { + dev_err(dip, CE_WARN, "could not initialize event subsystem " + "(%d)", r); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_ASYNC_EVENT; + + if (scsi_hba_iport_register(dip, SMRT_IPORT_VIRT) != DDI_SUCCESS) + goto fail; + + if (scsi_hba_iport_register(dip, SMRT_IPORT_PHYS) != DDI_SUCCESS) + goto fail; + + /* + * Announce the attachment of this controller. + */ + ddi_report_dev(dip); + + return (DDI_SUCCESS); + +fail: + if (check_for_interrupts) { + if (smrt->smrt_stats.smrts_claimed_interrupts == 0) { + dev_err(dip, CE_WARN, "controller did not interrupt " + "during attach"); + } + } + smrt_cleanup(smrt); + return (DDI_FAILURE); +} + +static int +smrt_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + scsi_hba_tran_t *tran = (scsi_hba_tran_t *)ddi_get_driver_private(dip); + smrt_t *smrt = (smrt_t *)tran->tran_hba_private; + + if (scsi_hba_iport_unit_address(dip) != NULL) + return (smrt_iport_detach(dip, cmd)); + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + /* + * First, check to make sure that all SCSI framework targets have + * detached. + */ + mutex_enter(&smrt->smrt_mutex); + if (!list_is_empty(&smrt->smrt_targets)) { + mutex_exit(&smrt->smrt_mutex); + dev_err(smrt->smrt_dip, CE_WARN, "cannot detach; targets still " + "using HBA"); + return (DDI_FAILURE); + } + + if (smrt->smrt_virt_iport != NULL || smrt->smrt_phys_iport != NULL) { + mutex_exit(&smrt->smrt_mutex); + dev_err(smrt->smrt_dip, CE_WARN, "cannot detach: iports still " + "attached"); + return (DDI_FAILURE); + } + + /* + * Prevent new targets from attaching now: + */ + smrt->smrt_status |= SMRT_CTLR_STATUS_DETACHING; + mutex_exit(&smrt->smrt_mutex); + + /* + * Clean up all remaining resources. + */ + smrt_cleanup(smrt); + + return (DDI_SUCCESS); +} + +static int +smrt_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rval) +{ + int inst = MINOR2INST(getminor(dev)); + int status; + + if (secpolicy_sys_config(credp, B_FALSE) != 0) { + return (EPERM); + } + + /* + * Ensure that we have a soft state object for this instance. + */ + if (ddi_get_soft_state(smrt_state, inst) == NULL) { + return (ENXIO); + } + + switch (cmd) { + default: + status = scsi_hba_ioctl(dev, cmd, arg, mode, credp, rval); + break; + } + + return (status); +} + +static void +smrt_cleanup(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_ASYNC_EVENT) { + smrt_event_fini(smrt); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_ASYNC_EVENT; + } + + smrt_interrupts_teardown(smrt); + + if (smrt->smrt_init_level & SMRT_INITLEVEL_TASKQ) { + ddi_taskq_destroy(smrt->smrt_discover_taskq); + smrt->smrt_discover_taskq = NULL; + smrt->smrt_init_level &= ~SMRT_INITLEVEL_TASKQ; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_PERIODIC) { + ddi_periodic_delete(smrt->smrt_periodic); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_PERIODIC; + } + + smrt_ctrl_hba_teardown(smrt); + + smrt_ctlr_teardown(smrt); + + smrt_device_teardown(smrt); + + if (smrt->smrt_init_level & SMRT_INITLEVEL_BASIC) { + smrt_logvol_teardown(smrt); + smrt_phys_teardown(smrt); + + cv_destroy(&smrt->smrt_cv_finishq); + + VERIFY(list_is_empty(&smrt->smrt_commands)); + list_destroy(&smrt->smrt_commands); + list_destroy(&smrt->smrt_finishq); + list_destroy(&smrt->smrt_abortq); + + VERIFY(list_is_empty(&smrt->smrt_volumes)); + list_destroy(&smrt->smrt_volumes); + + VERIFY(list_is_empty(&smrt->smrt_physicals)); + list_destroy(&smrt->smrt_physicals); + + VERIFY(list_is_empty(&smrt->smrt_targets)); + list_destroy(&smrt->smrt_targets); + + VERIFY(avl_is_empty(&smrt->smrt_inflight)); + avl_destroy(&smrt->smrt_inflight); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_BASIC; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_MUTEX) { + mutex_destroy(&smrt->smrt_mutex); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_MUTEX; + } + + VERIFY0(smrt->smrt_init_level); + + ddi_soft_state_free(smrt_state, ddi_get_instance(smrt->smrt_dip)); +} + +/* + * Comparator for the "smrt_inflight" AVL tree in a "smrt_t". This AVL tree + * allows a tag ID to be mapped back to the relevant "smrt_command_t". + */ +static int +smrt_command_comparator(const void *lp, const void *rp) +{ + const smrt_command_t *l = lp; + const smrt_command_t *r = rp; + + if (l->smcm_tag > r->smcm_tag) { + return (1); + } else if (l->smcm_tag < r->smcm_tag) { + return (-1); + } else { + return (0); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf new file mode 100644 index 0000000000..758ecd0779 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +scsi-no-quiesce=1; diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c new file mode 100644 index 0000000000..b4cdd5607e --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c @@ -0,0 +1,2023 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * Discovery, Resets, Periodics, and Events + * ---------------------------------------- + * + * Discovery is the act of figuring out what logical and physical volumes exist + * under the controller. Discovery happens in response to the following events: + * + * o iports for virtual and physical devices being attached + * o Controller event notifications indicating potential topology changes + * o After a reset of the controller, before we can perform I/O again + * + * Because we have to perform discovery after a reset, which can happen during + * panic(), that also means that discovery may be run in panic context. We + * also need to emphasize the need for discovery to happen after a controller + * reset. Once a reset is initiated, we cannot be certain about the addresses + * of any of the existing targets until the reset has completed. The driver + * performs I/Os to addresses that the controller provides. The controller + * specification says that these addresses may change after a controller reset. + * + * Unfortunately, all of this combined means that making sure we can correctly + * run discovery is somewhat complicated. In non-panic contexts, discovery is + * always run from a taskq. We'll kick off the discovery in the taskq if + * nothing is pending at that time. The state is managed by bits in the + * smrt_status member of the smrt_t. There are four bits at this time: + * + * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has + * requested that a discovery be performed. + * If no flags are set when this is set, + * then we will kick off discovery. All + * discovery requests are initiated via the + * smrt_discover_request() function. + * + * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us + * running a discovery. It is removed when + * discovery finishes. + * + * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of + * circumstances, which will be described + * in a subsequent section. This indicates + * that the periodic must kick off the + * discovery process. + * + * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a + * controller reset occurred and we need to + * have a successful discovery to finish + * the act of resetting and allowing I/O to + * continue. + * + * In general, a request to discover kicks off the taskq to discover entries, if + * it hasn't already been requested or started. This also allows us to coalesce + * multiple requests, if needed. Note that if a request comes in when a + * discovery is ongoing, we do not kick off discovery again. Instead, we set + * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the + * initial pass has completed. + * + * When a discovery starts, the first thing it does is clear the + * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any + * additional requests for discovery that come in after this has started likely + * indicate that we've missed something. As such, when the discovery process + * finishes, if it sees the REQUESTED flag, then it will need to set the + * PERIODIC flag. The PERIODIC flag is used to indicate that we should run + * discovery again, but not kick if off immediately. Instead, it should be + * driven by the normal periodic behavior. + * + * If for some reason the act of discovery fails, or we fail to dispatch + * discovery due to a transient error, then we will flag PERIODIC so that the + * periodic tick will try and run things again. + * + * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set + * after a reset occurs. The reset thread will be blocked on this. + * Importantly, none of the code in the discovery path can ask for a controller + * reset at this time. If at the end of a discovery, this flag is set, then we + * will signal the reset thread that it should check on its status by + * broadcasting on the smrt_cv_finishq. At that point, the reset thread will + * continue. + * + * Panic Context + * ------------- + * + * All of this talk of threads and taskqs is well and good, but as an HBA + * driver, we have a serious responsibility to try and deal with panic sanely. + * In panic context, we will directly call the discovery functions and not poll + * for them to occur. + * + * However, because our discovery relies on the target maps, which aren't safe + * for panic context at this time, we have to take a different approach. We + * leverage the fact that we have a generation number stored with every + * discovery. If we try to do an I/O to a device where the generation doesn't + * match, then we know that it disappeared and should not be used. We also + * sanity check the model, serial numbers, and WWNs to make sure that these are + * the same devices. If they are, then we'll end up updating the address + * structures. + * + * Now, it is possible that when we were panicking, we had a thread that was in + * the process of running a discovery or even resetting the system. Once we're + * in panic, those threads aren't running, so if they didn't end up producing a + * new view of the world that the SCSI framework is using, then it shouldn't + * really matter, as we won't have updated the list of devices. Importantly, + * once we're in that context, we're not going to be attaching or detaching + * targets. If we get a request for one of these targets which has disappeared, + * we're going to have to end up giving up. + * + * Request Attributes + * ------------------ + * + * The CISS specification allows for three different kinds of attributes that + * describe how requests are queued to the controller. These are: + * + * HEAD OF QUEUE The request should go to the head of the + * controller queue. This is used for resets and + * aborts to ensure that they're not blocked behind + * additional I/O. + * + * SIMPLE This queues the request for normal processing. + * Commands queued this way are not special with + * respect to one another. We use this for all I/O + * and discovery commands. + * + * ORDERED This attribute is used to indicate that commands + * should be submitted and processed in some order. + * This is used primarily for the event + * notification bits so we can ensure that at the + * return of a cancellation of the event + * notification, that any outstanding request has + * been honored. + */ + +static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *); +static void smrt_discover(void *); + +/* + * The maximum number of seconds to wait for the controller to come online. + */ +unsigned smrt_ciss_init_time = 90; + +/* + * A tunable that determines the number of events per tick that we'll process + * via asynchronous event notification. If this rate is very high, then we will + * not submit the event and it will be picked up at the next tick of the + * periodic. + */ +uint_t smrt_event_intervention_threshold = 1000; + +/* + * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used + * when performing various physical commands and generally should stay the same + * for a given device across inserts and removals; however, not across + * controller resets. These are calculated based on what the CISS specification + * calls the 'Level 2' target and bus, which don't have a real meaning in the + * SAS world otherwise. + */ +uint16_t +smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr) +{ + uint16_t id; + + id = (paddr->Target[1].PeripDev.Bus - 1) << 8; + id += paddr->Target[1].PeripDev.Dev; + + return (id); +} + +void +smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus, + unsigned target) +{ + lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR : + PERIPHERIAL_DEV_ADDR; + + lun->PhysDev.TargetId = target; + lun->PhysDev.Bus = bus; + + bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target)); +} + +/* + * According to the CISS Specification, the controller is always addressed in + * Mask Perhiperhal mode with a bus and target ID of zero. This is used by + * commands that need to write to the controller itself, which is generally + * discovery and other commands. + */ +void +smrt_write_controller_lun_addr(LUNAddr_t *lun) +{ + smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0); +} + +void +smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs) +{ + switch (type) { + case CISS_MSG_ABORT: + case CISS_MSG_RESET: + case CISS_MSG_NOP: + break; + + default: + panic("unknown message type"); + } + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs); + smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN; + smcm->smcm_va_cmd->Request.CDB[0] = type; +} + +void +smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag) +{ + smrt_tag_t cisstag; + + /* + * When aborting a particular command, the request is addressed + * to the controller. + */ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); + + /* + * Abort a single command. + */ + smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK; + + /* + * The CISS Specification says that the tag value for a task-level + * abort should be in the CDB in bytes 4-11. + */ + bzero(&cisstag, sizeof (cisstag)); + cisstag.tag_value = tag; + bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4], + sizeof (cisstag)); +} + +void +smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr) +{ + /* + * When aborting all tasks for a particular Logical Volume, + * the command is addressed not to the controller but to + * the Volume itself. + */ + smcm->smcm_va_cmd->Header.LUN = *addr; + + smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); + + /* + * Abort all commands for a particular Logical Volume. + */ + smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET; +} + +void +smrt_write_message_event_notify(smrt_command_t *smcm) +{ + smrt_event_notify_req_t senr; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + smcm->smcm_va_cmd->Request.Timeout = 0; + smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); + + bzero(&senr, sizeof (senr)); + senr.senr_opcode = CISS_SCMD_READ; + senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT; + senr.senr_flags = BE_32(0); + senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); + + bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (senr))); +} + +void +smrt_write_message_cancel_event_notify(smrt_command_t *smcm) +{ + smrt_event_notify_req_t senr; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT); + smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); + + bzero(&senr, sizeof (senr)); + senr.senr_opcode = CISS_SCMD_WRITE; + senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL; + senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); + + bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (senr))); +} + +void +smrt_write_message_reset_ctlr(smrt_command_t *smcm) +{ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_RESET, 0); + + smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR; +} + +void +smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs) +{ + /* + * No-op messages are always sent to the controller. + */ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs); +} + +/* + * This routine is executed regularly by ddi_periodic_add(9F). It checks the + * health of the controller and looks for submitted commands that have timed + * out. + */ +void +smrt_periodic(void *arg) +{ + smrt_t *smrt = arg; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Before we even check if the controller is running to process + * everything else, we must first check if we had a request to kick off + * discovery. We do this before the check if the controller is running, + * as this may be required to finish a discovery. + */ + if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 && + (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 && + (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) { + if (ddi_taskq_dispatch(smrt->smrt_discover_taskq, + smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { + smrt->smrt_stats.smrts_discovery_tq_errors++; + } else { + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC; + } + } + + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) { + /* + * The device is currently not active, e.g. due to an + * in-progress controller reset. + */ + mutex_exit(&smrt->smrt_mutex); + return; + } + + /* + * Check on the health of the controller firmware. Note that if the + * controller has locked up, this routine will panic the system. + */ + smrt_lockup_check(smrt); + + /* + * Reset the event notification threshold counter. + */ + smrt->smrt_event_count = 0; + + /* + * Check inflight commands to see if they have timed out. + */ + for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight); + smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) { + if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { + /* + * Polled commands are timed out by the polling + * routine. + */ + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * This command has been aborted; either it will + * complete or the controller will be reset. + */ + continue; + } + + if (list_link_active(&smcm->smcm_link_abort)) { + /* + * Already on the abort queue. + */ + continue; + } + + if (smcm->smcm_expiry == 0) { + /* + * This command has no expiry time. + */ + continue; + } + + if (gethrtime() > smcm->smcm_expiry) { + list_insert_tail(&smrt->smrt_abortq, smcm); + smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT; + } + } + + /* + * Process the abort queue. + */ + (void) smrt_process_abortq(smrt); + + /* + * Check if we have an outstanding event intervention request. Note, + * the command in question should always be in a state such that it is + * usable by the system here. The command is always prepared again by + * the normal event notification path, even if a reset has occurred. + * The reset will be processed before we'd ever consider running an + * event again. Note, if we fail to submit this, then we leave this for + * the next occurrence of the periodic. + */ + if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { + smrt->smrt_stats.smrts_events_intervened++; + + if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) { + smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; + } + } + + mutex_exit(&smrt->smrt_mutex); +} + +int +smrt_retrieve(smrt_t *smrt) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_retrieve_simple(smrt); + return (DDI_SUCCESS); + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + + panic("unknown controller mode"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +/* + * Grab a new tag number for this command. We aim to avoid reusing tag numbers + * as much as possible, so as to avoid spurious double completion from the + * controller. + */ +static void +smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Loop until we find a tag that is not in use. The tag space is + * very large (~30 bits) and the maximum number of inflight commands + * is comparatively small (~1024 in current controllers). + */ + for (;;) { + uint32_t new_tag = smrt->smrt_next_tag; + + if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) { + smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER; + } + + if (smrt_lookup_inflight(smrt, new_tag) != NULL) { + /* + * This tag is already used on an inflight command. + * Choose another. + */ + continue; + } + + /* + * Set the tag for the command and also write it into the + * appropriate part of the request block. + */ + smcm->smcm_tag = new_tag; + smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag; + return; + } +} + +/* + * Submit a command to the controller. + */ +int +smrt_submit(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT); + + /* + * Anything that asks us to ignore the running state of the controller + * must be wired up to poll for completion. + */ + if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) { + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + } + + /* + * If the controller is currently being reset, do not allow command + * submission. However, if this is one of the commands needed to finish + * reset, as indicated on the command structure, allow it. + */ + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) && + !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) { + return (EIO); + } + + /* + * Do not allow submission of more concurrent commands than the + * controller supports. + */ + if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) { + return (EAGAIN); + } + + /* + * Synchronise the Command Block DMA resources to ensure that the + * device has a consistent view before we pass it the command. + */ + if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure"); + return (EIO); + } + + /* + * Ensure that this command is not re-used without issuing a new + * tag number and performing any appropriate cleanup. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED)); + smcm->smcm_status |= SMRT_CMD_STATUS_USED; + + /* + * Assign a tag that is not currently in use + */ + smrt_set_new_tag(smrt, smcm); + + /* + * Insert this command into the inflight AVL. + */ + avl_index_t where; + if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) { + dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x", + smcm->smcm_tag); + } + avl_insert(&smrt->smrt_inflight, smcm, where); + if (smrt->smrt_stats.smrts_max_inflight < + avl_numnodes(&smrt->smrt_inflight)) { + smrt->smrt_stats.smrts_max_inflight = + avl_numnodes(&smrt->smrt_inflight); + } + + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT; + + smcm->smcm_time_submit = gethrtime(); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_submit_simple(smrt, smcm); + return (0); + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + panic("unknown controller mode"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +static void +smrt_process_finishq_sync(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, + DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure"); + } +} + +static void +smrt_process_finishq_one(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE)); + smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE; + + switch (smcm->smcm_type) { + case SMRT_CMDTYPE_INTERNAL: + cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq); + return; + + case SMRT_CMDTYPE_SCSA: + smrt_hba_complete(smcm); + return; + + case SMRT_CMDTYPE_EVENT: + smrt_event_complete(smcm); + return; + + case SMRT_CMDTYPE_ABORTQ: + /* + * Abort messages sent as part of abort queue processing + * do not require any completion activity. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + return; + + case SMRT_CMDTYPE_PREINIT: + dev_err(smrt->smrt_dip, CE_PANIC, "preinit command " + "completed after initialisation"); + return; + } + + panic("unknown command type"); +} + +/* + * Process commands in the completion queue. + */ +void +smrt_process_finishq(smrt_t *smrt) +{ + smrt_command_t *smcm; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) { + /* + * Synchronise the Command Block before we read from it or + * free it, to ensure that any writes from the controller are + * visible. + */ + smrt_process_finishq_sync(smcm); + + /* + * Check if this command was in line to be aborted. + */ + if (list_link_active(&smcm->smcm_link_abort)) { + /* + * This command was in line, but the controller + * subsequently completed the command before we + * were able to do so. + */ + list_remove(&smrt->smrt_abortq, smcm); + smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT; + } + + /* + * Check if this command has been abandoned by the original + * submitter. If it has, free it now to avoid a leak. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { + /* + * This command will be picked up and processed + * by "smrt_poll_for()" once the CV is triggered + * at the end of processing. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + continue; + } + + smrt_process_finishq_one(smcm); + } + + cv_broadcast(&smrt->smrt_cv_finishq); +} + +/* + * Process commands in the abort queue. + */ +void +smrt_process_abortq(smrt_t *smrt) +{ + smrt_command_t *smcm; + smrt_command_t *abort_smcm = NULL; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (list_is_empty(&smrt->smrt_abortq)) { + goto out; + } + +another: + mutex_exit(&smrt->smrt_mutex); + if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ, + KM_NOSLEEP)) == NULL) { + /* + * No resources available to send abort messages. We will + * try again the next time around. + */ + mutex_enter(&smrt->smrt_mutex); + goto out; + } + mutex_enter(&smrt->smrt_mutex); + + while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) { + if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * This message is not currently inflight, so + * no abort is needed. + */ + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * An abort message has already been sent for + * this command. + */ + continue; + } + + /* + * Send an abort message for the command. + */ + smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag); + if (smrt_submit(smrt, abort_smcm) != 0) { + /* + * The command could not be submitted to the + * controller. Put it back in the abort queue + * and give up for now. + */ + list_insert_head(&smrt->smrt_abortq, smcm); + goto out; + } + smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT; + + /* + * Record some debugging information about the abort we + * sent: + */ + smcm->smcm_abort_time = gethrtime(); + smcm->smcm_abort_tag = abort_smcm->smcm_tag; + + /* + * The abort message was sent. Release it and + * allocate another command. + */ + abort_smcm = NULL; + goto another; + } + +out: + cv_broadcast(&smrt->smrt_cv_finishq); + if (abort_smcm != NULL) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + mutex_enter(&smrt->smrt_mutex); + } +} + +int +smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + + while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) { + if (smcm->smcm_expiry != 0) { + /* + * This command has an expiry time. Check to see + * if it has already passed: + */ + if (smcm->smcm_expiry < gethrtime()) { + return (ETIMEDOUT); + } + } + + if (ddi_in_panic()) { + /* + * When the system is panicking, there are no + * interrupts or other threads. Drive the polling loop + * on our own, but with a small delay to avoid + * aggrevating the controller while we're trying to + * dump. + */ + (void) smrt_retrieve(smrt); + smrt_process_finishq(smrt); + drv_usecwait(100); + continue; + } + + /* + * Wait for command completion to return through the regular + * interrupt handling path. + */ + if (smcm->smcm_expiry == 0) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } else { + /* + * Wait only until the expiry time for this command. + */ + (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq, + &smrt->smrt_mutex, smcm->smcm_expiry); + } + } + + /* + * Fire the completion callback for this command. The callback + * is responsible for freeing the command, so it may not be + * referenced again once this call returns. + */ + smrt_process_finishq_one(smcm); + + return (0); +} + +void +smrt_intr_set(smrt_t *smrt, boolean_t enabled) +{ + /* + * Read the Interrupt Mask Register. + */ + uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + if (enabled) { + imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE; + } else { + imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE; + } + smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr); + return; + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + panic("unknown controller mode"); +} + +/* + * Signal to the controller that we have updated the Configuration Table by + * writing to the Inbound Doorbell Register. The controller will, after some + * number of seconds, acknowledge this by clearing the bit. + * + * If successful, return DDI_SUCCESS. If the controller takes too long to + * acknowledge, return DDI_FAILURE. + */ +int +smrt_cfgtbl_flush(smrt_t *smrt) +{ + /* + * Read the current value of the Inbound Doorbell Register. + */ + uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); + + /* + * Signal the Configuration Table change to the controller. + */ + idr |= CISS_IDR_BIT_CFGTBL_CHANGE; + smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr); + + /* + * Wait for the controller to acknowledge the change. + */ + for (unsigned i = 0; i < smrt_ciss_init_time; i++) { + idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); + + if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) { + return (DDI_SUCCESS); + } + + /* + * Wait for one second before trying again. + */ + delay(drv_usectohz(1000000)); + } + + dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller " + "configuration completed"); + return (DDI_FAILURE); +} + +int +smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + /* + * Read the current value of the "Supported Transport Methods" field in + * the Configuration Table. + */ + uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->TransportSupport); + + /* + * Check that the desired transport method is supported by the + * controller: + */ + if ((xport_active & xport) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller does not support " + "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? + "simple" : "performant"); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +void +smrt_cfgtbl_transport_set(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest, + xport); +} + +int +smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + /* + * Read the current value of the TransportActive field in the + * Configuration Table. + */ + uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->TransportActive); + + /* + * Check that the desired transport method is now active: + */ + if ((xport_active & xport) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport " + "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? + "simple" : "performant"); + return (DDI_FAILURE); + } + + /* + * Ensure that the controller is now ready to accept commands. + */ + if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to " + "accept commands"); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +uint32_t +smrt_ctlr_get_maxsgelements(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements)); +} + +uint32_t +smrt_ctlr_get_cmdsoutmax(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax)); +} + +static uint32_t +smrt_ctlr_get_hostdrvsup(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HostDrvrSupport)); +} + +int +smrt_ctlr_init(smrt_t *smrt) +{ + uint8_t signature[4] = { 'C', 'I', 'S', 'S' }; + int e; + + if ((e = smrt_ctlr_wait_for_state(smrt, + SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) { + return (e); + } + + /* + * The configuration table contains an ASCII signature ("CISS") which + * should be checked as we initialise the controller. + * See: "9.1 Configuration Table" in CISS Specification. + */ + for (unsigned i = 0; i < 4; i++) { + if (ddi_get8(smrt->smrt_ct_handle, + &smrt->smrt_ct->Signature[i]) != signature[i]) { + dev_err(smrt->smrt_dip, CE_WARN, "invalid signature " + "detected"); + return (DDI_FAILURE); + } + } + + /* + * Initialise an appropriate Transport Method. For now, this driver + * only supports the "Simple" method. + */ + if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) { + return (e); + } + + /* + * Save some common feature support bitfields. + */ + smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt); + smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->BusTypes); + + /* + * Read initial controller heartbeat value and mark the current + * reading time. + */ + smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HeartBeat); + smrt->smrt_last_heartbeat_time = gethrtime(); + + /* + * Determine the firmware version of the controller so that we can + * select which type of interrupts to use. + */ + if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT, + &smrt->smrt_versions)) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "could not identify " + "controller (%d)", e); + return (DDI_FAILURE); + } + + dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s", + smrt->smrt_versions.smrtv_firmware_rev); + + return (DDI_SUCCESS); +} + +void +smrt_ctlr_teardown(smrt_t *smrt) +{ + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_ctlr_teardown_simple(smrt); + return; + + case SMRT_CTLR_MODE_UNKNOWN: + return; + } + + panic("unknown controller mode"); +} + +int +smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state) +{ + unsigned wait_usec = 100 * 1000; + unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec; + + VERIFY(state == SMRT_WAIT_STATE_READY || + state == SMRT_WAIT_STATE_UNREADY); + + /* + * Read from the Scratchpad Register until the expected ready signature + * is detected. This behaviour is not described in the CISS + * specification. + * + * If the device is not in the desired state immediately, sleep for a + * second and try again. If the device has not become ready in 300 + * seconds, give up. + */ + for (unsigned i = 0; i < wait_count; i++) { + uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); + + switch (state) { + case SMRT_WAIT_STATE_READY: + if (spr == CISS_SCRATCHPAD_INITIALISED) { + return (DDI_SUCCESS); + } + break; + + case SMRT_WAIT_STATE_UNREADY: + if (spr != CISS_SCRATCHPAD_INITIALISED) { + return (DDI_SUCCESS); + } + break; + } + + if (ddi_in_panic()) { + /* + * There is no sleep for the panicking, so we + * must spin wait: + */ + drv_usecwait(wait_usec); + } else { + /* + * Wait for a quarter second and try again. + */ + delay(drv_usectohz(wait_usec)); + } + } + + dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller " + "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ? + "ready": "unready"); + return (DDI_FAILURE); +} + +void +smrt_lockup_check(smrt_t *smrt) +{ + /* + * Read the current controller heartbeat value. + */ + uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HeartBeat); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Check to see if the value is the same as last time we looked: + */ + if (heartbeat != smrt->smrt_last_heartbeat) { + /* + * The heartbeat value has changed, which suggests that the + * firmware in the controller has not yet come to a complete + * stop. Record the new value, as well as the current time. + */ + smrt->smrt_last_heartbeat = heartbeat; + smrt->smrt_last_heartbeat_time = gethrtime(); + return; + } + + /* + * The controller _might_ have been able to signal to us that is + * has locked up. This is a truly unfathomable state of affairs: + * If the firmware can tell it has flown off the rails, why not + * simply reset the controller? + */ + uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS); + uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); + if ((odr & CISS_ODR_BIT_LOCKUP) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " + "reported a critical fault (odr %08x spr %08x)", + odr, spr); + } + + if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) { + dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " + "stopped responding (odr %08x spr %08x)", + odr, spr); + } +} + +/* + * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC + * command, so it must be submitted to the controller and we must poll for its + * completion. This functionality is only presently used during controller + * initialisation, so it uses the special pre-initialisation path for command + * allocation and submission. + */ +static int +smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout, + smrt_identify_controller_t *resp) +{ + smrt_command_t *smcm; + smrt_identify_controller_req_t smicr; + int r; + size_t sz; + + /* + * Allocate a command with a data buffer; the controller will fill it + * with identification information. There is some suggestion in the + * firmware-level specification that the buffer length should be a + * multiple of 512 bytes for some controllers, so we round up. + */ + sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t); + if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) { + return (ENOMEM); + } + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr); + smcm->smcm_va_cmd->Request.Timeout = timeout; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * Construct the IDENTIFY CONTROLLER request CDB. Note that any + * reserved fields in the request must be filled with zeroes. + */ + bzero(&smicr, sizeof (smicr)); + smicr.smicr_opcode = CISS_SCMD_BMIC_READ; + smicr.smicr_lun = 0; + smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER; + bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smicr))); + + /* + * Send the command to the device and poll for its completion. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * This command timed out, but the driver is not presently + * initialised to the point where we can try to abort it. + * The command was created with the PREINIT type, so it + * does not appear in the global command tracking list. + * In order to avoid problems with DMA from the controller, + * we have to leak the command allocation. + */ + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to identify + * it. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "identify " + "controller error: status 0x%x", + ei->CommandStatus); + r = EIO; + goto out; + } + } + + if (resp != NULL) { + /* + * Copy the identify response out for the caller. + */ + bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp)); + } + + r = 0; + +out: + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +/* + * The firmware versions in an IDENTIFY CONTROLLER response generally take + * the form of a four byte ASCII string containing a dotted decimal version + * number; e.g., "8.00". + * + * This function sanitises the firmware version, replacing unexpected + * values with a question mark. + */ +static void +smrt_copy_firmware_version(uint8_t *src, char *dst) +{ + for (unsigned i = 0; i < 4; i++) { + /* + * Make sure that this is a 7-bit clean ASCII value. + */ + char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?'; + + if (isalnum(c) || c == '.' || c == ' ') { + dst[i] = c; + } else { + dst[i] = '?'; + } + } + dst[4] = '\0'; +} + +/* + * Using an IDENTIFY CONTROLLER request, determine firmware and controller + * version details. See the comments for "smrt_ctlr_identify()" for more + * details about calling context. + */ +static int +smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv) +{ + smrt_identify_controller_t smic; + int r; + + if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) { + return (r); + } + + smrtv->smrtv_hardware_version = smic.smic_hardware_version; + smrt_copy_firmware_version(smic.smic_firmware_rev, + smrtv->smrtv_firmware_rev); + smrt_copy_firmware_version(smic.smic_recovery_rev, + smrtv->smrtv_recovery_rev); + smrt_copy_firmware_version(smic.smic_bootblock_rev, + smrtv->smrtv_bootblock_rev); + + return (0); +} + +int +smrt_ctlr_reset(smrt_t *smrt) +{ + smrt_command_t *smcm, *smcm_nop; + int r; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (ddi_in_panic()) { + goto skip_check; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + /* + * Don't pile on. One reset is enough. Wait until + * it's complete, and then return success. + */ + while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + return (0); + } + smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING; + smrt->smrt_last_reset_start = gethrtime(); + smrt->smrt_stats.smrts_ctlr_resets++; + +skip_check: + /* + * Allocate two commands: one for the soft reset message, which we + * cannot free until the controller has reset; and one for the ping we + * will use to determine when it is once again functional. + */ + mutex_exit(&smrt->smrt_mutex); + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + mutex_enter(&smrt->smrt_mutex); + + /* + * Send a soft reset command to the controller. If this command + * succeeds, there will likely be no completion notification. Instead, + * the device should become unavailable for some period of time and + * then become available again. Once available again, we know the soft + * reset has completed and should abort all in-flight commands. + */ + smrt_write_message_reset_ctlr(smcm); + + /* + * Disable interrupts now. + */ + smrt_intr_set(smrt, B_FALSE); + + dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset"); + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "submit failed (%d)", r); + } + + /* + * Mark every currently inflight command as being reset, including the + * soft reset command we just sent. Once we confirm the reset works, + * we can safely report that these commands have failed. + */ + for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); + t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) { + t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT; + } + + /* + * Now that we have submitted our soft reset command, prevent + * the rest of the driver from interacting with the controller. + */ + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; + + /* + * We do not expect a completion from the controller for our soft + * reset command, but we also cannot remove it from the inflight + * list until we know the controller has actually reset. To do + * otherwise would potentially allow the controller to scribble + * on the memory we were using. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + + if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller did not become unready"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready"); + + if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller did not come become ready"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready"); + + /* + * In at least the Smart Array P420i, the controller can take 30-45 + * seconds after the scratchpad register shows it as being available + * before it is ready to receive commands. In order to avoid hitting + * it too early with our post-reset ping, we will sleep for 10 seconds + * here. + */ + if (ddi_in_panic()) { + drv_usecwait(10 * MICROSEC); + } else { + delay(drv_usectohz(10 * MICROSEC)); + } + + smrt_ctlr_teardown(smrt); + if (smrt_ctlr_init(smrt) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller transport could not be configured"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured"); + + smrt_write_message_nop(smcm_nop, 0); + smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED | + SMRT_CMD_IGNORE_RUNNING; + if ((r = smrt_submit(smrt, smcm_nop)) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "ping could not be submitted (%d)", r); + } + + /* + * Interrupts are still masked at this stage. Poll manually in + * a way that will not trigger regular finish queue processing: + */ + VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT); + for (unsigned i = 0; i < 600; i++) { + smrt_retrieve_simple(smrt); + + if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * Remove the ping command from the finish queue and + * process it manually. This processing must mirror + * what would have been done in smrt_process_finishq(). + */ + VERIFY(list_link_active(&smcm_nop->smcm_link_finish)); + list_remove(&smrt->smrt_finishq, smcm_nop); + smrt_process_finishq_sync(smcm_nop); + smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + smrt_process_finishq_one(smcm_nop); + break; + } + + if (ddi_in_panic()) { + drv_usecwait(100 * 1000); + } else { + delay(drv_usectohz(100 * 1000)); + } + } + + if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "ping did not complete"); + } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) { + dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed " + "in error (status %u)", + (unsigned)smcm_nop->smcm_va_err->CommandStatus); + } else { + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed"); + } + + /* + * Now that the controller is working again, we can abort any + * commands that were inflight during the reset. + */ + smrt_command_t *nt; + for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); + t != NULL; t = nt) { + nt = AVL_NEXT(&smrt->smrt_inflight, t); + + if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + avl_remove(&smrt->smrt_inflight, t); + t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + + list_insert_tail(&smrt->smrt_finishq, t); + } + } + + /* + * Quiesce our discovery thread. Note, because + * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be + * enabled again. + */ + if (!ddi_in_panic()) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + /* + * Re-enable interrupts. Now, we must kick off a discovery to make sure + * that the system is in a sane state and that we can perform I/O. + */ + smrt_intr_set(smrt, B_TRUE); + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING; + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED; + + /* + * Attempt a discovery to make sure that the drivers sees a realistic + * view of the world. If we're not in panic context, spin for the + * asynchronous process to complete, otherwise we're in panic context + * and this is going to happen regardless if we want it to or not. + * Before we kick off the request to run discovery, we reset the + * discovery request flags as we know that nothing else can consider + * running discovery and we don't want to delay until the next smrt + * periodic tick if we can avoid it. In panic context, if this failed, + * then we won't make it back. + */ + VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING); + smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK); + smrt_discover(smrt); + if (!ddi_in_panic()) { + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + } + + smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING; + smrt->smrt_last_reset_finish = gethrtime(); + + /* + * Wake anybody that was waiting for the reset to complete. + */ + cv_broadcast(&smrt->smrt_cv_finishq); + + /* + * Process the completion queue one last time before we let go + * of the mutex. + */ + smrt_process_finishq(smrt); + + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm_nop); + mutex_enter(&smrt->smrt_mutex); + return (0); +} + +int +smrt_event_init(smrt_t *smrt) +{ + int ret; + smrt_command_t *event, *cancel; + + event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP); + if (event == NULL) + return (ENOMEM); + if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN, + KM_NOSLEEP) != 0) { + smrt_command_free(event); + return (ENOMEM); + } + smrt_write_message_event_notify(event); + + cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP); + if (cancel == NULL) { + smrt_command_free(event); + return (ENOMEM); + } + if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN, + KM_NOSLEEP) != 0) { + smrt_command_free(event); + smrt_command_free(cancel); + return (ENOMEM); + } + smrt_write_message_cancel_event_notify(cancel); + + cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL); + + mutex_enter(&smrt->smrt_mutex); + if ((ret = smrt_submit(smrt, event)) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(event); + smrt_command_free(cancel); + return (ret); + } + + smrt->smrt_event_cmd = event; + smrt->smrt_event_cancel_cmd = cancel; + mutex_exit(&smrt->smrt_mutex); + + return (0); +} + +void +smrt_event_complete(smrt_command_t *smcm) +{ + smrt_event_notify_t *sen; + boolean_t log, rescan; + + boolean_t intervene = B_FALSE; + smrt_t *smrt = smcm->smcm_ctlr; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY3P(smcm, ==, smrt->smrt_event_cmd); + VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION); + + smrt->smrt_stats.smrts_events_received++; + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + cv_signal(&smrt->smrt_event_queue); + return; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + intervene = B_TRUE; + goto clean; + } + + /* + * The event notification command failed for some reason. Attempt to + * drive on and try again at the next intervention period. Because this + * may represent a programmer error (though it's hard to know), we wait + * until the next intervention period and don't panic. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + intervene = B_TRUE; + + smrt->smrt_stats.smrts_events_errors++; + dev_err(smrt->smrt_dip, CE_WARN, "!event notification request " + "error: status 0x%x", ei->CommandStatus); + goto clean; + } + + sen = smcm->smcm_internal->smcmi_va; + log = rescan = B_FALSE; + switch (sen->sen_class) { + case SMRT_EVENT_CLASS_PROTOCOL: + /* + * Most of the event protocol class events aren't really + * actionable. However, subclass 1 indicates errors. Today, + * the only error is an event overflow. If there's an event + * overflow, then we must assume that we need to rescan. + */ + if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_HOTPLUG: + /* + * We want to log all hotplug events. However we only need to + * scan these if the subclass indicates the event is for a disk. + */ + log = B_TRUE; + if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_HWERROR: + case SMRT_EVENT_CLASS_ENVIRONMENT: + log = B_TRUE; + break; + case SMRT_EVENT_CLASS_PHYS: + log = B_TRUE; + /* + * This subclass indicates some change for physical drives. As + * such, this should trigger a rescan. + */ + if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_LOGVOL: + rescan = B_TRUE; + log = B_TRUE; + break; + default: + /* + * While there are other classes of events, it's hard to say how + * actionable they are for the moment. If we revamp this such + * that it becomes an ireport based system, then we should just + * always log these. We opt not to at the moment to try and be + * kind to the system log. + */ + break; + } + + /* + * Ideally, this would be an ireport that we could pass onto + * administrators; however, since we don't have any way to generate + * that, we provide a subset of the event information. + */ + if (log) { + const char *rmsg; + if (rescan == B_TRUE) { + rmsg = "rescanning"; + } else { + rmsg = "not rescanning"; + } + if (sen->sen_message[0] != '\0') { + sen->sen_message[sizeof (sen->sen_message) - 1] = '\0'; + dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " + "class/sub-class/detail %x, %x, %x: %s; %s devices", + sen->sen_class, sen->sen_subclass, sen->sen_detail, + sen->sen_message, rmsg); + } else { + dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " + "class/sub-class/detail %x, %x, %x; %s devices", + sen->sen_class, sen->sen_subclass, sen->sen_detail, + rmsg); + } + } + + if (rescan) + smrt_discover_request(smrt); + +clean: + mutex_exit(&smrt->smrt_mutex); + smrt_command_reuse(smcm); + bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN); + mutex_enter(&smrt->smrt_mutex); + + /* + * Make sure we're not _now_ detaching or resetting. + */ + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + cv_signal(&smrt->smrt_event_queue); + return; + } + + if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 || + intervene == B_TRUE) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + return; + } + + /* + * Check out command count per tick. If it's too high, leave it for + * intervention to solve. Likely there is some serious driver or + * firmware error going on. + */ + smrt->smrt_event_count++; + if (smrt->smrt_event_count > smrt_event_intervention_threshold) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + return; + } + + if (smrt_submit(smrt, smcm) != 0) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + } +} + +void +smrt_event_fini(smrt_t *smrt) +{ + int ret; + smrt_command_t *event, *cancel; + mutex_enter(&smrt->smrt_mutex); + + /* + * If intervention has been requested, there is nothing for us to do. We + * clear the flag so nothing else accidentally sees this and takes + * action. We also don't need to bother sending a cancellation request, + * as there is no outstanding event. + */ + if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { + smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; + goto free; + } + + /* + * Submit a cancel request for the event notification queue. Because we + * submit both the cancel event and the regular notification event as an + * ordered command, we know that by the time this completes, that the + * existing one will have completed. + */ + smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) { + /* + * This is unfortunate. We've failed to submit the command. At + * this point all we can do is reset the device. If the reset + * succeeds, we're done and we can clear all the memory. If it + * fails, then all we can do is just leak the command and scream + * to the system, sorry. + */ + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after failure to submit cancellation " + "(%d), abandoning smrt_command_t at address %p", + ret, smrt->smrt_event_cmd); + smrt->smrt_event_cmd = NULL; + goto free; + } + } + + smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() + + SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC; + if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) { + VERIFY3S(ret, ==, ETIMEDOUT); + VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status & + SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out. All we can do is hope a reset will + * work. + */ + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after failure to poll for async " + "cancellation command abandoning smrt_command_t " + "event command at address %p and cancellation " + "command at %p", smrt->smrt_event_cmd, + smrt->smrt_event_cancel_cmd); + smrt->smrt_event_cmd = NULL; + smrt->smrt_event_cancel_cmd = NULL; + goto free; + } + + } + + /* + * Well, in the end, it's results that count. + */ + if (smrt->smrt_event_cancel_cmd->smcm_status & + SMRT_CMD_STATUS_RESET_SENT) { + goto free; + } + + if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err; + + /* + * This can return a CISS_CMD_TARGET_STATUS entry when the + * controller doesn't think a command is outstanding. It is + * possible we raced, so don't think too much about that case. + * Anything else leaves us between a rock and a hard place, the + * only way out is a reset. + */ + if (ei->CommandStatus != CISS_CMD_TARGET_STATUS && + smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after receiving an error on the async " + "cancellation command (%d); abandoning " + "smrt_command_t event command at address %p and " + "cancellation command at %p", ei->CommandStatus, + smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd); + smrt->smrt_event_cmd = NULL; + smrt->smrt_event_cancel_cmd = NULL; + goto free; + } + } + +free: + event = smrt->smrt_event_cmd; + smrt->smrt_event_cmd = NULL; + cancel = smrt->smrt_event_cancel_cmd; + smrt->smrt_event_cancel_cmd = NULL; + mutex_exit(&smrt->smrt_mutex); + if (event != NULL) + smrt_command_free(event); + if (cancel != NULL) + smrt_command_free(cancel); + cv_destroy(&smrt->smrt_event_queue); +} + +/* + * We've been asked to do a discovery in panic context. This would have + * occurred because there was a device reset. Because we can't rely on the + * target maps, all we can do at the moment is go over all the active targets + * and note which ones no longer exist. If this target was required to dump, + * then the dump code will encounter a fatal error. If not, then we should + * count ourselves surprisingly lucky. + */ +static void +smrt_discover_panic_check(smrt_t *smrt) +{ + smrt_target_t *smtg; + + ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); + for (smtg = list_head(&smrt->smrt_targets); smtg != NULL; + smtg = list_next(&smrt->smrt_targets, smtg)) { + uint64_t gen; + + if (smtg->smtg_physical) { + smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys; + /* + * Don't worry about drives that aren't visible. + */ + if (!smpt->smpt_visible) + continue; + gen = smpt->smpt_gen; + } else { + smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol; + gen = smlv->smlv_gen; + } + + if (gen != smrt->smrt_discover_gen) { + dev_err(smrt->smrt_dip, CE_WARN, "target %s " + "disappeared during post-panic discovery", + scsi_device_unit_address(smtg->smtg_scsi_dev)); + smtg->smtg_gone = B_TRUE; + } + } +} + +static void +smrt_discover(void *arg) +{ + int log = 0, phys = 0; + smrt_t *smrt = arg; + uint64_t gen; + boolean_t runphys, runvirt; + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING; + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED; + + smrt->smrt_discover_gen++; + gen = smrt->smrt_discover_gen; + runphys = smrt->smrt_phys_tgtmap != NULL; + runvirt = smrt->smrt_virt_tgtmap != NULL; + mutex_exit(&smrt->smrt_mutex); + if (runphys) + phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); + if (runvirt) + log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); + mutex_enter(&smrt->smrt_mutex); + + if (phys != 0 || log != 0) { + if (!ddi_in_panic()) { + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + } else { + panic("smrt_t %p failed to perform discovery after " + "a reset in panic context, unable to continue. " + "logvol: %d, phys: %d", smrt, log, phys); + } + } else { + if (!ddi_in_panic() && + smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED; + cv_broadcast(&smrt->smrt_cv_finishq); + } + + if (ddi_in_panic()) { + smrt_discover_panic_check(smrt); + } + } + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING; + if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED) + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + mutex_exit(&smrt->smrt_mutex); +} + +/* + * Request discovery, which is always run via a taskq. + */ +void +smrt_discover_request(smrt_t *smrt) +{ + boolean_t run; + ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); + + if (ddi_in_panic()) { + smrt_discover(smrt); + return; + } + + run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0; + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED; + if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq, + smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + smrt->smrt_stats.smrts_discovery_tq_errors++; + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c new file mode 100644 index 0000000000..1b3d7b2602 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c @@ -0,0 +1,282 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +uint_t +smrt_isr_hw_simple(caddr_t arg1, caddr_t arg2) +{ + _NOTE(ARGUNUSED(arg2)) + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + smrt_t *smrt = (smrt_t *)arg1; + uint32_t isr = smrt_get32(smrt, CISS_I2O_INTERRUPT_STATUS); + hrtime_t now = gethrtime(); + + mutex_enter(&smrt->smrt_mutex); + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) { + smrt->smrt_stats.smrts_unclaimed_interrupts++; + smrt->smrt_last_interrupt_unclaimed = now; + + /* + * We should not be receiving interrupts from the controller + * while the driver is not running. + */ + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_UNCLAIMED); + } + + /* + * Check to see if this interrupt came from the device: + */ + if ((isr & CISS_ISR_BIT_SIMPLE_INTR) == 0) { + smrt->smrt_stats.smrts_unclaimed_interrupts++; + smrt->smrt_last_interrupt_unclaimed = now; + + /* + * Check to see if the firmware has come to rest. If it has, + * this routine will panic the system. + */ + smrt_lockup_check(smrt); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_UNCLAIMED); + } + + smrt->smrt_stats.smrts_claimed_interrupts++; + smrt->smrt_last_interrupt_claimed = now; + + /* + * The interrupt was from our controller, so collect any pending + * command completions. + */ + smrt_retrieve_simple(smrt); + + /* + * Process any commands in the completion queue. + */ + smrt_process_finishq(smrt); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_CLAIMED); +} + +/* + * Read tags and process completion of the associated command until the supply + * of tags is exhausted. + */ +void +smrt_retrieve_simple(smrt_t *smrt) +{ + uint32_t opq; + uint32_t none = 0xffffffff; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + while ((opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q)) != none) { + uint32_t tag = CISS_OPQ_READ_TAG(opq); + smrt_command_t *smcm; + + if ((smcm = smrt_lookup_inflight(smrt, tag)) == NULL) { + dev_err(smrt->smrt_dip, CE_WARN, "spurious tag %x", + tag); + continue; + } + + avl_remove(&smrt->smrt_inflight, smcm); + smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + if (CISS_OPQ_READ_ERROR(opq) != 0) { + smcm->smcm_status |= SMRT_CMD_STATUS_ERROR; + } + smcm->smcm_time_complete = gethrtime(); + + /* + * Push this command onto the completion queue. + */ + list_insert_tail(&smrt->smrt_finishq, smcm); + } +} + +/* + * Submit a command to the controller by posting it to the Inbound Post Queue + * Register. + */ +void +smrt_submit_simple(smrt_t *smrt, smrt_command_t *smcm) +{ + smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd); +} + +/* + * Submit a command to the controller by posting it to the Inbound Post Queue + * Register. Immediately begin polling on the completion of that command. + * + * NOTE: This function is for controller initialisation only. It discards + * completions of commands other than the expected command as spurious, and + * will not interact correctly with the rest of the driver once it is running. + */ +int +smrt_preinit_command_simple(smrt_t *smrt, smrt_command_t *smcm) +{ + /* + * The controller must be initialised to use the Simple Transport + * Method, but not be marked RUNNING. The command to process must be a + * PREINIT command with the expected tag number, marked for polling. + */ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE); + VERIFY(!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)); + VERIFY(smcm->smcm_type == SMRT_CMDTYPE_PREINIT); + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + VERIFY3U(smcm->smcm_tag, ==, SMRT_PRE_TAG_NUMBER); + + /* + * Submit this command to the controller. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT; + smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd); + + /* + * Poll the controller for completions until we see the command we just + * sent, or the timeout expires. + */ + for (;;) { + uint32_t none = 0xffffffff; + uint32_t opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q); + uint32_t tag; + + if (smcm->smcm_expiry != 0) { + /* + * This command has an expiry time. Check to see + * if it has already passed: + */ + if (smcm->smcm_expiry < gethrtime()) { + return (ETIMEDOUT); + } + } + + if (opq == none) { + delay(drv_usectohz(10 * 1000)); + continue; + } + + if ((tag = CISS_OPQ_READ_TAG(opq)) != SMRT_PRE_TAG_NUMBER) { + dev_err(smrt->smrt_dip, CE_WARN, "unexpected tag 0x%x" + " completed during driver init", tag); + delay(drv_usectohz(10 * 1000)); + continue; + } + + smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + if (CISS_OPQ_READ_ERROR(opq) != 0) { + smcm->smcm_status |= SMRT_CMD_STATUS_ERROR; + } + smcm->smcm_time_complete = gethrtime(); + smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + + return (0); + } +} + +int +smrt_ctlr_init_simple(smrt_t *smrt) +{ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_UNKNOWN); + + if (smrt_cfgtbl_transport_has_support(smrt, + CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_SIMPLE; + + /* + * Disable device interrupts while we are setting up. + */ + smrt_intr_set(smrt, B_FALSE); + + if ((smrt->smrt_maxcmds = smrt_ctlr_get_cmdsoutmax(smrt)) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding " + "commands set to zero"); + return (DDI_FAILURE); + } + + /* + * Determine the number of Scatter/Gather List entries this controller + * supports. The maximum number we allow is CISS_MAXSGENTRIES: the + * number of elements in the static struct we use for command + * submission. + */ + if ((smrt->smrt_sg_cnt = smrt_ctlr_get_maxsgelements(smrt)) == 0) { + /* + * The CISS specification states that if this value is + * zero, we should assume a value of 31 for compatibility + * with older firmware. + */ + smrt->smrt_sg_cnt = CISS_SGCNT_FALLBACK; + + } else if (smrt->smrt_sg_cnt > CISS_MAXSGENTRIES) { + /* + * If the controller supports more than we have allocated, + * just cap the count at the allocation size. + */ + smrt->smrt_sg_cnt = CISS_MAXSGENTRIES; + } + + /* + * Zero the upper 32 bits of the address in the Controller. + */ + ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->Upper32Addr, 0); + + /* + * Set the Transport Method and flush the changes to the + * Configuration Table. + */ + smrt_cfgtbl_transport_set(smrt, CISS_CFGTBL_XPORT_SIMPLE); + if (smrt_cfgtbl_flush(smrt) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + if (smrt_cfgtbl_transport_confirm(smrt, + CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + /* + * Check the outstanding command cap a second time now that we have + * flushed out the new Transport Method. This is entirely defensive; + * we do not expect this value to change. + */ + uint32_t check_again = smrt_ctlr_get_cmdsoutmax(smrt); + if (check_again != smrt->smrt_maxcmds) { + dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding commands " + "changed during initialisation (was %u, now %u)", + smrt->smrt_maxcmds, check_again); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +void +smrt_ctlr_teardown_simple(smrt_t *smrt) +{ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE); + + /* + * Due to the nominal simplicity of the simple mode, we have no + * particular teardown to perform as we do not allocate anything + * on the way up. + */ + smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_UNKNOWN; +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c new file mode 100644 index 0000000000..edcbfa65e2 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c @@ -0,0 +1,362 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + + +static ddi_dma_attr_t smrt_command_dma_attr = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x00000000, + .dma_attr_addr_hi = 0xFFFFFFFF, + .dma_attr_count_max = 0x00FFFFFF, + .dma_attr_align = 0x20, + .dma_attr_burstsizes = 0x20, + .dma_attr_minxfer = DMA_UNIT_8, + .dma_attr_maxxfer = 0xFFFFFFFF, + .dma_attr_seg = 0x0000FFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 512, + .dma_attr_flags = 0 +}; + +/* + * These device access attributes are for command block allocation, where we do + * not use any of the structured byte swapping facilities. + */ +static ddi_device_acc_attr_t smrt_command_dev_attr = { + .devacc_attr_version = DDI_DEVICE_ATTR_V0, + .devacc_attr_endian_flags = DDI_NEVERSWAP_ACC, + .devacc_attr_dataorder = DDI_STRICTORDER_ACC, + .devacc_attr_access = 0 +}; + + +static void smrt_contig_free(smrt_dma_t *); + + +static int +smrt_check_command_type(smrt_command_type_t type) +{ + /* + * Note that we leave out the default case in order to utilise + * compiler warnings about missed enum values. + */ + switch (type) { + case SMRT_CMDTYPE_ABORTQ: + case SMRT_CMDTYPE_SCSA: + case SMRT_CMDTYPE_INTERNAL: + case SMRT_CMDTYPE_PREINIT: + case SMRT_CMDTYPE_EVENT: + return (type); + } + + panic("unexpected command type"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +static int +smrt_contig_alloc(smrt_t *smrt, smrt_dma_t *smdma, size_t sz, int kmflags, + void **vap, uint32_t *pap) +{ + caddr_t va; + int rv; + dev_info_t *dip = smrt->smrt_dip; + int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : + DDI_DMA_DONTWAIT; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + + /* + * Ensure we don't try to allocate a second time using the same + * tracking object. + */ + VERIFY0(smdma->smdma_level); + + if ((rv = ddi_dma_alloc_handle(dip, &smrt_command_dma_attr, + dma_wait, NULL, &smdma->smdma_dma_handle)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)", + rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_ALLOC; + + if ((rv = ddi_dma_mem_alloc(smdma->smdma_dma_handle, sz, + &smrt_command_dev_attr, DDI_DMA_CONSISTENT, dma_wait, NULL, + &va, &smdma->smdma_real_size, &smdma->smdma_acc_handle)) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_MEMORY_ALLOC; + + if ((rv = ddi_dma_addr_bind_handle(smdma->smdma_dma_handle, + NULL, va, smdma->smdma_real_size, + DDI_DMA_CONSISTENT | DDI_DMA_RDWR, dma_wait, NULL, + smdma->smdma_dma_cookies, &smdma->smdma_dma_ncookies)) != + DDI_DMA_MAPPED) { + dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_BOUND; + + VERIFY3U(smdma->smdma_dma_ncookies, ==, 1); + *pap = smdma->smdma_dma_cookies[0].dmac_address; + *vap = (void *)va; + return (DDI_SUCCESS); + +fail: + *vap = NULL; + *pap = 0; + smrt_contig_free(smdma); + return (DDI_FAILURE); +} + +static void +smrt_contig_free(smrt_dma_t *smdma) +{ + if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_BOUND) { + VERIFY3U(ddi_dma_unbind_handle(smdma->smdma_dma_handle), ==, + DDI_SUCCESS); + + smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_BOUND; + } + + if (smdma->smdma_level & SMRT_DMALEVEL_MEMORY_ALLOC) { + ddi_dma_mem_free(&smdma->smdma_acc_handle); + + smdma->smdma_level &= ~SMRT_DMALEVEL_MEMORY_ALLOC; + } + + if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_ALLOC) { + ddi_dma_free_handle(&smdma->smdma_dma_handle); + + smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_ALLOC; + } + + VERIFY(smdma->smdma_level == 0); + bzero(smdma, sizeof (*smdma)); +} + +static smrt_command_t * +smrt_command_alloc_impl(smrt_t *smrt, smrt_command_type_t type, int kmflags) +{ + smrt_command_t *smcm; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + + if ((smcm = kmem_zalloc(sizeof (*smcm), kmflags)) == NULL) { + return (NULL); + } + + smcm->smcm_ctlr = smrt; + smcm->smcm_type = smrt_check_command_type(type); + + /* + * Allocate a single contiguous chunk of memory for the command block + * (smcm_va_cmd) and the error information block (smcm_va_err). The + * physical address of each block should be 32-byte aligned. + */ + size_t contig_size = 0; + contig_size += P2ROUNDUP_TYPED(sizeof (CommandList_t), 32, size_t); + + size_t errorinfo_offset = contig_size; + contig_size += P2ROUNDUP_TYPED(sizeof (ErrorInfo_t), 32, size_t); + + if (smrt_contig_alloc(smrt, &smcm->smcm_contig, contig_size, + kmflags, (void **)&smcm->smcm_va_cmd, &smcm->smcm_pa_cmd) != + DDI_SUCCESS) { + kmem_free(smcm, sizeof (*smcm)); + return (NULL); + } + + smcm->smcm_va_err = (void *)((caddr_t)smcm->smcm_va_cmd + + errorinfo_offset); + smcm->smcm_pa_err = smcm->smcm_pa_cmd + errorinfo_offset; + + /* + * Ensure we asked for, and received, the correct physical alignment: + */ + VERIFY0(smcm->smcm_pa_cmd & 0x1f); + VERIFY0(smcm->smcm_pa_err & 0x1f); + + /* + * Populate Fields. + */ + bzero(smcm->smcm_va_cmd, contig_size); + smcm->smcm_va_cmd->ErrDesc.Addr = smcm->smcm_pa_err; + smcm->smcm_va_cmd->ErrDesc.Len = sizeof (ErrorInfo_t); + + return (smcm); +} + +smrt_command_t * +smrt_command_alloc_preinit(smrt_t *smrt, size_t datasize, int kmflags) +{ + smrt_command_t *smcm; + + if ((smcm = smrt_command_alloc_impl(smrt, SMRT_CMDTYPE_PREINIT, + kmflags)) == NULL) { + return (NULL); + } + + /* + * Note that most driver infrastructure has not been initialised at + * this time. All commands are submitted to the controller serially, + * using a pre-specified tag, and are not attached to the command + * tracking list. + */ + smcm->smcm_tag = SMRT_PRE_TAG_NUMBER; + smcm->smcm_va_cmd->Header.Tag.tag_value = SMRT_PRE_TAG_NUMBER; + + if (smrt_command_attach_internal(smrt, smcm, datasize, kmflags) != 0) { + smrt_command_free(smcm); + return (NULL); + } + + return (smcm); +} + +smrt_command_t * +smrt_command_alloc(smrt_t *smrt, smrt_command_type_t type, int kmflags) +{ + smrt_command_t *smcm; + + VERIFY(type != SMRT_CMDTYPE_PREINIT); + + if ((smcm = smrt_command_alloc_impl(smrt, type, kmflags)) == NULL) { + return (NULL); + } + + /* + * Insert into the per-controller command list. + */ + mutex_enter(&smrt->smrt_mutex); + list_insert_tail(&smrt->smrt_commands, smcm); + mutex_exit(&smrt->smrt_mutex); + + return (smcm); +} + +int +smrt_command_attach_internal(smrt_t *smrt, smrt_command_t *smcm, size_t len, + int kmflags) +{ + smrt_command_internal_t *smcmi; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + VERIFY3U(len, <=, UINT32_MAX); + + if ((smcmi = kmem_zalloc(sizeof (*smcmi), kmflags)) == NULL) { + return (ENOMEM); + } + + if (smrt_contig_alloc(smrt, &smcmi->smcmi_contig, len, kmflags, + &smcmi->smcmi_va, &smcmi->smcmi_pa) != DDI_SUCCESS) { + kmem_free(smcmi, sizeof (*smcmi)); + return (ENOMEM); + } + + bzero(smcmi->smcmi_va, smcmi->smcmi_len); + + smcm->smcm_internal = smcmi; + + smcm->smcm_va_cmd->SG[0].Addr = smcmi->smcmi_pa; + smcm->smcm_va_cmd->SG[0].Len = (uint32_t)len; + smcm->smcm_va_cmd->Header.SGList = 1; + smcm->smcm_va_cmd->Header.SGTotal = 1; + + return (0); +} + +void +smrt_command_reuse(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Make sure the command is not currently inflight, then + * reset the command status. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + smcm->smcm_status = SMRT_CMD_STATUS_REUSED; + + /* + * Ensure we are not trying to reuse a command that is in the finish or + * abort queue. + */ + VERIFY(!list_link_active(&smcm->smcm_link_abort)); + VERIFY(!list_link_active(&smcm->smcm_link_finish)); + + /* + * Clear the previous tag value. + */ + smcm->smcm_tag = 0; + smcm->smcm_va_cmd->Header.Tag.tag_value = 0; + + mutex_exit(&smrt->smrt_mutex); +} + +void +smrt_command_free(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + /* + * Ensure the object we are about to free is not currently in the + * inflight AVL. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + + if (smcm->smcm_internal != NULL) { + smrt_command_internal_t *smcmi = smcm->smcm_internal; + + smrt_contig_free(&smcmi->smcmi_contig); + kmem_free(smcmi, sizeof (*smcmi)); + } + + smrt_contig_free(&smcm->smcm_contig); + + if (smcm->smcm_type != SMRT_CMDTYPE_PREINIT) { + mutex_enter(&smrt->smrt_mutex); + + /* + * Ensure we are not trying to free a command that is in the + * finish or abort queue. + */ + VERIFY(!list_link_active(&smcm->smcm_link_abort)); + VERIFY(!list_link_active(&smcm->smcm_link_finish)); + + list_remove(&smrt->smrt_commands, smcm); + + mutex_exit(&smrt->smrt_mutex); + } + + kmem_free(smcm, sizeof (*smcm)); +} + +smrt_command_t * +smrt_lookup_inflight(smrt_t *smrt, uint32_t tag) +{ + smrt_command_t srch; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + bzero(&srch, sizeof (srch)); + srch.smcm_tag = tag; + + return (avl_find(&smrt->smrt_inflight, &srch, NULL)); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c new file mode 100644 index 0000000000..9e27448b68 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c @@ -0,0 +1,238 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * We must locate what the CISS specification describes as the "I2O + * registers". The Intelligent I/O (I2O) Architecture Specification describes + * this somewhat more coherently as "the memory region specified by the first + * base address configuration register indicating memory space (offset 10h, + * 14h, and so forth)". + */ +static int +smrt_locate_bar(pci_regspec_t *regs, unsigned nregs, + unsigned *i2o_bar) +{ + /* + * Locate the first memory-mapped BAR: + */ + for (unsigned i = 0; i < nregs; i++) { + unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK; + + if (type == PCI_ADDR_MEM32 || type == PCI_ADDR_MEM64) { + *i2o_bar = i; + return (DDI_SUCCESS); + } + } + + return (DDI_FAILURE); +} + +static int +smrt_locate_cfgtbl(smrt_t *smrt, pci_regspec_t *regs, unsigned nregs, + unsigned *ct_bar, uint32_t *baseaddr) +{ + uint32_t cfg_offset, mem_offset; + unsigned want_type; + uint32_t want_bar; + + cfg_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_CFG_OFFSET); + mem_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_MEM_OFFSET); + + VERIFY3U(cfg_offset, !=, 0xffffffff); + VERIFY3U(mem_offset, !=, 0xffffffff); + + /* + * Locate the Configuration Table. Three different values read + * from two I2O registers allow us to determine the location: + * - the correct PCI BAR offset is in the low 16 bits of + * CISS_I2O_CFGTBL_CFG_OFFSET + * - bit 16 is 0 for a 32-bit space, and 1 for 64-bit + * - the memory offset from the base of this BAR is + * in CISS_I2O_CFGTBL_MEM_OFFSET + */ + want_bar = (cfg_offset & 0xffff); + want_type = (cfg_offset & (1UL << 16)) ? PCI_ADDR_MEM64 : + PCI_ADDR_MEM32; + + DTRACE_PROBE4(locate_cfgtbl, uint32_t, want_bar, unsigned, + want_type, uint32_t, cfg_offset, uint32_t, mem_offset); + + for (unsigned i = 0; i < nregs; i++) { + unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK; + unsigned bar = PCI_REG_REG_G(regs[i].pci_phys_hi); + + if (type != PCI_ADDR_MEM32 && type != PCI_ADDR_MEM64) { + continue; + } + + if (bar == want_bar) { + *ct_bar = i; + *baseaddr = mem_offset; + return (DDI_SUCCESS); + } + } + + return (DDI_FAILURE); +} + +/* + * Determine the PCI vendor and device ID which is a proxy for which generation + * of controller we're working with. + */ +static int +smrt_identify_device(smrt_t *smrt) +{ + ddi_acc_handle_t pci_hdl; + + if (pci_config_setup(smrt->smrt_dip, &pci_hdl) != DDI_SUCCESS) + return (DDI_FAILURE); + + smrt->smrt_pci_vendor = pci_config_get16(pci_hdl, PCI_CONF_VENID); + smrt->smrt_pci_device = pci_config_get16(pci_hdl, PCI_CONF_DEVID); + + pci_config_teardown(&pci_hdl); + + return (DDI_SUCCESS); +} + +static int +smrt_map_device(smrt_t *smrt) +{ + pci_regspec_t *regs; + uint_t regslen, nregs; + dev_info_t *dip = smrt->smrt_dip; + int r = DDI_FAILURE; + + /* + * Get the list of PCI registers from the DDI property "regs": + */ + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", (int **)®s, ®slen) != DDI_PROP_SUCCESS) { + dev_err(dip, CE_WARN, "could not load \"reg\" DDI prop"); + return (DDI_FAILURE); + } + nregs = regslen * sizeof (int) / sizeof (pci_regspec_t); + + if (smrt_locate_bar(regs, nregs, &smrt->smrt_i2o_bar) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "did not find any memory BARs"); + goto out; + } + + /* + * Map enough of the I2O memory space to enable us to talk to the + * device. + */ + if (ddi_regs_map_setup(dip, smrt->smrt_i2o_bar, &smrt->smrt_i2o_space, + CISS_I2O_MAP_BASE, CISS_I2O_MAP_LIMIT - CISS_I2O_MAP_BASE, + &smrt_dev_attributes, &smrt->smrt_i2o_handle) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to map I2O registers"); + goto out; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_I2O_MAPPED; + + if (smrt_locate_cfgtbl(smrt, regs, nregs, &smrt->smrt_ct_bar, + &smrt->smrt_ct_baseaddr) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not find config table"); + goto out; + } + + /* + * Map the Configuration Table. + */ + if (ddi_regs_map_setup(dip, smrt->smrt_ct_bar, + (caddr_t *)&smrt->smrt_ct, smrt->smrt_ct_baseaddr, + sizeof (CfgTable_t), &smrt_dev_attributes, + &smrt->smrt_ct_handle) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not map config table"); + goto out; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_CFGTBL_MAPPED; + + r = DDI_SUCCESS; + +out: + ddi_prop_free(regs); + return (r); +} + +int +smrt_device_setup(smrt_t *smrt) +{ + /* + * Ensure that the controller is installed in such a fashion that it + * may become a DMA master. + */ + if (ddi_slaveonly(smrt->smrt_dip) == DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "device cannot become DMA " + "master"); + return (DDI_FAILURE); + } + + if (smrt_identify_device(smrt) != DDI_SUCCESS) + goto fail; + + if (smrt_map_device(smrt) != DDI_SUCCESS) { + goto fail; + } + + return (DDI_SUCCESS); + +fail: + smrt_device_teardown(smrt); + return (DDI_FAILURE); +} + +void +smrt_device_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_CFGTBL_MAPPED) { + ddi_regs_map_free(&smrt->smrt_ct_handle); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_CFGTBL_MAPPED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_I2O_MAPPED) { + ddi_regs_map_free(&smrt->smrt_i2o_handle); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_I2O_MAPPED; + } +} + +uint32_t +smrt_get32(smrt_t *smrt, offset_t off) +{ + VERIFY3S(off, >=, CISS_I2O_MAP_BASE); + VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space + + (off - CISS_I2O_MAP_BASE)); + + return (ddi_get32(smrt->smrt_i2o_handle, addr)); +} + +void +smrt_put32(smrt_t *smrt, offset_t off, uint32_t val) +{ + VERIFY3S(off, >=, CISS_I2O_MAP_BASE); + VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space + + (off - CISS_I2O_MAP_BASE)); + + ddi_put32(smrt->smrt_i2o_handle, addr, val); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c new file mode 100644 index 0000000000..8f082ffc9c --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c @@ -0,0 +1,1457 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * The controller is not allowed to attach. + */ +static int +smrt_ctrl_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + return (DDI_FAILURE); +} + +/* + * The controller is not allowed to send packets. + */ +static int +smrt_ctrl_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + return (TRAN_BADPKT); +} + +static boolean_t +smrt_logvol_parse(const char *ua, uint_t *targp) +{ + long targ, lun; + const char *comma; + char *eptr; + + comma = strchr(ua, ','); + if (comma == NULL) { + return (B_FALSE); + } + + /* + * We expect the target number for a logical unit number to be zero for + * a logical volume. + */ + if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' || + lun != 0) { + return (B_FALSE); + } + + if (ddi_strtol(ua, &eptr, 16, &targ) != 0 || eptr != comma || + targ < 0 || targ >= SMRT_MAX_LOGDRV) { + return (B_FALSE); + } + + *targp = (uint_t)targ; + + return (B_TRUE); +} + +static int +smrt_logvol_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip)) + + smrt_volume_t *smlv; + smrt_target_t *smtg; + const char *ua; + uint_t targ; + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + dev_info_t *dip = smrt->smrt_dip; + + /* + * The unit address comes in the form of 'target,lun'. We expect the + * lun to be zero. The target is what we set when we added it to the + * target map earlier. + */ + ua = scsi_device_unit_address(sd); + if (ua == NULL) { + return (DDI_FAILURE); + } + + if (!smrt_logvol_parse(ua, &targ)) { + return (DDI_FAILURE); + } + + if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate target object " + "due to memory exhaustion"); + return (DDI_FAILURE); + } + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + /* + * We are detaching. Do not accept any more requests to + * attach targets from the framework. + */ + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + /* + * Look for a logical volume for the SCSI unit address of this target. + */ + if ((smlv = smrt_logvol_lookup_by_id(smrt, targ)) == NULL) { + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + smtg->smtg_lun.smtg_vol = smlv; + smtg->smtg_addr = &smlv->smlv_addr; + smtg->smtg_physical = B_FALSE; + list_insert_tail(&smlv->smlv_targets, smtg); + + /* + * Link this target object to the controller: + */ + smtg->smtg_ctlr = smrt; + list_insert_tail(&smrt->smrt_targets, smtg); + + smtg->smtg_scsi_dev = sd; + VERIFY(sd->sd_dev == tgt_dip); + + scsi_device_hba_private_set(sd, smtg); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_SUCCESS); +} + +static void +smrt_logvol_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip, tgt_dip)) + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + smrt_target_t *smtg = scsi_device_hba_private_get(sd); + smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol; + + VERIFY(smtg->smtg_scsi_dev == sd); + VERIFY(smtg->smtg_physical == B_FALSE); + + mutex_enter(&smrt->smrt_mutex); + list_remove(&smlv->smlv_targets, smtg); + list_remove(&smrt->smrt_targets, smtg); + + scsi_device_hba_private_set(sd, NULL); + + mutex_exit(&smrt->smrt_mutex); + + kmem_free(smtg, sizeof (*smtg)); +} + +static int +smrt_phys_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip)) + + smrt_target_t *smtg; + smrt_physical_t *smpt; + const char *ua, *comma; + char *eptr; + long lun; + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + dev_info_t *dip = smrt->smrt_dip; + + /* + * The unit address comes in the form of 'target,lun'. We expect the + * lun to be zero. The target is what we set when we added it to the + * target map earlier. + */ + ua = scsi_device_unit_address(sd); + if (ua == NULL) + return (DDI_FAILURE); + + comma = strchr(ua, ','); + if (comma == NULL) { + return (DDI_FAILURE); + } + + /* + * Confirm the LUN is zero. We may want to instead check the scsi + * 'lun'/'lun64' property or do so in addition to this logic. + */ + if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' || + lun != 0) { + return (DDI_FAILURE); + } + + if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate target object " + "due to memory exhaustion"); + return (DDI_FAILURE); + } + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + /* + * We are detaching. Do not accept any more requests to + * attach targets from the framework. + */ + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + + /* + * Look for a physical target based on the unit address of the target + * (which will encode its WWN and LUN). + */ + smpt = smrt_phys_lookup_by_ua(smrt, ua); + if (smpt == NULL) { + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + smtg->smtg_scsi_dev = sd; + smtg->smtg_physical = B_TRUE; + smtg->smtg_lun.smtg_phys = smpt; + list_insert_tail(&smpt->smpt_targets, smtg); + smtg->smtg_addr = &smpt->smpt_addr; + + /* + * Link this target object to the controller: + */ + smtg->smtg_ctlr = smrt; + list_insert_tail(&smrt->smrt_targets, smtg); + + VERIFY(sd->sd_dev == tgt_dip); + smtg->smtg_scsi_dev = sd; + + scsi_device_hba_private_set(sd, smtg); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +static void +smrt_phys_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip, tgt_dip)) + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + smrt_target_t *smtg = scsi_device_hba_private_get(sd); + smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys; + + VERIFY(smtg->smtg_scsi_dev == sd); + VERIFY(smtg->smtg_physical == B_TRUE); + + mutex_enter(&smrt->smrt_mutex); + list_remove(&smpt->smpt_targets, smtg); + list_remove(&smrt->smrt_targets, smtg); + + scsi_device_hba_private_set(sd, NULL); + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); +} + +/* + * This function is called when the SCSI framework has allocated a packet and + * our private per-packet object. + * + * We choose not to have the framework pre-allocate memory for the CDB. + * Instead, we will make available the CDB area in the controller command block + * itself. + * + * Status block memory is allocated by the framework because we passed + * SCSI_HBA_TRAN_SCB to scsi_hba_attach_setup(9F). + */ +static int +smrt_tran_setup_pkt(struct scsi_pkt *pkt, int (*callback)(caddr_t), + caddr_t arg) +{ + _NOTE(ARGUNUSED(arg)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm; + smrt_command_scsa_t *smcms; + int kmflags = callback == SLEEP_FUNC ? KM_SLEEP : KM_NOSLEEP; + + sd = scsi_address_device(&pkt->pkt_address); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private; + + /* + * Check that we have enough space in the command object for the + * request from the target driver: + */ + if (pkt->pkt_cdblen > CISS_CDBLEN) { + /* + * The CDB member of the Request Block of a controller + * command is fixed at 16 bytes. + */ + dev_err(smrt->smrt_dip, CE_WARN, "oversize CDB: had %u, " + "needed %u", CISS_CDBLEN, pkt->pkt_cdblen); + return (-1); + } + + /* + * Allocate our command block: + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_SCSA, + kmflags)) == NULL) { + return (-1); + } + smcm->smcm_scsa = smcms; + smcms->smcms_command = smcm; + smcms->smcms_pkt = pkt; + + pkt->pkt_cdbp = &smcm->smcm_va_cmd->Request.CDB[0]; + smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen; + + smcm->smcm_target = smtg; + + return (0); +} + +static void +smrt_tran_teardown_pkt(struct scsi_pkt *pkt) +{ + smrt_command_scsa_t *smcms = (smrt_command_scsa_t *) + pkt->pkt_ha_private; + smrt_command_t *smcm = smcms->smcms_command; + + smrt_command_free(smcm); + + pkt->pkt_cdbp = NULL; +} + +static void +smrt_set_arq_data(struct scsi_pkt *pkt, uchar_t key) +{ + struct scsi_arq_status *sts; + + VERIFY3U(pkt->pkt_scblen, >=, sizeof (struct scsi_arq_status)); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sts = (struct scsi_arq_status *)(pkt->pkt_scbp); + bzero(sts, sizeof (*sts)); + + /* + * Mock up a CHECK CONDITION SCSI status for the original command: + */ + sts->sts_status.sts_chk = 1; + + /* + * Pretend that we successfully performed REQUEST SENSE: + */ + sts->sts_rqpkt_reason = CMD_CMPLT; + sts->sts_rqpkt_resid = 0; + sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_XFERRED_DATA; + sts->sts_rqpkt_statistics = 0; + + /* + * Return the key value we were provided in the fake sense data: + */ + sts->sts_sensedata.es_valid = 1; + sts->sts_sensedata.es_class = CLASS_EXTENDED_SENSE; + sts->sts_sensedata.es_key = key; + + pkt->pkt_state |= STATE_ARQ_DONE; +} + +/* + * When faking up a REPORT LUNS data structure, we simply report one LUN, LUN 0. + * We need 16 bytes for this, 4 for the size, 4 reserved bytes, and the 8 for + * the actual LUN. + */ +static void +smrt_fake_report_lun(smrt_command_t *smcm, struct scsi_pkt *pkt) +{ + size_t sz; + char resp[16]; + struct buf *bp; + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD | + STATE_GOT_STATUS; + + /* + * Check to make sure this is valid. If reserved bits are set or if the + * mode is one other than 0x00, 0x01, 0x02, then it's an illegal + * request. + */ + if (pkt->pkt_cdbp[1] != 0 || pkt->pkt_cdbp[3] != 0 || + pkt->pkt_cdbp[4] != 0 || pkt->pkt_cdbp[5] != 0 || + pkt->pkt_cdbp[10] != 0 || pkt->pkt_cdbp[11] != 0 || + pkt->pkt_cdbp[2] > 0x2) { + smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST); + return; + } + + /* + * Construct the actual REPORT LUNS reply. We need to indicate a single + * LUN of all zeros. This means that the length needs to be 8 bytes, + * the size of the lun. Otherwise, the rest of this structure can be + * zeros. + */ + bzero(resp, sizeof (resp)); + resp[3] = sizeof (scsi_lun_t); + + bp = scsi_pkt2bp(pkt); + sz = MIN(sizeof (resp), bp->b_bcount); + + bp_mapin(bp); + bcopy(resp, bp->b_un.b_addr, sz); + bp_mapout(bp); + pkt->pkt_state |= STATE_XFERRED_DATA; + pkt->pkt_resid = bp->b_bcount - sz; + if (pkt->pkt_scblen >= 1) { + pkt->pkt_scbp[0] = STATUS_GOOD; + } +} + +static int +smrt_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + _NOTE(ARGUNUSED(sa)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_scsa_t *smcms; + smrt_command_t *smcm; + int r; + + sd = scsi_address_device(&pkt->pkt_address); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private; + VERIFY(smcms != NULL); + smcm = smcms->smcms_command; + VERIFY(smcm != NULL); + + if (smcm->smcm_status & SMRT_CMD_STATUS_TRAN_START) { + /* + * This is a retry of a command that has already been + * used once. Assign it a new tag number. + */ + smrt_command_reuse(smcm); + } + smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_START; + + /* + * The sophisticated firmware in this controller cannot possibly bear + * the following SCSI commands. It appears to return a response with + * the status STATUS_ACA_ACTIVE (0x30), which is not something we + * expect. Instead, fake up a failure response. + */ + switch (pkt->pkt_cdbp[0]) { + case SCMD_FORMAT: + case SCMD_LOG_SENSE_G1: + case SCMD_MODE_SELECT: + case SCMD_PERSISTENT_RESERVE_IN: + if (smtg->smtg_physical) { + break; + } + + smrt->smrt_stats.smrts_ignored_scsi_cmds++; + smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_IGNORED; + + /* + * Mark the command as completed to the point where we + * received a SCSI status code: + */ + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_GOT_STATUS; + + /* + * Mock up sense data for an illegal request: + */ + smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST); + + scsi_hba_pkt_comp(pkt); + return (TRAN_ACCEPT); + case SCMD_REPORT_LUNS: + /* + * The SMRT controller does not accept a REPORT LUNS command for + * logical volumes. As such, we need to fake up a REPORT LUNS + * response that has a single LUN, LUN 0. + */ + if (smtg->smtg_physical) { + break; + } + + smrt_fake_report_lun(smcm, pkt); + + scsi_hba_pkt_comp(pkt); + return (TRAN_ACCEPT); + default: + break; + } + + if (pkt->pkt_flags & FLAG_NOINTR) { + /* + * We must sleep and wait for the completion of this command. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + } + + /* + * Because we provide a tran_setup_pkt(9E) entrypoint, we must now + * set up the Scatter/Gather List in the Command to reflect any + * DMA resources passed to us by the framework. + */ + if (pkt->pkt_numcookies > smrt->smrt_sg_cnt) { + /* + * More DMA cookies than we are prepared to handle. + */ + dev_err(smrt->smrt_dip, CE_WARN, "too many DMA cookies (got %u;" + " expected %u)", pkt->pkt_numcookies, smrt->smrt_sg_cnt); + return (TRAN_BADPKT); + } + smcm->smcm_va_cmd->Header.SGList = pkt->pkt_numcookies; + smcm->smcm_va_cmd->Header.SGTotal = pkt->pkt_numcookies; + for (unsigned i = 0; i < pkt->pkt_numcookies; i++) { + smcm->smcm_va_cmd->SG[i].Addr = + LE_64(pkt->pkt_cookies[i].dmac_laddress); + smcm->smcm_va_cmd->SG[i].Len = + LE_32(pkt->pkt_cookies[i].dmac_size); + } + + /* + * Copy logical volume address from the target object: + */ + smcm->smcm_va_cmd->Header.LUN = *smcm->smcm_target->smtg_addr; + + /* + * Initialise the command block. + */ + smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(pkt->pkt_time); + if (pkt->pkt_numcookies > 0) { + /* + * There are DMA resources; set the transfer direction + * appropriately: + */ + if (pkt->pkt_dma_flags & DDI_DMA_READ) { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_READ; + } else if (pkt->pkt_dma_flags & DDI_DMA_WRITE) { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_WRITE; + } else { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_NONE; + } + } else { + /* + * No DMA resources means no transfer. + */ + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE; + } + + /* + * Initialise the SCSI packet as described in tran_start(9E). We will + * progressively update these fields as the command moves through the + * submission and completion states. + */ + pkt->pkt_resid = 0; + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = 0; + + /* + * If this SCSI packet has a timeout, configure an appropriate + * expiry time: + */ + if (pkt->pkt_time != 0) { + smcm->smcm_expiry = gethrtime() + pkt->pkt_time * NANOSEC; + } + + /* + * Submit the command to the controller. + */ + mutex_enter(&smrt->smrt_mutex); + + /* + * If we're dumping, there's a chance that the target we're talking to + * could have ended up disappearing during the process of discovery. If + * this target is part of the dump device, we check here and return that + * we hit a fatal error. + */ + if (ddi_in_panic() && smtg->smtg_gone) { + mutex_exit(&smrt->smrt_mutex); + + dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed: target " + "%s is gone, it did not come back after post-panic reset " + "device discovery", scsi_device_unit_address(sd)); + + return (TRAN_FATAL_ERROR); + } + + smrt->smrt_stats.smrts_tran_starts++; + if ((r = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + + dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed %d", r); + + /* + * Inform the SCSI framework that we could not submit + * the command. + */ + return (r == EAGAIN ? TRAN_BUSY : TRAN_FATAL_ERROR); + } + + /* + * Update the SCSI packet to reflect submission of the command. + */ + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD; + + if (pkt->pkt_flags & FLAG_NOINTR) { + /* + * Poll the controller for completion of the command we + * submitted. Once this routine has returned, the completion + * callback will have been fired with either an active response + * (success or error) or a timeout. The command is freed by + * the completion callback, so it may not be referenced again + * after this call returns. + */ + smrt_poll_for(smrt, smcm); + } + + mutex_exit(&smrt->smrt_mutex); + return (TRAN_ACCEPT); +} + +static int +smrt_tran_reset(struct scsi_address *sa, int level) +{ + _NOTE(ARGUNUSED(level)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm; + int r; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + + /* + * The framework has requested some kind of SCSI reset. A + * controller-level soft reset can take a very long time -- often on + * the order of 30-60 seconds -- but might well be our only option if + * the controller is non-responsive. + * + * First, check if the controller is responding to pings. + */ +again: + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + return (0); + } + + smrt_write_message_nop(smcm, SMRT_PING_CHECK_TIMEOUT); + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_stats.smrts_tran_resets++; + if (ddi_in_panic()) { + goto skip_check; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + /* + * The controller is already resetting. Wait for that + * to finish. + */ + while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + } + +skip_check: + /* + * Submit our ping to the controller. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + smcm->smcm_expiry = gethrtime() + SMRT_PING_CHECK_TIMEOUT * NANOSEC; + if (smrt_submit(smrt, smcm) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (0); + } + + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The ping command timed out. Abandon it now. + */ + dev_err(smrt->smrt_dip, CE_WARN, "controller ping timed out"); + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + + } else if ((smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) || + (smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * The command completed in error, or a controller reset + * was sent while we were trying to ping. + */ + dev_err(smrt->smrt_dip, CE_WARN, "controller ping error"); + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + + } else { + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE); + + /* + * The controller is responsive, and a full soft reset would be + * extremely disruptive to the system. Given our spotty + * support for some SCSI commands (which can upset the target + * drivers) and the historically lax behaviour of the "smrt" + * driver, we grit our teeth and pretend we were able to + * perform a reset. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (1); + } + + /* + * If a reset has been initiated in the last 90 seconds, try + * another ping. + */ + if (gethrtime() < smrt->smrt_last_reset_start + 90 * NANOSEC) { + dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed, but " + "was recently reset; retrying ping"); + mutex_exit(&smrt->smrt_mutex); + + /* + * Sleep for a second first. + */ + if (ddi_in_panic()) { + drv_usecwait(1 * MICROSEC); + } else { + delay(drv_usectohz(1 * MICROSEC)); + } + goto again; + } + + dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed; resetting " + "controller"); + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller reset failure"); + mutex_exit(&smrt->smrt_mutex); + return (0); + } + + mutex_exit(&smrt->smrt_mutex); + return (1); +} + +static int +smrt_tran_abort(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm = NULL; + smrt_command_t *abort_smcm; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + + + if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + /* + * No resources available to send an abort message. + */ + return (0); + } + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_stats.smrts_tran_aborts++; + if (pkt != NULL) { + /* + * The framework wants us to abort a specific SCSI packet. + */ + smrt_command_scsa_t *smcms = (smrt_command_scsa_t *) + pkt->pkt_ha_private; + smcm = smcms->smcms_command; + + if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * This message is not currently in flight, so we + * cannot abort it. + */ + goto fail; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * An abort message for this command has already been + * sent to the controller. Return failure. + */ + goto fail; + } + + smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag); + } else { + /* + * The framework wants us to abort every in flight command + * for the target with this address. + */ + smrt_write_message_abort_all(abort_smcm, smtg->smtg_addr); + } + + /* + * Submit the abort message to the controller. + */ + abort_smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if (smrt_submit(smrt, abort_smcm) != 0) { + goto fail; + } + + if (pkt != NULL) { + /* + * Record some debugging information about the abort we + * sent: + */ + smcm->smcm_abort_time = gethrtime(); + smcm->smcm_abort_tag = abort_smcm->smcm_tag; + + /* + * Mark the command as aborted so that we do not send + * a second abort message: + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT; + } + + /* + * Poll for completion of the abort message. Note that this function + * only fails if we set a timeout on the command, which we have not + * done. + */ + VERIFY0(smrt_poll_for(smrt, abort_smcm)); + + if ((abort_smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) || + (abort_smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * Either the controller was reset or the abort command + * failed. + */ + goto fail; + } + + /* + * The command was successfully aborted. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + return (1); + +fail: + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + return (0); +} + +static void +smrt_hba_complete_status(smrt_command_t *smcm) +{ + ErrorInfo_t *ei = smcm->smcm_va_err; + struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt; + + bzero(pkt->pkt_scbp, pkt->pkt_scblen); + + if (ei->ScsiStatus != STATUS_CHECK) { + /* + * If the SCSI status is not CHECK CONDITION, we don't want + * to try and read the sense data buffer. + */ + goto simple_status; + } + + if (pkt->pkt_scblen < sizeof (struct scsi_arq_status)) { + /* + * There is not enough room for a request sense structure. + * Fall back to reporting just the SCSI status code. + */ + goto simple_status; + } + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + struct scsi_arq_status *sts = (struct scsi_arq_status *)pkt->pkt_scbp; + + /* + * Copy in the SCSI status from the original command. + */ + bcopy(&ei->ScsiStatus, &sts->sts_status, sizeof (sts->sts_status)); + + /* + * Mock up a successful REQUEST SENSE: + */ + sts->sts_rqpkt_reason = CMD_CMPLT; + sts->sts_rqpkt_resid = 0; + sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS; + sts->sts_rqpkt_statistics = 0; + + /* + * The sense data from the controller should be copied into place + * starting at the "sts_sensedata" member of the auto request + * sense object. + */ + size_t sense_len = pkt->pkt_scblen - offsetof(struct scsi_arq_status, + sts_sensedata); + if (ei->SenseLen < sense_len) { + /* + * Only copy sense data bytes that are within the region + * the controller marked as valid. + */ + sense_len = ei->SenseLen; + } + bcopy(ei->SenseInfo, &sts->sts_sensedata, sense_len); + + pkt->pkt_state |= STATE_ARQ_DONE; + return; + +simple_status: + if (pkt->pkt_scblen < sizeof (struct scsi_status)) { + /* + * There is not even enough room for the SCSI status byte. + */ + return; + } + + bcopy(&ei->ScsiStatus, pkt->pkt_scbp, sizeof (struct scsi_status)); +} + +static void +smrt_hba_complete_log_error(smrt_command_t *smcm, const char *name) +{ + smrt_t *smrt = smcm->smcm_ctlr; + ErrorInfo_t *ei = smcm->smcm_va_err; + + dev_err(smrt->smrt_dip, CE_WARN, "!SCSI command failed: %s: " + "SCSI op %x, CISS status %x, SCSI status %x", name, + (unsigned)smcm->smcm_va_cmd->Request.CDB[0], + (unsigned)ei->CommandStatus, (unsigned)ei->ScsiStatus); +} + +/* + * Completion routine for commands submitted to the controller via the SCSI + * framework. + */ +void +smrt_hba_complete(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + ErrorInfo_t *ei = smcm->smcm_va_err; + struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + pkt->pkt_resid = ei->ResidualCnt; + + /* + * Check if the controller was reset while this packet was in flight. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + if (pkt->pkt_reason != CMD_CMPLT) { + /* + * If another error status has already been written, + * do not overwrite it. + */ + pkt->pkt_reason = CMD_RESET; + } + pkt->pkt_statistics |= STAT_BUS_RESET | STAT_DEV_RESET; + goto finish; + } + + if (!(smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * The command was completed without error by the controller. + * + * As per the specification, if an error was not signalled + * by the controller through the CISS transport method, + * the error information (including CommandStatus) has not + * been written and should not be checked. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + goto finish; + } + + /* + * Check the completion status to determine what befell this request. + */ + switch (ei->CommandStatus) { + case CISS_CMD_SUCCESS: + /* + * In a certain sense, the specification contradicts itself. + * On the one hand, it suggests that a successful command + * will not result in a controller write to the error + * information block; on the other hand, it makes room + * for a status code (0) which denotes a successful + * execution. + * + * To be on the safe side, we check for that condition here. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_DATA_UNDERRUN: + /* + * A data underrun occurred. Ideally this will result in + * an appropriate SCSI status and sense data. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_TARGET_STATUS: + /* + * The command completed, but an error occurred. We need + * to provide the sense data to the SCSI framework. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_DATA_OVERRUN: + /* + * Data overrun has occurred. + */ + smrt_hba_complete_log_error(smcm, "data overrun"); + pkt->pkt_reason = CMD_DATA_OVR; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_INVALID: + /* + * One or more fields in the command has invalid data. + */ + smrt_hba_complete_log_error(smcm, "invalid command"); + pkt->pkt_reason = CMD_BADMSG; + pkt->pkt_state |= STATE_GOT_STATUS; + break; + + case CISS_CMD_PROTOCOL_ERR: + /* + * An error occurred in communication with the end device. + */ + smrt_hba_complete_log_error(smcm, "protocol error"); + pkt->pkt_reason = CMD_BADMSG; + pkt->pkt_state |= STATE_GOT_STATUS; + break; + + case CISS_CMD_HARDWARE_ERR: + /* + * A hardware error occurred. + */ + smrt_hba_complete_log_error(smcm, "hardware error"); + pkt->pkt_reason = CMD_INCOMPLETE; + break; + + case CISS_CMD_CONNECTION_LOST: + /* + * The connection with the end device cannot be + * re-established. + */ + smrt_hba_complete_log_error(smcm, "connection lost"); + pkt->pkt_reason = CMD_INCOMPLETE; + break; + + case CISS_CMD_ABORTED: + case CISS_CMD_UNSOLICITED_ABORT: + if (smcm->smcm_status & SMRT_CMD_STATUS_TIMEOUT) { + /* + * This abort was arranged by the periodic routine + * in response to an elapsed timeout. + */ + pkt->pkt_reason = CMD_TIMEOUT; + pkt->pkt_statistics |= STAT_TIMEOUT; + } else { + pkt->pkt_reason = CMD_ABORTED; + } + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + pkt->pkt_statistics |= STAT_ABORTED; + break; + + case CISS_CMD_TIMEOUT: + smrt_hba_complete_log_error(smcm, "timeout"); + pkt->pkt_reason = CMD_TIMEOUT; + pkt->pkt_statistics |= STAT_TIMEOUT; + break; + + default: + /* + * This is an error that we were not prepared to handle. + * Signal a generic transport-level error to the framework. + */ + smrt_hba_complete_log_error(smcm, "unexpected error"); + pkt->pkt_reason = CMD_TRAN_ERR; + } + + /* + * Attempt to read a SCSI status code and any automatic + * request sense data that may exist: + */ + smrt_hba_complete_status(smcm); + +finish: + mutex_exit(&smrt->smrt_mutex); + scsi_hba_pkt_comp(pkt); + mutex_enter(&smrt->smrt_mutex); +} + +static int +smrt_getcap(struct scsi_address *sa, char *cap, int whom) +{ + _NOTE(ARGUNUSED(whom)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + int index; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + + if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) { + /* + * This capability string could not be translated to an + * ID number, so it must not exist. + */ + return (-1); + } + + switch (index) { + case SCSI_CAP_CDB_LEN: + /* + * The CDB field in the CISS request block is fixed at 16 + * bytes. + */ + return (CISS_CDBLEN); + + case SCSI_CAP_DMA_MAX: + if (smrt->smrt_dma_attr.dma_attr_maxxfer > INT_MAX) { + return (INT_MAX); + } + return ((int)smrt->smrt_dma_attr.dma_attr_maxxfer); + + case SCSI_CAP_SECTOR_SIZE: + if (smrt->smrt_dma_attr.dma_attr_granular > INT_MAX) { + return (-1); + } + return ((int)smrt->smrt_dma_attr.dma_attr_granular); + + /* + * If this target corresponds to a physical device, then we always + * indicate that we're on a SAS interconnect. Otherwise, we default to + * saying that we're on a parallel bus. We can't use SAS for + * everything, unfortunately. When you declare yourself to be a SAS + * interconnect, it's expected that you have a full 16-byte WWN as the + * target. If not, devfsadm will not be able to enumerate the device + * and create /dev/[r]dsk entries. + */ + case SCSI_CAP_INTERCONNECT_TYPE: + if (smtg->smtg_physical) { + return (INTERCONNECT_SAS); + } else { + return (INTERCONNECT_PARALLEL); + } + + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_ARQ: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_TAGGED_QING: + /* + * These capabilities are supported by the driver and the + * controller. See scsi_ifgetcap(9F) for more information. + */ + return (1); + + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_RESET_NOTIFICATION: + /* + * These capabilities are not supported. + */ + return (0); + + default: + /* + * The property in question is not known to this driver. + */ + return (-1); + } +} + +/* ARGSUSED */ +static int +smrt_setcap(struct scsi_address *sa, char *cap, int value, int whom) +{ + int index; + + if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) { + /* + * This capability string could not be translated to an + * ID number, so it must not exist. + */ + return (-1); + } + + if (whom == 0) { + /* + * When whom is 0, this is a request to set a capability for + * all targets. As per the recommendation in tran_setcap(9E), + * we do not support this mode of operation. + */ + return (-1); + } + + switch (index) { + case SCSI_CAP_CDB_LEN: + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_SECTOR_SIZE: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_ARQ: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_TAGGED_QING: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_INTERCONNECT_TYPE: + /* + * We do not support changing any capabilities at this time. + */ + return (0); + + default: + /* + * The capability in question is not known to this driver. + */ + return (-1); + } +} + +int +smrt_ctrl_hba_setup(smrt_t *smrt) +{ + int flags; + dev_info_t *dip = smrt->smrt_dip; + scsi_hba_tran_t *tran; + + if ((tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate SCSA resources"); + return (DDI_FAILURE); + } + + smrt->smrt_hba_tran = tran; + tran->tran_hba_private = smrt; + + tran->tran_tgt_init = smrt_ctrl_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + + tran->tran_start = smrt_ctrl_tran_start; + + tran->tran_getcap = smrt_getcap; + tran->tran_setcap = smrt_setcap; + + tran->tran_setup_pkt = smrt_tran_setup_pkt; + tran->tran_teardown_pkt = smrt_tran_teardown_pkt; + tran->tran_hba_len = sizeof (smrt_command_scsa_t); + tran->tran_interconnect_type = INTERCONNECT_SAS; + + flags = SCSI_HBA_HBA | SCSI_HBA_TRAN_SCB | SCSI_HBA_ADDR_COMPLEX; + if (scsi_hba_attach_setup(dip, &smrt->smrt_dma_attr, tran, flags) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not attach to SCSA framework"); + scsi_hba_tran_free(tran); + return (DDI_FAILURE); + } + + smrt->smrt_init_level |= SMRT_INITLEVEL_SCSA; + return (DDI_SUCCESS); +} + +void +smrt_ctrl_hba_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_SCSA) { + VERIFY(scsi_hba_detach(smrt->smrt_dip) != DDI_FAILURE); + scsi_hba_tran_free(smrt->smrt_hba_tran); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_SCSA; + } +} + +int +smrt_logvol_hba_setup(smrt_t *smrt, dev_info_t *iport) +{ + scsi_hba_tran_t *tran; + + tran = ddi_get_driver_private(iport); + if (tran == NULL) + return (DDI_FAILURE); + + tran->tran_tgt_init = smrt_logvol_tran_tgt_init; + tran->tran_tgt_free = smrt_logvol_tran_tgt_free; + + tran->tran_start = smrt_tran_start; + tran->tran_reset = smrt_tran_reset; + tran->tran_abort = smrt_tran_abort; + + tran->tran_hba_private = smrt; + + mutex_enter(&smrt->smrt_mutex); + if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC, + 2 * MICROSEC, smrt, smrt_logvol_tgtmap_activate, + smrt_logvol_tgtmap_deactivate, &smrt->smrt_virt_tgtmap) != + DDI_SUCCESS) { + return (DDI_FAILURE); + } + + smrt_discover_request(smrt); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +void +smrt_logvol_hba_teardown(smrt_t *smrt, dev_info_t *iport) +{ + ASSERT(smrt->smrt_virt_iport == iport); + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_virt_tgtmap != NULL) { + scsi_hba_tgtmap_t *t; + + /* + * Ensure that we can't be racing with discovery. + */ + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + t = smrt->smrt_virt_tgtmap; + smrt->smrt_virt_tgtmap = NULL; + mutex_exit(&smrt->smrt_mutex); + scsi_hba_tgtmap_destroy(t); + mutex_enter(&smrt->smrt_mutex); + } + + mutex_exit(&smrt->smrt_mutex); +} + +int +smrt_phys_hba_setup(smrt_t *smrt, dev_info_t *iport) +{ + scsi_hba_tran_t *tran; + + tran = ddi_get_driver_private(iport); + if (tran == NULL) + return (DDI_FAILURE); + + tran->tran_tgt_init = smrt_phys_tran_tgt_init; + tran->tran_tgt_free = smrt_phys_tran_tgt_free; + + tran->tran_start = smrt_tran_start; + tran->tran_reset = smrt_tran_reset; + tran->tran_abort = smrt_tran_abort; + + tran->tran_hba_private = smrt; + + mutex_enter(&smrt->smrt_mutex); + if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC, + 2 * MICROSEC, smrt, smrt_phys_tgtmap_activate, + smrt_phys_tgtmap_deactivate, &smrt->smrt_phys_tgtmap) != + DDI_SUCCESS) { + return (DDI_FAILURE); + } + + smrt_discover_request(smrt); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +void +smrt_phys_hba_teardown(smrt_t *smrt, dev_info_t *iport) +{ + ASSERT(smrt->smrt_phys_iport == iport); + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_phys_tgtmap != NULL) { + scsi_hba_tgtmap_t *t; + + /* + * Ensure that we can't be racing with discovery. + */ + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + t = smrt->smrt_phys_tgtmap; + smrt->smrt_phys_tgtmap = NULL; + mutex_exit(&smrt->smrt_mutex); + scsi_hba_tgtmap_destroy(t); + mutex_enter(&smrt->smrt_mutex); + } + + mutex_exit(&smrt->smrt_mutex); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c new file mode 100644 index 0000000000..18d5b8e936 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c @@ -0,0 +1,286 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static char * +smrt_interrupt_type_name(int type) +{ + switch (type) { + case DDI_INTR_TYPE_MSIX: + return ("MSI-X"); + case DDI_INTR_TYPE_MSI: + return ("MSI"); + case DDI_INTR_TYPE_FIXED: + return ("fixed"); + default: + return ("?"); + } +} + +static boolean_t +smrt_try_msix(smrt_t *smrt) +{ + char *fwver = smrt->smrt_versions.smrtv_firmware_rev; + + /* + * Generation 9 controllers end up having a different firmware + * versioning scheme than others. If this is a generation 9 controller, + * which all share the same PCI device ID, then we default to MSI. + */ + if (smrt->smrt_pci_vendor == SMRT_VENDOR_HP && + smrt->smrt_pci_device == SMRT_DEVICE_GEN9) { + return (B_FALSE); + } + + if (fwver[0] == '8' && fwver[1] == '.' && isdigit(fwver[2]) && + isdigit(fwver[3])) { + /* + * Version 8.00 of the Smart Array firmware appears to have + * broken MSI support on at least one controller. We could + * blindly try MSI-X everywhere, except that on at least some + * 6.XX firmware versions, MSI-X interrupts do not appear + * to be triggered for Simple Transport Method command + * completions. + * + * For now, assume we should try for MSI-X with all 8.XX + * versions of the firmware. + */ + dev_err(smrt->smrt_dip, CE_NOTE, "!trying MSI-X interrupts " + "to work around 8.XX firmware defect"); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +smrt_interrupts_disable(smrt_t *smrt) +{ + if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + return (ddi_intr_block_disable(smrt->smrt_interrupts, + smrt->smrt_ninterrupts)); + } else { + VERIFY3S(smrt->smrt_ninterrupts, ==, 1); + + return (ddi_intr_disable(smrt->smrt_interrupts[0])); + } +} + +int +smrt_interrupts_enable(smrt_t *smrt) +{ + int ret; + + VERIFY(!(smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED)); + + if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + ret = ddi_intr_block_enable(smrt->smrt_interrupts, + smrt->smrt_ninterrupts); + } else { + VERIFY3S(smrt->smrt_ninterrupts, ==, 1); + + ret = ddi_intr_enable(smrt->smrt_interrupts[0]); + } + + if (ret == DDI_SUCCESS) { + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ENABLED; + } + + return (ret); +} + +static void +smrt_interrupts_free(smrt_t *smrt) +{ + for (int i = 0; i < smrt->smrt_ninterrupts; i++) { + (void) ddi_intr_free(smrt->smrt_interrupts[i]); + } + smrt->smrt_ninterrupts = 0; + smrt->smrt_interrupt_type = 0; + smrt->smrt_interrupt_cap = 0; + smrt->smrt_interrupt_pri = 0; +} + +static int +smrt_interrupts_alloc(smrt_t *smrt, int type) +{ + dev_info_t *dip = smrt->smrt_dip; + int nintrs = 0; + int navail = 0; + + if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count %s interrupts", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (nintrs < 1) { + dev_err(dip, CE_WARN, "no %s interrupts supported", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + + if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count available %s " + "interrupts", smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (navail < 1) { + dev_err(dip, CE_WARN, "no %s interrupts available", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + + if (ddi_intr_alloc(dip, smrt->smrt_interrupts, type, 0, 1, + &smrt->smrt_ninterrupts, DDI_INTR_ALLOC_STRICT) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "%s interrupt allocation failed", + smrt_interrupt_type_name(type)); + smrt_interrupts_free(smrt); + return (DDI_FAILURE); + } + + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ALLOC; + smrt->smrt_interrupt_type = type; + return (DDI_SUCCESS); +} + +int +smrt_interrupts_setup(smrt_t *smrt) +{ + int types; + unsigned ipri; + uint_t (*hw_isr)(caddr_t, caddr_t); + dev_info_t *dip = smrt->smrt_dip; + + /* + * Select the correct hardware interrupt service routine for the + * Transport Method we have configured: + */ + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + hw_isr = smrt_isr_hw_simple; + break; + default: + panic("unknown controller mode"); + } + + if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not get support interrupts"); + goto fail; + } + + /* + * At least one firmware version has been released for the Smart Array + * line with entirely defective MSI support. The specification is + * somewhat unclear on the precise nature of MSI-X support with Smart + * Array controllers, particularly with respect to the Simple Transport + * Method, but for those broken firmware versions we need to try + * anyway. + */ + if (smrt_try_msix(smrt) && (types & DDI_INTR_TYPE_MSIX)) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSIX) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * If MSI-X is not available, or not expected to work, fall back to + * MSI. + */ + if (types & DDI_INTR_TYPE_MSI) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSI) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * If neither MSI-X nor MSI is available, fall back to fixed + * interrupts. Note that the use of fixed interrupts has been + * observed, with some combination of controllers and systems, to + * result in interrupts stopping completely at random times. + */ + if (types & DDI_INTR_TYPE_FIXED) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_FIXED) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * We were unable to allocate any interrupts. + */ + dev_err(dip, CE_WARN, "interrupt allocation failed"); + goto fail; + +add_handler: + /* + * Ensure that we have not been given a high-level interrupt, as our + * interrupt handlers do not support them. + */ + if (ddi_intr_get_pri(smrt->smrt_interrupts[0], &ipri) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not determine interrupt priority"); + goto fail; + } + if (ipri >= ddi_intr_get_hilevel_pri()) { + dev_err(dip, CE_WARN, "high level interrupts not supported"); + goto fail; + } + smrt->smrt_interrupt_pri = ipri; + + if (ddi_intr_get_cap(smrt->smrt_interrupts[0], + &smrt->smrt_interrupt_cap) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not get %s interrupt cap", + smrt_interrupt_type_name(smrt->smrt_interrupt_type)); + goto fail; + } + + if (ddi_intr_add_handler(smrt->smrt_interrupts[0], hw_isr, + (caddr_t)smrt, NULL) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "adding %s interrupt failed", + smrt_interrupt_type_name(smrt->smrt_interrupt_type)); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ADDED; + + return (DDI_SUCCESS); + +fail: + smrt_interrupts_teardown(smrt); + return (DDI_FAILURE); +} + +void +smrt_interrupts_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED) { + (void) smrt_interrupts_disable(smrt); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ENABLED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ADDED) { + (void) ddi_intr_remove_handler(smrt->smrt_interrupts[0]); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ADDED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ALLOC) { + smrt_interrupts_free(smrt); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ALLOC; + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c new file mode 100644 index 0000000000..05963ac2e2 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c @@ -0,0 +1,367 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static void +smrt_logvol_free(smrt_volume_t *smlv) +{ + /* + * By this stage of teardown, all of the SCSI target drivers + * must have been detached from this logical volume. + */ + VERIFY(list_is_empty(&smlv->smlv_targets)); + list_destroy(&smlv->smlv_targets); + + kmem_free(smlv, sizeof (*smlv)); +} + +smrt_volume_t * +smrt_logvol_lookup_by_id(smrt_t *smrt, unsigned long id) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + for (smrt_volume_t *smlv = list_head(&smrt->smrt_volumes); + smlv != NULL; smlv = list_next(&smrt->smrt_volumes, smlv)) { + if (smlv->smlv_addr.LogDev.VolId == id) { + return (smlv); + } + } + + return (NULL); +} + +static int +smrt_read_logvols(smrt_t *smrt, smrt_report_logical_lun_t *smrll, uint64_t gen) +{ + smrt_report_logical_lun_ent_t *ents = smrll->smrll_data.ents; + uint32_t count = BE_32(smrll->smrll_datasize) / + sizeof (smrt_report_logical_lun_ent_t); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_LOGDRV) { + count = SMRT_MAX_LOGDRV; + } + + for (unsigned i = 0; i < count; i++) { + smrt_volume_t *smlv; + char id[SCSI_MAXNAMELEN]; + + DTRACE_PROBE2(read_logvol, unsigned, i, + smrt_report_logical_lun_ent_t *, &ents[i]); + + if ((smlv = smrt_logvol_lookup_by_id(smrt, + ents[i].smrle_addr.VolId)) == NULL) { + + /* + * This is a new Logical Volume, so add it the the list. + */ + if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) == + NULL) { + return (ENOMEM); + } + + list_create(&smlv->smlv_targets, + sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + + smlv->smlv_ctlr = smrt; + list_insert_tail(&smrt->smrt_volumes, smlv); + } + + /* + * Always make sure that the address and the generation are up + * to date, regardless of where this came from. + */ + smlv->smlv_addr.LogDev = ents[i].smrle_addr; + smlv->smlv_gen = gen; + (void) snprintf(id, sizeof (id), "%x", + smlv->smlv_addr.LogDev.VolId); + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap, + SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +static int +smrt_read_logvols_ext(smrt_t *smrt, smrt_report_logical_lun_t *smrll, + uint64_t gen) +{ + smrt_report_logical_lun_extent_t *extents = + smrll->smrll_data.extents; + uint32_t count = BE_32(smrll->smrll_datasize) / + sizeof (smrt_report_logical_lun_extent_t); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_LOGDRV) { + count = SMRT_MAX_LOGDRV; + } + + for (unsigned i = 0; i < count; i++) { + smrt_volume_t *smlv; + char id[SCSI_MAXNAMELEN]; + + DTRACE_PROBE2(read_logvol_ext, unsigned, i, + smrt_report_logical_lun_extent_t *, &extents[i]); + + if ((smlv = smrt_logvol_lookup_by_id(smrt, + extents[i].smrle_addr.VolId)) != NULL) { + if ((smlv->smlv_flags & SMRT_VOL_FLAG_WWN) && + bcmp(extents[i].smrle_wwn, smlv->smlv_wwn, + 16) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "logical " + "volume %u WWN changed unexpectedly", i); + } + } else { + /* + * This is a new Logical Volume, so add it the the list. + */ + if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) == + NULL) { + return (ENOMEM); + } + + bcopy(extents[i].smrle_wwn, smlv->smlv_wwn, 16); + smlv->smlv_flags |= SMRT_VOL_FLAG_WWN; + + list_create(&smlv->smlv_targets, + sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + + smlv->smlv_ctlr = smrt; + list_insert_tail(&smrt->smrt_volumes, smlv); + } + + /* + * Always make sure that the address and the generation are up + * to date. The address may have changed on a reset. + */ + smlv->smlv_addr.LogDev = extents[i].smrle_addr; + smlv->smlv_gen = gen; + (void) snprintf(id, sizeof (id), "%x", + smlv->smlv_addr.LogDev.VolId); + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap, + SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +/* + * Discover the currently visible set of Logical Volumes exposed by the + * controller. + */ +int +smrt_logvol_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen) +{ + smrt_command_t *smcm; + smrt_report_logical_lun_t *smrll; + smrt_report_logical_lun_req_t smrllr = { 0 }; + int r; + + /* + * Allocate the command to send to the device, including buffer space + * for the returned list of Logical Volumes. + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (smrt_report_logical_lun_t), KM_NOSLEEP) != 0) { + r = ENOMEM; + mutex_enter(&smrt->smrt_mutex); + goto out; + } + + smrll = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrllr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * The Report Logical LUNs command is essentially a vendor-specific + * SCSI command, which we assemble into the CDB region of the command + * block. + */ + bzero(&smrllr, sizeof (smrllr)); + smrllr.smrllr_opcode = CISS_SCMD_REPORT_LOGICAL_LUNS; + smrllr.smrllr_extflag = 1; + smrllr.smrllr_datasize = htonl(sizeof (smrt_report_logical_lun_t)); + bcopy(&smrllr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smrllr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "logical volume " + "discovery error: status 0x%x", ei->CommandStatus); + r = EIO; + goto out; + } + } + + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_begin(smrt->smrt_virt_tgtmap) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map " + "observation on %s", SMRT_IPORT_VIRT); + r = EIO; + goto out; + } + + if ((smrll->smrll_extflag & 0x1) != 0) { + r = smrt_read_logvols_ext(smrt, smrll, gen); + } else { + r = smrt_read_logvols(smrt, smrll, gen); + } + + if (r == 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_end(smrt->smrt_virt_tgtmap, 0) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_VIRT); + r = EIO; + } + } else if (r != 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_flush(smrt->smrt_virt_tgtmap) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_VIRT); + r = EIO; + } + } + + if (r == 0) { + /* + * Update the time of the last successful Logical Volume + * discovery: + */ + smrt->smrt_last_log_discovery = gethrtime(); + } + +out: + mutex_exit(&smrt->smrt_mutex); + + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +void +smrt_logvol_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void **privpp) +{ + smrt_t *smrt = arg; + unsigned long volume; + char *eptr; + + VERIFY(type == SCSI_TGT_SCSI_DEVICE); + VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume)); + VERIFY3S(*eptr, ==, '\0'); + VERIFY3S(volume, >=, 0); + VERIFY3S(volume, <, SMRT_MAX_LOGDRV); + mutex_enter(&smrt->smrt_mutex); + VERIFY(smrt_logvol_lookup_by_id(smrt, volume) != NULL); + mutex_exit(&smrt->smrt_mutex); + *privpp = NULL; +} + +boolean_t +smrt_logvol_tgtmap_deactivate(void *arg, char *addr, + scsi_tgtmap_tgt_type_t type, void *priv, scsi_tgtmap_deact_rsn_t reason) +{ + smrt_t *smrt = arg; + smrt_volume_t *smlv; + unsigned long volume; + char *eptr; + + VERIFY(type == SCSI_TGT_SCSI_DEVICE); + VERIFY(priv == NULL); + VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume)); + VERIFY3S(*eptr, ==, '\0'); + VERIFY3S(volume, >=, 0); + VERIFY3S(volume, <, SMRT_MAX_LOGDRV); + + mutex_enter(&smrt->smrt_mutex); + smlv = smrt_logvol_lookup_by_id(smrt, volume); + VERIFY(smlv != NULL); + + list_remove(&smrt->smrt_volumes, smlv); + smrt_logvol_free(smlv); + mutex_exit(&smrt->smrt_mutex); + + return (B_FALSE); +} + +void +smrt_logvol_teardown(smrt_t *smrt) +{ + smrt_volume_t *smlv; + + while ((smlv = list_remove_head(&smrt->smrt_volumes)) != NULL) { + smrt_logvol_free(smlv); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c new file mode 100644 index 0000000000..8ab3927673 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c @@ -0,0 +1,613 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static void +smrt_physical_free(smrt_physical_t *smpt) +{ + VERIFY(list_is_empty(&smpt->smpt_targets)); + VERIFY(smpt->smpt_info != NULL); + + kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info)); + list_destroy(&smpt->smpt_targets); + kmem_free(smpt, sizeof (*smpt)); +} + +/* + * Determine if a physical device enumerated should be shown to the world. There + * are three conditions to satisfy for this to be true. + * + * 1. The device (SAS, SATA, SES, etc.) must not have a masked CISS address. A + * masked CISS address indicates a device that we should not be performing I/O + * to. + * 2. The drive (SAS or SATA device) must not be marked as a member of a logical + * volume. + * 3. The drive (SAS or SATA device) must not be marked as a spare. + */ +static boolean_t +smrt_physical_visible(PhysDevAddr_t *addr, smrt_identify_physical_drive_t *info) +{ + if (addr->Mode == SMRT_CISS_MODE_MASKED) { + return (B_FALSE); + } + + if ((info->sipd_more_flags & (SMRT_MORE_FLAGS_LOGVOL | + SMRT_MORE_FLAGS_SPARE)) != 0) { + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Note, the caller is responsible for making sure that the unit-address form of + * the WWN is pased in. Any additional information to target a specific LUN + * will be ignored. + */ +smrt_physical_t * +smrt_phys_lookup_by_ua(smrt_t *smrt, const char *ua) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Sanity check that the caller has provided us enough bytes for a + * properly formed unit-address form of a WWN. + */ + if (strlen(ua) < SCSI_WWN_UA_STRLEN) + return (NULL); + + for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals); + smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) { + char wwnstr[SCSI_WWN_BUFLEN]; + + (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, wwnstr); + if (strncmp(wwnstr, ua, SCSI_WWN_UA_STRLEN) != 0) + continue; + + /* + * Verify that the UA string is either a comma or null there. + * We accept the comma in case it's being used as part of a + * normal UA with a LUN. + */ + if (ua[SCSI_WWN_UA_STRLEN] != '\0' && + ua[SCSI_WWN_UA_STRLEN] != ',') { + continue; + } + + return (smpt); + } + + return (NULL); +} + +static smrt_physical_t * +smrt_phys_lookup_by_wwn(smrt_t *smrt, uint64_t wwn) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals); + smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) { + if (wwn == smpt->smpt_wwn) + return (smpt); + } + + return (NULL); +} + +static int +smrt_phys_identify(smrt_t *smrt, smrt_identify_physical_drive_t *info, + uint16_t bmic, uint16_t timeout) +{ + smrt_command_t *smcm = NULL; + smrt_identify_physical_drive_t *sipd; + smrt_identify_physical_drive_req_t sipdr; + int ret; + size_t sz, copysz; + + sz = sizeof (smrt_identify_physical_drive_t); + sz = P2ROUNDUP_TYPED(sz, 512, size_t); + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (*sipd), KM_NOSLEEP) != 0) { + ret = ENOMEM; + goto out; + } + + sipd = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (sipdr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * Construct the IDENTIFY PHYSICAL DEVICE request CDB. Note that any + * reserved fields in the request must be filled with zeroes. + */ + bzero(&sipdr, sizeof (sipdr)); + sipdr.sipdr_opcode = CISS_SCMD_BMIC_READ; + sipdr.sipdr_lun = 0; + sipdr.sipdr_bmic_index1 = bmic & 0x00ff; + sipdr.sipdr_command = CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE; + sipdr.sipdr_bmic_index2 = (bmic & 0xff00) >> 8; + bcopy(&sipdr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (sipdr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((ret = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((ret = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(ret, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + mutex_exit(&smrt->smrt_mutex); + goto out; + } + mutex_exit(&smrt->smrt_mutex); + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * physical volumes. Report failure. + */ + ret = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "identify physical " + "device error: status 0x%x", ei->CommandStatus); + ret = EIO; + goto out; + } + + copysz = MIN(sizeof (*sipd), sz - ei->ResidualCnt); + } else { + copysz = sizeof (*sipd); + } + + + sz = MIN(sizeof (*sipd), copysz); + bcopy(sipd, info, sizeof (*sipd)); + + ret = 0; +out: + if (smcm != NULL) { + smrt_command_free(smcm); + } + + return (ret); +} + +static int +smrt_read_phys_ext(smrt_t *smrt, smrt_report_physical_lun_t *smrpl, + uint16_t timeout, uint64_t gen) +{ + smrt_report_physical_lun_extent_t *extents = smrpl->smrpl_data.extents; + uint32_t count = BE_32(smrpl->smrpl_datasize) / + sizeof (smrt_report_physical_lun_extent_t); + uint32_t i; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_PHYSDEV) { + count = SMRT_MAX_PHYSDEV; + } + + for (i = 0; i < count; i++) { + int ret; + smrt_physical_t *smpt; + smrt_identify_physical_drive_t *info; + smrt_report_physical_opdi_t *opdi; + uint16_t bmic; + uint64_t wwn, satawwn; + char name[SCSI_MAXNAMELEN]; + + opdi = &extents[i].srple_extdata.srple_opdi; + + mutex_exit(&smrt->smrt_mutex); + + /* + * Get the extended information about this device. + */ + info = kmem_zalloc(sizeof (*info), KM_NOSLEEP); + if (info == NULL) { + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + + bmic = smrt_lun_addr_to_bmic(&extents[i].srple_addr); + ret = smrt_phys_identify(smrt, info, bmic, timeout); + if (ret != 0) { + mutex_enter(&smrt->smrt_mutex); + kmem_free(info, sizeof (*info)); + return (ret); + } + + wwn = *(uint64_t *)opdi->srpo_wwid; + wwn = BE_64(wwn); + + /* + * SATA devices may not have a proper WWN returned from firmware + * based on the SATL specification. Try to fetch the proper id + * for SATA devices, if the drive has one. If the drive doesn't + * have one or the SATL refuses to give us one, we use whatever + * the controller told us. + */ + if (opdi->srpo_dtype == SMRT_DTYPE_SATA && + smrt_sata_determine_wwn(smrt, &extents[i].srple_addr, + &satawwn, timeout) == 0) { + wwn = satawwn; + } + + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_wwn(smrt, wwn); + if (smpt != NULL) { + /* + * Sanity check that the model and serial number of this + * device is the same for this WWN. If it's not, the + * controller is probably lying about something. + */ + if (bcmp(smpt->smpt_info->sipd_model, info->sipd_model, + sizeof (info->sipd_model)) != 0 || + bcmp(smpt->smpt_info->sipd_serial, + info->sipd_serial, sizeof (info->sipd_serial)) != + 0 || smpt->smpt_dtype != opdi->srpo_dtype) { + dev_err(smrt->smrt_dip, CE_PANIC, "physical " + "target with wwn 0x%" PRIx64 " changed " + "model, serial, or type unexpectedly: " + "smrt_physical_t %p, phys info: %p", wwn, + smpt, info); + } + + /* + * When panicking, we don't allow a device's visibility + * to change to being invisible and be able to actually + * panic. We only worry about devices which are used + * for I/O. We purposefully ignore SES devices. + */ + if (ddi_in_panic() && + (opdi->srpo_dtype == SMRT_DTYPE_SATA || + opdi->srpo_dtype == SMRT_DTYPE_SAS)) { + boolean_t visible; + + visible = smrt_physical_visible( + &smpt->smpt_addr.PhysDev, smpt->smpt_info); + + if (visible != smpt->smpt_visible) { + dev_err(smrt->smrt_dip, CE_PANIC, + "physical target with wwn 0x%" + PRIx64 " changed visibility status " + "unexpectedly", wwn); + } + } + + kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info)); + smpt->smpt_info = NULL; + } else { + smpt = kmem_zalloc(sizeof (smrt_physical_t), + KM_NOSLEEP); + if (smpt == NULL) { + kmem_free(info, sizeof (*info)); + return (ENOMEM); + } + + smpt->smpt_wwn = wwn; + smpt->smpt_dtype = opdi->srpo_dtype; + list_create(&smpt->smpt_targets, sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + smpt->smpt_ctlr = smrt; + list_insert_tail(&smrt->smrt_physicals, smpt); + } + + VERIFY3P(smpt->smpt_info, ==, NULL); + + /* + * Determine if this device is supported and if it's visible to + * the system. Some devices may not be visible to the system + * because they're used in logical volumes or spares. + * Unsupported devices are also not visible. + */ + switch (smpt->smpt_dtype) { + case SMRT_DTYPE_SATA: + case SMRT_DTYPE_SAS: + smpt->smpt_supported = B_TRUE; + smpt->smpt_visible = + smrt_physical_visible(&extents[i].srple_addr, info); + break; + case SMRT_DTYPE_SES: + smpt->smpt_supported = B_TRUE; + smpt->smpt_visible = + smrt_physical_visible(&extents[i].srple_addr, info); + break; + default: + smpt->smpt_visible = B_FALSE; + smpt->smpt_supported = B_FALSE; + } + + smpt->smpt_info = info; + smpt->smpt_addr.PhysDev = extents[i].srple_addr; + smpt->smpt_bmic = bmic; + smpt->smpt_gen = gen; + (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, name); + if (!ddi_in_panic() && smpt->smpt_visible && + scsi_hba_tgtmap_set_add(smrt->smrt_phys_tgtmap, + SCSI_TGT_SCSI_DEVICE, name, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +int +smrt_phys_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen) +{ + smrt_command_t *smcm; + smrt_report_physical_lun_t *smrpl; + smrt_report_physical_lun_req_t smrplr; + int r; + + /* + * Allocate the command to send to the device, including buffer space + * for the returned list of Physical Volumes. + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (*smrpl), KM_NOSLEEP) != 0) { + r = ENOMEM; + mutex_enter(&smrt->smrt_mutex); + goto out; + } + + smrpl = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrplr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * The Report Physical LUNs command is essentially a vendor-specific + * SCSI command, which we assemble into the CDB region of the command + * block. + */ + bzero(&smrplr, sizeof (smrplr)); + smrplr.smrplr_opcode = CISS_SCMD_REPORT_PHYSICAL_LUNS; + smrplr.smrplr_extflag = SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI; + smrplr.smrplr_datasize = BE_32(sizeof (smrt_report_physical_lun_t)); + bcopy(&smrplr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smrplr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "physical target " + "discovery error: status 0x%x", ei->CommandStatus); + r = EIO; + goto out; + } + } + + /* + * If the controller doesn't support extended physical reporting, it + * likely doesn't even support physical devices that we'd care about + * exposing. As such, we treat this as an OK case. + */ + if ((smrpl->smrpl_extflag & SMRT_REPORT_PHYSICAL_LUN_EXT_MASK) != + SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI) { + r = 0; + goto out; + } + + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_begin(smrt->smrt_phys_tgtmap) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map " + "observation on %s", SMRT_IPORT_PHYS); + r = EIO; + goto out; + } + + r = smrt_read_phys_ext(smrt, smrpl, timeout, gen); + + if (r == 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_end(smrt->smrt_phys_tgtmap, 0) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_PHYS); + r = EIO; + } + } else if (r != 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_flush(smrt->smrt_phys_tgtmap) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_PHYS); + r = EIO; + } + } + + if (r == 0) { + smrt_physical_t *smpt, *next; + + /* + * Prune physical devices that do not match the current + * generation and are not marked as visible devices. Visible + * devices will be dealt with as part of the target map work. + */ + for (smpt = list_head(&smrt->smrt_physicals), next = NULL; + smpt != NULL; smpt = next) { + next = list_next(&smrt->smrt_physicals, smpt); + if (smpt->smpt_visible || smpt->smpt_gen == gen) + continue; + list_remove(&smrt->smrt_physicals, smpt); + smrt_physical_free(smpt); + } + + /* + * Update the time of the last successful Physical Volume + * discovery: + */ + smrt->smrt_last_phys_discovery = gethrtime(); + + /* + * Now, for each unsupported device that we haven't warned about + * encountering, try and give the administrator some hope of + * knowing about this. + */ + for (smpt = list_head(&smrt->smrt_physicals), next = NULL; + smpt != NULL; smpt = next) { + if (smpt->smpt_supported || smpt->smpt_unsup_warn) + continue; + smpt->smpt_unsup_warn = B_TRUE; + dev_err(smrt->smrt_dip, CE_WARN, "encountered " + "unsupported device with device type %d", + smpt->smpt_dtype); + } + } + +out: + mutex_exit(&smrt->smrt_mutex); + + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +void +smrt_phys_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void **privpp) +{ + smrt_t *smrt = arg; + smrt_physical_t *smpt; + + VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE); + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_ua(smrt, addr); + VERIFY(smpt != NULL); + VERIFY(smpt->smpt_supported); + VERIFY(smpt->smpt_visible); + *privpp = NULL; + mutex_exit(&smrt->smrt_mutex); +} + +boolean_t +smrt_phys_tgtmap_deactivate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void *priv, scsi_tgtmap_deact_rsn_t reason) +{ + smrt_t *smrt = arg; + smrt_physical_t *smpt; + + VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE); + VERIFY3P(priv, ==, NULL); + + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_ua(smrt, addr); + + /* + * If the device disappeared or became invisible, then it may have + * already been removed. + */ + if (smpt == NULL || !smpt->smpt_visible) { + mutex_exit(&smrt->smrt_mutex); + return (B_FALSE); + } + + list_remove(&smrt->smrt_physicals, smpt); + smrt_physical_free(smpt); + mutex_exit(&smrt->smrt_mutex); + return (B_FALSE); +} + +void +smrt_phys_teardown(smrt_t *smrt) +{ + smrt_physical_t *smpt; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + while ((smpt = list_remove_head(&smrt->smrt_physicals)) != NULL) { + smrt_physical_free(smpt); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c new file mode 100644 index 0000000000..6224b97732 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c @@ -0,0 +1,160 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Collection of routines specific to SATA devices and attempting to make them + * work. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * This is a buffer size that should easily cover all of the data that we need + * to properly determine the buffer allocation. + */ +#define SMRT_SATA_INQ83_LEN 256 + +/* + * We need to try and determine if a SATA WWN exists on the device. SAT-2 + * defines that the response to the inquiry page 0x83. + */ +int +smrt_sata_determine_wwn(smrt_t *smrt, PhysDevAddr_t *addr, uint64_t *wwnp, + uint16_t timeout) +{ + smrt_command_t *smcm; + int r; + uint8_t *inq; + uint64_t wwn; + size_t resid; + + VERIFY3P(wwnp, !=, NULL); + + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + SMRT_SATA_INQ83_LEN, KM_NOSLEEP) != 0) { + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (ENOMEM); + } + + smcm->smcm_va_cmd->Header.LUN.PhysDev = *addr; + smcm->smcm_va_cmd->Request.CDBLen = CDB_GROUP0; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + + smcm->smcm_va_cmd->Request.CDB[0] = SCMD_INQUIRY; + smcm->smcm_va_cmd->Request.CDB[1] = 1; + smcm->smcm_va_cmd->Request.CDB[2] = 0x83; + smcm->smcm_va_cmd->Request.CDB[3] = (SMRT_SATA_INQ83_LEN & 0xff00) >> 8; + smcm->smcm_va_cmd->Request.CDB[4] = SMRT_SATA_INQ83_LEN & 0x00ff; + smcm->smcm_va_cmd->Request.CDB[5] = 0; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (r); + } + + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + mutex_exit(&smrt->smrt_mutex); + return (r); + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (EIO); + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "physical target " + "SATA WWN error: status 0x%x", ei->CommandStatus); + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (EIO); + } + resid = ei->ResidualCnt; + } else { + resid = 0; + } + + mutex_exit(&smrt->smrt_mutex); + + /* + * We must have at least 12 bytes. The first four bytes are the header, + * the next four are for the LUN header, and the last 8 are for the + * actual WWN, which according to SAT-2 will always be first. + */ + if (SMRT_SATA_INQ83_LEN - resid < 16) { + smrt_command_free(smcm); + return (EINVAL); + } + inq = smcm->smcm_internal->smcmi_va; + + /* + * Sanity check we have the right page. + */ + if (inq[1] != 0x83) { + smrt_command_free(smcm); + return (EINVAL); + } + + /* + * Check to see if we have a proper Network Address Authority (NAA) + * based world wide number for this LUN. It is possible that firmware + * interposes on this and constructs a fake world wide number (WWN). If + * this is the case, we don't want to actually use it. We need to + * verify that the WWN declares the correct naming authority and is of + * the proper length. + */ + if ((inq[5] & 0x30) != 0 || (inq[5] & 0x0f) != 3 || inq[7] != 8) { + smrt_command_free(smcm); + return (ENOTSUP); + } + + bcopy(&inq[8], &wwn, sizeof (uint64_t)); + *wwnp = BE_64(wwn); + + smrt_command_free(smcm); + + return (0); +} diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index f968b54847..274c7caccf 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -3498,9 +3498,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -4444,19 +4448,78 @@ static int sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; - int rval = SD_SUCCESS; + int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. + */ + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. + */ + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring @@ -6723,7 +6786,7 @@ sdpower(dev_info_t *devi, int component, int level) time_t intvlp; struct pm_trans_data sd_pm_tran_data; uchar_t save_state = SD_STATE_NORMAL; - int sval; + int sval, tursval = 0; uchar_t state_before_pm; int got_semaphore_here; sd_ssc_t *ssc; @@ -7040,13 +7103,26 @@ sdpower(dev_info_t *devi, int component, int level) * a deadlock on un_pm_busy_cv will occur. */ if (SD_PM_IS_IO_CAPABLE(un, level)) { - sval = sd_send_scsi_TEST_UNIT_READY(ssc, + tursval = sd_send_scsi_TEST_UNIT_READY(ssc, SD_DONT_RETRY_TUR | SD_BYPASS_PM); - if (sval != 0) + if (tursval != 0) sd_ssc_assessment(ssc, SD_FMT_IGNORE); } - if (un->un_f_power_condition_supported) { + /* + * We've encountered certain classes of drives that pass a TUR, but fail + * the START STOP UNIT when using power conditions, or worse leave the + * drive in an unusable state despite passing SSU. Strictly speaking, + * for SPC-4 or greater, no additional actions are required to make the + * drive operational when a TUR passes. If we have something that + * matches this condition, we continue on and presume the drive is + * successfully powered on. + */ + if (un->un_f_power_condition_supported && + SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) && + level == SD_SPINDLE_ACTIVE && tursval == 0) { + sval = 0; + } else if (un->un_f_power_condition_supported) { char *pm_condition_name[] = {"STOPPED", "STANDBY", "IDLE", "ACTIVE"}; SD_TRACE(SD_LOG_IO_PM, un, @@ -7066,6 +7142,7 @@ sdpower(dev_info_t *devi, int component, int level) sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK); else sd_ssc_assessment(ssc, SD_FMT_IGNORE); + } /* Command failed, check for media present. */ @@ -31324,7 +31401,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) { un->un_f_log_sense_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_INQUIRY(un)->inq_ansi == 6) { + SD_SCSI_VERS_IS_GE_SPC_4(un)) { un->un_f_power_condition_supported = TRUE; } } else { @@ -31342,7 +31419,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) /* SD_PM_CAPABLE_IS_TRUE case */ un->un_f_pm_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_PM_CAPABLE_IS_SPC_4(pm_cap)) { + (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) { un->un_f_power_condition_supported = TRUE; } diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c index 46d616dd79..4dce53e22c 100644 --- a/usr/src/uts/common/io/signalfd.c +++ b/usr/src/uts/common/io/signalfd.c @@ -107,6 +107,7 @@ #include <sys/schedctl.h> #include <sys/id_space.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/disp.h> #include <sys/taskq_impl.h> @@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block) lwp->lwp_extsig = 0; mutex_exit(&p->p_lock); + if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate) + BROP(p)->b_sigfd_translate(infop); + /* Convert k_siginfo into external, datamodel independent, struct. */ bzero(ssp, sizeof (*ssp)); ssp->ssi_signo = infop->si_signo; diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c index 727fbbad8e..9bfe2fe7cf 100644 --- a/usr/src/uts/common/io/simnet/simnet.c +++ b/usr/src/uts/common/io/simnet/simnet.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ /* @@ -795,12 +797,6 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) continue; } - /* Fix mblk checksum as the pkt dest is local */ - if ((mp = mac_fix_cksum(mp)) == NULL) { - sdev->sd_stats.xmit_errors++; - continue; - } - /* Hold reference for taskq receive processing per-pkt */ if (!simnet_thread_ref(sdev_rx)) { freemsg(mp); diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index ec76c6e2b9..55fd87db45 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1451,6 +1451,16 @@ copyb(mblk_t *bp) ndp = nbp->b_datap; /* + * Copy the various checksum information that came in + * originally. + */ + ndp->db_cksumstart = dp->db_cksumstart; + ndp->db_cksumend = dp->db_cksumend; + ndp->db_cksumstuff = dp->db_cksumstuff; + bcopy(dp->db_struioun.data, ndp->db_struioun.data, + sizeof (dp->db_struioun.data)); + + /* * Well, here is a potential issue. If we are trying to * trace a flow, and we copy the message, we might lose * information about where this message might have been. diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c index 03b93c6114..e77f33d31f 100644 --- a/usr/src/uts/common/io/tl.c +++ b/usr/src/uts/common/io/tl.c @@ -1419,8 +1419,9 @@ tl_closeok(tl_endpt_t *tep) static int tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) { - tl_endpt_t *tep; - minor_t minor = getminor(*devp); + tl_endpt_t *tep; + minor_t minor = getminor(*devp); + id_t inst_minor; /* * Driver is called directly. Both CLONEOPEN and MODOPEN @@ -1440,6 +1441,14 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) minor |= TL_SOCKET; } + /* + * Attempt to allocate a unique minor number for this instance. + * Avoid an uninterruptable sleep if none are available. + */ + if ((inst_minor = id_alloc_nosleep(tl_minors)) == -1) { + return (ENOMEM); + } + tep = kmem_cache_alloc(tl_cache, KM_SLEEP); tep->te_refcnt = 1; tep->te_cpid = curproc->p_pid; @@ -1451,9 +1460,7 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) tep->te_flag = minor & TL_MINOR_MASK; tep->te_transport = &tl_transports[minor]; - - /* Allocate a unique minor number for this instance. */ - tep->te_minor = (minor_t)id_alloc(tl_minors); + tep->te_minor = (minor_t)inst_minor; /* Reserve hash handle for bind(). */ (void) mod_hash_reserve(tep->te_addrhash, &tep->te_hash_hndl); diff --git a/usr/src/uts/common/io/usb/clients/hid/hid.c b/usr/src/uts/common/io/usb/clients/hid/hid.c index 084fa7fedc..eccd48bf08 100644 --- a/usr/src/uts/common/io/usb/clients/hid/hid.c +++ b/usr/src/uts/common/io/usb/clients/hid/hid.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ @@ -139,6 +139,12 @@ static int hid_info(dev_info_t *, ddi_info_cmd_t, void *, void **); static int hid_attach(dev_info_t *, ddi_attach_cmd_t); static int hid_detach(dev_info_t *, ddi_detach_cmd_t); static int hid_power(dev_info_t *, int, int); +/* These are to enable ugen support: */ +static int hid_chropen(dev_t *, int, int, cred_t *); +static int hid_chrclose(dev_t, int, int, cred_t *); +static int hid_read(dev_t, struct uio *, cred_t *); +static int hid_write(dev_t, struct uio *, cred_t *); +static int hid_poll(dev_t, short, int, short *, struct pollhead **); /* * Warlock is not aware of the automatic locking mechanisms for @@ -198,18 +204,18 @@ struct streamtab hid_streamtab = { }; struct cb_ops hid_cb_ops = { - nulldev, /* open */ - nulldev, /* close */ + hid_chropen, /* open */ + hid_chrclose, /* close */ nulldev, /* strategy */ nulldev, /* print */ nulldev, /* dump */ - nulldev, /* read */ - nulldev, /* write */ + hid_read, /* read */ + hid_write, /* write */ nulldev, /* ioctl */ nulldev, /* devmap */ nulldev, /* mmap */ nulldev, /* segmap */ - nochpoll, /* poll */ + hid_poll, /* poll */ ddi_prop_op, /* cb_prop_op */ &hid_streamtab, /* streamtab */ D_MP | D_MTPERQ @@ -349,6 +355,7 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) usb_alt_if_data_t *altif_data; char minor_name[HID_MINOR_NAME_LEN]; usb_ep_data_t *ep_data; + usb_ugen_info_t usb_ugen_info; switch (cmd) { case DDI_ATTACH: @@ -491,6 +498,28 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) usb_free_dev_data(dip, dev_data); hidp->hid_dev_data = NULL; + if (usb_owns_device(dip)) { + /* Get a ugen handle. */ + bzero(&usb_ugen_info, sizeof (usb_ugen_info)); + + usb_ugen_info.usb_ugen_flags = 0; + usb_ugen_info.usb_ugen_minor_node_ugen_bits_mask = + (dev_t)HID_MINOR_UGEN_BITS_MASK; + usb_ugen_info.usb_ugen_minor_node_instance_mask = + (dev_t)HID_MINOR_INSTANCE_MASK; + hidp->hid_ugen_hdl = usb_ugen_get_hdl(dip, &usb_ugen_info); + + if (usb_ugen_attach(hidp->hid_ugen_hdl, cmd) != + USB_SUCCESS) { + USB_DPRINTF_L2(PRINT_MASK_ATTA, + hidp->hid_log_handle, + "usb_ugen_attach failed"); + + usb_ugen_release_hdl(hidp->hid_ugen_hdl); + hidp->hid_ugen_hdl = NULL; + } + } + /* * Don't get the report descriptor if parsing hid descriptor earlier * failed since device probably won't return valid report descriptor @@ -769,6 +798,149 @@ hid_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (rval); } +static int +hid_chropen(dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int rval; + minor_t minor = getminor(*devp); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + hid_pm_busy_component(hidp); + (void) pm_raise_power(hidp->hid_dip, 0, USB_DEV_OS_FULL_PWR); + + mutex_enter(&hidp->hid_mutex); + + rval = usb_ugen_open(hidp->hid_ugen_hdl, devp, flag, + sflag, credp); + + mutex_exit(&hidp->hid_mutex); + + if (rval != 0) { + hid_pm_idle_component(hidp); + } + + return (rval); +} + +static int +hid_chrclose(dev_t dev, int flag, int otyp, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + mutex_enter(&hidp->hid_mutex); + + rval = usb_ugen_close(hidp->hid_ugen_hdl, dev, flag, + otyp, credp); + + mutex_exit(&hidp->hid_mutex); + + if (rval == 0) { + hid_pm_idle_component(hidp); + } + + return (rval); +} + +static int +hid_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_read(hidp->hid_ugen_hdl, dev, uiop, credp); + + return (rval); +} + +static int +hid_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_write(hidp->hid_ugen_hdl, dev, uiop, credp); + + return (rval); +} + +static int +hid_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_poll(hidp->hid_ugen_hdl, dev, events, anyyet, + reventsp, phpp); + + return (rval); +} + /* * hid_open : * Open entry point: Opens the interrupt pipe. Sets up queues. @@ -787,13 +959,21 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) hidp = ddi_get_soft_state(hid_statep, instance); if (hidp == NULL) { - return (ENXIO); } USB_DPRINTF_L4(PRINT_MASK_OPEN, hidp->hid_log_handle, "hid_open: Begin"); + /* + * If this is a ugen device, return ENOSTR (no streams). This will + * cause spec_open to try hid_chropen from our regular ops_cb instead + * (and thus treat us as a plain character device). + */ + if (HID_IS_UGEN_OPEN(minor)) { + return (ENOSTR); + } + if (sflag) { /* clone open NOT supported here */ return (ENXIO); @@ -803,6 +983,8 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (EIO); } + mutex_enter(&hidp->hid_mutex); + /* * This is a workaround: * Currently, if we open an already disconnected device, and send @@ -812,7 +994,6 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * The consconfig_dacf module need this interface to detect if the * device is already disconnnected. */ - mutex_enter(&hidp->hid_mutex); if (HID_IS_INTERNAL_OPEN(minor) && (hidp->hid_dev_state == USB_DEV_DISCONNECTED)) { mutex_exit(&hidp->hid_mutex); @@ -1688,6 +1869,11 @@ hid_cpr_suspend(hid_state_t *hidp) } mutex_exit(&hidp->hid_mutex); + if ((retval == USB_SUCCESS) && hidp->hid_ugen_hdl != NULL) { + retval = usb_ugen_detach(hidp->hid_ugen_hdl, + DDI_SUSPEND); + } + return (retval); } @@ -1699,6 +1885,10 @@ hid_cpr_resume(hid_state_t *hidp) "hid_cpr_resume: dip=0x%p", (void *)hidp->hid_dip); hid_restore_device_state(hidp->hid_dip, hidp); + + if (hidp->hid_ugen_hdl != NULL) { + (void) usb_ugen_attach(hidp->hid_ugen_hdl, DDI_RESUME); + } } @@ -2136,6 +2326,12 @@ hid_detach_cleanup(dev_info_t *dip, hid_state_t *hidp) hidp->hid_pm = NULL; } + if (hidp->hid_ugen_hdl != NULL) { + rval = usb_ugen_detach(hidp->hid_ugen_hdl, DDI_DETACH); + VERIFY0(rval); + usb_ugen_release_hdl(hidp->hid_ugen_hdl); + } + mutex_exit(&hidp->hid_mutex); if (hidp->hid_report_descr != NULL) { diff --git a/usr/src/uts/common/io/usb/usba/genconsole.c b/usr/src/uts/common/io/usb/usba/genconsole.c index 609c1d7456..5e48a2e672 100644 --- a/usr/src/uts/common/io/usb/usba/genconsole.c +++ b/usr/src/uts/common/io/usb/usba/genconsole.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019, Joyent, Inc. */ /* @@ -40,10 +41,8 @@ * layers to initialize any state information. */ int -usb_console_input_init(dev_info_t *dip, - usb_pipe_handle_t pipe_handle, - uchar_t **state_buf, - usb_console_info_t *console_input_info) +usb_console_input_init(dev_info_t *dip, usb_pipe_handle_t pipe_handle, + uchar_t **state_buf, usb_console_info_t *console_input_info) { int ret; usba_device_t *usba_device; @@ -168,10 +167,8 @@ usb_console_input_enter(usb_console_info_t console_input_info) /* * Call the lower layer to save state information. */ - usba_device->usb_hcdi_ops->usba_hcdi_console_input_enter( - usb_console_input); - - return (USB_SUCCESS); + return (usba_device->usb_hcdi_ops->usba_hcdi_console_input_enter( + usb_console_input)); } @@ -235,10 +232,8 @@ usb_console_input_exit(usb_console_info_t console_input_info) /* * Restore the state information. */ - usba_device->usb_hcdi_ops->usba_hcdi_console_input_exit( - usb_console_input); - - return (USB_SUCCESS); + return (usba_device->usb_hcdi_ops->usba_hcdi_console_input_exit( + usb_console_input)); } /* @@ -345,10 +340,8 @@ usb_console_output_enter(usb_console_info_t console_output_info) /* * Call the lower layer to save state information. */ - usb_device->usb_hcdi_ops->usba_hcdi_console_output_enter( - usb_console_output); - - return (USB_SUCCESS); + return (usb_device->usb_hcdi_ops->usba_hcdi_console_output_enter( + usb_console_output)); } /* @@ -358,7 +351,7 @@ usb_console_output_enter(usb_console_info_t console_output_info) */ int usb_console_write(usb_console_info_t console_output_info, - uchar_t *buf, uint_t num_characters, uint_t *num_characters_written) + uchar_t *buf, uint_t num_characters, uint_t *num_characters_written) { usba_device_t *usb_device; usb_console_info_impl_t *usb_console_output; @@ -402,8 +395,6 @@ usb_console_output_exit(usb_console_info_t console_output_info) /* * Restore the state information. */ - usb_device->usb_hcdi_ops->usba_hcdi_console_output_exit( - usb_console_output); - - return (USB_SUCCESS); + return (usb_device->usb_hcdi_ops->usba_hcdi_console_output_exit( + usb_console_output)); } diff --git a/usr/src/uts/common/io/usb/usba/hubdi.c b/usr/src/uts/common/io/usb/usba/hubdi.c index e3f3722de8..99d75edce3 100644 --- a/usr/src/uts/common/io/usb/usba/hubdi.c +++ b/usr/src/uts/common/io/usb/usba/hubdi.c @@ -22,7 +22,7 @@ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ /* @@ -1797,6 +1797,10 @@ usba_hubdi_power(dev_info_t *dip, int comp, int level) retval = hubd_pwrlvl3(hubd); break; + default: + retval = USB_FAILURE; + + break; } mutex_exit(HUBD_MUTEX(hubd)); @@ -2133,11 +2137,11 @@ fail: kmem_free(pathname, MAXPATHLEN); } - mutex_enter(HUBD_MUTEX(hubd)); - hubd_pm_idle_component(hubd, dip, 0); - mutex_exit(HUBD_MUTEX(hubd)); + if (hubd != NULL) { + mutex_enter(HUBD_MUTEX(hubd)); + hubd_pm_idle_component(hubd, dip, 0); + mutex_exit(HUBD_MUTEX(hubd)); - if (hubd) { rval = hubd_cleanup(dip, hubd); if (rval != USB_SUCCESS) { USB_DPRINTF_L2(DPRINT_MASK_ATTA, hubdi_log_handle, @@ -2180,7 +2184,7 @@ usba_hubdi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) static int hubd_setdevaddr(hubd_t *hubd, usb_port_t port) { - int rval; + int rval = USB_FAILURE; usb_cr_t completion_reason; usb_cb_flags_t cb_flags; usb_pipe_handle_t ph; @@ -2235,8 +2239,8 @@ hubd_setdevaddr(hubd_t *hubd, usb_port_t port) for (retry = 0; retry < hubd_retry_enumerate; retry++) { /* open child's default pipe with USBA_DEFAULT_ADDR */ - if (usb_pipe_open(child_dip, NULL, NULL, - USB_FLAGS_SLEEP | USBA_FLAGS_PRIVILEGED, &ph) != + if ((rval = usb_pipe_open(child_dip, NULL, NULL, + USB_FLAGS_SLEEP | USBA_FLAGS_PRIVILEGED, &ph)) != USB_SUCCESS) { USB_DPRINTF_L2(DPRINT_MASK_ATTA, hubd->h_log_handle, "hubd_setdevaddr: Unable to open default pipe"); @@ -6071,7 +6075,6 @@ hubd_ready_device(hubd_t *hubd, dev_info_t *child_dip, usba_device_t *child_ud, return (child_dip); } - /* * hubd_create_child * - create child dip @@ -6480,6 +6483,8 @@ hubd_create_child(dev_info_t *dip, goto fail_cleanup; } + /* Read the BOS data */ + usba_get_binary_object_store(child_dip, child_ud); /* get the device string descriptor(s) */ usba_get_dev_string_descrs(child_dip, child_ud); @@ -9198,7 +9203,7 @@ usba_hubdi_reset_device(dev_info_t *dip, usb_dev_reset_lvl_t reset_level) usb_port_t port = 0; dev_info_t *hdip; usb_pipe_state_t prev_pipe_state = 0; - usba_device_t *usba_device; + usba_device_t *usba_device = NULL; hubd_reset_arg_t *arg; int i, ph_open_cnt; int rval = USB_FAILURE; @@ -9372,6 +9377,7 @@ usba_hubdi_reset_device(dev_info_t *dip, usb_dev_reset_lvl_t reset_level) == USB_SUCCESS) { mutex_exit(HUBD_MUTEX(hubd)); /* re-open the default pipe */ + ASSERT3P(usba_device, !=, NULL); rval = usba_persistent_pipe_open(usba_device); mutex_enter(HUBD_MUTEX(hubd)); if (rval != USB_SUCCESS) { diff --git a/usr/src/uts/common/io/usb/usba/parser.c b/usr/src/uts/common/io/usb/usba/parser.c index 965113374c..f81bcfdb39 100644 --- a/usr/src/uts/common/io/usb/usba/parser.c +++ b/usr/src/uts/common/io/usb/usba/parser.c @@ -23,6 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2019, Joyent, Inc. */ @@ -45,16 +46,13 @@ extern usba_cfg_pwr_descr_t default_cfg_power; extern usba_if_pwr_descr_t default_if_power; size_t -usb_parse_data(char *format, - uchar_t *data, - size_t datalen, - void *structure, - size_t structlen) +usb_parse_data(char *format, const uchar_t *data, size_t datalen, + void *structure, size_t structlen) { int fmt; int counter = 1; int multiplier = 0; - uchar_t *dataend = data + datalen; + const uchar_t *dataend = data + datalen; char *structstart = (char *)structure; void *structend = (void *)((intptr_t)structstart + structlen); @@ -170,11 +168,8 @@ usb_parse_data(char *format, size_t -usb_parse_CV_descr(char *format, - uchar_t *data, - size_t datalen, - void *structure, - size_t structlen) +usb_parse_CV_descr(char *format, const uchar_t *data, size_t datalen, + void *structure, size_t structlen) { return (usb_parse_data(format, data, datalen, structure, structlen)); @@ -186,16 +181,12 @@ usb_parse_CV_descr(char *format, * type descr_type, unless the end of the buffer or a descriptor * of type stop_descr_type1 or stop_descr_type2 is encountered first. */ -static uchar_t * -usb_nth_descr(uchar_t *buf, - size_t buflen, - int descr_type, - uint_t n, - int stop_descr_type1, - int stop_descr_type2) +static const uchar_t * +usb_nth_descr(const uchar_t *buf, size_t buflen, int descr_type, uint_t n, + int stop_descr_type1, int stop_descr_type2) { - uchar_t *bufstart = buf; - uchar_t *bufend = buf + buflen; + const uchar_t *bufstart = buf; + const uchar_t *bufend = buf + buflen; if (buf == NULL) { @@ -229,10 +220,8 @@ usb_nth_descr(uchar_t *buf, size_t -usb_parse_dev_descr(uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */ - size_t buflen, - usb_dev_descr_t *ret_descr, - size_t ret_buf_len) +usb_parse_dev_descr(const uchar_t *buf, size_t buflen, + usb_dev_descr_t *ret_descr, size_t ret_buf_len) { if ((buf == NULL) || (ret_descr == NULL) || (buflen < 2) || (buf[1] != USB_DESCR_TYPE_DEV)) { @@ -246,10 +235,8 @@ usb_parse_dev_descr(uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */ size_t -usb_parse_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - usb_cfg_descr_t *ret_descr, - size_t ret_buf_len) +usb_parse_cfg_descr(const uchar_t *buf, size_t buflen, + usb_cfg_descr_t *ret_descr, size_t ret_buf_len) { if ((buf == NULL) || (ret_descr == NULL) || (buflen < 2) || (buf[1] != USB_DESCR_TYPE_CFG)) { @@ -263,13 +250,10 @@ usb_parse_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t -usba_parse_cfg_pwr_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - usba_cfg_pwr_descr_t *ret_descr, - size_t ret_buf_len) +usba_parse_cfg_pwr_descr(const uchar_t *buf, size_t buflen, + usba_cfg_pwr_descr_t *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL)) { @@ -298,13 +282,10 @@ usba_parse_cfg_pwr_descr( size_t -usb_parse_ia_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - size_t first_if, - usb_ia_descr_t *ret_descr, - size_t ret_buf_len) +usb_parse_ia_descr(const uchar_t *buf, size_t buflen, size_t first_if, + usb_ia_descr_t *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL)) { @@ -332,14 +313,10 @@ usb_parse_ia_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t -usb_parse_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - uint_t if_number, - uint_t alt_if_setting, - usb_if_descr_t *ret_descr, - size_t ret_buf_len) +usb_parse_if_descr(const uchar_t *buf, size_t buflen, uint_t if_number, + uint_t alt_if_setting, usb_if_descr_t *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL)) { @@ -367,14 +344,10 @@ usb_parse_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ } size_t -usba_parse_if_pwr_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - uint_t if_number, - uint_t alt_if_setting, - usba_if_pwr_descr_t *ret_descr, - size_t ret_buf_len) +usba_parse_if_pwr_descr(const uchar_t *buf, size_t buflen, uint_t if_number, + uint_t alt_if_setting, usba_if_pwr_descr_t *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL)) { @@ -422,15 +395,11 @@ usba_parse_if_pwr_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ * the first endpoint */ size_t -usb_parse_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - uint_t if_number, - uint_t alt_if_setting, - uint_t ep_index, - usb_ep_descr_t *ret_descr, - size_t ret_buf_len) +usb_parse_ep_descr(const uchar_t *buf, size_t buflen, uint_t if_number, + uint_t alt_if_setting, uint_t ep_index, usb_ep_descr_t *ret_descr, + size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL)) { @@ -473,14 +442,12 @@ usb_parse_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ */ /*ARGSUSED*/ size_t -usba_ascii_string_descr(uchar_t *buf, /* from GET_DESCRIPTOR(STRING) */ - size_t buflen, - char *ret_descr, - size_t ret_buf_len) +usba_ascii_string_descr(const uchar_t *buf, size_t buflen, char *ret_descr, + size_t ret_buf_len) { - int i = 1; - char *retstart = ret_descr; - uchar_t *bufend = buf + buflen; + int i = 1; + char *retstart = ret_descr; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL) || (ret_buf_len == 0) || (buflen < 2) || @@ -501,15 +468,10 @@ usba_ascii_string_descr(uchar_t *buf, /* from GET_DESCRIPTOR(STRING) */ size_t -usb_parse_CV_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - char *fmt, - uint_t descr_type, - uint_t descr_index, - void *ret_descr, - size_t ret_buf_len) +usb_parse_CV_cfg_descr(const uchar_t *buf, size_t buflen, char *fmt, + uint_t descr_type, uint_t descr_index, void *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL) || (buflen < 2) || ((buf = usb_nth_descr(buf, buflen, descr_type, @@ -525,17 +487,11 @@ usb_parse_CV_cfg_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t -usb_parse_CV_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - char *fmt, - uint_t if_number, - uint_t alt_if_setting, - uint_t descr_type, - uint_t descr_index, - void *ret_descr, - size_t ret_buf_len) +usb_parse_CV_if_descr(const uchar_t *buf, size_t buflen, char *fmt, + uint_t if_number, uint_t alt_if_setting, uint_t descr_type, + uint_t descr_index, void *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL)) { @@ -570,18 +526,11 @@ usb_parse_CV_if_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t -usb_parse_CV_ep_descr(uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ - size_t buflen, - char *fmt, - uint_t if_number, - uint_t alt_if_setting, - uint_t ep_index, - uint_t descr_type, - uint_t descr_index, - void *ret_descr, - size_t ret_buf_len) +usb_parse_CV_ep_descr(const uchar_t *buf, size_t buflen, char *fmt, + uint_t if_number, uint_t alt_if_setting, uint_t ep_index, uint_t descr_type, + uint_t descr_index, void *ret_descr, size_t ret_buf_len) { - uchar_t *bufend = buf + buflen; + const uchar_t *bufend = buf + buflen; if ((buf == NULL) || (ret_descr == NULL) || (fmt == NULL)) { diff --git a/usr/src/uts/common/io/usb/usba/usba.c b/usr/src/uts/common/io/usb/usba/usba.c index 7cc68e79df..6a37f8430a 100644 --- a/usr/src/uts/common/io/usb/usba/usba.c +++ b/usr/src/uts/common/io/usb/usba/usba.c @@ -24,6 +24,7 @@ * * Copyright 2014 Garrett D'Amore <garrett@damore.org> * Copyright 2016 James S. Blachly, MD <james.blachly@gmail.com> + * Copyright 2019 Joyent, Inc. */ @@ -776,6 +777,8 @@ usba_free_usba_device(usba_device_t *usba_device) strlen(usba_device->usb_serialno_str) + 1); } + usba_free_binary_object_store(usba_device); + usba_unset_usb_address(usba_device); } @@ -2262,6 +2265,17 @@ usba_ready_device_node(dev_info_t *child_dip) } } + if (usba_device->usb_port_status == USBA_FULL_SPEED_DEV) { + /* create boolean property */ + rval = ndi_prop_create_boolean(DDI_DEV_T_NONE, child_dip, + "full-speed"); + if (rval != DDI_PROP_SUCCESS) { + USB_DPRINTF_L2(DPRINT_MASK_USBA, usba_log_handle, + "usba_ready_device_node: " + "full speed prop update failed"); + } + } + if (usba_device->usb_port_status == USBA_HIGH_SPEED_DEV) { /* create boolean property */ rval = ndi_prop_create_boolean(DDI_DEV_T_NONE, child_dip, @@ -2283,6 +2297,8 @@ usba_ready_device_node(dev_info_t *child_dip) } } + usba_add_binary_object_store_props(child_dip, usba_device); + USB_DPRINTF_L4(DPRINT_MASK_USBA, usba_log_handle, "%s%d at port %d: %s, dip=0x%p", ddi_node_name(ddi_get_parent(child_dip)), @@ -2906,7 +2922,6 @@ usba_get_mfg_prod_sn_str( return (buffer); } - /* * USB enumeration statistic functions */ diff --git a/usr/src/uts/common/io/usb/usba/usba10_calls.c b/usr/src/uts/common/io/usb/usba/usba10_calls.c index 2bdcfd11c4..9fe39418e8 100644 --- a/usr/src/uts/common/io/usb/usba/usba10_calls.c +++ b/usr/src/uts/common/io/usb/usba/usba10_calls.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019, Joyent, Inc. */ @@ -80,7 +81,7 @@ usba10_usb_free_descr_tree( size_t usba10_usb_parse_data( char *format, - uchar_t *data, + const uchar_t *data, size_t datalen, void *structure, size_t structlen) diff --git a/usr/src/uts/common/io/usb/usba/usba_bos.c b/usr/src/uts/common/io/usb/usba/usba_bos.c new file mode 100644 index 0000000000..df8bd00680 --- /dev/null +++ b/usr/src/uts/common/io/usb/usba/usba_bos.c @@ -0,0 +1,420 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Routines to access, parse, and manage the USB Binary Object Store + */ + +#define USBA_FRAMEWORK +#include <sys/usb/usba/usba_impl.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> + +static size_t +usba_bos_parse_bos_descr(const uchar_t *buf, size_t buflen, + usb_bos_descr_t *bosp, size_t rlen) +{ + if (buf == NULL || bosp == NULL || buflen < USB_BOS_PACKED_SIZE || + buf[1] != USB_DESCR_TYPE_BOS) { + return (USB_PARSE_ERROR); + } + + return (usb_parse_data("ccsc", buf, buflen, bosp, rlen)); +} + +static boolean_t +usba_bos_parse_usb2ext(const uchar_t *buf, size_t buflen, usb_bos_t *bosp) +{ + size_t len; + + if (buflen != USB_BOS_USB2EXT_PACKED_SIZE) { + return (B_FALSE); + } + + len = usb_parse_data("cccl", buf, buflen, &bosp->ubos_caps.ubos_usb2, + sizeof (usb_bos_usb2ext_t)); + return (len == sizeof (usb_bos_usb2ext_t)); +} + +static boolean_t +usba_bos_parse_superspeed(const uchar_t *buf, size_t buflen, usb_bos_t *bosp) +{ + size_t len; + + if (buflen != USB_BOS_SSUSB_PACKED_SIZE) { + return (B_FALSE); + } + + len = usb_parse_data("ccccsccs", buf, buflen, + &bosp->ubos_caps.ubos_ssusb, sizeof (usb_bos_ssusb_t)); + return (len == sizeof (usb_bos_ssusb_t)); +} + +static boolean_t +usba_bos_parse_container(const uchar_t *buf, size_t buflen, usb_bos_t *bosp) +{ + size_t len; + + if (buflen != USB_BOS_CONTAINER_PACKED_SIZE) { + return (B_FALSE); + } + + len = usb_parse_data("cccc16c", buf, buflen, + &bosp->ubos_caps.ubos_container, sizeof (usb_bos_container_t)); + return (len == sizeof (usb_bos_container_t)); +} + +static boolean_t +usba_bos_parse_precision_time(const uchar_t *buf, size_t buflen, + usb_bos_t *bosp) +{ + size_t len; + + if (buflen != USB_BOS_PRECISION_TIME_PACKED_SIZE) { + return (B_FALSE); + } + + len = usb_parse_data("ccc", buf, buflen, &bosp->ubos_caps.ubos_time, + sizeof (usb_bos_precision_time_t)); + /* + * The actual size of this structure will usually be rounded up to four + * bytes by the compiler, therefore we need to compare against the + * packed size. + */ + return (len == USB_BOS_PRECISION_TIME_PACKED_SIZE); +} + +/* + * Validate that the BOS looks reasonable. This means the following: + * + * - We read the whole length of the descriptor + * - The total number of capabilities doesn't exceed the expected value + * - The length of each device capabilities fits within our expected range + * + * After we finish that up, go through and save all of the valid BOS + * descriptors, unpacking the ones that we actually understand. + */ +static boolean_t +usba_bos_save(usba_device_t *ud, const mblk_t *mp, usb_bos_descr_t *bdesc) +{ + size_t len = MBLKL(mp); + const uchar_t *buf = mp->b_rptr; + uint_t ncaps, nalloc; + usb_bos_t *bos; + + if (bdesc->bLength != USB_BOS_PACKED_SIZE || + bdesc->bNumDeviceCaps == 0 || len < USB_BOS_PACKED_SIZE || + len < bdesc->wTotalLength) { + return (B_FALSE); + } + + len = MIN(len, bdesc->wTotalLength); + buf += USB_BOS_PACKED_SIZE; + len -= USB_BOS_PACKED_SIZE; + + if (len < USB_DEV_CAP_PACKED_SIZE) { + return (B_FALSE); + } + + ncaps = 0; + while (len > 0) { + usb_dev_cap_descr_t dev; + + if (usb_parse_data("ccc", buf, len, &dev, sizeof (dev)) != + USB_DEV_CAP_PACKED_SIZE) { + return (B_FALSE); + } + + if (dev.bDescriptorType != USB_DESCR_TYPE_DEV_CAPABILITY || + dev.bLength > len) { + return (B_FALSE); + } + + ncaps++; + len -= dev.bLength; + buf += dev.bLength; + } + + if (ncaps != bdesc->bNumDeviceCaps) { + return (B_FALSE); + } + + nalloc = ncaps; + bos = kmem_zalloc(sizeof (usb_bos_t) * nalloc, KM_SLEEP); + buf = mp->b_rptr + USB_BOS_PACKED_SIZE; + len = MIN(MBLKL(mp), bdesc->wTotalLength) - USB_BOS_PACKED_SIZE; + ncaps = 0; + while (len > 0) { + usb_dev_cap_descr_t dev; + boolean_t valid; + + if (usb_parse_data("ccc", buf, len, &dev, sizeof (dev)) != + USB_DEV_CAP_PACKED_SIZE) { + goto fail; + } + + bos[ncaps].ubos_length = dev.bLength; + bos[ncaps].ubos_type = dev.bDevCapabilityType; + + valid = B_FALSE; + switch (dev.bDevCapabilityType) { + case USB_BOS_TYPE_USB2_EXT: + valid = usba_bos_parse_usb2ext(buf, dev.bLength, + &bos[ncaps]); + break; + case USB_BOS_TYPE_SUPERSPEED: + valid = usba_bos_parse_superspeed(buf, dev.bLength, + &bos[ncaps]); + break; + case USB_BOS_TYPE_CONTAINER: + valid = usba_bos_parse_container(buf, dev.bLength, + &bos[ncaps]); + break; + case USB_BOS_TYPE_PRECISION_TIME: + valid = usba_bos_parse_precision_time(buf, dev.bLength, + &bos[ncaps]); + break; + default: + /* + * Override the type to one that we know isn't used to + * indicate that the caller can't rely on the type + * that's present here. + */ + bos[ncaps].ubos_type = USB_BOS_TYPE_INVALID; + bcopy(buf, bos[ncaps].ubos_caps.ubos_raw, dev.bLength); + valid = B_TRUE; + break; + } + + if (valid) { + ncaps++; + } else { + bos[ncaps].ubos_length = 0; + bos[ncaps].ubos_type = USB_BOS_TYPE_INVALID; + bzero(bos[ncaps].ubos_caps.ubos_raw, + sizeof (bos[ncaps].ubos_caps.ubos_raw)); + } + len -= dev.bLength; + buf += dev.bLength; + } + + ud->usb_bos_nalloc = nalloc; + ud->usb_bos_nents = ncaps; + ud->usb_bos = bos; + + return (B_TRUE); + +fail: + kmem_free(bos, sizeof (usb_bos_t) * nalloc); + return (B_FALSE); +} + +/* + * Read the Binary Object Store (BOS) data from the device and attempt to parse + * it. Do not fail to attach the device if we cannot get all of the information + * at this time. While certain aspects of the BOS are required for Windows, + * which suggests that we could actually rely on it, we haven't historically. + */ +void +usba_get_binary_object_store(dev_info_t *dip, usba_device_t *ud) +{ + int rval; + mblk_t *mp = NULL; + usb_cr_t completion_reason; + usb_cb_flags_t cb_flags; + usb_pipe_handle_t ph; + size_t size; + usb_bos_descr_t bos; + + /* + * The BOS is only supported on USB 3.x devices. Therefore if the bcdUSB + * is greater than USB 2.0, we can check this. Note, USB 3.x devices + * that are linked on a USB device will report version 2.1 in the bcdUSB + * field. + */ + if (ud->usb_dev_descr->bcdUSB <= 0x200) { + return; + } + + ph = usba_get_dflt_pipe_handle(dip); + + /* + * First get just the BOS descriptor itself. + */ + rval = usb_pipe_sync_ctrl_xfer(dip, ph, + USB_DEV_REQ_DEV_TO_HOST | USB_DEV_REQ_TYPE_STANDARD, + USB_REQ_GET_DESCR, /* bRequest */ + (USB_DESCR_TYPE_BOS << 8), /* wValue */ + 0, /* wIndex */ + USB_BOS_PACKED_SIZE, /* wLength */ + &mp, USB_ATTRS_SHORT_XFER_OK, + &completion_reason, &cb_flags, 0); + + if (rval != USB_SUCCESS) { + return; + } + + size = usba_bos_parse_bos_descr(mp->b_rptr, MBLKL(mp), &bos, + sizeof (bos)); + freemsg(mp); + mp = NULL; + if (size < USB_BOS_PACKED_SIZE) { + return; + } + + /* + * Check to see if there are any capabilities and if it's worth getting + * the whole BOS. + */ + if (bos.bLength != USB_BOS_PACKED_SIZE || bos.bNumDeviceCaps == 0) { + return; + } + + rval = usb_pipe_sync_ctrl_xfer(dip, ph, + USB_DEV_REQ_DEV_TO_HOST | USB_DEV_REQ_TYPE_STANDARD, + USB_REQ_GET_DESCR, /* bRequest */ + (USB_DESCR_TYPE_BOS << 8), /* wValue */ + 0, /* wIndex */ + bos.wTotalLength, /* wLength */ + &mp, USB_ATTRS_SHORT_XFER_OK, + &completion_reason, &cb_flags, 0); + + if (rval != USB_SUCCESS) { + return; + } + + size = usba_bos_parse_bos_descr(mp->b_rptr, MBLKL(mp), &bos, + sizeof (bos)); + if (size < USB_BOS_PACKED_SIZE) { + freemsg(mp); + return; + } + + if (!usba_bos_save(ud, mp, &bos)) { + freemsg(mp); + return; + } + + ud->usb_bos_mp = mp; +} + +static void +usba_add_superspeed_props(dev_info_t *dip, usb_bos_ssusb_t *ssusb) +{ + char *supported[4]; + uint_t nsup = 0; + char *min; + + if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_LOW) { + supported[nsup++] = "low-speed"; + } + + if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_FULL) { + supported[nsup++] = "full-speed"; + } + + if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_HIGH) { + supported[nsup++] = "high-speed"; + } + + if (ssusb->wSpeedsSupported & USB_BOS_SSUSB_SPEED_SUPER) { + supported[nsup++] = "super-speed"; + } + + if (nsup != 0 && ndi_prop_update_string_array(DDI_DEV_T_NONE, dip, + "usb-supported-speeds", supported, nsup) != DDI_PROP_SUCCESS) { + USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add " + "usb-supported-speeds property"); + } + + switch (ssusb->bFunctionalitySupport) { + case 0: + min = "low-speed"; + break; + case 1: + min = "full-speed"; + break; + case 2: + min = "high-speed"; + break; + case 3: + min = "super-speed"; + break; + default: + min = NULL; + } + + if (min != NULL && ndi_prop_update_string(DDI_DEV_T_NONE, dip, + "usb-minimum-speed", min) != DDI_PROP_SUCCESS) { + USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add " + "usb-minimum-speed property"); + } +} + +static void +usba_add_container_props(dev_info_t *dip, usb_bos_container_t *cp) +{ + if (ndi_prop_update_byte_array(DDI_DEV_T_NONE, dip, "usb-container-id", + cp->ContainerId, sizeof (cp->ContainerId)) != DDI_PROP_SUCCESS) { + USB_DPRINTF_L2(DPRINT_MASK_USBA, NULL, "failed to add " + "usb-container-id property"); + } +} + +void +usba_add_binary_object_store_props(dev_info_t *dip, usba_device_t *ud) +{ + uint_t i; + + if (ud->usb_bos == NULL) { + return; + } + + for (i = 0; i < ud->usb_bos_nents; i++) { + usb_bos_t *bos = &ud->usb_bos[i]; + + switch (bos->ubos_type) { + case USB_BOS_TYPE_SUPERSPEED: + usba_add_superspeed_props(dip, + &bos->ubos_caps.ubos_ssusb); + break; + case USB_BOS_TYPE_CONTAINER: + usba_add_container_props(dip, + &bos->ubos_caps.ubos_container); + break; + default: + /* + * This is a capability that we're not going to add + * devinfo properties to describe. + */ + continue; + } + } +} + +void +usba_free_binary_object_store(usba_device_t *ud) +{ + if (ud->usb_bos_mp != NULL) { + freemsg(ud->usb_bos_mp); + ud->usb_bos_mp = NULL; + } + + if (ud->usb_bos != NULL) { + kmem_free(ud->usb_bos, sizeof (usb_bos_t) * ud->usb_bos_nalloc); + ud->usb_bos = NULL; + ud->usb_bos_nalloc = ud->usb_bos_nents = 0; + } +} diff --git a/usr/src/uts/common/io/usb/usba/usba_devdb.c b/usr/src/uts/common/io/usb/usba/usba_devdb.c index 4fd1748bf0..e3d14f90c6 100644 --- a/usr/src/uts/common/io/usb/usba/usba_devdb.c +++ b/usr/src/uts/common/io/usb/usba/usba_devdb.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019, Joyent, Inc. */ @@ -140,13 +141,13 @@ usba_devdb_get_conf_rec(struct _buf *file, usba_configrec_t **rec) token_t token; char tokval[MAXPATHLEN]; usba_configrec_t *cfgrec; - config_field_t cfgvar; + config_field_t cfgvar = USB_NONE; u_longlong_t llptr; u_longlong_t value; enum { USB_NEWVAR, USB_CONFIG_VAR, USB_VAR_EQUAL, USB_VAR_VALUE, USB_ERROR - } parse_state = USB_NEWVAR; + } parse_state = USB_NEWVAR; cfgrec = (usba_configrec_t *)kmem_zalloc( sizeof (usba_configrec_t), KM_SLEEP); diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c index 745497f590..bcc658a001 100644 --- a/usr/src/uts/common/io/usb/usba/usba_ugen.c +++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c @@ -24,7 +24,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -409,11 +409,9 @@ usb_ugen_attach(usb_ugen_hdl_t usb_ugen_hdl, ddi_attach_cmd_t cmd) return (DDI_SUCCESS); fail: - if (ugenp) { - USB_DPRINTF_L2(UGEN_PRINT_ATTA, ugenp->ug_log_hdl, - "attach fail"); - (void) ugen_cleanup(ugenp); - } + USB_DPRINTF_L2(UGEN_PRINT_ATTA, ugenp->ug_log_hdl, + "attach fail"); + (void) ugen_cleanup(ugenp); return (DDI_FAILURE); } @@ -2545,7 +2543,7 @@ ugen_epx_ctrl_req(ugen_state_t *ugenp, ugen_ep_t *epp, goto fail; } -done: + *wait = B_TRUE; return (USB_SUCCESS); diff --git a/usr/src/uts/common/io/usb/usba/usbai.c b/usr/src/uts/common/io/usb/usba/usbai.c index 1ff8507ff1..f6ac391bd8 100644 --- a/usr/src/uts/common/io/usb/usba/usbai.c +++ b/usr/src/uts/common/io/usb/usba/usbai.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019, Joyent, Inc. */ @@ -115,8 +116,8 @@ usba_usbai_destroy() */ usb_log_handle_t usb_alloc_log_hdl(dev_info_t *dip, char *name, - uint_t *errlevel, uint_t *mask, uint_t *instance_filter, - usb_flags_t flags) + uint_t *errlevel, uint_t *mask, uint_t *instance_filter, + usb_flags_t flags) { usba_log_handle_impl_t *hdl; @@ -147,8 +148,8 @@ usb_alloc_log_hdl(dev_info_t *dip, char *name, /*ARGSUSED*/ usb_log_handle_t usb_alloc_log_handle(dev_info_t *dip, char *name, - uint_t *errlevel, uint_t *mask, uint_t *instance_filter, - uint_t reserved, usb_flags_t flags) + uint_t *errlevel, uint_t *mask, uint_t *instance_filter, + uint_t reserved, usb_flags_t flags) { return (usb_alloc_log_hdl(dip, name, errlevel, mask, instance_filter, flags)); @@ -215,7 +216,7 @@ static void usb_vprintf(dev_info_t *dip, int level, char *label, char *fmt, va_list ap) { size_t len; - int instance; + int instance = 0; char driver_name[USBA_DRVNAME_LEN]; char *msg_ptr; @@ -383,7 +384,7 @@ usb_vprintf(dev_info_t *dip, int level, char *label, char *fmt, va_list ap) int usba_vlog(usb_log_handle_t, uint_t, uint_t, char *, va_list) - __KVPRINTFLIKE(4); + __KVPRINTFLIKE(4); /* When usba10_calls.c goes away, this function can be made static again. */ int @@ -579,7 +580,7 @@ usba_async_req_raise_power(void *arg) /* usb function to perform async pm_request_power_change */ int usb_req_raise_power(dev_info_t *dip, int comp, int level, - void (*callback)(void *, int), void *arg, usb_flags_t flags) + void (*callback)(void *, int), void *arg, usb_flags_t flags) { usba_pm_req_t *pmrq; @@ -633,7 +634,7 @@ usba_async_req_lower_power(void *arg) /* usb function to perform async pm_request_power_change */ int usb_req_lower_power(dev_info_t *dip, int comp, int level, - void (*callback)(void *, int), void *arg, usb_flags_t flags) + void (*callback)(void *, int), void *arg, usb_flags_t flags) { usba_pm_req_t *pmrq; @@ -1100,7 +1101,7 @@ usb_unregister_hotplug_cbs(dev_info_t *dip) /*ARGSUSED*/ int usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata, - usb_flags_t flags) + usb_flags_t flags) { usba_device_t *usba_device; usba_evdata_t *evdata; diff --git a/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c b/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c index 296fcab878..455774b1b4 100644 --- a/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c +++ b/usr/src/uts/common/io/usb/usba/usbai_pipe_mgmt.c @@ -23,7 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ @@ -540,7 +540,7 @@ usba_init_pipe_handle(dev_info_t *dip, } /* fix up the MaxPacketSize if it is the default endpoint descr */ - if ((ep == &usba_default_ep_descr) && usba_device) { + if (ep == &usba_default_ep_descr) { uint16_t maxpktsize; maxpktsize = usba_device->usb_dev_descr->bMaxPacketSize0; diff --git a/usr/src/uts/common/io/usb/usba/usbai_register.c b/usr/src/uts/common/io/usb/usba/usbai_register.c index 6d22a188df..8b75a7619b 100644 --- a/usr/src/uts/common/io/usb/usba/usbai_register.c +++ b/usr/src/uts/common/io/usb/usba/usbai_register.c @@ -23,7 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ /* @@ -1313,7 +1313,7 @@ usba_make_alts_sparse(usb_alt_if_data_t **array, uint_t *n_elements) uint8_t largest_value; uint8_t curr_value; uint_t in_order = 0; - usb_alt_if_data_t *orig_addr = *array; /* Non-sparse array base ptr */ + usb_alt_if_data_t *orig_addr; /* Non-sparse array base ptr */ usb_alt_if_data_t *repl_array; /* Base ptr to sparse array */ uint_t n_repl_elements; /* Number elements in the new array */ uint_t i; @@ -1328,6 +1328,7 @@ usba_make_alts_sparse(usb_alt_if_data_t **array, uint_t *n_elements) "make_sparse: array=0x%p, n_orig_elements=%d", (void *)array, n_orig_elements); + orig_addr = *array; curr_value = orig_addr[0].altif_descr.bAlternateSetting; smallest_value = largest_value = curr_value; @@ -1635,7 +1636,7 @@ usba_dump_descr_tree(dev_info_t *dip, usb_client_dev_data_t *usb_reg, usb_cfg_descr_t *config_descr; /* and its USB descriptor. */ char *string; char *name_string = NULL; - int name_string_size; + int name_string_size = 0; if ((usb_reg == NULL) || ((log_handle == NULL) && (dip == NULL))) { diff --git a/usr/src/uts/common/io/usb/usba/usbai_req.c b/usr/src/uts/common/io/usb/usba/usbai_req.c index 4792d32efb..3a99185225 100644 --- a/usr/src/uts/common/io/usb/usba/usbai_req.c +++ b/usr/src/uts/common/io/usb/usba/usbai_req.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ @@ -113,7 +114,7 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req, mblk_t *data; usb_cr_t *cr; usb_req_attrs_t attrs; - usb_opaque_t cb, exc_cb; + usb_opaque_t cb = NULL, exc_cb = NULL; uint_t timeout = 0; uchar_t direction = ph_data->p_ep.bEndpointAddress & USB_EP_DIR_MASK; @@ -144,6 +145,8 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req, case USB_EP_ATTR_ISOCH: cr = &isoc_req->isoc_completion_reason; break; + default: + return (USB_INVALID_REQUEST); } *cr = USB_CR_UNSPECIFIED_ERR; @@ -220,6 +223,8 @@ _usba_check_req(usba_pipe_handle_data_t *ph_data, usb_opaque_t req, cb = (usb_opaque_t)isoc_req->isoc_cb; exc_cb = (usb_opaque_t)isoc_req->isoc_exc_cb; break; + default: + return (USB_INVALID_REQUEST); } USB_DPRINTF_L4(DPRINT_MASK_USBAI, usbai_log_handle, diff --git a/usr/src/uts/common/io/usb/usba/usbai_util.c b/usr/src/uts/common/io/usb/usba/usbai_util.c index dd942e35f2..58fbd472ae 100644 --- a/usr/src/uts/common/io/usb/usba/usbai_util.c +++ b/usr/src/uts/common/io/usb/usba/usbai_util.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ @@ -301,7 +302,7 @@ usb_get_string_descr(dev_info_t *dip, usba_get_dflt_pipe_handle(dip), USB_DEV_REQ_DEV_TO_HOST, USB_REQ_GET_DESCR, - USB_DESCR_TYPE_STRING << 8 | index & 0xff, + (USB_DESCR_TYPE_STRING << 8) | (index & 0xff), langid, 4, &data, USB_ATTRS_SHORT_XFER_OK, @@ -345,7 +346,7 @@ usb_get_string_descr(dev_info_t *dip, usba_get_dflt_pipe_handle(dip), USB_DEV_REQ_DEV_TO_HOST, USB_REQ_GET_DESCR, - USB_DESCR_TYPE_STRING << 8 | index & 0xff, + (USB_DESCR_TYPE_STRING << 8) | (index & 0xff), langid, length, &data, USB_ATTRS_SHORT_XFER_OK, @@ -2009,7 +2010,7 @@ usb_serialize_access( usb_serialization_t tokenp, uint_t how_to_wait, uint_t delta_timeout) { int rval = 1; /* Must be initialized > 0 */ - clock_t abs_timeout; + clock_t abs_timeout = 0; usba_serialization_impl_t *impl_tokenp; impl_tokenp = (usba_serialization_impl_t *)tokenp; diff --git a/usr/src/uts/common/io/usb/usba10/usba10.c b/usr/src/uts/common/io/usb/usba10/usba10.c index 9c8b0bed49..0c8e4af630 100644 --- a/usr/src/uts/common/io/usb/usba10/usba10.c +++ b/usr/src/uts/common/io/usb/usba10/usba10.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019, Joyent, Inc. */ @@ -107,7 +108,7 @@ usb_free_descr_tree( size_t usb_parse_data( char *format, - uchar_t *data, + const uchar_t *data, size_t datalen, void *structure, size_t structlen) diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c index d5dd1e8e39..ec6684f040 100644 --- a/usr/src/uts/common/io/vioif/vioif.c +++ b/usr/src/uts/common/io/vioif/vioif.c @@ -12,7 +12,7 @@ /* * Copyright 2013 Nexenta Inc. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* Based on the NetBSD virtio driver by Minoura Makoto. */ @@ -60,7 +60,6 @@ #include <sys/dlpi.h> #include <sys/taskq.h> -#include <sys/cyclic.h> #include <sys/pattr.h> #include <sys/strsun.h> @@ -216,6 +215,9 @@ static struct modlinkage modlinkage = { }, }; +/* Interval for the periodic TX reclaim */ +uint_t vioif_reclaim_ms = 200; + ddi_device_acc_attr_t vioif_attr = { DDI_DEVICE_ATTR_V0, DDI_NEVERSWAP_ACC, /* virtio is always native byte order */ @@ -278,7 +280,11 @@ struct vioif_softc { struct virtqueue *sc_tx_vq; struct virtqueue *sc_ctrl_vq; - unsigned int sc_tx_stopped:1; + /* TX virtqueue management resources */ + kmutex_t sc_tx_lock; + boolean_t sc_tx_corked; + boolean_t sc_tx_drain; + timeout_id_t sc_tx_reclaim_tid; /* Feature bits. */ unsigned int sc_rx_csum:1; @@ -406,6 +412,8 @@ static char *vioif_priv_props[] = { NULL }; +static void vioif_reclaim_restart(struct vioif_softc *); + /* Add up to ddi? */ static ddi_dma_cookie_t * vioif_dma_curr_cookie(ddi_dma_handle_t dmah) @@ -707,27 +715,26 @@ exit_txalloc: } /* ARGSUSED */ -int +static int vioif_multicst(void *arg, boolean_t add, const uint8_t *macaddr) { return (DDI_SUCCESS); } /* ARGSUSED */ -int +static int vioif_promisc(void *arg, boolean_t on) { return (DDI_SUCCESS); } /* ARGSUSED */ -int +static int vioif_unicst(void *arg, const uint8_t *macaddr) { return (DDI_FAILURE); } - static uint_t vioif_add_rx(struct vioif_softc *sc, int kmflag) { @@ -902,23 +909,25 @@ static uint_t vioif_reclaim_used_tx(struct vioif_softc *sc) { struct vq_entry *ve; - struct vioif_tx_buf *buf; uint32_t len; - mblk_t *mp; uint_t num_reclaimed = 0; while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) { + struct vioif_tx_buf *buf; + mblk_t *mp; + /* We don't chain descriptors for tx, so don't expect any. */ - ASSERT(!ve->qe_next); + ASSERT(ve->qe_next == NULL); buf = &sc->sc_txbufs[ve->qe_index]; mp = buf->tb_mp; buf->tb_mp = NULL; if (mp != NULL) { - for (int i = 0; i < buf->tb_external_num; i++) + for (uint_t i = 0; i < buf->tb_external_num; i++) { (void) ddi_dma_unbind_handle( buf->tb_external_mapping[i].vbm_dmah); + } } virtio_free_chain(ve); @@ -929,14 +938,107 @@ vioif_reclaim_used_tx(struct vioif_softc *sc) num_reclaimed++; } - if (sc->sc_tx_stopped && num_reclaimed > 0) { - sc->sc_tx_stopped = 0; - mac_tx_update(sc->sc_mac_handle); + /* Return ring to transmitting state if descriptors were reclaimed. */ + if (num_reclaimed > 0) { + boolean_t do_update = B_FALSE; + + mutex_enter(&sc->sc_tx_lock); + if (sc->sc_tx_corked) { + /* + * TX was corked on a lack of available descriptors. + * That dire state has passed so the TX interrupt can + * be disabled and MAC can be notified that + * transmission is possible again. + */ + sc->sc_tx_corked = B_FALSE; + virtio_stop_vq_intr(sc->sc_tx_vq); + do_update = B_TRUE; + } + mutex_exit(&sc->sc_tx_lock); + + /* Notify MAC outside the above lock */ + if (do_update) { + mac_tx_update(sc->sc_mac_handle); + } } return (num_reclaimed); } +static void +vioif_reclaim_periodic(void *arg) +{ + struct vioif_softc *sc = arg; + uint_t num_reclaimed; + + num_reclaimed = vioif_reclaim_used_tx(sc); + + mutex_enter(&sc->sc_tx_lock); + sc->sc_tx_reclaim_tid = 0; + /* + * If used descriptors were reclaimed or TX descriptors appear to be + * outstanding, the ring is considered active and periodic reclamation + * is necessary for now. + */ + if (num_reclaimed != 0 || vq_num_used(sc->sc_tx_vq) != 0) { + /* Do not reschedule if the ring is being drained. */ + if (!sc->sc_tx_drain) { + vioif_reclaim_restart(sc); + } + } + mutex_exit(&sc->sc_tx_lock); +} + +static void +vioif_reclaim_restart(struct vioif_softc *sc) +{ + ASSERT(MUTEX_HELD(&sc->sc_tx_lock)); + ASSERT(!sc->sc_tx_drain); + + if (sc->sc_tx_reclaim_tid == 0) { + sc->sc_tx_reclaim_tid = timeout(vioif_reclaim_periodic, sc, + MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms)); + } +} + +static void +vioif_tx_drain(struct vioif_softc *sc) +{ + mutex_enter(&sc->sc_tx_lock); + sc->sc_tx_drain = B_TRUE; + /* Put a stop to the periodic reclaim if it is running */ + if (sc->sc_tx_reclaim_tid != 0) { + timeout_id_t tid = sc->sc_tx_reclaim_tid; + + /* + * With sc_tx_drain set, there is no risk that a racing + * vioif_reclaim_periodic() call will reschedule itself. + * + * Being part of the mc_stop hook also guarantees that + * vioif_tx() will not be called to restart it. + */ + sc->sc_tx_reclaim_tid = 0; + mutex_exit(&sc->sc_tx_lock); + (void) untimeout(tid); + mutex_enter(&sc->sc_tx_lock); + } + virtio_stop_vq_intr(sc->sc_tx_vq); + mutex_exit(&sc->sc_tx_lock); + + /* + * Wait for all of the TX descriptors to be processed by the host so + * they can be reclaimed. + */ + while (vq_num_used(sc->sc_tx_vq) != 0) { + (void) vioif_reclaim_used_tx(sc); + delay(5); + } + + VERIFY(!sc->sc_tx_corked); + VERIFY3U(sc->sc_tx_reclaim_tid, ==, 0); + VERIFY3U(vq_num_used(sc->sc_tx_vq), ==, 0); +} + /* sc will be used to update stat counters. */ /* ARGSUSED */ static inline void @@ -1178,28 +1280,60 @@ exit_tx_external: return (B_TRUE); } -mblk_t * +static mblk_t * vioif_tx(void *arg, mblk_t *mp) { struct vioif_softc *sc = arg; - mblk_t *nmp; + mblk_t *nmp; + + /* + * Prior to attempting to send any more frames, do a reclaim to pick up + * any descriptors which have been processed by the host. + */ + if (vq_num_used(sc->sc_tx_vq) != 0) { + (void) vioif_reclaim_used_tx(sc); + } while (mp != NULL) { nmp = mp->b_next; mp->b_next = NULL; if (!vioif_send(sc, mp)) { - sc->sc_tx_stopped = 1; + /* + * If there are no descriptors available, try to + * reclaim some, allowing a retry of the send if some + * are found. + */ mp->b_next = nmp; - break; + if (vioif_reclaim_used_tx(sc) != 0) { + continue; + } + + /* + * Otherwise, enable the TX ring interrupt so that as + * soon as a descriptor becomes available, transmission + * can begin again. For safety, make sure the periodic + * reclaim is running as well. + */ + mutex_enter(&sc->sc_tx_lock); + sc->sc_tx_corked = B_TRUE; + virtio_start_vq_intr(sc->sc_tx_vq); + vioif_reclaim_restart(sc); + mutex_exit(&sc->sc_tx_lock); + return (mp); } mp = nmp; } - return (mp); + /* Ensure the periodic reclaim has been started. */ + mutex_enter(&sc->sc_tx_lock); + vioif_reclaim_restart(sc); + mutex_exit(&sc->sc_tx_lock); + + return (NULL); } -int +static int vioif_start(void *arg) { struct vioif_softc *sc = arg; @@ -1211,10 +1345,11 @@ vioif_start(void *arg) virtio_start_vq_intr(sc->sc_rx_vq); /* - * Don't start interrupts on sc_tx_vq. We use VIRTIO_F_NOTIFY_ON_EMPTY, - * so the device will send a transmit interrupt when the queue is empty - * and we can reclaim it in one sweep. + * Starting interrupts on the TX virtqueue is unnecessary at this time. + * Descriptor reclamation is handling during transmit, via a periodic + * timer, and when resources are tight, via the then-enabled interrupt. */ + sc->sc_tx_drain = B_FALSE; /* * Clear any data that arrived early on the receive queue and populate @@ -1228,15 +1363,17 @@ vioif_start(void *arg) return (DDI_SUCCESS); } -void +static void vioif_stop(void *arg) { struct vioif_softc *sc = arg; + /* Ensure all TX descriptors have been processed and reclaimed */ + vioif_tx_drain(sc); + virtio_stop_vq_intr(sc->sc_rx_vq); } -/* ARGSUSED */ static int vioif_stat(void *arg, uint_t stat, uint64_t *val) { @@ -1519,8 +1656,7 @@ vioif_dev_features(struct vioif_softc *sc) VIRTIO_NET_F_HOST_ECN | VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | - VIRTIO_F_RING_INDIRECT_DESC | - VIRTIO_F_NOTIFY_ON_EMPTY); + VIRTIO_F_RING_INDIRECT_DESC); vioif_show_features(sc, "Host features: ", host_features); vioif_show_features(sc, "Negotiated features: ", @@ -1535,7 +1671,7 @@ vioif_dev_features(struct vioif_softc *sc) return (DDI_SUCCESS); } -static int +static boolean_t vioif_has_feature(struct vioif_softc *sc, uint32_t feature) { return (virtio_has_feature(&sc->sc_virtio, feature)); @@ -1585,7 +1721,7 @@ vioif_get_mac(struct vioif_softc *sc) * Virtqueue interrupt handlers */ /* ARGSUSED */ -uint_t +static uint_t vioif_rx_handler(caddr_t arg1, caddr_t arg2) { struct virtio_softc *vsc = (void *) arg1; @@ -1604,7 +1740,7 @@ vioif_rx_handler(caddr_t arg1, caddr_t arg2) } /* ARGSUSED */ -uint_t +static uint_t vioif_tx_handler(caddr_t arg1, caddr_t arg2) { struct virtio_softc *vsc = (void *)arg1; @@ -1612,9 +1748,8 @@ vioif_tx_handler(caddr_t arg1, caddr_t arg2) struct vioif_softc, sc_virtio); /* - * The return value of this function is not needed but makes debugging - * interrupts simpler because you can use it to detect if anything was - * reclaimed in this handler. + * The TX interrupt could race with other reclamation activity, so + * interpreting the return value is unimportant. */ (void) vioif_reclaim_used_tx(sc); @@ -1770,6 +1905,9 @@ vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) goto exit_alloc2; virtio_stop_vq_intr(sc->sc_tx_vq); + mutex_init(&sc->sc_tx_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(sc->sc_virtio.sc_intr_prio)); + if (vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ)) { sc->sc_ctrl_vq = virtio_alloc_vq(&sc->sc_virtio, 2, VIOIF_CTRL_QLEN, 0, "ctrl"); diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c new file mode 100644 index 0000000000..198c14d4be --- /dev/null +++ b/usr/src/uts/common/io/vnd/frameio.c @@ -0,0 +1,465 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Frame I/O utility functions + */ + +#include <sys/frameio.h> + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/inttypes.h> + +static kmem_cache_t *frameio_cache; + +int +frameio_init(void) +{ + frameio_cache = kmem_cache_create("frameio_cache", + sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX, + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (frameio_cache == NULL) + return (1); + + return (0); +} + +void +frameio_fini(void) +{ + if (frameio_cache != NULL) + kmem_cache_destroy(frameio_cache); +} + +frameio_t * +frameio_alloc(int kmflags) +{ + return (kmem_cache_alloc(frameio_cache, kmflags)); +} + +void +frameio_free(frameio_t *fio) +{ + kmem_cache_free(frameio_cache, fio); +} + +/* + * Ensure that we don't see any garbage in the framevecs that we're nominally + * supposed to work with. Specifically we want to make sure that the buflen and + * the address are not zero. + */ +static int +frameio_hdr_check_vecs(frameio_t *fio) +{ + int i; + for (i = 0; i < fio->fio_nvecs; i++) + if (fio->fio_vecs[i].fv_buf == NULL || + fio->fio_vecs[i].fv_buflen == 0) + return (EINVAL); + + return (0); +} + +/* + * We have to copy in framevec32_t's. To work around the data model issues and + * trying not to copy memory we first copy in the framevec32_t data into the + * standard fio_vec space. Next we work backwards copying a given framevec32_t + * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that + * it is important that we do this in reverse so as to ensure that we don't + * clobber data as the framevec_t is larger than the framevec32_t. + */ +static int +frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr) +{ + framevec32_t *vec32p; + framevec_t fv; + int i; + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + + if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs, + 0) != 0) + return (EFAULT); + + for (i = fio->fio_nvecs - 1; i >= 0; i--) { + fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf; + fv.fv_buflen = vec32p[i].fv_buflen; + fv.fv_actlen = vec32p[i].fv_actlen; + fio->fio_vecs[i].fv_buf = fv.fv_buf; + fio->fio_vecs[i].fv_buflen = fv.fv_buflen; + fio->fio_vecs[i].fv_actlen = fv.fv_actlen; + } + + return (frameio_hdr_check_vecs(fio)); +} + +/* + * Copy in a frame io header into fio with space for up to nvecs. If the frameio + * contains more vectors than specified it will be ignored. mode should contain + * information about the datamodel. + */ +int +frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode) +{ + int model = ddi_model_convert_from(mode & FMODELS); + int cpf = mode & FKIOCTL ? FKIOCTL : 0; + size_t fsize = model == DDI_MODEL_ILP32 ? + sizeof (frameio32_t) : sizeof (frameio_t); + + /* + * The start of the header is the same in all data models for the + * current verison. + */ + if (ddi_copyin(addr, fio, fsize, cpf) != 0) + return (EFAULT); + + if (fio->fio_version != FRAMEIO_VERSION_ONE) + return (EINVAL); + + if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0) + return (EINVAL); + + if (fio->fio_nvpf == 0) + return (EINVAL); + + if (fio->fio_nvecs % fio->fio_nvpf != 0) + return (EINVAL); + + if (fio->fio_nvecs > max_vecs) + return (EOVERFLOW); + + addr = (void *)((uintptr_t)addr + fsize); + if (model == DDI_MODEL_ILP32) { + if (cpf != 0) + return (EINVAL); + return (frameio_hdr_copyin_ilp32(fio, addr)); + } + + if (ddi_copyin(addr, &fio->fio_vecs[0], + sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0) + return (EFAULT); + + return (frameio_hdr_check_vecs(fio)); +} + +static mblk_t * +frameio_allocb(size_t sz) +{ + mblk_t *mp; + + mp = allocb(sz, 0); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_DATA; + return (mp); +} + +static int +framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf) +{ + mblk_t *mp; + cpf = cpf != 0 ? FKIOCTL : 0; + + mp = frameio_allocb(fv->fv_buflen); + + if (mp == NULL) { + freemsg(mp); + return (EAGAIN); + } + + if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen, + cpf) != 0) { + freemsg(mp); + return (EFAULT); + } + + mp->b_wptr += fv->fv_buflen; + *mpp = mp; + return (0); +} + +/* + * Read a set of frame vectors that make up a single message boundary and return + * that as a single message in *mpp that consists of multiple data parts. + */ +static int +frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf) +{ + int nparts = fio->fio_nvpf; + int part, error; + mblk_t *mp; + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + /* + * Construct the initial frame + */ + for (part = 0; part < nparts; part++) { + error = framevec_mblk_read(fv, &mp, cpf); + if (error != 0) { + freemsg(*mpp); + return (error); + } + + if (*mpp == NULL) + *mpp = mp; + else + linkb(*mpp, mp); + fv++; + } + + return (0); +} + +/* + * Read data from a series of frameio vectors into a message block chain. A + * given frameio request has a number of discrete messages divided into + * individual vectors based on fio->fio_nvcspframe. Each discrete message will + * be constructed into a message block chain pointed to by b_next. + * + * If we get an EAGAIN while trying to construct a given message block what we + * return depends on what else we've done so far. If we have succesfully + * completed at least one message then we free everything else we've done so + * far and return that. If no messages have been completed we return EAGAIN. If + * instead we encounter a different error, say EFAULT, then all of the fv_actlen + * entries values are undefined. + */ +int +frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf) +{ + int error = ENOTSUP; + int nframes = fio->fio_nvecs / fio->fio_nvpf; + int frame; + framevec_t *fv; + mblk_t *mp, *bmp = NULL; + + /* + * Protect against bogus kernel subsystems. + */ + VERIFY(fio->fio_nvecs > 0); + VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0); + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + fv = &fio->fio_vecs[0]; + for (frame = 0; frame < nframes; frame++) { + error = frameio_mblk_read(fio, fv, &mp, cpf); + if (error != 0) + goto failed; + + if (bmp != NULL) + bmp->b_next = mp; + else + *mpp = mp; + bmp = mp; + } + + *nvecs = nframes; + return (0); +failed: + /* + * On EAGAIN we've already taken care of making sure that we have no + * leftover messages, eg. they were never linked in. + */ + if (error == EAGAIN) { + if (frame != 0) + error = 0; + if (*nvecs != NULL) + *nvecs = frame; + ASSERT(*mpp != NULL); + } else { + for (mp = *mpp; mp != NULL; mp = bmp) { + bmp = mp->b_next; + freemsg(mp); + } + if (nvecs != NULL) + *nvecs = 0; + *mpp = NULL; + } + return (error); +} + +size_t +frameio_frame_length(frameio_t *fio, framevec_t *fv) +{ + int i; + size_t len = 0; + + for (i = 0; i < fio->fio_nvpf; i++, fv++) + len += fv->fv_buflen; + + return (len); +} + +/* + * Write a portion of an mblk to the current. + */ +static int +framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff, + size_t foff, int cpf) +{ + ASSERT(len <= MBLKL(mp) - moff); + ASSERT(len <= fv->fv_buflen - fv->fv_actlen); + cpf = cpf != 0 ? FKIOCTL : 0; + + if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len, + cpf) != 0) + return (EFAULT); + fv->fv_actlen += len; + + return (0); +} + +/* + * Because copying this out to the user might fail we don't want to update the + * b_rptr in case we need to copy it out again. + */ +static int +framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf) +{ + int err; + size_t msize, blksize, len, moff, foff; + + msize = msgsize(mp); + if (msize > frameio_frame_length(fio, fv)) + return (EOVERFLOW); + + moff = 0; + foff = 0; + blksize = MBLKL(mp); + fv->fv_actlen = 0; + while (msize != 0) { + len = MIN(blksize, fv->fv_buflen - fv->fv_actlen); + err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf); + if (err != 0) + return (err); + + msize -= len; + blksize -= len; + moff += len; + foff += len; + + if (blksize == 0 && msize != 0) { + mp = mp->b_cont; + ASSERT(mp != NULL); + moff = 0; + blksize = MBLKL(mp); + } + + if (fv->fv_buflen == fv->fv_actlen && msize != 0) { + fv++; + fv->fv_actlen = 0; + foff = 0; + } + } + + return (0); +} + +int +frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map, + mblk_t *mp, int *nwrite, int cpf) +{ + int mcount = 0; + int ret = 0; + + if (map != MAP_BLK_FRAME) + return (EINVAL); + + while (mp != NULL && mcount < fio->fio_nvecs) { + ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf); + if (ret != 0) + break; + mcount += fio->fio_nvpf; + mp = mp->b_next; + } + + if (ret != 0 && mcount == 0) { + if (nwrite != NULL) + *nwrite = 0; + return (ret); + } + + if (nwrite != NULL) + *nwrite = mcount / fio->fio_nvpf; + + return (0); +} + +/* + * Copy out nframes worth of frameio header data back to userland. + */ +int +frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode) +{ + int i; + int model = ddi_model_convert_from(mode & FMODELS); + framevec32_t *vec32p; + framevec32_t f; + + if (fio->fio_nvecs / fio->fio_nvpf < nframes) + return (EINVAL); + + fio->fio_nvecs = nframes * fio->fio_nvpf; + + if (model == DDI_MODEL_NONE) { + if (ddi_copyout(fio, addr, + sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); + } + + ASSERT(model == DDI_MODEL_ILP32); + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + for (i = 0; i < fio->fio_nvecs; i++) { + f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf; + if (fio->fio_vecs[i].fv_buflen > UINT_MAX || + fio->fio_vecs[i].fv_actlen > UINT_MAX) + return (EOVERFLOW); + f.fv_buflen = fio->fio_vecs[i].fv_buflen; + f.fv_actlen = fio->fio_vecs[i].fv_actlen; + vec32p[i].fv_buf = f.fv_buf; + vec32p[i].fv_buflen = f.fv_buflen; + vec32p[i].fv_actlen = f.fv_actlen; + } + + if (ddi_copyout(fio, addr, + sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +void +frameio_mark_consumed(frameio_t *fio, int nframes) +{ + int i; + + ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes); + for (i = 0; i < nframes * fio->fio_nvpf; i++) + fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen; +} diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c new file mode 100644 index 0000000000..d03c7ce4ec --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -0,0 +1,5857 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +/* + * vnd - virtual (machine) networking datapath + * + * vnd's purpose is to provide a highly performant data path for Layer 2 network + * traffic and exist side by side an active IP netstack, each servicing + * different datalinks. vnd provides many of the same capabilities as the + * current TCP/IP stack does and some specific to layer two. Specifically: + * + * o Use of the DLD fastpath + * o Packet capture hooks + * o Ability to use hardware capabilities + * o Useful interfaces for handling multiple frames + * + * The following image shows where vnd fits into today's networking stack: + * + * +---------+----------+----------+ + * | libdlpi | libvnd | libsocket| + * +---------+----------+----------+ + * | · · VFS | + * | VFS · VFS +----------+ + * | · | sockfs | + * +---------+----------+----------+ + * | | VND | IP | + * | +----------+----------+ + * | DLD/DLS | + * +-------------------------------+ + * | MAC | + * +-------------------------------+ + * | GLDv3 | + * +-------------------------------+ + * + * ----------------------------------------- + * A Tale of Two Devices - DDI Device Basics + * ----------------------------------------- + * + * vnd presents itself to userland as a character device; however, it also is a + * STREAMS device so that it can interface with dld and the rest of the + * networking stack. Users never interface with the STREAMs devices directly and + * they are purely an implementation detail of vnd. Opening the STREAMS device + * require kcred and as such userland cannot interact with it or push it onto + * the stream head. + * + * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every + * clone gets its own minor number; however, minor nodes are not created in the + * devices tree for these instances. In this state a user may do two different + * things. They may issue ioctls that affect global state or they may issue + * ioctls that try to attach it to a given datalink. Once a minor device has + * been attached to a datalink, all operations on it are scoped to that context, + * therefore subsequent global operations are not permitted. + * + * A given device can be linked into the /devices and /dev name space via a link + * ioctl. That ioctl causes a minor node to be created in /devices and then it + * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar + * to, but simpler than, IP's persistence mechanism. + * + * --------------------- + * Binding to a datalink + * --------------------- + * + * Datalinks are backed by the dld (datalink device) and dls (datalink services) + * drivers. These drivers provide a STREAMS device for datalinks on the system + * which are exposed through /dev/net. Userland generally manipulates datalinks + * through libdlpi. When an IP interface is being plumbed up what actually + * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink + * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may + * then can negotiate with dld and dls to obtain access to various capabilities + * and fast paths via a series of STREAMS messages. + * + * In vnd, we do the same thing, but we leave our STREAMS module as an + * implementation detail of the system. We don't want users to be able to + * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require + * kcred to manipulate it. Thus, when a user issues a request to attach a + * datalink to a minor instance of the character device, that vnd minor instance + * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. + * vnd does that open using the passed in credentials from the ioctl, not kcred. + * This ensures that users who doesn't have permissions to open the device + * cannot. Once that's been opened, we push on the vnd streams module. + * + * Once the vnd STREAMS instance has been created for this device, eg. the + * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl + * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. + * This association begins the STREAM device's initialization. We start up an + * asynchronous state machine that takes care of all the different aspects of + * plumbing up the device with dld and dls and enabling the MAC fast path. We + * need to guarantee to consumers of the character device that by the time their + * ioctl returns, the data path has been fully initialized. + * + * The state progression is fairly linear. There are two general steady states. + * The first is VND_S_ONLINE, which means that everything is jacked up and good + * to go. The alternative is VND_S_ZOMBIE, which means that the streams device + * encountered an error or we have finished tearing it down and the character + * device can clean it up. The following is our state progression and the + * meaning of each state: + * + * | + * | + * V + * +---------------+ + * | VNS_S_INITIAL | This is our initial state. Every + * +---------------+ vnd STREAMS device starts here. + * | While in this state, only dlpi + * | M_PROTO and M_IOCTL messages can be + * | sent or received. All STREAMS based + * | data messages are dropped. + * | We transition out of this state by + * | sending a DL_INFO_REQ to obtain + * | information about the underlying + * | link. + * v + * +-----------------+ + * +--<-| VNS_S_INFO_SENT | In this state, we verify and + * | +-----------------+ record information about the + * | | underlying device. If the device is + * | | not suitable, eg. not of type + * v | DL_ETHER, then we immediately + * | | become a ZOMBIE. To leave this + * | | state we request exclusive active + * | | access to the device via + * v | DL_EXCLUSIVE_REQ. + * | v + * | +----------------------+ + * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether + * | +----------------------+ or not we were able to obtain + * | | | exclusive access to the device. If + * | | | we were not able to, then we leave, + * v | | as that means that something like + * | | | IP is already plumbed up on top of + * | | | the datalink. We leave this state + * | | | by progressing through to the + * | | | appropriate DLPI primitive, either + * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ + * | | | depending on the style of the + * | | | datalink. + * | | v + * | | +-------------------+ + * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were + * | | +-------------------+ able to perform a standard DLPI + * | | | attach and if so, go ahead and + * v | | send a DLPI_BIND_REQ. + * | v v + * | +-------------------+ + * +--<-| VNS_S_BIND_SENT | In this state we see the result of + * | +-------------------+ our attempt to bind to PPA 0 of the + * v | underlying device. Because we're + * | | trying to be a layer two datapath, + * | | the specific attachment point isn't + * | | too important as we're going to + * v | have to enable promiscuous mode. We + * | | transition out of this by sending + * | | our first of three promiscuous mode + * | | requests. + * v v + * | +------------------------+ + * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we + * | +------------------------+ were able to enable promiscuous + * | | mode at the physical level. We + * | | transition out of this by enabling + * | | multicast and broadcast promiscuous + * v | mode. + * | v + * | +--------------------------+ + * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ have enabled DL_PROMISC_MULTI and + * v | move onto the second promiscuous + * | | mode request. + * | v + * | +----------------------------+ + * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we + * | +----------------------------+ enabled RX_ONLY promiscuous mode. + * | | We specifically do this as we don't + * v | want to receive our own traffic + * | | that we'll send out. We leave this + * | | state by enabling the final flag + * | | DL_PROMISC_FIXUPS. + * | v + * | +--------------------------+ + * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ enabled FIXUP promiscuous mode. + * | | We specifically do this as we need + * v | to ensure that traffic which is + * | | received by being looped back to us + * | | correctly has checksums fixed. We + * | | leave this state by requesting the + * | | dld/dls capabilities that we can + * v | process. + * | v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of + * | +--------------------+ capabilities that dld advertised + * | | and enable the ones that currently + * v | support for use. See the section + * | | later on regarding capabilities + * | | for more information. We leave this + * | | state by sending an enable request. + * v v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability + * | +--------------------+ initialization. Once finished, we + * | | transition to the next state. If + * v | the dld fast path is not available, + * | | we become a zombie. + * | v + * | +--------------+ + * | | VNS_S_ONLINE | This is a vnd STREAMS device's + * | +--------------+ steady state. It will normally + * | | reside in this state while it is in + * | | active use. It will only transition + * v | to the next state when the STREAMS + * | | device is closed by the character + * | | device. In this state, all data + * | | flows over the dld fast path. + * | v + * | +---------------------+ + * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of + * | +---------------------+ disabling capabilities and + * | | flushing all data. At this point + * | | any additional data that we receive + * | | will be dropped. We leave this + * v | state by trying to remove multicast + * | | promiscuity. + * | | + * | v + * | +---------------------------------+ + * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------------+ successfully removed multicast + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. We leave this state by trying + * | | to disable SAP level promiscuous + * | | mode. + * | v + * | +---------------------------+ + * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------+ successfully removed SAP level + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. Note that we don't worry + * | | about either of + * | | DL_PROMISC_FIXUPS or + * | | DL_PROMISC_RX_ONLY. If these are + * | | the only two entries left, then we + * | | should have anything that MAC is + * | | doing for us at this point, + * | | therefore it's safe for us to + * | | proceed to unbind, which is how we + * | | leave this state via a + * | v DL_UNBIND_REQ. + * | +-------------------+ + * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind + * | +-------------------+ request went. Regardless of its + * | | success, we always transition to + * | | a zombie state. + * | v + * | +--------------+ + * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS + * +--------------+ device is waiting to finish being + * reaped. Because we have no more + * ways to receive data it should be + * safe to destroy all remaining data + * structures. + * + * If the stream association fails for any reason the state machine reaches + * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the + * STREAMS ioctl to the character device. That will fail the user ioctl and + * propagate the vnd_errno_t back to userland. If, on the other hand, the + * association succeeds, then the vnd STREAMS device will be fully plumbed up + * and ready to transmit and receive message blocks. Consumers will be able to + * start using the other cbops(9E) entry points once the attach has fully + * finished, which will occur after the original user attach ioctl to the + * character device returns. + * + * It's quite important that we end up sending the full series of STREAMS + * messages when tearing down. While it's tempting to say that we should just + * rely on the STREAMS device being closed to properly ensure that we have no + * more additional data, that's not sufficient due to our use of direct + * callbacks. DLS does not ensure that by the time we change the direct + * callback (vnd_mac_input) that all callers to it will have been quiesced. + * However, it does guarantee that if we disable promiscuous mode ourselves and + * we turn off the main data path via DL_UNBIND_REQ that it will work. + * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do + * it as part of tearing down the STREAMS device. This ensures that we'll + * quiesce all data before we destroy our data structures and thus we should + * eliminate the race in changing the data function. + * + * -------------------- + * General Architecture + * -------------------- + * + * There are several different devices and structures in the vnd driver. There + * is a per-netstack component, pieces related to the character device that + * consumers see, the internal STREAMS device state, and the data queues + * themselves. The following ASCII art picture describes their relationships and + * some of the major pieces of data that contain them. These are not exhaustive, + * e.g. synchronization primitives are left out. + * + * +----------------+ +-----------------+ + * | global | | global | + * | device list | | netstack list | + * | vnd_dev_list | | vnd_nsd_list | + * +----------------+ +-----------------+ + * | | + * | v + * | +-------------------+ +-------------------+ + * | | per-netstack data | ---> | per-netstack data | --> ... + * | | vnd_pnsd_t | | vnd_pnsd_t | + * | | | +-------------------+ + * | | | + * | | nestackid_t ---+----> Netstack ID + * | | vnd_pnsd_flags_t -+----> Status flags + * | | zoneid_t ---+----> Zone ID for this netstack + * | | hook_family_t ---+----> VND IPv4 Hooks + * | | hook_family_t ---+----> VND IPv6 Hooks + * | | list_t ----+ | + * | +------------+------+ + * | | + * | v + * | +------------------+ +------------------+ + * | | character device | ---> | character device | -> ... + * +---------->| vnd_dev_t | | vnd_dev_t | + * | | +------------------+ + * | | + * | minor_t ---+--> device minor number + * | ldi_handle_t ---+--> handle to /dev/net/%datalink + * | vnd_dev_flags_t -+--> device flags, non blocking, etc. + * | char[] ---+--> name if linked + * | vnd_str_t * -+ | + * +--------------+---+ + * | + * v + * +-------------------------+ + * | STREAMS device | + * | vnd_str_t | + * | | + * | vnd_str_state_t ---+---> State machine state + * | gsqueue_t * ---+---> mblk_t Serialization queue + * | vnd_str_stat_t ---+---> per-device kstats + * | vnd_str_capab_t ---+----------------------------+ + * | vnd_data_queue_t ---+ | | + * | vnd_data_queue_t -+ | | v + * +-------------------+-+---+ +---------------------+ + * | | | Stream capabilities | + * | | | vnd_str_capab_t | + * | | | | + * | | supported caps <--+-- vnd_capab_flags_t | + * | | dld cap handle <--+-- void * | + * | | direct tx func <--+-- vnd_dld_tx_t | + * | | +---------------------+ + * | | + * +----------------+ +-------------+ + * | | + * v v + * +-------------------+ +-------------------+ + * | Read data queue | | Write data queue | + * | vnd_data_queue_t | | vnd_data_queue_t | + * | | | | + * | size_t ----+--> Current size | size_t ----+--> Current size + * | size_t ----+--> Max size | size_t ----+--> Max size + * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head + * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail + * +-------------------+ +-------------------+ + * + * + * Globally, we maintain two lists. One list contains all of the character + * device soft states. The other maintains a list of all our netstack soft + * states. Each netstack maintains a list of active devices that have been + * associated with a datalink in its netstack. + * + * Recall that a given minor instance of the character device exists in one of + * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, + * or it can be associated with a given datalink. When minor instances are in + * the former state, they do not exist in a given vnd_pnsd_t's list of devices. + * As part of attaching to a datalink, the given vnd_dev_t will be inserted into + * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a + * vnd_str_t, to be created and associated to a vnd_dev_t. + * + * The character device, and its vnd_dev_t, is the interface to the rest of the + * system. The vnd_dev_t keeps track of various aspects like whether various + * operations, such as read, write and the frameio ioctls, are considered + * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for + * keeping track of things like the name of the device, if any, in /dev. The + * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual + * data queues. However, ioctls that manipulate these properties all go through + * the vnd_dev_t to its associated vnd_str_t. + * + * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One + * for frames to transmit (write queue) and one for frames received (read + * queue). These data queues have a maximum size and attempting to add data + * beyond that maximum size will result in data being dropped. The sizes are + * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits + * in those buffers or has a reservation in those buffers while they are in vnd + * and waiting to be consumed by the user or by mac. + * + * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the + * available, negotiated, and currently active features. + * + * ---------------------- + * Data Path and gsqueues + * ---------------------- + * + * There's a lot of plumbing in vnd to get to the point where we can send data, + * but vnd's bread and butter is the data path, so it's worth diving into it in + * more detail. Data enters and exits the system from two ends. + * + * The first end is the vnd consumer. This comes in the form of read and write + * system calls as well as the frame I/O ioctls. The read and write system calls + * operate on a single frame at a time. Think of a frame as a single message + * that has come in off the wire, which may itself comprise multiple mblk_t's + * linked together in the kernel. readv(2) and writev(2) have the same + * limitations as read(2) and write(2). We enforce this as the system is + * required to fill up every uio(9S) buffer before moving onto the next one. + * This means that if you have a MTU sized buffer and two frames come in which + * are less than half of the MTU they must fill up the given iovec. Even if we + * didn't want to do this, we have no way of informing the supplier of the + * iovecs that they were only partially filled or where one frame ends and + * another begins. That's life, as such we have frame I/O which solves this + * problem. It allows for multiple frames to be consumed as well as for frames + * to be broken down into multiple vector components. + * + * The second end is the mac direct calls. As part of negotiating capabilities + * via dld, we give mac a function of ours to call when packets are received + * [vnd_mac_input()] and a callback to indicate that flow has been restored + * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can + * transmit data with. As part of the contract with mac, mac is allowed to flow + * control us by returning a cookie to the transmit function. When that happens, + * all outbound traffic is halted until our callback function is called and we + * can schedule drains. + * + * It's worth looking at these in further detail. We'll start with the rx path. + * + * + * | + * * . . . packets from gld + * | + * v + * +-------------+ + * | mac | + * +-------------+ + * | + * v + * +-------------+ + * | dld | + * +-------------+ + * | + * * . . . dld direct callback + * | + * v + * +---------------+ + * | vnd_mac_input | + * +---------------+ + * | + * v + * +---------+ +-------------+ + * | dropped |<--*---------| vnd_hooks | + * | by | . +-------------+ + * | hooks | . drop probe | + * +---------+ kstat bump * . . . Do we have free + * | buffer space? + * | + * no . | . yes + * . + . + * +---*--+------*-------+ + * | | + * * . . drop probe * . . recv probe + * | kstat bump | kstat bump + * v | + * +---------+ * . . fire pollin + * | freemsg | v + * +---------+ +-----------------------+ + * | vnd_str_t`vns_dq_read | + * +-----------------------+ + * ^ ^ + * +----------+ | | +---------+ + * | read(9E) |-->-+ +--<--| frameio | + * +----------+ +---------+ + * + * The rx path is rather linear. Packets come into us from mac. We always run + * them through the various hooks, and if they come out of that, we inspect the + * read data queue. If there is not enough space for a packet, we drop it. + * Otherwise, we append it to the data queue, and fire read notifications + * targetting anyone polling or doing blocking I/O on this device. Those + * consumers then drain the head of the data queue. + * + * The tx path is more complicated due to mac flow control. After any call into + * mac, we may have to potentially suspend writes and buffer data for an + * arbitrary amount of time. As such, we need to carefully track the total + * amount of outstanding data so that we don't waste kernel memory. This is + * further complicated by the fact that mac will asynchronously tell us when our + * flow has been resumed. + * + * For data to be able to enter the system, it needs to be able to take a + * reservation from the write data queue. Once the reservation has been + * obtained, we enter the gsqueue so that we can actually append it. We use + * gsqueues (serialization queues) to ensure that packets are manipulated in + * order as we deal with the draining and appending packets. We also leverage + * its worker thread to help us do draining after mac has restorted our flow. + * + * The following image describes the flow: + * + * +-----------+ +--------------+ +-------------------------+ +------+ + * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | + * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ + * +-----------+ +--------------+ . +-------------------------+ + * | ^ . + * | | . reserve space from gsqueue + * | | | + * queue . . . * | space v + * full | * . . . avail +------------------------+ + * v | | vnd_squeue_tx_append() | + * +--------+ +------------+ +------------------------+ + * | EAGAIN |<--*------| Non-block? |<-+ | + * +--------+ . +------------+ | v + * . yes v | wait +--------------+ + * no . .* * . . for | append chain | + * +----+ space | to outgoing | + * | mblk chain | + * from gsqueue +--------------+ + * | | + * | +-------------------------------------------------+ + * | | + * | | yes . . . + * v v . + * +-----------------------+ +--------------+ . +------+ + * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | + * +-----------------------+ +--------------+ +------+ + * | | + * +---------------------------------|---------------------+ + * | | tx | + * | no . . * queue . . * + * | flow controlled . | empty * . fire pollout + * | . v | if mblk_t's + * +-------------+ . +---------------------+ | sent + * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ + * | flags | +---------------------+ | + * +-------------+ More data | | | More data | + * and limit ^ v * . . and limit ^ + * not reached . . * | | reached | + * +----+ | | + * v | + * +----------+ +-------------+ +---------------------------+ + * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | + * | control | | block flags | | vnd_squeue_tx_drain() and | + * | callback | +-------------+ | GSQUEUE_FILL flag, iff | + * +----------+ | not already scheduled | + * +---------------------------+ + * + * The final path taken for a given write(9E)/frameio ioctl depends on whether + * or not the vnd_dev_t is non-blocking. That controls the initial path of + * trying to take a reservation in write data queue. If the device is in + * non-blocking mode, we'll return EAGAIN when there is not enough space + * available, otherwise, the calling thread blocks on the data queue. + * + * Today when we call into vnd_squeue_tx_drain() we will not try to drain the + * entire queue, as that could be quite large and we don't want to necessarily + * keep the thread that's doing the drain until it's been finished. Not only + * could more data be coming in, but the draining thread could be a userland + * thread that has more work to do. We have two limits today. There is an upper + * bound on the total amount of data and the total number of mblk_t chains. If + * we hit either limit, then we will schedule another drain in the gsqueue and + * go from there. + * + * It's worth taking some time to describe how we interact with gsqueues. vnd + * has a gsqueue_set_t for itself. It's important that it has its own set, as + * the profile of work that vnd does is different from other sub-systems in the + * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. + * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up + * maintaining one for a given device. Because of that, we want to use a + * pseudo-random one to try and spread out the load, and picking one at random + * is likely to be just as good as any fancy algorithm we might come up with, + * especially as any two devices could have radically different transmit + * profiles. + * + * While some of the write path may seem complicated, it does allow us to + * maintain an important property. Once we have acknowledged a write(9E) or + * frameio ioctl, we will not drop the packet, excepting something like ipf via + * the firewall hooks. + * + * There is one other source of flow control that can exist in the system which + * is in the form of a barrier. The barrier is an internal mechanism used for + * ensuring that an gsqueue is drained for a given device. We use this as part + * of tearing down. Specifically we disable the write path so nothing new can be + * inserted into the gsqueue and then insert a barrier block. Once the barrier + * block comes out of the gsqueue, then we know nothing else in the gsqueue that + * could refer to the vnd_str_t, being destroyed, exists. + * + * --------------------- + * vnd, zones, netstacks + * --------------------- + * + * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. + * Because of that, vnd is also a netstack module. It registers with the + * netstack sub-system and receives callbacks every time a netstack is created, + * being shutdown, and destroyed. The netstack callbacks drive the creation and + * destruction of the vnd_pnsd_t structures. + * + * Recall from the earlier architecture diagrams that every vnd device is scoped + * to a netstack and known about by a given vnd_pnsd_t. When that netstack is + * torn down, we also tear down any vnd devices that are hanging around. When + * the netstack is torn down, we know that any zones that are scoped to that + * netstack are being shut down and have no processes remaining. This is going + * to be the case whether they are shared or exclusive stack zones. We have to + * perform a careful dance. + * + * There are two different callbacks that happen on tear down, the first is a + * shutdown callback, the second is a destroy callback. When the shutdown + * callback is fired we need to prepare for the netstack to go away and ensure + * that nothing can continue to persist itself. + * + * More specifically, when we get notice of a stack being shutdown we first + * remove the netstack from the global netstack list to ensure that no one new + * can come in and find the netstack and get a reference to it. After that, we + * notify the neti hooks that they're going away. Once that's all done, we get + * to the heart of the matter. + * + * When shutting down there could be any number of outstanding contexts that + * have a reference on the vnd_pnsd_t and on the individual links. However, we + * know that no one new will be able to find the vnd_pnsd_t. To account for + * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with + * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device + * to the netstack's list. If this is set, then they must not append to it. + * Once this is set, we know that the netstack's list of devices can never grow, + * only shrink. + * + * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that + * the container for the device is being destroyed and that we should not allow + * additional references to the device to be created, whether via open, or + * linking. The presence of this bit also allows things like the list ioctl and + * sdev to know not to consider its existence. At the conclusion of this being + * set, we know that no one else should be able to obtain a new reference to the + * device. + * + * Once that has been set for all devices, we go through and remove any existing + * links that have been established in sdev. Because doing that may cause the + * final reference for the device to be dropped, which still has a reference to + * the netstack, we have to restart our walk due to dropped locks. We know that + * this walk will eventually complete because the device cannot be relinked and + * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. + * Once that's finished, the shutdown callback returns. + * + * When we reach the destroy callback, we simply wait for references on the + * netstack to disappear. Because the zone has been shut down, all processes in + * it that have open references have been terminated and reaped. Any threads + * that are newly trying to reference it will fail. However, there is one thing + * that can halt this that we have no control over, which is the global zone + * holding open a reference to the device. In this case the zone halt will hang + * in vnd_stack_destroy. Once the last references is dropped we finish destroy + * the netinfo hooks and free the vnd_pnsd_t. + * + * ---- + * sdev + * ---- + * + * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd + * for both the global and non-global zones. In any given zone we always supply + * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone + * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. + * if a link was named net0, there would be a /dev/vnd/net0. The global zone can + * also see every link for every zone, ala /dev/net, under + * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device + * named net0, the global zone would have /dev/vnd/turin/net0. + * + * The sdev plugin has three interfaces that it supplies back to sdev. One is to + * validate that a given node is still valid. The next is a callback from sdev + * to say that it is no longer using the node. The third and final one is from + * sdev where it asks us to fill a directory. All of the heavy lifting is done + * in directory filling and in valiation. We opt not to maintain a reference on + * the device while there is an sdev node present. This makes the removal of + * nodes much simpler and most of the possible failure modes shouldn't cause any + * real problems. For example, the open path has to handle both dev_t's which no + * longer exist and which are no longer linked. + * + * ----- + * hooks + * ----- + * + * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd + * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks + * in a minimal fashion. While we will allow traffic to be filtered through the + * hooks, we do not provide means for packet injection or additional inspection + * at this time. There are a total of four different events created: + * + * o IPv4 physical in + * o IPv4 physical out + * o IPv6 physical in + * o IPv6 physical out + * + * --------------- + * Synchronization + * --------------- + * + * To make our synchronization simpler, we've put more effort into making the + * metadata/setup paths do more work. That work allows the data paths to make + * assumptions around synchronization that simplify the general case. Each major + * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is + * annotated with the protection that its members receives. The following + * annotations are used: + * + * A Atomics; these values are only modified using atomics values. + * Currently this only applies to kstat values. + * E Existence; no lock is needed to access this member, it does not + * change while the structure is valid. + * GL Global Lock; these members are protected by the global + * vnd_dev_lock. + * L Locked; access to the member is controlled by a lock that is in + * the structure. + * NSL netstack lock; this member is protected by the containing + * netstack. This only applies to the vnd_dev_t`vdd_nslink. + * X This member is special, and is discussed in this section. + * + * In addition to locking, we also have reference counts on the vnd_dev_t and + * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. + * With rare exception, once a reference count is decremented, the consumer + * should not assume that the data is valid any more. The only exception to this + * is the case where we're removing an extant reference count from a link into + * /devices or /dev. Reference counts are obtained on these structures as a part + * of looking them up. + * + * # Global Lock Ordering + * ###################### + * + * The following is the order that you must take locks in vnd: + * + * 1) vnd`vnd_dev_lock + * 2) vnd_pnsd_t`vpnd_lock + * 3) vnd_dev_t`vnd_lock + * 4) vnd_str_t`vns_lock + * 5) vnd_data_queue_t`vdq_lock + * + * One must adhere to the following rules: + * + * o You must acquire a lower numbered lock before a high numbered lock. + * o It is NOT legal to hold two locks of the same level concurrently, eg. you + * can not hold two different vnd_dev_t's vnd_lock at the same time. + * o You may release locks in any order. + * o If you release a lock, you must honor the locking rules before acquiring + * it again. + * o You should not hold any locks when calling any of the rele functions. + * + * # Special Considerations + * ######################## + * + * While most of the locking is what's expected, it's worth going into the + * special nature that a few members hold. Today, only two structures have + * special considerations: the vnd_dev_t and the vnd_str_t. All members with + * special considerations have an additional annotation that describes how you + * should interact with it. + * + * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is + * attached or in the process of attaching. If the code path that goes through + * requires an attached vnd_dev_t, eg. the data path and tear down path, then it + * is always legal to dereference that member without a lock held. When they are + * added to the system, they should be done under the vdd_lock and done as part + * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the + * lifetime of the vnd_dev_t. + * + * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it + * always exists as it is a part of the structure. The only time that it's valid + * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag + * set or during tear down. Outside of those paths which are naturally + * serialized, there is no explicit locking around the member. + * + * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not + * initially set as part of creating the structure, but are set as part of + * responding to the association ioctl. Anything in the data path or metadata + * path that requires association may assume that they exist, as we do not kick + * off the state machine until they're set. + * + * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The + * members are designed to be used as part of various operations with the + * gsqueues. A lock isn't needed to use them, but to work with them, the + * appropriate flag in the vnd_str_t`vns_flags must have been set by the current + * thread. Otherwise, it is always fair game to refer to their addresses. Their + * contents are ignored by vnd, but some members are manipulated by the gsqueue + * subsystem. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/ddi.h> +#include <sys/ethernet.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/ksynch.h> +#include <sys/taskq_impl.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/dlpi.h> +#include <sys/cred.h> +#include <sys/id_space.h> +#include <sys/list.h> +#include <sys/ctype.h> +#include <sys/policy.h> +#include <sys/sunldi.h> +#include <sys/cred.h> +#include <sys/strsubr.h> +#include <sys/poll.h> +#include <sys/neti.h> +#include <sys/hook.h> +#include <sys/hook_event.h> +#include <sys/vlan.h> +#include <sys/dld.h> +#include <sys/mac_client.h> +#include <sys/netstack.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/gsqueue.h> +#include <sys/ht.h> + +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/vnd.h> + +/* + * Globals + */ +static dev_info_t *vnd_dip; +static taskq_t *vnd_taskq; +static kmem_cache_t *vnd_str_cache; +static kmem_cache_t *vnd_dev_cache; +static kmem_cache_t *vnd_pnsd_cache; +static id_space_t *vnd_minors; +static int vnd_list_init = 0; +static sdev_plugin_hdl_t vnd_sdev_hdl; +static gsqueue_set_t *vnd_sqset; + +static kmutex_t vnd_dev_lock; +static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ +static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ + +/* + * STREAMs ioctls + * + * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such + * they aren't a part of the header file. + */ +#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) + +/* + * Private ioctl to associate a given streams instance with a minor instance of + * the character device. + */ +#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) + +typedef struct vnd_strioc_associate { + minor_t vsa_minor; /* minor device node */ + netstackid_t vsa_nsid; /* netstack id */ + vnd_errno_t vsa_errno; /* errno */ +} vnd_strioc_associate_t; + +typedef enum vnd_strioc_state { + VSS_UNKNOWN = 0, + VSS_COPYIN = 1, + VSS_COPYOUT = 2, +} vnd_strioc_state_t; + +typedef struct vnd_strioc { + vnd_strioc_state_t vs_state; + caddr_t vs_addr; +} vnd_strioc_t; + +/* + * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though + * really, overlap is at the end of the day, inevitable. + */ +#define VND_SQUEUE_TAG_TX_DRAIN 0x42 +#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 +#define VND_SQUEUE_TAG_VND_WRITE 0x44 +#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 +#define VND_SQUEUE_TAG_STRBARRIER 0x46 + +/* + * vnd reserved names. These are names which are reserved by vnd and thus + * shouldn't be used by some external program. + */ +static char *vnd_reserved_names[] = { + "ctl", + "zone", + NULL +}; + +/* + * vnd's DTrace probe macros + * + * DTRACE_VND* are all for a stable provider. We also have an unstable internal + * set of probes for reference count manipulation. + */ +#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_VND_REFINC(vdp) \ + DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); +#define DTRACE_VND_REFDEC(vdp) \ + DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); + + +/* + * Tunables + */ +size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ +size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ + +/* + * These numbers are designed as per-device tunables that are applied when a new + * vnd device is attached. They're a rough stab at what may be a reasonable + * amount of work to do in one burst in an squeue. + */ +size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ +size_t vnd_flush_nburst = 10; /* 10 frames */ + +/* + * Constants related to our sdev plugins + */ +#define VND_SDEV_NAME "vnd" +#define VND_SDEV_ROOT "/dev/vnd" +#define VND_SDEV_ZROOT "/dev/vnd/zone" + +/* + * vnd relies on privileges, not mode bits to limit access. As such, device + * files are read-write to everyone. + */ +#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \ + S_IROTH | S_IWOTH) + +/* + * Statistic macros + */ +#define VND_STAT_INC(vsp, field, val) \ + atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) +#define VND_LATENCY_1MS 1000000 +#define VND_LATENCY_10MS 10000000 +#define VND_LATENCY_100MS 100000000 +#define VND_LATENCY_1S 1000000000 +#define VND_LATENCY_10S 10000000000 + +/* + * Constants for vnd hooks + */ +static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; +#define IPV4_MCAST_LEN 3 +static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +#define IPV6_MCAST_LEN 2 +static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* + * vnd internal data structures and types + */ + +struct vnd_str; +struct vnd_dev; +struct vnd_pnsd; + +/* + * As part of opening the device stream we need to properly communicate with our + * underlying stream. This is a bit of an asynchronous dance and we need to + * properly work with dld to get everything set up. We have to initiate the + * conversation with dld and as such we keep track of our state here. + */ +typedef enum vnd_str_state { + VNS_S_INITIAL = 0, + VNS_S_INFO_SENT, + VNS_S_EXCLUSIVE_SENT, + VNS_S_ATTACH_SENT, + VNS_S_BIND_SENT, + VNS_S_SAP_PROMISC_SENT, + VNS_S_MULTI_PROMISC_SENT, + VNS_S_RX_ONLY_PROMISC_SENT, + VNS_S_FIXUP_PROMISC_SENT, + VNS_S_CAPAB_Q_SENT, + VNS_S_CAPAB_E_SENT, + VNS_S_ONLINE, + VNS_S_SHUTTING_DOWN, + VNS_S_MULTICAST_PROMISCOFF_SENT, + VNS_S_SAP_PROMISCOFF_SENT, + VNS_S_UNBIND_SENT, + VNS_S_ZOMBIE +} vnd_str_state_t; + +typedef enum vnd_str_flags { + VNS_F_NEED_ZONE = 0x1, + VNS_F_TASKQ_DISPATCHED = 0x2, + VNS_F_CONDEMNED = 0x4, + VNS_F_FLOW_CONTROLLED = 0x8, + VNS_F_DRAIN_SCHEDULED = 0x10, + VNS_F_BARRIER = 0x20, + VNS_F_BARRIER_DONE = 0x40 +} vnd_str_flags_t; + +typedef enum vnd_capab_flags { + VNS_C_HCKSUM = 0x1, + VNS_C_DLD = 0x2, + VNS_C_DIRECT = 0x4, + VNS_C_HCKSUM_BADVERS = 0x8 +} vnd_capab_flags_t; + +/* + * Definitions to interact with direct callbacks + */ +typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, + mac_header_info_t *); +typedef uintptr_t vnd_mac_cookie_t; +/* DLD Direct capability function */ +typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); +/* DLD Direct tx function */ +typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +/* DLD Direct function to set flow control callback */ +typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), + void *); +/* DLD Direct function to see if flow controlled still */ +typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); + +/* + * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. + */ +typedef struct vnd_str_capab { + vnd_capab_flags_t vsc_flags; + t_uscalar_t vsc_hcksum_opts; + vnd_dld_cap_t vsc_capab_f; + void *vsc_capab_hdl; + vnd_dld_tx_t vsc_tx_f; + void *vsc_tx_hdl; + vnd_dld_set_fcb_t vsc_set_fcb_f; + void *vsc_set_fcb_hdl; + vnd_dld_is_fc_t vsc_is_fc_f; + void *vsc_is_fc_hdl; + vnd_mac_cookie_t vsc_fc_cookie; + void *vsc_tx_fc_hdl; +} vnd_str_capab_t; + +/* + * The vnd_data_queue is a simple construct for storing a series of messages in + * a queue. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_data_queue { + struct vnd_str *vdq_vns; /* E */ + kmutex_t vdq_lock; + kcondvar_t vdq_ready; /* Uses vdq_lock */ + ssize_t vdq_max; /* L */ + ssize_t vdq_cur; /* L */ + mblk_t *vdq_head; /* L */ + mblk_t *vdq_tail; /* L */ +} vnd_data_queue_t; + +typedef struct vnd_str_stat { + kstat_named_t vks_rbytes; + kstat_named_t vks_rpackets; + kstat_named_t vks_obytes; + kstat_named_t vks_opackets; + kstat_named_t vks_nhookindrops; + kstat_named_t vks_nhookoutdrops; + kstat_named_t vks_ndlpidrops; + kstat_named_t vks_ndataindrops; + kstat_named_t vks_ndataoutdrops; + kstat_named_t vks_tdrops; + kstat_named_t vks_linkname; + kstat_named_t vks_zonename; + kstat_named_t vks_nmacflow; + kstat_named_t vks_tmacflow; + kstat_named_t vks_mac_flow_1ms; + kstat_named_t vks_mac_flow_10ms; + kstat_named_t vks_mac_flow_100ms; + kstat_named_t vks_mac_flow_1s; + kstat_named_t vks_mac_flow_10s; +} vnd_str_stat_t; + +/* + * vnd stream structure + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_str { + kmutex_t vns_lock; + kcondvar_t vns_cancelcv; /* Uses vns_lock */ + kcondvar_t vns_barriercv; /* Uses vns_lock */ + kcondvar_t vns_stcv; /* Uses vns_lock */ + vnd_str_state_t vns_state; /* L */ + vnd_str_state_t vns_laststate; /* L */ + vnd_errno_t vns_errno; /* L */ + vnd_str_flags_t vns_flags; /* L */ + vnd_str_capab_t vns_caps; /* L */ + taskq_ent_t vns_tqe; /* L */ + vnd_data_queue_t vns_dq_read; /* E */ + vnd_data_queue_t vns_dq_write; /* E */ + mblk_t *vns_dlpi_inc; /* L */ + queue_t *vns_rq; /* E */ + queue_t *vns_wq; /* E */ + queue_t *vns_lrq; /* E */ + t_uscalar_t vns_dlpi_style; /* L */ + t_uscalar_t vns_minwrite; /* L */ + t_uscalar_t vns_maxwrite; /* L */ + hrtime_t vns_fclatch; /* L */ + hrtime_t vns_fcupdate; /* L */ + kstat_t *vns_kstat; /* E */ + gsqueue_t *vns_squeue; /* E */ + mblk_t vns_drainblk; /* E + X */ + mblk_t vns_barrierblk; /* E + X */ + vnd_str_stat_t vns_ksdata; /* A */ + size_t vns_nflush; /* L */ + size_t vns_bsize; /* L */ + struct vnd_dev *vns_dev; /* E + X */ + struct vnd_pnsd *vns_nsd; /* E + X */ +} vnd_str_t; + +typedef enum vnd_dev_flags { + VND_D_ATTACH_INFLIGHT = 0x001, + VND_D_ATTACHED = 0x002, + VND_D_LINK_INFLIGHT = 0x004, + VND_D_LINKED = 0x008, + VND_D_CONDEMNED = 0x010, + VND_D_ZONE_DYING = 0x020, + VND_D_OPENED = 0x040 +} vnd_dev_flags_t; + +/* + * This represents the data associated with a minor device instance. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_dev { + kmutex_t vdd_lock; + list_node_t vdd_link; /* GL */ + list_node_t vdd_nslink; /* NSL */ + int vdd_ref; /* L */ + vnd_dev_flags_t vdd_flags; /* L */ + minor_t vdd_minor; /* E */ + dev_t vdd_devid; /* E */ + ldi_ident_t vdd_ldiid; /* E */ + ldi_handle_t vdd_ldih; /* X */ + cred_t *vdd_cr; /* X */ + vnd_str_t *vdd_str; /* L */ + struct pollhead vdd_ph; /* E */ + struct vnd_pnsd *vdd_nsd; /* E + X */ + char vdd_datalink[VND_NAMELEN]; /* L */ + char vdd_lname[VND_NAMELEN]; /* L */ +} vnd_dev_t; + +typedef enum vnd_pnsd_flags { + VND_NS_CONDEMNED = 0x1 +} vnd_pnsd_flags_t; + +/* + * Per netstack data structure. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_pnsd { + list_node_t vpnd_link; /* protected by global dev lock */ + zoneid_t vpnd_zid; /* E */ + netstackid_t vpnd_nsid; /* E */ + boolean_t vpnd_hooked; /* E */ + net_handle_t vpnd_neti_v4; /* E */ + hook_family_t vpnd_family_v4; /* E */ + hook_event_t vpnd_event_in_v4; /* E */ + hook_event_t vpnd_event_out_v4; /* E */ + hook_event_token_t vpnd_token_in_v4; /* E */ + hook_event_token_t vpnd_token_out_v4; /* E */ + net_handle_t vpnd_neti_v6; /* E */ + hook_family_t vpnd_family_v6; /* E */ + hook_event_t vpnd_event_in_v6; /* E */ + hook_event_t vpnd_event_out_v6; /* E */ + hook_event_token_t vpnd_token_in_v6; /* E */ + hook_event_token_t vpnd_token_out_v6; /* E */ + kmutex_t vpnd_lock; /* Protects remaining members */ + kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ + int vpnd_ref; /* L */ + vnd_pnsd_flags_t vpnd_flags; /* L */ + list_t vpnd_dev_list; /* L */ +} vnd_pnsd_t; + +static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); + +/* + * Drop function signature. + */ +typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); + +static void +vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +/* ARGSUSED */ +static void +vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + panic("illegal vnd drop"); +} + +/* ARGSUSED */ +static void +vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + mblk_t *mp; + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp->b_next; + vnd_drop_hook_in(vsp, mp, "stream not associated"); + } +} + +static vnd_pnsd_t * +vnd_nsd_lookup(netstackid_t nsid) +{ + vnd_pnsd_t *nsp; + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + if (nsp->vpnd_nsid == nsid) { + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref >= 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zid(zoneid_t zid) +{ + netstack_t *ns; + vnd_pnsd_t *nsp; + ns = netstack_find_by_zoneid(zid); + if (ns == NULL) + return (NULL); + nsp = vnd_nsd_lookup(ns->netstack_stackid); + netstack_rele(ns); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zonename(char *zname) +{ + zone_t *zonep; + vnd_pnsd_t *nsp; + + zonep = zone_find_by_name(zname); + if (zonep == NULL) + return (NULL); + + nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); + zone_rele(zonep); + return (nsp); +} + +static void +vnd_nsd_ref(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + /* + * This can only be used on something that has been obtained through + * some other means. As such, the caller should already have a reference + * before adding another one. This function should not be used as a + * means of creating the initial reference. + */ + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static void +vnd_nsd_rele(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref--; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static vnd_dev_t * +vnd_dev_lookup(minor_t m) +{ + vnd_dev_t *vdp; + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + if (vdp->vdd_minor == m) { + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (vdp); +} + +static void +vnd_dev_free(vnd_dev_t *vdp) +{ + /* + * When the STREAM exists we need to go through and make sure + * communication gets torn down. As part of closing the stream, we + * guarantee that nothing else should be able to enter the stream layer + * at this point. That means no one should be able to call + * read(),write() or one of the frameio ioctls. + */ + if (vdp->vdd_flags & VND_D_ATTACHED) { + (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + + /* + * We have to remove ourselves from our parents list now. It is + * really quite important that we have already set the condemend + * flag here so that our containing netstack basically knows + * that we're on the way down and knows not to wait for us. It's + * also important that we do that before we put a rele on the + * the device as that is the point at which it will check again. + */ + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(vdp->vdd_nsd); + vdp->vdd_nsd = NULL; + } + ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); + id_free(vnd_minors, vdp->vdd_minor); + mutex_destroy(&vdp->vdd_lock); + kmem_cache_free(vnd_dev_cache, vdp); +} + +static void +vnd_dev_ref(vnd_dev_t *vdp) +{ + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); +} + +/* + * As part of releasing the hold on this we may tear down a given vnd_dev_t As + * such we need to make sure that we grab the list lock first before grabbing + * the vnd_dev_t's lock to ensure proper lock ordering. + */ +static void +vnd_dev_rele(vnd_dev_t *vdp) +{ + mutex_enter(&vnd_dev_lock); + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref--; + DTRACE_VND_REFDEC(vdp); + if (vdp->vdd_ref > 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return; + } + + /* + * Now that we've removed this from the list, we can go ahead and + * drop the list lock. No one else can find this device and reference + * it. As its reference count is zero, it by definition does not have + * any remaining entries in /devices that could lead someone back to + * this. + */ + vdp->vdd_flags |= VND_D_CONDEMNED; + list_remove(&vnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + vnd_dev_free(vdp); +} + +/* + * Insert a mesage block chain if there's space, otherwise drop it. Return one + * so someone who was waiting for data would now end up having found it. eg. + * caller should consider a broadcast. + */ +static int +vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, + vnd_dropper_f dropf) +{ + size_t msize; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + if (reserved == B_FALSE) { + msize = msgsize(mp); + if (vqp->vdq_cur + msize > vqp->vdq_max) { + dropf(vqp->vdq_vns, mp, "buffer full"); + return (0); + } + vqp->vdq_cur += msize; + } + + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + vqp->vdq_head = mp; + vqp->vdq_tail = mp; + } else { + vqp->vdq_tail->b_next = mp; + vqp->vdq_tail = mp; + } + + return (1); +} + +/* + * Remove a message message block chain. If the amount of space in the buffer + * has changed we return 1. We have no way of knowing whether or not there is + * enough space overall for a given writer who is blocked, so we always end up + * having to return true and thus tell consumers that they should consider + * signalling. + */ +static int +vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) +{ + size_t msize; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(mpp != NULL); + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + *mpp = NULL; + return (0); + } + + mp = vqp->vdq_head; + msize = msgsize(mp); + + vqp->vdq_cur -= msize; + if (mp->b_next == NULL) { + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + /* + * We can't be certain that this is always going to be zero. + * Someone may have basically taken a reservation of space on + * the data queue, eg. claimed spae but not yet pushed it on + * yet. + */ + ASSERT(vqp->vdq_cur >= 0); + } else { + vqp->vdq_head = mp->b_next; + ASSERT(vqp->vdq_cur > 0); + } + mp->b_next = NULL; + *mpp = mp; + return (1); +} + +/* + * Reserve space in the queue. This will bump up the size of the queue and + * entitle the user to push something on later without bumping the space. + */ +static int +vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size >= 0); + + if (size == 0) + return (0); + + if (size + vqp->vdq_cur > vqp->vdq_max) + return (0); + + vqp->vdq_cur += size; + return (1); +} + +static void +vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size > 0); + ASSERT(size <= vqp->vdq_cur); + + vqp->vdq_cur -= size; +} + +static void +vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) +{ + mblk_t *mp, *next; + + mutex_enter(&vqp->vdq_lock); + for (mp = vqp->vdq_head; mp != NULL; mp = next) { + next = mp->b_next; + mp->b_next = NULL; + dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); + } + vqp->vdq_cur = 0; + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + mutex_exit(&vqp->vdq_lock); +} + +static boolean_t +vnd_dq_is_empty(vnd_data_queue_t *vqp) +{ + boolean_t ret; + + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head == NULL) + ret = B_TRUE; + else + ret = B_FALSE; + mutex_exit(&vqp->vdq_lock); + + return (ret); +} + +/* + * Get a network uint16_t from the message and translate it into something the + * host understands. + */ +static int +vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); +} + +/* + * Given an mblk chain find the mblk and address of a particular offset. + */ +static int +vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) +{ + size_t mpsize; + + if (off >= msgsize(mp)) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + *mpp = mp; + *offp = (uintptr_t)mp->b_rptr + off; + + return (0); +} + +/* + * Fetch the destination mac address. Set *dstp to that mac address. If the data + * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. + */ +static int +vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) +{ + int i; + + if (MBLKL(mp) >= ETHERADDRL) { + *dstpp = mp->b_rptr; + return (0); + } + + *dstpp = datap; + for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { + if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) + return (1); + } + + return (0); +} + +static int +vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, + hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, + hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) +{ + uint16_t etype; + hook_pkt_event_t info; + size_t offset, mblen; + uint8_t *dstp; + uint8_t dstaddr[6]; + hook_event_t he; + hook_event_token_t het; + net_handle_t neti; + + /* + * Before we can ask if we're interested we have to do enough work to + * determine the ethertype. + */ + + /* Byte 12 is either the VLAN tag or the ethertype */ + if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + if (etype == ETHERTYPE_VLAN) { + /* Actual ethertype is another four bytes in */ + if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { + ddrop(vsp, *mpp, + "packet has incomplete ethernet vlan header"); + *mpp = NULL; + return (1); + } + offset = sizeof (struct ether_vlan_header); + } else { + offset = sizeof (struct ether_header); + } + + /* + * At the moment we only hook on the kinds of things that the IP module + * would normally. + */ + if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) + return (0); + + if (etype == ETHERTYPE_IP) { + neti = netiv4; + he = hev4; + het = hetv4; + } else { + neti = netiv6; + he = hev6; + het = hetv6; + } + + if (!he.he_interested) + return (0); + + + if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + /* + * Now that we know we're interested, we have to do some additional + * sanity checking for IPF's sake, ala ip_check_length(). Specifically + * we need to check to make sure that the remaining packet size, + * excluding MAC, is at least the size of an IP header. + */ + mblen = msgsize(*mpp); + if ((etype == ETHERTYPE_IP && + mblen - offset < IP_SIMPLE_HDR_LENGTH) || + (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { + ddrop(vsp, *mpp, "packet has invalid IP header"); + *mpp = NULL; + return (1); + } + + info.hpe_protocol = neti; + info.hpe_ifp = (phy_if_t)vsp; + info.hpe_ofp = (phy_if_t)vsp; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) + info.hpe_flags |= HPE_BROADCAST; + else if (etype == ETHERTYPE_IP && + bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + else if (etype == ETHERTYPE_IPV6 && + bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + + if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, + (uintptr_t *)&info.hpe_hdr) != 0) { + ddrop(vsp, *mpp, "packet too small -- " + "unable to find payload"); + *mpp = NULL; + return (1); + } + + if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { + hdrop(vsp, *mpp, "drooped by hooks"); + return (1); + } + + return (0); +} + +/* + * This should not be used for DL_INFO_REQ. + */ +static mblk_t * +vnd_dlpi_alloc(size_t len, t_uscalar_t prim) +{ + mblk_t *mp; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_PROTO; + mp->b_wptr = mp->b_rptr + len; + bzero(mp->b_rptr, len); + ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; + + return (mp); +} + +static void +vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) +{ + mblk_t **mpp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + ASSERT(mp->b_next == NULL); + mpp = &vsp->vns_dlpi_inc; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + *mpp = mp; +} + +static mblk_t * +vnd_dlpi_inc_pop(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vsp->vns_dlpi_inc; + if (mp != NULL) { + VERIFY(mp->b_next == NULL || mp->b_next != mp); + vsp->vns_dlpi_inc = mp->b_next; + mp->b_next = NULL; + } + return (mp); +} + +static int +vnd_st_sinfo(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_info_req_t *dlir; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + vsp->vns_state = VNS_S_INFO_SENT; + cv_broadcast(&vsp->vns_stcv); + + mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_info(vnd_str_t *vsp) +{ + dl_info_ack_t *dlia; + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + dlia = (dl_info_ack_t *)mp->b_rptr; + vsp->vns_dlpi_style = dlia->dl_provider_style; + vsp->vns_minwrite = dlia->dl_min_sdu; + vsp->vns_maxwrite = dlia->dl_max_sdu; + + /* + * At this time we only support DL_ETHER devices. + */ + if (dlia->dl_mac_type != DL_ETHER) { + freemsg(mp); + vsp->vns_errno = VND_E_NOTETHER; + return (1); + } + + /* + * Because vnd operates on entire packets, we need to manually account + * for the ethernet header information. We add the size of the + * ether_vlan_header to account for this, regardless if it is using + * vlans or not. + */ + vsp->vns_maxwrite += sizeof (struct ether_vlan_header); + + freemsg(mp); + return (0); +} + +static int +vnd_st_sexclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_EXCLUSIVE_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + return (0); +} + +static int +vnd_st_exclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_exclusive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_EXCLUSIVE_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_exclusive: got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_DLEXCL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +/* + * Send down a DLPI_ATTACH_REQ. + */ +static int +vnd_st_sattach(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; + vsp->vns_state = VNS_S_ATTACH_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_attach(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_ATTACH_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_attach: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_ATTACHFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_sbind(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_bind_req_t *dbrp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), + DL_BIND_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + dbrp = (dl_bind_req_t *)(mp->b_rptr); + dbrp->dl_sap = 0; + dbrp->dl_service_mode = DL_CLDLS; + + vsp->vns_state = VNS_S_BIND_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_bind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + + if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_BINDFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + mblk_t *mp; + dl_promiscon_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + dprp = (dl_promiscon_req_t *)mp->b_rptr; + dprp->dl_level = type; + + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_promisc(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promisc"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_PROMISCON_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promisc: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_PROMISCFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_scapabq(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_CAPAB_Q_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +/* ARGSUSED */ +static void +vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + int signal = 0; + mblk_t *mp; + vnd_pnsd_t *nsp = vsp->vns_nsd; + + ASSERT(vsp != NULL); + ASSERT(mp_chain != NULL); + + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + uint16_t vid; + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * If we were operating in a traditional dlpi context then we + * would have enabled DLIOCRAW and rather than the fast path, we + * would come through dld_str_rx_raw. That function does two + * things that we have to consider doing ourselves. The first is + * that it adjusts the b_rptr back to account for dld bumping us + * past the mac header. It also tries to account for cases where + * mac provides an illusion of the mac header. Fortunately, dld + * only allows the fastpath when the media type is the same as + * the native type. Therefore all we have to do here is adjust + * the b_rptr. + */ + ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); + mp->b_rptr -= mhip->mhi_hdrsize; + vid = VLAN_ID(mhip->mhi_tci); + if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { + /* + * This is an overlapping copy. Do not use bcopy(9F). + */ + (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12); + mp->b_rptr += 4; + } + + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, + nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, + nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) + continue; + + VND_STAT_INC(vsp, vks_rpackets, 1); + VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); + DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + mutex_enter(&vsp->vns_dq_read.vdq_lock); + signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, + vnd_drop_in); + mutex_exit(&vsp->vns_dq_read.vdq_lock); + } + + if (signal != 0) { + cv_broadcast(&vsp->vns_dq_read.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); + } + +} + +static void +vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) +{ + VND_STAT_INC(vsp, vks_nmacflow, 1); + VND_STAT_INC(vsp, vks_tmacflow, diff); + if (diff >= VND_LATENCY_1MS) + VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); + if (diff >= VND_LATENCY_10MS) + VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); + if (diff >= VND_LATENCY_100MS) + VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); + if (diff >= VND_LATENCY_1S) + VND_STAT_INC(vsp, vks_mac_flow_1s, 1); + if (diff >= VND_LATENCY_10S) + VND_STAT_INC(vsp, vks_mac_flow_10s, 1); +} + +/* + * This is a callback from MAC that indicates that we are allowed to send + * packets again. + */ +static void +vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) +{ + vnd_str_t *vsp = arg; + hrtime_t now; + + mutex_enter(&vsp->vns_lock); + now = gethrtime(); + + /* + * Check for the case that we beat vnd_squeue_tx_one to the punch. + * There's also an additional case here that we got notified because + * we're sharing a device that ran out of tx descriptors, even though it + * wasn't because of us. + */ + if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { + vsp->vns_fcupdate = now; + mutex_exit(&vsp->vns_lock); + return; + } + + ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); + ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); + vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = NULL; + vsp->vns_fclatch = 0; + DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, + vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); + /* + * If someone has asked to flush the squeue and thus inserted a barrier, + * than we shouldn't schedule a drain. + */ + if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, + vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, + VND_SQUEUE_TAG_MAC_FLOW_CONTROL); + } + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); +} + +static void +vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); +} + +static int +vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) +{ + int ret; + dld_capab_direct_t d; + mac_perim_handle_t mph; + vnd_str_capab_t *c = &vsp->vns_caps; + + bzero(&d, sizeof (d)); + d.di_rx_cf = (uintptr_t)rxfunc; + d.di_rx_ch = vsp; + d.di_flags = DI_DIRECT_RAW; + + vnd_mac_enter(vsp, &mph); + + /* + * If we're coming in here for a second pass, we need to make sure that + * we remove an existing flow control notification callback, otherwise + * we'll create a duplicate that will remain with garbage data. + */ + if (c->vsc_tx_fc_hdl != NULL) { + ASSERT(c->vsc_set_fcb_hdl != NULL); + (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, + c->vsc_tx_fc_hdl); + c->vsc_tx_fc_hdl = NULL; + } + + if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, + DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { + c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; + c->vsc_tx_hdl = d.di_tx_dh; + c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; + c->vsc_set_fcb_hdl = d.di_tx_cb_dh; + c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; + c->vsc_is_fc_hdl = d.di_tx_fctl_dh; + c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, + vnd_mac_flow_control, vsp); + c->vsc_flags |= VNS_C_DIRECT; + ret = 0; + } else { + vsp->vns_errno = VND_E_DIRECTFAIL; + ret = 1; + } + vnd_mac_exit(vsp, mph); + return (ret); +} + +static int +vnd_st_capabq(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_capability_ack_t *cap; + dl_capability_sub_t *subp; + dl_capab_hcksum_t *hck; + dl_capab_dld_t *dld; + unsigned char *rp; + int ret = 0; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + + rp = mp->b_rptr; + cap = (dl_capability_ack_t *)rp; + if (cap->dl_sub_length == 0) + goto done; + + /* Don't try to process something too big */ + if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_CAPACKINVAL; + ret = 1; + goto done; + } + + rp += cap->dl_sub_offset; + + while (cap->dl_sub_length > 0) { + subp = (dl_capability_sub_t *)rp; + /* Sanity check something crazy from down below */ + if (subp->dl_length + sizeof (dl_capability_sub_t) > + cap->dl_sub_length) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_SUBCAPINVAL; + ret = 1; + goto done; + } + + switch (subp->dl_cap) { + case DL_CAPAB_HCKSUM: + hck = (dl_capab_hcksum_t *)(rp + + sizeof (dl_capability_sub_t)); + if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; + break; + } + if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; + vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; + break; + case DL_CAPAB_DLD: + dld = (dl_capab_dld_t *)(rp + + sizeof (dl_capability_sub_t)); + if (dld->dld_version != DLD_CURRENT_VERSION) { + vsp->vns_errno = VND_E_DLDBADVERS; + ret = 1; + goto done; + } + if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_DLD; + vsp->vns_caps.vsc_capab_f = + (vnd_dld_cap_t)dld->dld_capab; + vsp->vns_caps.vsc_capab_hdl = + (void *)dld->dld_capab_handle; + /* + * At this point in time, we have to set up a direct + * function that drops all input. This validates that + * we'll be able to set up direct input and that we can + * easily switch it earlier to the real data function + * when we've plumbed everything up. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { + /* vns_errno set by vnd_dld_cap_enable */ + ret = 1; + goto done; + } + break; + default: + /* Ignore unsupported cap */ + break; + } + + rp += sizeof (dl_capability_sub_t) + subp->dl_length; + cap->dl_sub_length -= sizeof (dl_capability_sub_t) + + subp->dl_length; + } + +done: + /* Make sure we enabled direct callbacks */ + if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { + vsp->vns_errno = VND_E_DIRECTNOTSUP; + ret = 1; + } + + freemsg(mp); + return (ret); +} + +static void +vnd_st_sonline(vnd_str_t *vsp) +{ + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + vsp->vns_state = VNS_S_ONLINE; + cv_broadcast(&vsp->vns_stcv); +} + +static void +vnd_st_shutdown(vnd_str_t *vsp) +{ + mac_perim_handle_t mph; + vnd_str_capab_t *vsc = &vsp->vns_caps; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * At this point in time we know that there is no one transmitting as + * our final reference has been torn down and that vnd_s_close inserted + * a barrier to validate that everything is flushed. + */ + if (vsc->vsc_flags & VNS_C_DIRECT) { + vnd_mac_enter(vsp, &mph); + vsc->vsc_flags &= ~VNS_C_DIRECT; + (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, + vsc->vsc_tx_fc_hdl); + vsc->vsc_tx_fc_hdl = NULL; + (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + vnd_mac_exit(vsp, mph); + } +} + +static boolean_t +vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + boolean_t ret = B_TRUE; + mblk_t *mp; + dl_promiscoff_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "promiscoff request"); + ret = B_FALSE; + goto next; + } + + dprp = (dl_promiscoff_req_t *)mp->b_rptr; + dprp->dl_level = type; + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_promiscoff(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + return; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promiscoff"); + return; + } + + if (cprim != DL_PROMISCOFF_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promiscoff: Got ack/nack for wrong primitive"); + return; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to disable promiscuos mode during " + "vnd teardown"); + } +} + +static boolean_t +vnd_st_sunbind(vnd_str_t *vsp) +{ + mblk_t *mp; + boolean_t ret = B_TRUE; + + mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "unbind request"); + ret = B_FALSE; + goto next; + } + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = VNS_S_UNBIND_SENT; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_unbind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + goto next; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_unbind"); + goto next; + } + + if (cprim != DL_UNBIND_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_unbind: Got ack/nack for wrong primitive"); + goto next; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to unbind stream during vnd " + "teardown"); + } + +next: + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); +} + +/* + * Perform state transitions. This is a one way shot down the flow chart + * described in the big theory statement. + */ +static void +vnd_str_state_transition(void *arg) +{ + boolean_t died = B_FALSE; + vnd_str_t *vsp = arg; + mblk_t *mp; + + mutex_enter(&vsp->vns_lock); + if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && + vsp->vns_state != VNS_S_SHUTTING_DOWN)) { + mutex_exit(&vsp->vns_lock); + return; + } + + /* + * When trying to shut down, or unwinding from a failed enabling, rather + * than immediately entering the ZOMBIE state, we may instead opt to try + * and enter the next state in the progression. This is especially + * important when trying to tear everything down. + */ +loop: + DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, + vnd_str_state_t, vsp->vns_state); + switch (vsp->vns_state) { + case VNS_S_INITIAL: + VERIFY(vsp->vns_dlpi_inc == NULL); + if (vnd_st_sinfo(vsp) != 0) + died = B_TRUE; + break; + case VNS_S_INFO_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_info(vsp) == 0) { + if (vnd_st_sexclusive(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_EXCLUSIVE_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_exclusive(vsp) == 0) { + if (vsp->vns_dlpi_style == DL_STYLE2) { + if (vnd_st_sattach(vsp) != 0) + died = B_TRUE; + } else { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } + } else { + died = B_TRUE; + } + break; + case VNS_S_ATTACH_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_attach(vsp) == 0) { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_BIND_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_bind(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_SAP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, + VNS_S_MULTI_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_MULTI_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, + VNS_S_RX_ONLY_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_RX_ONLY_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS, + VNS_S_FIXUP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_FIXUP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_scapabq(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_CAPAB_Q_SENT: + if (vnd_st_capabq(vsp) != 0) + died = B_TRUE; + else + vnd_st_sonline(vsp); + break; + case VNS_S_SHUTTING_DOWN: + vnd_st_shutdown(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI, + VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_MULTICAST_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_SAP_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_sunbind(vsp) == B_FALSE) + goto loop; + break; + case VNS_S_UNBIND_SENT: + vnd_st_unbind(vsp); + break; + case VNS_S_ZOMBIE: + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vsp received data as a zombie"); + break; + default: + panic("vnd_str_t entered an unknown state"); + } + + if (died == B_TRUE) { + ASSERT(vsp->vns_errno != VND_E_SUCCESS); + vsp->vns_laststate = vsp->vns_state; + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); + } + + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_dlpi_taskq_dispatch(void *arg) +{ + vnd_str_t *vsp = arg; + int run = 1; + + while (run != 0) { + vnd_str_state_transition(vsp); + mutex_enter(&vsp->vns_lock); + if (vsp->vns_flags & VNS_F_CONDEMNED || + vsp->vns_dlpi_inc == NULL) { + run = 0; + vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; + } + if (vsp->vns_flags & VNS_F_CONDEMNED) + cv_signal(&vsp->vns_cancelcv); + mutex_exit(&vsp->vns_lock); + } +} + +/* ARGSUSED */ +static int +vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getptmue(net_handle_t neti) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + zoneid_t *zid) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phylookup(net_handle_t neti, const char *name) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static lif_if_t +vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_route(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static int +vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static net_protocol_t vnd_neti_info_v4 = { + NETINFO_VERSION, + NHF_VND_INET, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + +static net_protocol_t vnd_neti_info_v6 = { + NETINFO_VERSION, + NHF_VND_INET6, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + + +static int +vnd_netinfo_init(vnd_pnsd_t *nsp) +{ + nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v4); + ASSERT(nsp->vpnd_neti_v4 != NULL); + + nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v6); + ASSERT(nsp->vpnd_neti_v6 != NULL); + + nsp->vpnd_family_v4.hf_version = HOOK_VERSION; + nsp->vpnd_family_v4.hf_name = "vnd_inet"; + + if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_family_v6.hf_version = HOOK_VERSION; + nsp->vpnd_family_v6.hf_name = "vnd_inet6"; + + if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v4.he_flags = 0; + nsp->vpnd_event_in_v4.he_interested = B_FALSE; + + nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + if (nsp->vpnd_token_in_v4 == NULL) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v6.he_flags = 0; + nsp->vpnd_event_in_v6.he_interested = B_FALSE; + + nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + if (nsp->vpnd_token_in_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v4.he_flags = 0; + nsp->vpnd_event_out_v4.he_interested = B_FALSE; + + nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_out_v4); + if (nsp->vpnd_token_out_v4 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v6.he_flags = 0; + nsp->vpnd_event_out_v6.he_interested = B_FALSE; + + nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_out_v6); + if (nsp->vpnd_token_out_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + return (0); +} + +static void +vnd_netinfo_shutdown(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); +} + +static void +vnd_netinfo_fini(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v4); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v6); + VERIFY(ret == 0); +} + +/* ARGSUSED */ +static void +vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + + VERIFY(bmp == &vsp->vns_barrierblk); + mutex_enter(&vsp->vns_lock); + VERIFY(vsp->vns_flags & VNS_F_BARRIER); + VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); + vsp->vns_flags |= VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * For better or worse, we have to broadcast here as we could have a + * thread that's blocked for completion as well as one that's blocked + * waiting to do a barrier itself. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * This is a data barrier for the stream while it is in fastpath mode. It blocks + * and ensures that there is nothing else in the squeue. + */ +static void +vnd_strbarrier(vnd_str_t *vsp) +{ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_flags & VNS_F_BARRIER) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags |= VNS_F_BARRIER; + mutex_exit(&vsp->vns_lock); + + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, + vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); + + mutex_enter(&vsp->vns_lock); + while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags &= ~VNS_F_BARRIER; + vsp->vns_flags &= ~VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * We have to broadcast in case anyone is waiting for the barrier + * themselves. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * Based on the type of message that we're dealing with we're going to want to + * do one of several things. Basically if it looks like it's something we know + * about, we should probably handle it in one of our transition threads. + * Otherwise, we should just simply putnext. + */ +static int +vnd_s_rput(queue_t *q, mblk_t *mp) +{ + t_uscalar_t prim; + int dispatch = 0; + vnd_str_t *vsp = q->q_ptr; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + vnd_drop_ctl(vsp, mp, "PROTO message too short"); + break; + } + + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; + if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { + vnd_drop_ctl(vsp, mp, + "recieved an unsupported dlpi DATA req"); + break; + } + + /* + * Enqueue the entry and fire off a taskq dispatch. + */ + mutex_enter(&vsp->vns_lock); + vnd_dlpi_inc_push(vsp, mp); + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + dispatch = 1; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + } + mutex_exit(&vsp->vns_lock); + if (dispatch != 0) + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, + vsp, 0, &vsp->vns_tqe); + break; + case M_DATA: + vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); + break; + default: + putnext(vsp->vns_rq, mp); + } + return (0); +} + +/* ARGSUSED */ +static void +vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) +{ + int error; + vnd_strioc_t *visp; + + if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || + iocp->ioc_count != TRANSPARENT) { + error = EINVAL; + goto nak; + } + + /* + * All streams ioctls that we support must use kcred as a means to + * distinguish that this is a layered open by the kernel as opposed to + * one by a user who has done an I_PUSH of the module. + */ + if (iocp->ioc_cr != kcred) { + error = EPERM; + goto nak; + } + + if (mp->b_cont == NULL) { + error = EAGAIN; + goto nak; + } + + visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); + ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); + visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; + visp->vs_state = VSS_COPYIN; + + mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); + qreply(q, mp); + + return; + +nak: + if (mp->b_cont != NULL) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + + iocp->ioc_error = error; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); +} + +static void +vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + vnd_str_state_t state; + struct copyreq *crp; + vnd_strioc_associate_t *vss; + vnd_dev_t *vdp = NULL; + vnd_pnsd_t *nsp = NULL; + char iname[2*VND_NAMELEN]; + zone_t *zone; + vnd_strioc_t *visp; + + visp = (vnd_strioc_t *)csp->cp_private; + + /* If it's not ours, it's not our problem */ + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* Data is sitting for us in b_cont */ + if (mp->b_cont == NULL || + MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { + kmem_free(visp, sizeof (vnd_strioc_t)); + miocnak(q, mp, 0, EINVAL); + return; + } + + vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; + vdp = vnd_dev_lookup(vss->vsa_minor); + if (vdp == NULL) { + vss->vsa_errno = VND_E_NODEV; + goto nak; + } + + nsp = vnd_nsd_lookup(vss->vsa_nsid); + if (nsp == NULL) { + vss->vsa_errno = VND_E_NONETSTACK; + goto nak; + } + + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { + mutex_exit(&vsp->vns_lock); + vss->vsa_errno = VND_E_ASSOCIATED; + goto nak; + } + + vsp->vns_nsd = nsp; + vsp->vns_flags &= ~VNS_F_NEED_ZONE; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + mutex_exit(&vsp->vns_lock); + + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, + &vsp->vns_tqe); + + + /* At this point we need to wait until we have transitioned to ONLINE */ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + state = vsp->vns_state; + mutex_exit(&vsp->vns_lock); + + if (state == VNS_S_ZOMBIE) { + vss->vsa_errno = vsp->vns_errno; + goto nak; + } + + mutex_enter(&vdp->vdd_lock); + mutex_enter(&vsp->vns_lock); + VERIFY(vdp->vdd_str == NULL); + /* + * Now initialize the remaining kstat properties and let's go ahead and + * create it. + */ + (void) snprintf(iname, sizeof (iname), "z%d_%d", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); + vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", + KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (vsp->vns_kstat == NULL) { + vss->vsa_errno = VND_E_KSTATCREATE; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + vdp->vdd_str = vsp; + vsp->vns_dev = vdp; + + /* + * Now, it's time to do the las thing that can fail, changing out the + * input function. After this we know that we can receive data, so we + * should make sure that we're ready. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { + vss->vsa_errno = VND_E_DIRECTFAIL; + vdp->vdd_str = NULL; + vsp->vns_dev = NULL; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + + zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); + ASSERT(zone != NULL); + vsp->vns_kstat->ks_data = &vsp->vns_ksdata; + /* Account for zone name */ + vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; + /* Account for eventual link name */ + vsp->vns_kstat->ks_data_size += VND_NAMELEN; + kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + zone_rele(zone); + kstat_install(vsp->vns_kstat); + + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + + /* + * Note that the vnd_str_t does not keep a permanent hold on the + * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what + * the nestack goes through to take care of everything. + */ + vss->vsa_errno = VND_E_SUCCESS; +nak: + if (vdp != NULL) + vnd_dev_rele(vdp); + if (nsp != NULL) + vnd_nsd_rele(nsp); + /* + * Change the copyin request to a copyout. Note that we can't use + * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's + * okay, as the copyin vs. copyout is basically the same. + */ + DB_TYPE(mp) = M_COPYOUT; + visp->vs_state = VSS_COPYOUT; + crp = (struct copyreq *)mp->b_rptr; + crp->cq_private = (void *)visp; + crp->cq_addr = visp->vs_addr; + crp->cq_size = sizeof (vnd_strioc_associate_t); + qreply(q, mp); +} + +static void +vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + ASSERT(csp->cp_private != NULL); + kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); + return; + } + + /* Ack and let's be done with it all */ + miocack(q, mp, 0, 0); +} + +static int +vnd_s_wput(queue_t *q, mblk_t *mp) +{ + vnd_str_t *vsp = q->q_ptr; + struct copyresp *crp; + vnd_strioc_state_t vstate; + vnd_strioc_t *visp; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); + return (0); + case M_IOCDATA: + crp = (struct copyresp *)mp->b_rptr; + ASSERT(crp->cp_private != NULL); + visp = (vnd_strioc_t *)crp->cp_private; + vstate = visp->vs_state; + ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); + if (vstate == VSS_COPYIN) + vnd_striocdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + else + vnd_stroutdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + return (0); + default: + break; + } + if (q->q_next != NULL) + putnext(q, mp); + else + vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) +{ + vnd_str_t *vsp; + uint_t rand; + + if (q->q_ptr != NULL) + return (EINVAL); + + if (!(sflag & MODOPEN)) + return (ENXIO); + + if (credp != kcred) + return (EPERM); + + vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); + bzero(vsp, sizeof (*vsp)); + mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); + vsp->vns_state = VNS_S_INITIAL; + + mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&vnd_dev_lock); + vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_read.vdq_vns = vsp; + vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_write.vdq_vns = vsp; + mutex_exit(&vnd_dev_lock); + vsp->vns_rq = q; + vsp->vns_wq = WR(q); + q->q_ptr = WR(q)->q_ptr = vsp; + vsp->vns_flags = VNS_F_NEED_ZONE; + vsp->vns_nflush = vnd_flush_nburst; + vsp->vns_bsize = vnd_flush_burst_size; + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); + + /* + * We create our kstat and initialize all of its fields now, but we + * don't install it until we actually do the zone association so we can + * get everything. + */ + kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, + "flowcontrol_100ms", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", + KSTAT_DATA_UINT64); + qprocson(q); + /* + * Now that we've called qprocson, grab the lower module for making sure + * that we don't have any pass through modules. + */ + vsp->vns_lrq = RD(vsp->vns_wq->q_next); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_close(queue_t *q, int flag, cred_t *credp) +{ + vnd_str_t *vsp; + mblk_t *mp; + + VERIFY(WR(q)->q_next != NULL); + + vsp = q->q_ptr; + ASSERT(vsp != NULL); + + /* + * We need to transition ourselves down. This means that we have a few + * important different things to do in the process of tearing down our + * input and output buffers, making sure we've drained the current + * squeue, and disabling the fast path. Before we disable the fast path, + * we should make sure the squeue is drained. Because we're in streams + * close, we know that no packets can come into us from userland, but we + * can receive more. As such, the following is the exact order of things + * that we do: + * + * 1) flush the vns_dq_read + * 2) Insert the drain mblk + * 3) When it's been received, tear down the fast path by kicking + * off the state machine. + * 4) One final flush of both the vns_dq_read,vns_dq_write + */ + + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_strbarrier(vsp); + mutex_enter(&vsp->vns_lock); + vsp->vns_state = VNS_S_SHUTTING_DOWN; + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, + 0, &vsp->vns_tqe); + } + while (vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + mutex_exit(&vsp->vns_lock); + + qprocsoff(q); + mutex_enter(&vsp->vns_lock); + vsp->vns_flags |= VNS_F_CONDEMNED; + while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) + cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); + + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vnd_s_close"); + mutex_exit(&vsp->vns_lock); + + q->q_ptr = NULL; + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); + mutex_destroy(&vsp->vns_dq_read.vdq_lock); + mutex_destroy(&vsp->vns_dq_write.vdq_lock); + + if (vsp->vns_kstat != NULL) + kstat_delete(vsp->vns_kstat); + mutex_destroy(&vsp->vns_lock); + cv_destroy(&vsp->vns_stcv); + cv_destroy(&vsp->vns_barriercv); + cv_destroy(&vsp->vns_cancelcv); + kmem_cache_free(vnd_str_cache, vsp); + + return (0); +} + +static vnd_mac_cookie_t +vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) +{ + hrtime_t txtime; + vnd_mac_cookie_t vc; + + VND_STAT_INC(vsp, vks_opackets, 1); + VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); + DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + /* Actually tx now */ + txtime = gethrtime(); + vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, + mp, 0, MAC_DROP_ON_NO_DESC); + + /* + * We need to check two different conditions before we immediately set + * the flow control lock. The first thing that we need to do is verify + * that this is an instance of hard flow control, so to say. The flow + * control callbacks won't always fire in cases where we still get a + * cookie returned. The explicit check for flow control will guarantee + * us that we'll get a subsequent notification callback. + * + * The second case comes about because we do not hold the + * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow + * control notification already came across for us in a different thread + * calling vnd_mac_flow_control(). To deal with this, we record a + * timestamp every time that we change the flow control state. We grab + * txtime here before we transmit because that guarantees that the + * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. + * + * If the flow control notification beat us to the punch, the value of + * vns_fcupdate will be larger than the value of txtime, and we should + * just record the statistics. However, if we didn't beat it to the + * punch (txtime > vns_fcupdate), then we know that it's safe to wait + * for a notification. + */ + if (vc != NULL) { + hrtime_t diff; + + if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, + vc) == 0) + return (NULL); + mutex_enter(&vsp->vns_lock); + diff = vsp->vns_fcupdate - txtime; + if (diff > 0) { + mutex_exit(&vsp->vns_lock); + vnd_mac_flow_control_stat(vsp, diff); + return (NULL); + } + vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = vc; + vsp->vns_fclatch = txtime; + vsp->vns_fcupdate = txtime; + DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, + uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); + mutex_exit(&vsp->vns_lock); + } + + return (vc); +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) +{ + mblk_t *mp; + int nmps; + size_t mptot, nflush, bsize; + boolean_t blocked, empty; + vnd_data_queue_t *vqp; + vnd_str_t *vsp = arg; + + mutex_enter(&vsp->vns_lock); + /* + * We either enter here via an squeue or via vnd_squeue_tx_append(). In + * the former case we need to mark that there is no longer an active + * user of the drain block. + */ + if (drain_mp != NULL) { + VERIFY(drain_mp == &vsp->vns_drainblk); + VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); + vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; + } + + /* + * If we're still flow controlled or under a flush barrier, nothing to + * do. + */ + if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { + mutex_exit(&vsp->vns_lock); + return; + } + + nflush = vsp->vns_nflush; + bsize = vsp->vns_bsize; + mutex_exit(&vsp->vns_lock); + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + ht_begin_unsafe(); + + nmps = 0; + mptot = 0; + blocked = B_FALSE; + vqp = &vsp->vns_dq_write; + while (nmps < nflush && mptot <= bsize) { + mutex_enter(&vqp->vdq_lock); + if (vnd_dq_pop(vqp, &mp) == 0) { + mutex_exit(&vqp->vdq_lock); + break; + } + mutex_exit(&vqp->vdq_lock); + + nmps++; + mptot += msgsize(mp); + if (vnd_squeue_tx_one(vsp, mp) != NULL) { + blocked = B_TRUE; + break; + } + } + + ht_end_unsafe(); + + empty = vnd_dq_is_empty(&vsp->vns_dq_write); + + /* + * If the queue is not empty, we're not blocked, and there isn't a drain + * scheduled, put it into the squeue with the drain block and + * GSQUEUE_FILL. + */ + if (blocked == B_FALSE && empty == B_FALSE) { + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { + mblk_t *mp = &vsp->vns_drainblk; + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, + mp, vnd_squeue_tx_drain, vsp, + GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); + } + mutex_exit(&vsp->vns_lock); + } + + /* + * If we drained some amount of data, we need to signal the data queue. + */ + if (nmps > 0) { + cv_broadcast(&vsp->vns_dq_write.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); + } +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + vnd_data_queue_t *vqp = &vsp->vns_dq_write; + vnd_pnsd_t *nsp = vsp->vns_nsd; + size_t len = msgsize(mp); + + /* + * Before we append this packet, we should run it through the firewall + * rules. + */ + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, + nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, + vnd_drop_out) != 0) { + /* + * Because we earlier reserved space for this packet and it's + * not making the cut, we need to go through and unreserve that + * space. Also note that the message block will likely be freed + * by the time we return from vnd_hook so we cannot rely on it. + */ + mutex_enter(&vqp->vdq_lock); + vnd_dq_unreserve(vqp, len); + mutex_exit(&vqp->vdq_lock); + return; + } + + /* + * We earlier reserved space for this packet. So for now simply append + * it and call drain. We know that no other drain can be going on right + * now thanks to the squeue. + */ + mutex_enter(&vqp->vdq_lock); + (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); + mutex_exit(&vqp->vdq_lock); + vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); +} + +/* + * We need to see if this is a valid name of sorts for us. That means a few + * things. First off, we can't assume that what we've been given has actually + * been null terminated. More importantly, that it's a valid name as far as + * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We + * further constrain ourselves to simply alphanumeric characters and a few + * additional ones, ':', '-', and '_'. + */ +static int +vnd_validate_name(const char *buf, size_t buflen) +{ + int i, len; + + /* First make sure a null terminator exists */ + for (i = 0; i < buflen; i++) + if (buf[i] == '\0') + break; + len = i; + if (i == 0 || i == buflen) + return (0); + + for (i = 0; i < len; i++) + if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && + buf[i] != '_') + return (0); + + return (1); +} + +static int +vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) +{ + vnd_ioc_attach_t via; + vnd_strioc_associate_t vss; + vnd_pnsd_t *nsp; + zone_t *zonep; + zoneid_t zid; + char buf[2*VND_NAMELEN]; + int ret, rp; + + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) + return (EFAULT); + via.via_errno = VND_E_SUCCESS; + + if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + + /* + * Only the global zone can request to create a device in a different + * zone. + */ + zid = crgetzoneid(credp); + if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && + zid != via.via_zoneid) { + via.via_errno = VND_E_PERM; + ret = EIO; + goto errcopyout; + } + + if (via.via_zoneid == -1) + via.via_zoneid = zid; + + /* + * Establish the name we'll use now. We want to be extra paranoid about + * the device we're opening so check that now. + */ + if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { + zonep = zone_find_by_id(via.via_zoneid); + if (zonep == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, + via.via_name) >= sizeof (buf)) { + zone_rele(zonep); + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", + zonep->zone_name, via.via_name); + zone_rele(zonep); + zonep = NULL; + } else { + if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= + sizeof (buf)) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); + } + + /* + * If our zone is dying then the netstack will have been removed from + * this list. + */ + nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); + if (nsp == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + + /* + * Note we set the attached handle even though we haven't actually + * finished the process of attaching the ldi handle. + */ + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { + mutex_exit(&vdp->vdd_lock); + vnd_nsd_rele(nsp); + via.via_errno = VND_E_ATTACHED; + ret = EIO; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; + ASSERT(vdp->vdd_cr == NULL); + crhold(credp); + vdp->vdd_cr = credp; + ASSERT(vdp->vdd_nsd == NULL); + vdp->vdd_nsd = nsp; + mutex_exit(&vdp->vdd_lock); + + /* + * Place an additional hold on the vnd_pnsd_t as we go through and do + * all of the rest of our work. This will be the hold that we keep for + * as long as this thing is attached. + */ + vnd_nsd_ref(nsp); + + ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, + &vdp->vdd_ldih, vdp->vdd_ldiid); + if (ret != 0) { + if (ret == ENODEV) + via.via_errno = VND_E_NODATALINK; + goto err; + } + + /* + * Unfortunately the I_PUSH interface doesn't allow us a way to detect + * whether or not we're coming in from a layered device. We really want + * to make sure that a normal user can't push on our streams module. + * Currently the only idea I have for this is to make sure that the + * credp is kcred which is really terrible. + */ + ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, + kcred, &rp); + if (ret != 0) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + via.via_errno = VND_E_STRINIT; + ret = EIO; + goto err; + } + + vss.vsa_minor = vdp->vdd_minor; + vss.vsa_nsid = nsp->vpnd_nsid; + + ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, + FKIOCTL, kcred, &rp); + if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + if (ret == 0) { + via.via_errno = vss.vsa_errno; + ret = EIO; + } + goto err; + } + + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + + /* + * There's a chance that our netstack was condemned while we've had a + * hold on it. As such we need to check and if so, error out. + */ + if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + ret = EIO; + via.via_errno = VND_E_NOZONE; + goto err; + } + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_str != NULL); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + vdp->vdd_flags |= VND_D_ATTACHED; + (void) strlcpy(vdp->vdd_datalink, via.via_name, + sizeof (vdp->vdd_datalink)); + list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(nsp); + + return (0); + +err: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + vdp->vdd_nsd = NULL; + mutex_exit(&vdp->vdd_lock); + + /* + * We have two holds to drop here. One for our original reference and + * one for the hold this operation would have represented. + */ + vnd_nsd_rele(nsp); + vnd_nsd_rele(nsp); +errcopyout: + if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) + ret = EFAULT; + + return (ret); +} + +static int +vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret = 0; + vnd_ioc_link_t vil; + char mname[2*VND_NAMELEN]; + char **c; + vnd_dev_t *v; + zoneid_t zid; + + /* Not anyone can link something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) + return (EFAULT); + + if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + c = vnd_reserved_names; + while (*c != NULL) { + if (strcmp(vil.vil_name, *c) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + c++; + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOTATTACHED; + goto errcopyout; + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto errcopyout; + } + + if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKED; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_LINK_INFLIGHT; + zid = vdp->vdd_nsd->vpnd_zid; + mutex_exit(&vdp->vdd_lock); + + if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= + sizeof (mname)) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + mutex_enter(&vnd_dev_lock); + for (v = list_head(&vnd_dev_list); v != NULL; + v = list_next(&vnd_dev_list, v)) { + if (!(v->vdd_flags & VND_D_LINKED)) + continue; + + if (v->vdd_nsd->vpnd_zid == zid && + strcmp(v->vdd_lname, vil.vil_name) == 0) { + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKEXISTS; + goto error; + } + } + + /* + * We set the name and mark ourselves attached while holding the list + * lock to ensure that no other user can mistakingly find our name. + */ + (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, + vil.vil_name); + mutex_enter(&vdp->vdd_lock); + + /* + * Because we dropped our lock, we need to double check whether or not + * the zone was marked as dying while we were here. If it hasn't, then + * it's safe for us to link it in. + */ + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto error; + } + + (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); + if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + ret = EIO; + vil.vil_errno = VND_E_MINORNODE; + } else { + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_flags |= VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + ret = 0; + } + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + if (ret == 0) { + /* + * Add a reference to represent that this device is linked into + * the file system name space to ensure that it doesn't + * disappear. + */ + vnd_dev_ref(vdp); + return (0); + } + +error: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_lname[0] = '\0'; + mutex_exit(&vdp->vdd_lock); + +errcopyout: + if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) + ret = EFAULT; + return (ret); +} + +/* + * Common unlink function. This is used both from the ioctl path and from the + * netstack shutdown path. The caller is required to hold the mutex on the + * vnd_dev_t, but they basically will have it relinquished for them. The only + * thing the caller is allowed to do afterward is to potentially rele the + * vnd_dev_t if they have their own hold. Note that only the ioctl path has its + * own hold. + */ +static void +vnd_dev_unlink(vnd_dev_t *vdp) +{ + char mname[2*VND_NAMELEN]; + + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + (void) snprintf(mname, sizeof (mname), "z%d:%s", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); + ddi_remove_minor_node(vnd_dip, mname); + vdp->vdd_lname[0] = '\0'; + vdp->vdd_flags &= ~VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + mutex_exit(&vdp->vdd_lock); + + /* + * This rele corresponds to the reference that we took in + * vnd_ioctl_link. + */ + vnd_dev_rele(vdp); +} + +static int +vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret; + zoneid_t zid; + vnd_ioc_unlink_t viu; + + /* Not anyone can unlink something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + zid = crgetzoneid(credp); + + if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) + return (EFAULT); + + viu.viu_errno = VND_E_SUCCESS; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_NOTLINKED; + goto err; + } + VERIFY(vdp->vdd_flags & VND_D_ATTACHED); + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_PERM; + goto err; + } + + /* vnd_dev_unlink releases the vdp mutex for us */ + vnd_dev_unlink(vdp); + ret = 0; +err: + if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +/* ARGSUSED */ +static int +vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + vnd_ioc_buf_t vib; + + mutex_enter(&vnd_dev_lock); + vib.vib_size = vnd_vdq_hard_max; + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + mutex_exit(&vdp->vdd_str->vns_lock); + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) +{ + vnd_ioc_buf_t vib; + + vib.vib_errno = 0; + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED) { + mutex_enter(&vdp->vdd_str->vns_lock); + if (min == B_TRUE) + vib.vib_size = vdp->vdd_str->vns_minwrite; + else + vib.vib_size = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + } else { + vib.vib_errno = VND_E_NOTATTACHED; + } + mutex_exit(&vdp->vdd_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + int ret, nonblock, nwrite; + frameio_t *fio; + vnd_data_queue_t *vqp; + mblk_t *mp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, + mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EWOULDBLOCK); + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + } + + ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, + &nwrite, mode & FKIOCTL); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + while (nwrite > 0) { + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); + nwrite--; + } + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + + return (0); +} + +static int +vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + frameio_t *fio; + int ret, nonblock, nframes, i, nread; + size_t maxwrite, minwrite, total, flen; + mblk_t *mp_chain, *mp, *nmp; + vnd_data_queue_t *vqp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + /* + * Make sure no single frame is larger than we can accept. + */ + mutex_enter(&vdp->vdd_str->vns_lock); + minwrite = vdp->vdd_str->vns_minwrite; + maxwrite = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + + nframes = fio->fio_nvpf / fio->fio_nvecs; + total = 0; + for (i = 0; i < nframes; i++) { + flen = frameio_frame_length(fio, + &fio->fio_vecs[i*fio->fio_nvpf]); + if (flen < minwrite || flen > maxwrite) { + frameio_free(fio); + return (ERANGE); + } + total += flen; + } + + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, total) == 0) { + if (nonblock != 0) { + frameio_free(fio); + mutex_exit(&vqp->vdq_lock); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * We've reserved our space, let's copyin and go from here. + */ + ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); + if (ret != 0) { + frameio_free(fio); + vnd_dq_unreserve(vqp, total); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + return (ret); + } + + for (mp = mp_chain; mp != NULL; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + } + + /* + * Update the frameio structure to indicate that we wrote those frames. + */ + frameio_mark_consumed(fio, nread); + ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); + frameio_free(fio); + + return (ret); +} + +static int +vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) +{ + const char *link; + uint32_t vers = 1; + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + /* + * Copy all of the members out to userland. + */ + if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (vdp->vdd_flags & VND_D_LINKED) + link = vdp->vdd_lname; + else + link = "<anonymous>"; + if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, + sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, + sizeof (zoneid_t), mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +static int +vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) +{ + vnd_ioc_list_t vl; + vnd_ioc_list32_t vl32; + zoneid_t zid; + vnd_dev_t *vdp; + vnd_ioc_info_t *vip; + int found, cancopy, ret; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + vl.vl_nents = vl32.vl_nents; + vl.vl_actents = vl32.vl_actents; + vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; + } else { + if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), + mode & FKIOCTL) != 0) + return (EFAULT); + } + + cancopy = vl.vl_nents; + vip = vl.vl_ents; + found = 0; + zid = crgetzoneid(credp); + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && + (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { + found++; + if (cancopy > 0) { + ret = vnd_ioctl_list_copy_info(vdp, vip, mode); + if (ret != 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return (ret); + } + cancopy--; + vip++; + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, + sizeof (uint_t), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + + +/* ARGSUSED */ +static int +vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + ASSERT(m != 0); + + /* + * Make sure no one has come in on an ioctl from the strioc case. + */ + if ((cmd & VND_STRIOC) == VND_STRIOC) + return (ENOTTY); + + /* + * Like close, seems like if this minor isn't found, it's a programmer + * error somehow. + */ + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + switch (cmd) { + case VND_IOC_ATTACH: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_attach(vdp, arg, credp, mode); + break; + case VND_IOC_LINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_link(vdp, arg, credp, mode); + break; + case VND_IOC_UNLINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_unlink(vdp, arg, credp, mode); + break; + case VND_IOC_GETRXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_getrxbuf(vdp, arg, mode); + break; + case VND_IOC_SETRXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_setrxbuf(vdp, arg, mode); + break; + case VND_IOC_GETTXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettxbuf(vdp, arg, mode); + break; + case VND_IOC_SETTXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_settxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMAXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + if (crgetzoneid(credp) != GLOBAL_ZONEID) { + ret = EPERM; + break; + } + ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMINTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); + break; + case VND_IOC_GETMAXTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); + break; + case VND_IOC_FRAMEIO_READ: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_frameio_read(vdp, arg, mode); + break; + case VND_IOC_FRAMEIO_WRITE: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_frameio_write(vdp, arg, mode); + break; + case VND_IOC_LIST: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_list(arg, credp, mode); + break; + default: + ret = ENOTTY; + break; + } + + vnd_dev_rele(vdp); + return (ret); +} + +static int +vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + vnd_dev_t *vdp; + minor_t m; + zoneid_t zid; + + if (flag & (FEXCL | FNDELAY)) + return (ENOTSUP); + + if (otyp & OTYP_BLK) + return (ENOTSUP); + + zid = crgetzoneid(credp); + m = getminor(*devp); + + /* + * If we have an open of a non-zero instance then we need to look that + * up in our list of entries. + */ + if (m != 0) { + + /* + * We don't check for rawaccess globally as a user could be + * doing a list ioctl on the control node which doesn't require + * this privilege. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENOENT); + + /* + * We need to check to make sure that the user is allowed to + * open this node. At this point it should be an attached handle + * as that's all we're allowed to access. + */ + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (EBUSY); + } + + if (!(vdp->vdd_flags & VND_D_OPENED)) { + vdp->vdd_flags |= VND_D_OPENED; + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + } + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + + return (0); + } + + if (flag & FEXCL) + return (ENOTSUP); + + /* + * We need to clone ourselves and set up new a state. + */ + vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); + bzero(vdp, sizeof (vnd_dev_t)); + + if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { + kmem_cache_free(vnd_dev_cache, vdp); + return (EINVAL); + } + + vdp->vdd_minor = id_alloc(vnd_minors); + mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); + list_link_init(&vdp->vdd_link); + vdp->vdd_ref = 1; + *devp = makedevice(getmajor(*devp), vdp->vdd_minor); + vdp->vdd_devid = *devp; + DTRACE_VND_REFINC(vdp); + vdp->vdd_flags |= VND_D_OPENED; + + mutex_enter(&vnd_dev_lock); + list_insert_head(&vnd_dev_list, vdp); + mutex_exit(&vnd_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + if (m == 0) + return (ENXIO); + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_flags & VND_D_OPENED); + vdp->vdd_flags &= ~VND_D_OPENED; + mutex_exit(&vdp->vdd_lock); + + /* Remove the hold from the previous open. */ + vnd_dev_rele(vdp); + + /* And now from lookup */ + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static int +vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error = 0; + size_t mpsize; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + mblk_t *mp = NULL; + offset_t u_loffset; + + /* + * If we have more than one uio we refuse to do anything. That's for + * frameio. + */ + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + error = EWOULDBLOCK; + goto err; + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + error = EINTR; + goto err; + } + } + } + + /* Ensure our buffer is big enough */ + mp = vqp->vdq_head; + ASSERT(mp != NULL); + mpsize = msgsize(mp); + if (mpsize > uiop->uio_resid) { + error = EOVERFLOW; + goto err; + } + + u_loffset = uiop->uio_loffset; + while (mp != NULL) { + if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { + error = EFAULT; + uiop->uio_loffset = u_loffset; + mp = NULL; + goto err; + } + mpsize -= MBLKL(mp); + mp = mp->b_cont; + } + ASSERT(mpsize == 0); + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); +err: + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + + return (error); +} + +/* ARGSUSED */ +static int +vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error; + vnd_dev_t *vdp; + mblk_t *mp; + ssize_t iosize, origsize; + vnd_data_queue_t *vqp; + + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + mutex_enter(&vdp->vdd_str->vns_lock); + if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || + uiop->uio_resid < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + vnd_dev_rele(vdp); + return (ERANGE); + } + mutex_exit(&vdp->vdd_str->vns_lock); + VERIFY(vdp->vdd_str != NULL); + + /* + * Reserve space in the data queue if we can. If we can't, block or + * return EAGAIN. If we can, go and squeue_enter. + */ + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * Now that we've reserved the space, try to allocate kernel space for + * and copy in the block. To take care of all this we use the + * strmakedata subroutine for now. + */ + origsize = iosize = uiop->uio_resid; + error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, + &mp); + + /* + * strmakedata() will return an error or it may only consume a portion + * of the data. + */ + if (error != 0 || uiop->uio_resid != 0) { + vnd_dq_unreserve(vqp, origsize); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + vnd_dev_rele(vdp); + return (ENOSR); + } + + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + short ready = 0; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + if ((events & POLLIN) || (events & POLLRDNORM)) { + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head != NULL) + ready |= events & (POLLIN | POLLRDNORM); + mutex_exit(&vqp->vdq_lock); + } + + if (events & POLLOUT) { + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_cur != vqp->vdq_max) + ready |= POLLOUT; + mutex_exit(&vqp->vdq_lock); + } + + if ((ready == 0 && !anyyet) || (events & POLLET)) { + *phpp = &vdp->vdd_ph; + } + *reventsp = ready; + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static void * +vnd_stack_init(netstackid_t stackid, netstack_t *ns) +{ + vnd_pnsd_t *nsp; + + nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); + bzero(nsp, sizeof (*nsp)); + nsp->vpnd_nsid = stackid; + nsp->vpnd_zid = netstackid_to_zoneid(stackid); + nsp->vpnd_flags = 0; + mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_nslink)); + if (vnd_netinfo_init(nsp) == 0) + nsp->vpnd_hooked = B_TRUE; + + mutex_enter(&vnd_dev_lock); + list_insert_tail(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + return (nsp); +} + +/* ARGSUSED */ +static void +vnd_stack_shutdown(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + vnd_dev_t *vdp; + + ASSERT(nsp != NULL); + /* + * After shut down no one should be able to find their way to this + * netstack again. + */ + mutex_enter(&vnd_dev_lock); + list_remove(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + /* + * Make sure hooks know that they're going away. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_shutdown(nsp); + + /* + * Now we need to go through and notify each zone that they are in + * teardown phase. See the big theory statement section on vnd, zones, + * netstacks, and sdev for more information about this. + */ + mutex_enter(&nsp->vpnd_lock); + nsp->vpnd_flags |= VND_NS_CONDEMNED; + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_CONDEMNED)) + vdp->vdd_flags |= VND_D_ZONE_DYING; + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + /* + * Next we remove all the links as we know nothing new can be added to + * the list and that none of the extent devices can obtain additional + * links. + */ +restart: + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_CONDEMNED) || + !(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + continue; + } + + /* + * We drop our lock here and restart afterwards. Note that as + * part of unlinking we end up doing a rele of the vnd_dev_t. If + * this is the final hold on the vnd_dev_t then it might try and + * remove itself. Our locking rules requires not to be holding + * any locks when we call any of the rele functions. + * + * Note that the unlink function requires holders to call into + * it with the vnd_dev_t->vdd_lock held and will take care of it + * for us. Because we don't have a hold on it, we're done at + * this point. + */ + mutex_exit(&nsp->vpnd_lock); + /* Forcibly unlink */ + vnd_dev_unlink(vdp); + goto restart; + } + mutex_exit(&nsp->vpnd_lock); +} + +/* ARGSUSED */ +static void +vnd_stack_destroy(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + + ASSERT(nsp != NULL); + + /* + * Now that we've unlinked everything we just have to hang out for + * it to finish exiting. Now that it's no longer the kernel itself + * that's doing this we just need to wait for our reference count to + * equal zero and then we're free. If the global zone is holding open a + * reference to a vnd device for another zone, that's bad, but there's + * nothing much we can do. See the section on 'vnd, zones, netstacks' in + * the big theory statement for more information. + */ + mutex_enter(&nsp->vpnd_lock); + while (nsp->vpnd_ref != 0) + cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); + mutex_exit(&nsp->vpnd_lock); + + /* + * During shutdown we removed ourselves from the list and now we have no + * more references so we can safely say that there is nothing left and + * destroy everything that we had sitting around. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_fini(nsp); + + mutex_destroy(&nsp->vpnd_lock); + list_destroy(&nsp->vpnd_dev_list); + kmem_cache_free(vnd_pnsd_cache, nsp); +} + +/* + * Convert a node with a name of the form /dev/vnd/zone/%zonename and + * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. + */ +static vnd_pnsd_t * +vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) +{ + enum vtype vt; + const char *path = sdev_ctx_path(ctx); + char *zstart, *dup; + size_t duplen; + vnd_pnsd_t *nsp; + + vt = sdev_ctx_vtype(ctx); + ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); + + if (vt == VDIR) { + zstart = strrchr(path, '/'); + ASSERT(zstart != NULL); + zstart++; + return (vnd_nsd_lookup_by_zonename(zstart)); + } + + ASSERT(vt == VCHR); + + dup = strdup(path); + duplen = strlen(dup) + 1; + zstart = strrchr(dup, '/'); + *zstart = '\0'; + zstart--; + zstart = strrchr(dup, '/'); + zstart++; + nsp = vnd_nsd_lookup_by_zonename(zstart); + kmem_free(dup, duplen); + + return (nsp); +} + +static sdev_plugin_validate_t +vnd_sdev_validate_dir(sdev_ctx_t ctx) +{ + vnd_pnsd_t *nsp; + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) + return (SDEV_VTOR_VALID); + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + return (SDEV_VTOR_VALID); + } + + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (SDEV_VTOR_INVALID); + vnd_nsd_rele(nsp); + + return (SDEV_VTOR_VALID); +} + +static sdev_plugin_validate_t +vnd_sdev_validate(sdev_ctx_t ctx) +{ + enum vtype vt; + vnd_dev_t *vdp; + minor_t minor; + + vt = sdev_ctx_vtype(ctx); + if (vt == VDIR) + return (vnd_sdev_validate_dir(ctx)); + ASSERT(vt == VCHR); + + if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) + return (SDEV_VTOR_VALID); + + if (sdev_ctx_minor(ctx, &minor) != 0) + return (SDEV_VTOR_STALE); + + vdp = vnd_dev_lookup(minor); + if (vdp == NULL) + return (SDEV_VTOR_STALE); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED) || + (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_VALID); +} + +/* + * This function is a no-op. sdev never has holds on our devices as they can go + * away at any time and specfs has to deal with that fact. + */ +/* ARGSUSED */ +static void +vnd_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static int +vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) +{ + int ret; + vnd_dev_t *vdp; + + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_LINKED) && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, + VND_SDEV_MODE, vdp->vdd_devid); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&nsp->vpnd_lock); + vnd_nsd_rele(nsp); + return (ret); + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + return (0); +} + +static int +vnd_sdev_filldir_root(sdev_ctx_t ctx) +{ + zoneid_t zid; + vnd_pnsd_t *nsp; + int ret; + + zid = getzoneid(); + nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); + ASSERT(nsp != NULL); + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + if (ret != 0) + return (ret); + + /* + * Checking the zone id is not sufficient as the global zone could be + * reaching down into a non-global zone's mounted /dev. + */ + if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { + ret = sdev_plugin_mkdir(ctx, "zone"); + if (ret != 0 && ret != EEXIST) + return (ret); + } + + /* + * Always add a reference to the control node. There's no need to + * reference it since it always exists and is always what we clone from. + */ + ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE, + makedevice(ddi_driver_major(vnd_dip), 0)); + if (ret != 0 && ret != EEXIST) + return (ret); + + return (0); +} + +static int +vnd_sdev_filldir_zroot(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + zone_t *zonep; + + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + mutex_enter(&nsp->vpnd_lock); + if (list_is_empty(&nsp->vpnd_dev_list)) { + mutex_exit(&nsp->vpnd_lock); + continue; + } + mutex_exit(&nsp->vpnd_lock); + zonep = zone_find_by_id(nsp->vpnd_zid); + /* + * This zone must be being torn down, so skip it. + */ + if (zonep == NULL) + continue; + ret = sdev_plugin_mkdir(ctx, zonep->zone_name); + zone_rele(zonep); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vnd_dev_lock); + return (ret); + } + } + mutex_exit(&vnd_dev_lock); + return (0); +} + +static int +vnd_sdev_filldir(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + + ASSERT(sdev_ctx_vtype(ctx) == VDIR); + if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_root(ctx)); + + if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_zroot(ctx)); + + ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), + strlen(VND_SDEV_ZROOT)) == 0); + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (0); + + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + + return (ret); +} + +static sdev_plugin_ops_t vnd_sdev_ops = { + SDEV_PLUGIN_VERSION, + SDEV_PLUGIN_SUBDIR, + vnd_sdev_validate, + vnd_sdev_filldir, + vnd_sdev_inactive +}; + +static int +vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int errp = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Only allow one instance. + */ + if (vnd_dip != NULL) + return (DDI_FAILURE); + + vnd_dip = dip; + if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != + DDI_SUCCESS) { + vnd_dip = NULL; + return (DDI_FAILURE); + } + + if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, + DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { + ddi_remove_minor_node(vnd_dip, NULL); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, + &errp); + if (vnd_sdev_hdl == NULL) { + ddi_remove_minor_node(vnd_dip, NULL); + ddi_prop_remove_all(vnd_dip); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY); + + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&vnd_dev_lock); + if (!list_is_empty(&vnd_dev_list)) { + mutex_exit(&vnd_dev_lock); + return (DDI_FAILURE); + } + mutex_exit(&vnd_dev_lock); + + return (DDI_FAILURE); +} + +/* ARGSUSED */ +static int +vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vnd_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + + + +static void +vnd_ddi_fini(void) +{ + netstack_unregister(NS_VND); + if (vnd_taskq != NULL) + taskq_destroy(vnd_taskq); + if (vnd_str_cache != NULL) + kmem_cache_destroy(vnd_str_cache); + if (vnd_dev_cache != NULL) + kmem_cache_destroy(vnd_dev_cache); + if (vnd_pnsd_cache != NULL) + kmem_cache_destroy(vnd_pnsd_cache); + if (vnd_minors != NULL) + id_space_destroy(vnd_minors); + if (vnd_list_init != 0) { + list_destroy(&vnd_nsd_list); + list_destroy(&vnd_dev_list); + mutex_destroy(&vnd_dev_lock); + vnd_list_init = 0; + } + frameio_fini(); +} + +static int +vnd_ddi_init(void) +{ + if (frameio_init() != 0) + return (DDI_FAILURE); + + vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_str_cache == NULL) { + frameio_fini(); + return (DDI_FAILURE); + } + vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_dev_cache == NULL) { + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", + sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_pnsd_cache == NULL) { + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); + if (vnd_taskq == NULL) { + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); + if (vnd_minors == NULL) { + taskq_destroy(vnd_taskq); + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&vnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_link)); + list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), + offsetof(vnd_pnsd_t, vpnd_link)); + vnd_list_init = 1; + + netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, + vnd_stack_destroy); + + return (DDI_SUCCESS); +} + +static struct module_info vnd_minfo = { + 0, /* module id */ + "vnd", /* module name */ + 1, /* smallest packet size */ + INFPSZ, /* largest packet size (infinite) */ + 1, /* high watermark */ + 0 /* low watermark */ +}; + +static struct qinit vnd_r_qinit = { + vnd_s_rput, + NULL, + vnd_s_open, + vnd_s_close, + NULL, + &vnd_minfo, + NULL +}; + +static struct qinit vnd_w_qinit = { + vnd_s_wput, + NULL, + NULL, + NULL, + NULL, + &vnd_minfo, + NULL +}; + +static struct streamtab vnd_strtab = { + &vnd_r_qinit, + &vnd_w_qinit, + NULL, + NULL +}; + + +static struct cb_ops vnd_cb_ops = { + vnd_open, /* open */ + vnd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + vnd_read, /* read */ + vnd_write, /* write */ + vnd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + vnd_chpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops vnd_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + vnd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + vnd_attach, /* attach */ + vnd_detach, /* detach */ + nodev, /* reset */ + &vnd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv vnd_modldrv = { + &mod_driverops, + "Virtual Networking Datapath Driver", + &vnd_dev_ops +}; + +static struct fmodsw vnd_fmodfsw = { + "vnd", + &vnd_strtab, + D_NEW | D_MP +}; + +static struct modlstrmod vnd_modlstrmod = { + &mod_strmodops, + "Virtual Networking Datapath Driver", + &vnd_fmodfsw +}; + +static struct modlinkage vnd_modlinkage = { + MODREV_1, + &vnd_modldrv, + &vnd_modlstrmod, + NULL +}; + +int +_init(void) +{ + int error; + + /* + * We need to do all of our global initialization in init as opposed to + * attach and detach. The problem here is that because vnd can be used + * from a stream context while being detached, we can not rely on having + * run attach to create everything, alas. so it goes in _init, just like + * our friend ip. + */ + if ((error = vnd_ddi_init()) != DDI_SUCCESS) + return (error); + error = mod_install((&vnd_modlinkage)); + if (error != 0) + vnd_ddi_fini(); + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vnd_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&vnd_modlinkage); + if (error == 0) + vnd_ddi_fini(); + return (error); +} diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf new file mode 100644 index 0000000000..65872e1ddf --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# + +name="vnd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index d671153967..e532a551e7 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -354,7 +354,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, rw_enter(&vnic_lock, RW_WRITER); - /* does a VNIC with the same id already exist? */ + /* Does a VNIC with the same id already exist? */ err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id), (mod_hash_val_t *)&vnic); if (err == 0) { @@ -370,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, bzero(vnic, sizeof (*vnic)); + vnic->vn_ls = LINK_STATE_UNKNOWN; vnic->vn_id = vnic_id; vnic->vn_link_id = linkid; vnic->vn_vrid = vrid; @@ -455,6 +456,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, } else { vnic->vn_hcksum_txflags = 0; } + + /* + * Check for LSO capabilities. LSO implementations + * depend on hardware checksumming, so the same + * requirement is enforced here. + */ + if (vnic->vn_hcksum_txflags != 0) { + if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO, + &vnic->vn_cap_lso)) { + vnic->vn_cap_lso.lso_flags = 0; + } + } else { + vnic->vn_cap_lso.lso_flags = 0; + } } /* register with the MAC module */ @@ -580,11 +595,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, vnic->vn_enabled = B_TRUE; if (is_anchor) { - mac_link_update(vnic->vn_mh, LINK_STATE_UP); + vnic->vn_ls = LINK_STATE_UP; } else { - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); } + mac_link_update(vnic->vn_mh, vnic->vn_ls); rw_exit(&vnic_lock); @@ -824,6 +840,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_PARTIAL); break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (vnic->vn_cap_lso.lso_flags == 0) { + return (B_FALSE); + } + *cap_lso = vnic->vn_cap_lso; + break; + } case MAC_CAPAB_VNIC: { mac_capab_vnic_t *vnic_capab = cap_data; @@ -1092,6 +1117,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = vnic_set_secondary_macs(vn, &msa); break; } + case MAC_PROP_PRIVATE: { + long val, i; + const char *v; + + if (vn->vn_link_id != DATALINK_INVALID_LINKID || + strcmp(pr_name, "_linkstate") != 0) { + err = ENOTSUP; + break; + } + + for (v = pr_val, i = 0; i < pr_valsize; i++, v++) { + if (*v == '\0') + break; + } + if (i == pr_valsize) { + err = EINVAL; + break; + } + + (void) ddi_strtol(pr_val, (char **)NULL, 0, &val); + if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) { + err = EINVAL; + break; + } + vn->vn_ls = val; + mac_link_update(vn->vn_mh, vn->vn_ls); + break; + } default: err = ENOTSUP; break; @@ -1117,6 +1170,18 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, case MAC_PROP_SECONDARY_ADDRS: ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val); break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) { + ret = EINVAL; + break; + } + + if (strcmp(pr_name, "_linkstate") != 0) { + ret = EINVAL; + break; + } + (void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls); + break; default: ret = ENOTSUP; break; @@ -1126,7 +1191,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, } /* ARGSUSED */ -static void vnic_m_propinfo(void *m_driver, const char *pr_name, +static void +vnic_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { vnic_t *vn = m_driver; @@ -1169,6 +1235,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name, mac_perim_exit(mph); } break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) + break; + + if (strcmp(pr_name, "_linkstate") == 0) { + char buf[16]; + + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + (void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls); + mac_prop_info_set_default_str(prh, buf); + } + break; } } @@ -1241,8 +1319,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type) break; case MAC_NOTE_LINK: - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); + mac_link_update(vnic->vn_mh, vnic->vn_ls); break; default: diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c new file mode 100644 index 0000000000..2da310ab8d --- /dev/null +++ b/usr/src/uts/common/io/zfd.c @@ -0,0 +1,1154 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +/* + * Zone File Descriptor Driver. + * + * This driver is derived from the zcons driver which is in turn derived from + * the pts/ptm drivers. The purpose is to expose file descriptors within the + * zone which are connected to zoneadmd and used for logging or an interactive + * connection to a process within the zone. + * + * Its implementation is straightforward. Each instance of the driver + * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd + * uses these devices unidirectionally to provide stdin, stdout and stderr to + * the process within the zone. + * + * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd, + * using the devctl framework; thus the driver does not need to maintain any + * sort of "admin" node. + * + * The driver shuttles I/O from master side to slave side and back. In a break + * from the pts/ptm semantics, if one side is not open, I/O directed towards + * it will simply be discarded. This is so that if zoneadmd is not holding the + * master side fd open (i.e. it has died somehow), processes in the zone do not + * experience any errors and I/O to the fd does not cause the process to hang. + * + * The driver can also act as a multiplexer so that data written to the + * slave side within the zone is also redirected back to another zfd device + * inside the zone for consumption (i.e. it can be read). The intention is + * that a logging process within the zone can consume data that is being + * written by an application onto the primary stream. This is essentially + * a tee off of the primary stream into a log stream. This tee can also be + * configured to be flow controlled via an ioctl. Flow control happens on the + * primary stream and is used to ensure that the log stream receives all of + * the messages off the primary stream when consumption of the data off of + * the log stream gets behind. Configuring for flow control implies that the + * application writing to the primary stream will be blocked when the log + * consumer gets behind. Note that closing the log stream (e.g. when the zone + * halts) will cause the loss of all messages queued in the stream. + * + * The zone's zfd device configuration is driven by zoneadmd and a zone mode. + * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat + * of a misnomer since its purpose has evolved. The attribute can have a + * variety of values, but the lowest two positions are used to control how many + * zfd devices are created inside the zone and if the primary stream is a tty. + * + * Here is a summary of how the 4 modes control what zfd devices are created + * and how they're used: + * + * t-: 1 stdio zdev (0) configured as a tty + * --: 3 stdio zdevs (0, 1, 2), not configured as a tty + * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1) + * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4) + * + * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex + * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are + * autopushed onto the stream when the slave side is opened. There is only a + * single zfd dev (0) needed for the primary stream. + * + * When the 'n' flag is set, it is assumed that output logging will be done + * within the zone itself. In this configuration 1 or 2 additional zfd devices, + * depending on tty mode ('t' flag) are created within the zone. An application + * can then configure the zfd streams driver into a multiplexer. Output from + * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s) + * within the zone. + * + * The following is a diagram of how this works for a '-n' configuration: + * + * + * zoneadmd (for zlogin -I stdout) + * GZ: ^ + * | + * -------------------------- + * ^ + * NGZ: | + * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout) + * + * There would be a similar path for the app's stderr into zfd4 for the logger + * to consume stderr. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/debug.h> +#include <sys/devops.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kstr.h> +#include <sys/modctl.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zfd.h> +#include <sys/vnode.h> +#include <sys/fs/snode.h> +#include <sys/zone.h> +#include <sys/sdt.h> + +static kmutex_t zfd_mux_lock; + +static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int zfd_attach(dev_info_t *, ddi_attach_cmd_t); +static int zfd_detach(dev_info_t *, ddi_detach_cmd_t); + +static int zfd_open(queue_t *, dev_t *, int, int, cred_t *); +static int zfd_close(queue_t *, int, cred_t *); +static void zfd_wput(queue_t *, mblk_t *); +static void zfd_rsrv(queue_t *); +static void zfd_wsrv(queue_t *); + +/* + * The instance number is encoded in the dev_t in the minor number; the lowest + * bit of the minor number is used to track the master vs. slave side of the + * fd. The rest of the bits in the minor number are the instance. + */ +#define ZFD_MASTER_MINOR 0 +#define ZFD_SLAVE_MINOR 1 + +#define ZFD_INSTANCE(x) (getminor((x)) >> 1) +#define ZFD_NODE(x) (getminor((x)) & 0x01) + +/* + * This macro converts a zfd_state_t pointer to the associated slave minor + * node's dev_t. + */ +#define ZFD_STATE_TO_SLAVEDEV(x) \ + (makedevice(ddi_driver_major((x)->zfd_devinfo), \ + (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR))) + +int zfd_debug = 0; +#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a) +#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b) + +/* + * ZFD Pseudo Terminal Module: stream data structure definitions, + * based on zcons. + */ +static struct module_info zfd_info = { + 0x20FD, /* ZOFD - 8445 */ + "zfd", + 0, /* min packet size */ + INFPSZ, /* max packet size - infinity */ + 2048, /* high water */ + 128 /* low water */ +}; + +static struct qinit zfd_rinit = { + NULL, + (int (*)()) zfd_rsrv, + zfd_open, + zfd_close, + NULL, + &zfd_info, + NULL +}; + +static struct qinit zfd_winit = { + (int (*)()) zfd_wput, + (int (*)()) zfd_wsrv, + NULL, + NULL, + NULL, + &zfd_info, + NULL +}; + +static struct streamtab zfd_tab_info = { + &zfd_rinit, + &zfd_winit, + NULL, + NULL +}; + +#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL) + +/* + * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops) + */ +DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \ + nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \ + ddi_quiesce_not_needed); + +/* + * Module linkage information for the kernel. + */ + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module (this is a pseudo driver) */ + "Zone FD driver", /* description of module */ + &zfd_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +typedef enum { + ZFD_NO_MUX, + ZFD_PRIMARY_STREAM, + ZFD_LOG_STREAM +} zfd_mux_type_t; + +typedef struct zfd_state { + dev_info_t *zfd_devinfo; /* instance info */ + queue_t *zfd_master_rdq; /* GZ read queue */ + queue_t *zfd_slave_rdq; /* in-zone read queue */ + int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */ + int zfd_tty; /* ZFD_MAKETTY - strm mods will push */ + boolean_t zfd_is_flowcon; /* primary stream flow stopped */ + boolean_t zfd_allow_flowcon; /* use flow control */ + zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */ + struct zfd_state *zfd_inst_pri; /* log state's primary ptr */ + struct zfd_state *zfd_inst_log; /* primary state's log ptr */ +} zfd_state_t; + +#define ZFD_STATE_MOPEN 0x01 +#define ZFD_STATE_SOPEN 0x02 + +static void *zfd_soft_state; + +/* + * List of STREAMS modules that are autopushed onto a slave instance when its + * opened, but only if the ZFD_MAKETTY ioctl has first been received by the + * master. + */ +static char *zfd_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t), + 0)) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(zfd_soft_state); + + mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL); + return (err); +} + + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) { + return (err); + } + + ddi_soft_state_fini(&zfd_soft_state); + mutex_destroy(&zfd_mux_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN]; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME, + instance); + (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME, + instance); + + /* + * Create the master and slave minor nodes. + */ + if ((ddi_create_minor_node(dip, slavenm, S_IFCHR, + instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) || + (ddi_create_minor_node(dip, masternm, S_IFCHR, + instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) { + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + return (DDI_FAILURE); + } + + VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL); + zfds->zfd_devinfo = dip; + zfds->zfd_tty = 0; + zfds->zfd_muxt = ZFD_NO_MUX; + zfds->zfd_inst_log = NULL; + return (DDI_SUCCESS); +} + +static int +zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (DDI_FAILURE); + + if ((zfds->zfd_state & ZFD_STATE_MOPEN) || + (zfds->zfd_state & ZFD_STATE_SOPEN)) { + DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip); + return (DDI_FAILURE); + } + + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + + return (DDI_SUCCESS); +} + +/* + * zfd_getinfo() + * getinfo(9e) entrypoint. + */ +/*ARGSUSED*/ +static int +zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + zfd_state_t *zfds; + int instance = ZFD_INSTANCE((dev_t)arg); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) + return (DDI_FAILURE); + *result = zfds->zfd_devinfo; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Return the equivalent queue from the other side of the relationship. + * e.g.: given the slave's write queue, return the master's write queue. + */ +static queue_t * +zfd_switch(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq) + return (zfds->zfd_slave_rdq); + else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq + != NULL) + return (OTHERQ(zfds->zfd_slave_rdq)); + else if (qp == zfds->zfd_slave_rdq) + return (zfds->zfd_master_rdq); + else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq + != NULL) + return (OTHERQ(zfds->zfd_master_rdq)); + else + return (NULL); +} + +/* + * For debugging and outputting messages. Returns the name of the side of + * the relationship associated with this queue. + */ +static const char * +zfd_side(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq || + OTHERQ(qp) == zfds->zfd_master_rdq) { + return ("master"); + } + ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq); + return ("slave"); +} + +/*ARGSUSED*/ +static int +zfd_master_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + + /* + * Enforce exclusivity on the master side; the only consumer should + * be the zoneadmd for the zone. + */ + if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0) + return (EBUSY); + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_master_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_MOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + qprocson(rqp); + + /* + * Following qprocson(), the master side is fully plumbed into the + * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq + * will allow the slave to send messages to us (the master). + * This cannot occur before qprocson() because the master is not + * ready to process them until that point. + */ + zfds->zfd_master_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + if (oflag & FNOCTTY) + sop->so_flags = SO_HIWAT | SO_LOWAT; + else + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/*ARGSUSED*/ +static int +zfd_slave_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + /* + * The slave side can be opened as many times as needed. + */ + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds)); + return (0); + } + + /* A log stream is read-only */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + (oflag & (FREAD | FWRITE)) != FREAD) + return (EINVAL); + + if (zfds->zfd_tty == 1) { + major_t major; + minor_t minor; + minor_t lastminor; + uint_t anchorindex; + + /* + * Set up sad(7D) so that the necessary STREAMS modules will + * be in place. A wrinkle is that 'ptem' must be anchored + * in place (see streamio(7i)) because we always want the + * fd to have terminal semantics. + */ + minor = + ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR; + major = ddi_driver_major(zfds->zfd_devinfo); + lastminor = 0; + anchorindex = 1; + if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor, + &anchorindex, zfd_mods) != 0) { + DBG("zfd_slave_open(): kstr_autopush() failed\n"); + return (EIO); + } + } + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_slave_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_SOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + + qprocson(rqp); + + /* + * Must follow qprocson(), since we aren't ready to process until then. + */ + zfds->zfd_slave_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/* + * open(9e) entrypoint; checks sflag, and rejects anything unordinary. + */ +static int +zfd_open(queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + int instance = ZFD_INSTANCE(*devp); + int ret; + zfd_state_t *zfds; + + if (sflag != 0) + return (EINVAL); + + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (ENXIO); + + switch (ZFD_NODE(*devp)) { + case ZFD_MASTER_MINOR: + ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp); + break; + case ZFD_SLAVE_MINOR: + ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp); + /* + * If we just opened the log stream and flow control has + * been enabled, we want to make sure the primary stream can + * start flowing. + */ + if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + break; + default: + ret = ENXIO; + break; + } + + return (ret); +} + +/* + * close(9e) entrypoint. + */ +/*ARGSUSED1*/ +static int +zfd_close(queue_t *rqp, int flag, cred_t *credp) +{ + queue_t *wqp; + mblk_t *bp; + zfd_state_t *zfds; + major_t major; + minor_t minor; + + zfds = (zfd_state_t *)rqp->q_ptr; + + if (rqp == zfds->zfd_master_rdq) { + DBG("Closing master side"); + + zfds->zfd_master_rdq = NULL; + zfds->zfd_state &= ~ZFD_STATE_MOPEN; + + /* + * qenable slave side write queue so that it can flush + * its messages as master's read queue is going away + */ + if (zfds->zfd_slave_rdq != NULL) { + qenable(WR(zfds->zfd_slave_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + } else if (rqp == zfds->zfd_slave_rdq) { + + DBG("Closing slave side"); + zfds->zfd_state &= ~ZFD_STATE_SOPEN; + zfds->zfd_slave_rdq = NULL; + + wqp = WR(rqp); + while ((bp = getq(wqp)) != NULL) { + if (zfds->zfd_master_rdq != NULL) + putnext(zfds->zfd_master_rdq, bp); + else if (bp->b_datap->db_type == M_IOCTL) + miocnak(wqp, bp, 0, 0); + else + freemsg(bp); + } + + /* + * Qenable master side write queue so that it can flush its + * messages as slaves's read queue is going away. + */ + if (zfds->zfd_master_rdq != NULL) + qenable(WR(zfds->zfd_master_rdq)); + + /* + * Qenable primary stream if necessary. + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + if (zfds->zfd_tty == 1) { + /* + * Clear the sad configuration so that reopening + * doesn't fail to set up sad configuration. + */ + major = ddi_driver_major(zfds->zfd_devinfo); + minor = ddi_get_instance(zfds->zfd_devinfo) << 1 | + ZFD_SLAVE_MINOR; + (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor, + NULL, NULL, NULL); + } + } + + return (0); +} + +static void +handle_mflush(queue_t *qp, mblk_t *mp) +{ + mblk_t *nmp; + DBG1("M_FLUSH on %s side", zfd_side(qp)); + + if (*mp->b_rptr & FLUSHW) { + DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp)); + flushq(qp, FLUSHDATA); + *mp->b_rptr &= ~FLUSHW; + if ((*mp->b_rptr & FLUSHR) == 0) { + /* + * FLUSHW only. Change to FLUSHR and putnext other side, + * then we are done. + */ + *mp->b_rptr |= FLUSHR; + if (zfd_switch(RD(qp)) != NULL) { + putnext(zfd_switch(RD(qp)), mp); + return; + } + } else if ((zfd_switch(RD(qp)) != NULL) && + (nmp = copyb(mp)) != NULL) { + /* + * It is a FLUSHRW; we copy the mblk and send + * it to the other side, since we still need to use + * the mblk in FLUSHR processing, below. + */ + putnext(zfd_switch(RD(qp)), nmp); + } + } + + if (*mp->b_rptr & FLUSHR) { + DBG("qreply(qp) turning FLUSHR around\n"); + qreply(qp, mp); + return; + } + freemsg(mp); +} + +/* + * Evaluate the various conditionals to determine if we're teeing into a log + * stream and if the primary stream should be flow controlled. This function + * can set the zfd_is_flowcon flag as a side effect. + * + * When teeing with flow control, we always queue the teed msg here and if + * the queue is getting full, we set zfd_is_flowcon. The primary stream will + * always queue when zfd_is_flowcon and will also not be served when + * zfd_is_flowcon is set. This causes backpressure on the primary stream + * until the teed queue can drain. + */ +static void +zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp) +{ + queue_t *log_qp; + zfd_state_t *log_zfds; + mblk_t *lmp; + + if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM) + return; + + if (type != M_DATA) + return; + + log_zfds = zfds->zfd_inst_log; + if (log_zfds == NULL) + return; + + ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM); + + if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + /* The zfd_slave_rdq is null until the log dev is opened in the zone */ + log_qp = RD(log_zfds->zfd_slave_rdq); + DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds); + + if (!zfds->zfd_allow_flowcon) { + /* + * We're not supposed to tee with flow control and the tee is + * full so we skip teeing into the log stream. + */ + if ((log_qp->q_flag & QFULL) != 0) + return; + } + + /* + * Tee the message into the log stream. + */ + lmp = dupmsg(mp); + if (lmp == NULL) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) { + putnext(log_qp, lmp); + } else { + if (putq(log_qp, lmp) == 0) { + /* The logger queue is full, free the msg. */ + freemsg(lmp); + } + /* + * If we're supposed to tee with flow control and the tee is + * over the high water mark then we want the primary stream to + * stop flowing. We'll stop queueing the primary stream after + * the log stream has drained. + */ + if (zfds->zfd_allow_flowcon && + log_qp->q_count > log_qp->q_hiwat) { + zfds->zfd_is_flowcon = B_TRUE; + } + } +} + +/* + * wput(9E) is symmetric for master and slave sides, so this handles both + * without splitting the codepath. (The only exception to this is the + * processing of zfd ioctls, which is restricted to the master side.) + * + * zfd_wput() looks at the other side; if there is no process holding that + * side open, it frees the message. This prevents processes from hanging + * if no one is holding open the fd. Otherwise, it putnext's high + * priority messages, putnext's normal messages if possible, and otherwise + * enqueues the messages; in the case that something is enqueued, wsrv(9E) + * will take care of eventually shuttling I/O to the other side. + * + * When configured as a multiplexer, then anything written to the stream + * from inside the zone is also teed off to the corresponding log stream + * for consumption within the zone (i.e. the log stream can be read, but never + * written to, by an application inside the zone). + */ +static void +zfd_wput(queue_t *qp, mblk_t *mp) +{ + unsigned char type = mp->b_datap->db_type; + zfd_state_t *zfds; + struct iocblk *iocbp; + boolean_t must_queue = B_FALSE; + + ASSERT(qp->q_ptr); + + DBG1("entering zfd_wput, %s side", zfd_side(qp)); + + /* + * Process zfd ioctl messages if qp is the master side's write queue. + */ + zfds = (zfd_state_t *)qp->q_ptr; + + if (type == M_IOCTL) { + iocbp = (struct iocblk *)(void *)mp->b_rptr; + + switch (iocbp->ioc_cmd) { + case ZFD_MAKETTY: + zfds->zfd_tty = 1; + miocack(qp, mp, 0, 0); + return; + case ZFD_EOF: + if (zfds->zfd_slave_rdq != NULL) + (void) putnextctl(zfds->zfd_slave_rdq, + M_HANGUP); + miocack(qp, mp, 0, 0); + return; + case ZFD_HAS_SLAVE: + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + miocack(qp, mp, 0, 0); + } else { + miocack(qp, mp, 0, ENOTTY); + } + return; + case ZFD_MUX: { + /* + * Setup the multiplexer configuration for the two + * streams. + * + * We expect to be called on the stream that will + * become the log stream and be passed one data block + * with the minor number of the slave side of the + * primary stream. + */ + int to; + int instance; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Get the primary slave minor device number */ + to = *(int *)mp->b_cont->b_rptr; + instance = ZFD_INSTANCE(to); + + if ((prim_zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Disallow changing primary/log once set. */ + mutex_enter(&zfd_mux_lock); + if (zfds->zfd_muxt != ZFD_NO_MUX || + prim_zfds->zfd_muxt != ZFD_NO_MUX) { + mutex_exit(&zfd_mux_lock); + miocack(qp, mp, 0, EINVAL); + return; + } + + zfds->zfd_muxt = ZFD_LOG_STREAM; + zfds->zfd_inst_pri = prim_zfds; + prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM; + prim_zfds->zfd_inst_log = zfds; + mutex_exit(&zfd_mux_lock); + DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds, + void *, zfds); + + miocack(qp, mp, 0, 0); + return; + } + case ZFD_MUX_FLOWCON: { + /* + * We expect this ioctl to be issued against the + * log stream. We don't use the primary stream since + * there can be other streams modules pushed onto that + * stream which would interfere with the ioctl. + */ + int val; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + if (zfds->zfd_muxt != ZFD_LOG_STREAM) { + miocack(qp, mp, 0, EINVAL); + return; + } + prim_zfds = zfds->zfd_inst_pri; + + /* Get the flow control setting */ + val = *(int *)mp->b_cont->b_rptr; + if (val != 0 && val != 1) { + miocack(qp, mp, 0, EINVAL); + return; + } + + prim_zfds->zfd_allow_flowcon = (boolean_t)val; + if (!prim_zfds->zfd_allow_flowcon) + prim_zfds->zfd_is_flowcon = B_FALSE; + + DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds); + miocack(qp, mp, 0, 0); + return; + } + default: + break; + } + } + + /* if on the write side, may need to tee */ + if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) { + /* tee output to any attached log stream */ + zfd_tee_handler(zfds, type, mp); + + /* high-priority msgs are not subject to flow control */ + if (zfds->zfd_is_flowcon && type == M_DATA) + must_queue = B_TRUE; + } + + if (zfd_switch(RD(qp)) == NULL) { + DBG1("wput to %s side (no one listening)", zfd_side(qp)); + switch (type) { + case M_FLUSH: + handle_mflush(qp, mp); + break; + case M_IOCTL: + miocnak(qp, mp, 0, 0); + break; + default: + freemsg(mp); + break; + } + return; + } + + if (type >= QPCTL) { + DBG1("(hipri) wput, %s side", zfd_side(qp)); + switch (type) { + case M_READ: /* supposedly from ldterm? */ + DBG("zfd_wput: tossing M_READ\n"); + freemsg(mp); + break; + case M_FLUSH: + handle_mflush(qp, mp); + break; + default: + /* + * Put this to the other side. + */ + ASSERT(zfd_switch(RD(qp)) != NULL); + putnext(zfd_switch(RD(qp)), mp); + break; + } + DBG1("done (hipri) wput, %s side", zfd_side(qp)); + return; + } + + /* + * If the primary stream has been stopped for flow control then + * enqueue the msg, otherwise only putnext if there isn't already + * something in the queue. If we don't do this then things would wind + * up out of order. + */ + if (!must_queue && qp->q_first == NULL && + bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + putnext(RD(zfd_switch(qp)), mp); + } else { + /* + * zfd_wsrv expects msgs queued on the primary queue. Those + * will be handled by zfd_wsrv after zfd_rsrv performs the + * qenable on the proper queue. + */ + (void) putq(qp, mp); + } + + DBG1("done wput, %s side", zfd_side(qp)); +} + +/* + * Read server + * + * For primary stream: + * Under normal execution rsrv(9E) is symmetric for master and slave, so + * zfd_rsrv() can handle both without splitting up the codepath. We do this by + * enabling the write side of the partner. This triggers the partner to send + * messages queued on its write side to this queue's read side. + * + * For log stream: + * Internally we've queued up the msgs that we've teed off to the log stream + * so when we're invoked we need to pass these along. + */ +static void +zfd_rsrv(queue_t *qp) +{ + zfd_state_t *zfds; + zfds = (zfd_state_t *)qp->q_ptr; + + /* + * log stream server + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) { + queue_t *log_qp; + mblk_t *mp; + + log_qp = RD(zfds->zfd_slave_rdq); + + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + zfd_state_t *pzfds = zfds->zfd_inst_pri; + + while ((mp = getq(qp)) != NULL) { + if (bcanputnext(log_qp, mp->b_band)) { + putnext(log_qp, mp); + } else { + (void) putbq(log_qp, mp); + break; + } + } + + if (log_qp->q_count < log_qp->q_lowat) { + DTRACE_PROBE(zfd__flow__on); + pzfds->zfd_is_flowcon = B_FALSE; + if (pzfds->zfd_master_rdq != NULL) + qenable(RD(pzfds->zfd_master_rdq)); + } + } else { + /* No longer open, drain the queue */ + while ((mp = getq(qp)) != NULL) { + freemsg(mp); + } + flushq(qp, FLUSHALL); + } + return; + } + + /* + * Care must be taken here, as either of the master or slave side + * qptr could be NULL. + */ + ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq); + if (zfd_switch(qp) == NULL) { + DBG("zfd_rsrv: other side isn't listening\n"); + return; + } + qenable(WR(zfd_switch(qp))); +} + +/* + * Write server + * + * This routine is symmetric for master and slave, so it handles both without + * splitting up the codepath. + * + * If there are messages on this queue that can be sent to the other, send + * them via putnext(). Else, if queued messages cannot be sent, leave them + * on this queue. + */ +static void +zfd_wsrv(queue_t *qp) +{ + queue_t *swq; + mblk_t *mp; + zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr; + + ASSERT(zfds != NULL); + + /* + * Partner has no read queue, so take the data, and throw it away. + */ + if (zfd_switch(RD(qp)) == NULL) { + DBG("zfd_wsrv: other side isn't listening"); + while ((mp = getq(qp)) != NULL) { + if (mp->b_datap->db_type == M_IOCTL) + miocnak(qp, mp, 0, 0); + else + freemsg(mp); + } + flushq(qp, FLUSHALL); + return; + } + + swq = RD(zfd_switch(qp)); + + /* + * while there are messages on this write queue... + */ + while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) { + /* + * Due to the way zfd_wput is implemented, we should never + * see a high priority control message here. + */ + ASSERT(mp->b_datap->db_type < QPCTL); + + if (bcanputnext(swq, mp->b_band)) { + putnext(swq, mp); + } else { + (void) putbq(qp, mp); + break; + } + } +} diff --git a/usr/src/uts/common/klm/klmmod.c b/usr/src/uts/common/klm/klmmod.c index 51ed43e198..58e0f2d874 100644 --- a/usr/src/uts/common/klm/klmmod.c +++ b/usr/src/uts/common/klm/klmmod.c @@ -12,6 +12,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -278,6 +279,10 @@ lm_svc(struct lm_svc_args *args) rfs4_lease_time = args->grace; } + if (args->n_v4_only == -1) { + g->nlm_v4_only = B_TRUE; + } + mutex_exit(&g->lock); err = nlm_svc_starting(g, fp, netid, &knc); mutex_enter(&g->lock); diff --git a/usr/src/uts/common/klm/mapfile-mod b/usr/src/uts/common/klm/mapfile-mod index 0debe6d986..b7789d81fd 100644 --- a/usr/src/uts/common/klm/mapfile-mod +++ b/usr/src/uts/common/klm/mapfile-mod @@ -11,6 +11,7 @@ # # Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# Copyright 2017 Joyent, Inc. # @@ -49,6 +50,11 @@ SYMBOL_SCOPE { nlm_frlock; nlm_register_lock_locally; nlm_shrlock; +# These four functions are available for use within a branded zone. + nlm_nsm_clnt_init; + nlm_netbuf_to_netobj; + sm_mon_1; + sm_unmon_1; local: *; diff --git a/usr/src/uts/common/klm/nlm_dispatch.c b/usr/src/uts/common/klm/nlm_dispatch.c index a0ca2a56c4..8fa9940eae 100644 --- a/usr/src/uts/common/klm/nlm_dispatch.c +++ b/usr/src/uts/common/klm/nlm_dispatch.c @@ -11,6 +11,7 @@ /* * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* @@ -412,13 +413,13 @@ nlm_prog_3_dtable[] = { 0, 0 }, - { /* 16: not used */ - NLM_SVC_FUNC(0), - (xdrproc_t)0, - (xdrproc_t)0, + { /* 16: Linux NLMPROC_NSM_NOTIFY (same handling as NLM_SM_NOTIFY1) */ + NLM_SVC_FUNC(nlm_sm_notify1_2_svc), + (xdrproc_t)xdr_nlm_sm_status, + (xdrproc_t)xdr_void, NULL, 0, - 0 }, + NLM_DISP_NOREMOTE }, { /* 17: NLM_SM_NOTIFY1 */ NLM_SVC_FUNC(nlm_sm_notify1_2_svc), diff --git a/usr/src/uts/common/klm/nlm_impl.c b/usr/src/uts/common/klm/nlm_impl.c index 1e9033a17c..e787f70ebd 100644 --- a/usr/src/uts/common/klm/nlm_impl.c +++ b/usr/src/uts/common/klm/nlm_impl.c @@ -28,6 +28,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* @@ -57,6 +58,7 @@ #include <sys/queue.h> #include <sys/bitmap.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <netinet/in.h> #include <rpc/rpc.h> @@ -202,6 +204,12 @@ static struct nlm_knc nlm_netconfigs[] = { /* (g) */ }; /* + * NLM functions which can be called by a brand hook. + */ +void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); +void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); + +/* * NLM misc. function */ static void nlm_copy_netbuf(struct netbuf *, struct netbuf *); @@ -210,8 +218,6 @@ static void nlm_kmem_reclaim(void *); static void nlm_pool_shutdown(void); static void nlm_suspend_zone(struct nlm_globals *); static void nlm_resume_zone(struct nlm_globals *); -static void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); -static void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); /* * NLM thread functions @@ -1839,6 +1845,12 @@ nlm_host_unmonitor(struct nlm_globals *g, struct nlm_host *host) return; host->nh_flags &= ~NLM_NH_MONITORED; + + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) { + ZBROP(curzone)->b_rpc_statd(SM_UNMON, g, host); + return; + } + stat = nlm_nsm_unmon(&g->nlm_nsm, host->nh_name); if (stat != RPC_SUCCESS) { NLM_WARN("NLM: Failed to contact statd, stat=%d\n", stat); @@ -1877,6 +1889,11 @@ nlm_host_monitor(struct nlm_globals *g, struct nlm_host *host, int state) host->nh_flags |= NLM_NH_MONITORED; mutex_exit(&host->nh_lock); + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) { + ZBROP(curzone)->b_rpc_statd(SM_MON, g, host); + return; + } + /* * Before we begin monitoring the host register the network address * associated with this hostname. @@ -2353,6 +2370,13 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp, VERIFY(g->run_status == NLM_ST_STARTING); VERIFY(g->nlm_gc_thread == NULL); + if (g->nlm_v4_only) { + NLM_WARN("Zone %d has no rpcbind, NLM is v4 only", getzoneid()); + bzero(&g->nlm_nsm, sizeof (struct nlm_nsm)); + g->nlm_nsm.ns_addr_handle = (void *)-1; + goto v4_only; + } + error = nlm_nsm_init_local(&g->nlm_nsm); if (error != 0) { NLM_ERR("Failed to initialize NSM handler " @@ -2389,6 +2413,7 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp, "(rpcerr=%d)\n", stat); goto shutdown_lm; } +v4_only: g->grace_threshold = ddi_get_lbolt() + SEC_TO_TICK(g->grace_period); @@ -2512,7 +2537,9 @@ nlm_svc_stopping(struct nlm_globals *g) ASSERT(TAILQ_EMPTY(&g->nlm_slocks)); - nlm_nsm_fini(&g->nlm_nsm); + /* If started with rpcbind (the normal case) */ + if (g->nlm_nsm.ns_addr_handle != (void *)-1) + nlm_nsm_fini(&g->nlm_nsm); g->lockd_pid = 0; g->run_status = NLM_ST_DOWN; } @@ -2781,14 +2808,14 @@ nlm_cprresume(void) rw_exit(&lm_lck); } -static void +void nlm_nsm_clnt_init(CLIENT *clnt, struct nlm_nsm *nsm) { (void) clnt_tli_kinit(clnt, &nsm->ns_knc, &nsm->ns_addr, 0, NLM_RPC_RETRIES, kcred); } -static void +void nlm_netbuf_to_netobj(struct netbuf *addr, int *family, netobj *obj) { /* LINTED pointer alignment */ diff --git a/usr/src/uts/common/klm/nlm_impl.h b/usr/src/uts/common/klm/nlm_impl.h index 6b2df7f8b0..2ac711f3c7 100644 --- a/usr/src/uts/common/klm/nlm_impl.h +++ b/usr/src/uts/common/klm/nlm_impl.h @@ -30,6 +30,7 @@ /* * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -459,6 +460,7 @@ struct nlm_globals { int cn_idle_tmo; /* (z) */ int grace_period; /* (z) */ int retrans_tmo; /* (z) */ + boolean_t nlm_v4_only; /* (z) */ kmutex_t clean_lock; /* (c) */ TAILQ_ENTRY(nlm_globals) nlm_link; /* (g) */ }; diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c index 62eaabfb03..1038875bbc 100644 --- a/usr/src/uts/common/krtld/kobj.c +++ b/usr/src/uts/common/krtld/kobj.c @@ -2180,6 +2180,7 @@ static void free_module_data(struct module *mp) { struct module_list *lp, *tmp; + hotinline_desc_t *hid, *next; int ksyms_exported = 0; lp = mp->head; @@ -2189,6 +2190,15 @@ free_module_data(struct module *mp) kobj_free((char *)tmp, sizeof (*tmp)); } + /* release hotinlines */ + hid = mp->hi_calls; + while (hid != NULL) { + next = hid->hid_next; + kobj_free(hid->hid_symname, strlen(hid->hid_symname) + 1); + kobj_free(hid, sizeof (hotinline_desc_t)); + hid = next; + } + rw_enter(&ksyms_lock, RW_WRITER); if (mp->symspace) { if (vmem_contains(ksyms_arena, mp->symspace, mp->symsize)) { @@ -3034,8 +3044,18 @@ do_symbols(struct module *mp, Elf64_Addr bss_base) if (sp->st_shndx == SHN_UNDEF) { resolved = 0; + /* + * Skip over sdt probes and smap calls, + * they're relocated later. + */ if (strncmp(name, sdt_prefix, strlen(sdt_prefix)) == 0) continue; +#if defined(__x86) + if (strcmp(name, "smap_enable") == 0 || + strcmp(name, "smap_disable") == 0) + continue; +#endif /* defined(__x86) */ + /* * If it's not a weak reference and it's diff --git a/usr/src/uts/common/mapfiles/README b/usr/src/uts/common/mapfiles/README new file mode 100644 index 0000000000..5b65771325 --- /dev/null +++ b/usr/src/uts/common/mapfiles/README @@ -0,0 +1,68 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +Kernel Module Build Time Symbol Verification +-------------------------------------------- + +Historically, kernel modules have all been built as relocatable objects. +They are not dynamic objects and dependency information is always noted +in individual makefiles. Along with this, there has never been any +verification of the symbols that are being used. This means that it's +possible for a kernel module author to refer to a symbol that doesn't +exist and not find out until they try to install the module. + +To help find these problems at build time, we provide an opt-in system +for modules to use, leveraging the link-editor's '-z defs' option. This +option ensures that there are no unknown definitons at link-edit time. +To supply these definitions we supply a series of mapfiles in this +directory. + +These mapfiles are not the traditional versioning mapfiles like those in +usr/src/lib/README.mapfiles! Please review the following differences +closely: + +* These mapfiles do not declare any versions! +* These mapfiles do not use the 'SYMBOL_VERSION' directive, instead they + use the 'SYMBOL_SCOPE' directive. +* These mapfiles do not hide symbols! Library mapfiles always have + something to catch all local symbols. That should *never* be used + here. These mapfiles should not effect visibility. +* All symbols in these mapfiles should be marked 'EXTERN' to indicate + that they are not provided by the kernel module but by another. +* These mapfiles do not declare what is or isn't a public interface, + though they are often grouped around interfaces, to make it easier for + a driver author to get this right. + +Mapfiles are organized based on kernel module. For example the GLDv3 +device driver interface is provided by the 'mac' module and thus is +found in the file 'mac.mapfile'. The DDI is currently in the 'ddi' +mapfile. Functions that are found in genunix and unix that aren't in +the DDI should not be put in that mapfile. + +Note, the existing files may not be complete. These are intended to only +have the public interfaces provided by modules and thus should not +include every symbol in them. As the need arises, add new symbols or +modules as appropriate. + +To opt a module into this, first declare a series of MAPFILES that they +should check against in the module. This should be a series of one or +more files, for example: + +MAPFILES += ddi mac + +Next, you should add an include of Makefile.mapfile right before you +include Makefile.targ. You can do this with the following line: + +include $(UTSBASE)/Makefile.mapfile diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile new file mode 100644 index 0000000000..1377af5857 --- /dev/null +++ b/usr/src/uts/common/mapfiles/ddi.mapfile @@ -0,0 +1,192 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +# +# This file contains core functions provided by the DDI and also items +# required as part of the platform's runime ABI (think compiler +# functions). +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + __divdi3 { FLAGS = EXTERN }; + __stack_chk_fail { FLAGS = EXTERN }; + __stack_chk_guard { FLAGS = EXTERN }; + allocb { FLAGS = EXTERN }; + assfail { FLAGS = EXTERN }; + assfail3 { FLAGS = EXTERN }; + atomic_dec_32_nv { FLAGS = EXTERN }; + bcmp { FLAGS = EXTERN }; + bcopy { FLAGS = EXTERN }; + bzero { FLAGS = EXTERN }; + cmn_err { FLAGS = EXTERN }; + cv_broadcast { FLAGS = EXTERN }; + cv_destroy { FLAGS = EXTERN }; + cv_init { FLAGS = EXTERN }; + cv_reltimedwait { FLAGS = EXTERN }; + ddi_cb_register { FLAGS = EXTERN }; + ddi_cb_unregister { FLAGS = EXTERN }; + ddi_dev_regsize { FLAGS = EXTERN }; + ddi_dma_addr_bind_handle { FLAGS = EXTERN }; + ddi_dma_alloc_handle { FLAGS = EXTERN }; + ddi_dma_free_handle { FLAGS = EXTERN }; + ddi_dma_mem_alloc { FLAGS = EXTERN }; + ddi_dma_mem_free { FLAGS = EXTERN }; + ddi_dma_nextcookie { FLAGS = EXTERN }; + ddi_dma_sync { FLAGS = EXTERN }; + ddi_dma_unbind_handle { FLAGS = EXTERN }; + ddi_fls { FLAGS = EXTERN }; + ddi_fm_acc_err_clear { FLAGS = EXTERN }; + ddi_fm_acc_err_get { FLAGS = EXTERN }; + ddi_fm_dma_err_get { FLAGS = EXTERN }; + ddi_fm_ereport_post { FLAGS = EXTERN }; + ddi_fm_fini { FLAGS = EXTERN }; + ddi_fm_handler_register { FLAGS = EXTERN }; + ddi_fm_handler_unregister { FLAGS = EXTERN }; + ddi_fm_init { FLAGS = EXTERN }; + ddi_fm_service_impact { FLAGS = EXTERN }; + ddi_get_driver_private { FLAGS = EXTERN }; + ddi_get_instance { FLAGS = EXTERN }; + ddi_get_lbolt { FLAGS = EXTERN }; + ddi_get_lbolt64 { FLAGS = EXTERN }; + ddi_get_name { FLAGS = EXTERN }; + ddi_get_parent { FLAGS = EXTERN }; + ddi_get16 { FLAGS = EXTERN }; + ddi_get32 { FLAGS = EXTERN }; + ddi_get64 { FLAGS = EXTERN }; + ddi_intr_add_handler { FLAGS = EXTERN }; + ddi_intr_alloc { FLAGS = EXTERN }; + ddi_intr_block_disable { FLAGS = EXTERN }; + ddi_intr_block_enable { FLAGS = EXTERN }; + ddi_intr_disable { FLAGS = EXTERN }; + ddi_intr_enable { FLAGS = EXTERN }; + ddi_intr_free { FLAGS = EXTERN }; + ddi_intr_get_cap { FLAGS = EXTERN }; + ddi_intr_get_navail { FLAGS = EXTERN }; + ddi_intr_get_nintrs { FLAGS = EXTERN }; + ddi_intr_get_pri { FLAGS = EXTERN }; + ddi_intr_get_supported_types { FLAGS = EXTERN }; + ddi_intr_remove_handler { FLAGS = EXTERN }; + ddi_periodic_add { FLAGS = EXTERN }; + ddi_periodic_delete { FLAGS = EXTERN }; + ddi_power { FLAGS = EXTERN }; + ddi_prop_free { FLAGS = EXTERN }; + ddi_prop_get_int { FLAGS = EXTERN }; + ddi_prop_lookup_int_array { FLAGS = EXTERN }; + ddi_prop_op { FLAGS = EXTERN }; + ddi_prop_remove_all { FLAGS = EXTERN }; + ddi_prop_update_int_array { FLAGS = EXTERN }; + ddi_prop_update_string { FLAGS = EXTERN }; + ddi_ptob { FLAGS = EXTERN }; + ddi_put16 { FLAGS = EXTERN }; + ddi_put32 { FLAGS = EXTERN }; + ddi_quiesce_not_supported { FLAGS = EXTERN }; + ddi_regs_map_free { FLAGS = EXTERN }; + ddi_regs_map_setup { FLAGS = EXTERN }; + ddi_set_driver_private { FLAGS = EXTERN }; + ddi_strtol { FLAGS = EXTERN }; + ddi_taskq_create { FLAGS = EXTERN }; + ddi_taskq_destroy { FLAGS = EXTERN }; + ddi_taskq_dispatch { FLAGS = EXTERN }; + delay { FLAGS = EXTERN }; + desballoc { FLAGS = EXTERN }; + dev_err { FLAGS = EXTERN }; + drv_usectohz { FLAGS = EXTERN }; + drv_usecwait { FLAGS = EXTERN }; + fm_ena_generate { FLAGS = EXTERN }; + freeb { FLAGS = EXTERN }; + freemsg { FLAGS = EXTERN }; + freemsgchain { FLAGS = EXTERN }; + gethrtime { FLAGS = EXTERN }; + kmem_alloc { FLAGS = EXTERN }; + kmem_free { FLAGS = EXTERN }; + kmem_zalloc { FLAGS = EXTERN }; + kstat_create { FLAGS = EXTERN }; + kstat_delete { FLAGS = EXTERN }; + kstat_install { FLAGS = EXTERN }; + kstat_named_init { FLAGS = EXTERN }; + list_create { FLAGS = EXTERN }; + list_destroy { FLAGS = EXTERN }; + list_head { FLAGS = EXTERN }; + list_insert_tail { FLAGS = EXTERN }; + list_next { FLAGS = EXTERN }; + list_remove { FLAGS = EXTERN }; + list_remove_head { FLAGS = EXTERN }; + memcpy { FLAGS = EXTERN }; + memset { FLAGS = EXTERN }; + miocack { FLAGS = EXTERN }; + miocnak { FLAGS = EXTERN }; + mod_driverops { FLAGS = EXTERN }; + mod_info { FLAGS = EXTERN }; + mod_install { FLAGS = EXTERN }; + mod_remove { FLAGS = EXTERN }; + msgpullup { FLAGS = EXTERN }; + msgsize { FLAGS = EXTERN }; + mutex_destroy { FLAGS = EXTERN }; + mutex_enter { FLAGS = EXTERN }; + mutex_exit { FLAGS = EXTERN }; + mutex_init { FLAGS = EXTERN }; + mutex_owned { FLAGS = EXTERN }; + mutex_tryenter { FLAGS = EXTERN }; + nochpoll { FLAGS = EXTERN }; + nodev { FLAGS = EXTERN }; + nulldev { FLAGS = EXTERN }; + panic { FLAGS = EXTERN }; + pci_config_get16 { FLAGS = EXTERN }; + pci_config_get32 { FLAGS = EXTERN }; + pci_config_get64 { FLAGS = EXTERN }; + pci_config_get8 { FLAGS = EXTERN }; + pci_config_put16 { FLAGS = EXTERN }; + pci_config_put32 { FLAGS = EXTERN }; + pci_config_put64 { FLAGS = EXTERN }; + pci_config_put8 { FLAGS = EXTERN }; + pci_config_setup { FLAGS = EXTERN }; + pci_config_teardown { FLAGS = EXTERN }; + pci_ereport_post { FLAGS = EXTERN }; + pci_ereport_setup { FLAGS = EXTERN }; + pci_ereport_teardown { FLAGS = EXTERN }; + pci_lcap_locate { FLAGS = EXTERN }; + qreply { FLAGS = EXTERN }; + rw_destroy { FLAGS = EXTERN }; + rw_enter { FLAGS = EXTERN }; + rw_exit { FLAGS = EXTERN }; + rw_init { FLAGS = EXTERN }; + snprintf { FLAGS = EXTERN }; + sprintf { FLAGS = EXTERN }; + strcat { FLAGS = EXTERN }; + strcmp { FLAGS = EXTERN }; + strcpy { FLAGS = EXTERN }; + strlen { FLAGS = EXTERN }; + timeout { FLAGS = EXTERN }; + untimeout { FLAGS = EXTERN }; + vsnprintf { FLAGS = EXTERN }; + vsprintf { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/dtrace.mapfile.awk b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk new file mode 100644 index 0000000000..b8a7e2d372 --- /dev/null +++ b/usr/src/uts/common/mapfiles/dtrace.mapfile.awk @@ -0,0 +1,34 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# This script is designed to assemble a mapfile for DTrace probes. +# +BEGIN { + print "#" + print "# This file is autogenerated by dtrace.mapfile.awk" + print "#" + print "$mapfile_version 2" + print "SYMBOL_SCOPE {" + print " global:" +} + +/__dtrace_probe_/ { + printf "\t%s\t{ FLAGS = EXTERN };\n", $1 +} + +END { + print "};" +} diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile new file mode 100644 index 0000000000..6bddb3c7ef --- /dev/null +++ b/usr/src/uts/common/mapfiles/kernel.mapfile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +# +# This file contains functions provided by the kernel that various +# modules use. This is a combination of things in both unix and genunix. +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + bt_getlowbit { FLAGS = EXTERN }; + servicing_interrupt { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/mac.mapfile b/usr/src/uts/common/mapfiles/mac.mapfile new file mode 100644 index 0000000000..d40c09b311 --- /dev/null +++ b/usr/src/uts/common/mapfiles/mac.mapfile @@ -0,0 +1,57 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + mac_alloc { FLAGS = EXTERN }; + mac_fini_ops { FLAGS = EXTERN }; + mac_free { FLAGS = EXTERN }; + mac_hcksum_get { FLAGS = EXTERN }; + mac_hcksum_set { FLAGS = EXTERN }; + mac_init_ops { FLAGS = EXTERN }; + mac_link_update { FLAGS = EXTERN }; + mac_lso_get { FLAGS = EXTERN }; + mac_maxsdu_update { FLAGS = EXTERN }; + mac_prop_info_set_default_link_flowctrl { FLAGS = EXTERN }; + mac_prop_info_set_default_str { FLAGS = EXTERN }; + mac_prop_info_set_default_uint8 { FLAGS = EXTERN }; + mac_prop_info_set_perm { FLAGS = EXTERN }; + mac_prop_info_set_range_uint32 { FLAGS = EXTERN }; + mac_ring_intr_set { FLAGS = EXTERN }; + mac_register { FLAGS = EXTERN }; + mac_rx { FLAGS = EXTERN }; + mac_rx_ring { FLAGS = EXTERN }; + mac_transceiver_info_set_present { FLAGS = EXTERN }; + mac_transceiver_info_set_usable { FLAGS = EXTERN }; + mac_tx_ring_update { FLAGS = EXTERN }; + mac_tx_update { FLAGS = EXTERN }; + mac_unregister { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/mapfiles/random.mapfile b/usr/src/uts/common/mapfiles/random.mapfile new file mode 100644 index 0000000000..d3d8bc89fa --- /dev/null +++ b/usr/src/uts/common/mapfiles/random.mapfile @@ -0,0 +1,37 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object scoping must comply with the rules detailed in +# +# usr/src/uts/common/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + random_get_bytes { FLAGS = EXTERN }; + random_get_blocking_bytes { FLAGS = EXTERN }; + random_get_pseudo_bytes { FLAGS = EXTERN }; +}; diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index 9ac3066362..6a4f538c97 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -3,6 +3,7 @@ * Use is subject to license terms. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* * Copyright (c) 1982, 1986 Regents of the University of California. @@ -225,6 +226,7 @@ typedef uint16_t sa_family_t; #define IPPORT_SLP 427 #define IPPORT_MIP 434 #define IPPORT_SMB 445 /* a.k.a. microsoft-ds */ +#define IPPORT_VXLAN 4789 /* * Internet Key Exchange (IKE) ports @@ -268,6 +270,11 @@ typedef uint16_t sa_family_t; #define IPPORT_RESERVED 1024 #define IPPORT_USERRESERVED 5000 +#ifdef _KERNEL +#define IPPORT_DYNAMIC_MIN 49152 +#define IPPORT_DYNAMIC_MAX 65535 +#endif + /* * Link numbers */ diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h index c65a9bad3a..74cff75d43 100644 --- a/usr/src/uts/common/netinet/udp.h +++ b/usr/src/uts/common/netinet/udp.h @@ -1,6 +1,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -17,9 +18,6 @@ #ifndef _NETINET_UDP_H #define _NETINET_UDP_H -#pragma ident "%Z%%M% %I% %E% SMI" -/* udp.h 1.7 88/08/19 SMI; from UCB 7.1 6/5/86 */ - #ifdef __cplusplus extern "C" { #endif @@ -36,6 +34,16 @@ struct udphdr { #define UDP_EXCLBIND 0x0101 /* for internal use only */ #define UDP_RCVHDR 0x0102 /* for internal use only */ #define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */ +#define UDP_SRCPORT_HASH 0x0104 /* for internal use only */ +#define UDP_SND_TO_CONNECTED 0x0105 /* for internal use only */ + +/* + * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go + * handle UDP_SRCPORT_HASH. + */ +#define UDP_HASH_DISABLE 0x0000 /* for internal use only */ +#define UDP_HASH_VXLAN 0x0001 /* for internal use only */ + /* * Following option in UDP_ namespace required to be exposed through * <xti.h> (It also requires exposing options not implemented). The options diff --git a/usr/src/uts/common/nfs/nfssys.h b/usr/src/uts/common/nfs/nfssys.h index e9a2746017..7d2401856c 100644 --- a/usr/src/uts/common/nfs/nfssys.h +++ b/usr/src/uts/common/nfs/nfssys.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -122,13 +123,20 @@ struct nfs_revauth_args32 { enum lm_fmly { LM_INET, LM_INET6, LM_LOOPBACK }; enum lm_proto { LM_TCP, LM_UDP }; +/* + * The 'n_v4_only' member was formerly called 'debug'. This member is not used + * in the kernel. To avoid a new version of this user/kernel interface + * structure, the member was renamed in a binary compatible way. It is now used + * by the user-level code to indicate that the zone is not running + * rpcbind/rpc.statd and that only NFSv4 locking is needed. + */ struct lm_svc_args { int version; /* keep this first */ int fd; enum lm_fmly n_fmly; /* protocol family */ enum lm_proto n_proto; /* protocol */ dev_t n_rdev; /* device ID */ - int debug; /* debugging level */ + int n_v4_only; /* NFSv4 locking only */ time_t timout; /* client handle life (asynch RPCs) */ int grace; /* secs in grace period */ time_t retransmittimeout; /* retransmission interval */ @@ -141,7 +149,7 @@ struct lm_svc_args32 { enum lm_fmly n_fmly; /* protocol family */ enum lm_proto n_proto; /* protocol */ dev32_t n_rdev; /* device ID */ - int32_t debug; /* debugging level */ + int32_t n_v4_only; /* NFSv4 locking only */ time32_t timout; /* client handle life (asynch RPCs) */ int32_t grace; /* secs in grace period */ time32_t retransmittimeout; /* retransmission interval */ diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -47,6 +48,7 @@ #include <sys/time.h> #include <sys/msacct.h> #include <sys/zone.h> +#include <sys/brand.h> /* * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t) * On exit, write a record on the accounting file. */ void -acct(char st) +acct(int st) { struct vnode *vp; struct cred *cr; @@ -402,6 +404,21 @@ acct(char st) * This only gets called from exit after all lwp's have exited so no * cred locking is needed. */ + + /* If there is a brand-specific hook, use it instead */ + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { + ZBROP(curzone)->b_acct_out(vp, st); + mutex_exit(&ag->aclock); + return; + } + + /* + * The 'st' status value was traditionally masked this way by our + * caller, but we now accept the unmasked value for brand handling. + * Zones not using the brand hook mask the status here. + */ + st &= 0xff; + p = curproc; ua = PTOU(p); bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 773ecc9c6a..ecf396f926 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -310,46 +312,115 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -600,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp) /*ARGSUSED*/ int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, - intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + intpdata_t *idatap, int level, size_t *execsz, int setid, + caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand, + char *bname, char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 805813037d..1280c8a1b6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent Inc. All rights reserved. + * Copyright 2016, Joyent Inc. */ #include <sys/timer.h> @@ -41,6 +41,9 @@ static clock_backend_t clock_highres; +/* minimum non-privileged interval (200us) */ +long clock_highres_interval_min = 200000; + /*ARGSUSED*/ static int clock_highres_settime(timespec_t *ts) @@ -68,17 +71,6 @@ clock_highres_getres(timespec_t *ts) static int clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { - /* - * CLOCK_HIGHRES timers of sufficiently high resolution can deny - * service; only allow privileged users to create such timers. - * Sites that do not wish to have this restriction should - * give users the "proc_clock_highres" privilege. - */ - if (secpolicy_clock_highres(CRED()) != 0) { - it->it_arg = NULL; - return (EPERM); - } - it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP); it->it_fire = fire; @@ -111,6 +103,49 @@ clock_highres_timer_settime(itimer_t *it, int flags, cpu_t *cpu; cpupart_t *cpupart; int pset; + boolean_t value_need_clamp = B_FALSE; + boolean_t intval_need_clamp = B_FALSE; + cred_t *cr = CRED(); + struct itimerspec clamped; + + /* + * CLOCK_HIGHRES timers of sufficiently high resolution can deny + * service; only allow privileged users to create such timers. + * Non-privileged users (those without the "proc_clock_highres" + * privilege) can create timers with lower resolution but if they + * attempt to use a very low time value (< 200us) then their + * timer will be clamped at 200us. + */ + if (when->it_value.tv_sec == 0 && + when->it_value.tv_nsec > 0 && + when->it_value.tv_nsec < clock_highres_interval_min) + value_need_clamp = B_TRUE; + + if (when->it_interval.tv_sec == 0 && + when->it_interval.tv_nsec > 0 && + when->it_interval.tv_nsec < clock_highres_interval_min) + intval_need_clamp = B_TRUE; + + if ((value_need_clamp || intval_need_clamp) && + secpolicy_clock_highres(cr) != 0) { + clamped.it_value.tv_sec = when->it_value.tv_sec; + clamped.it_interval.tv_sec = when->it_interval.tv_sec; + + if (value_need_clamp) { + clamped.it_value.tv_nsec = clock_highres_interval_min; + } else { + clamped.it_value.tv_nsec = when->it_value.tv_nsec; + } + + if (intval_need_clamp) { + clamped.it_interval.tv_nsec = + clock_highres_interval_min; + } else { + clamped.it_interval.tv_nsec = when->it_interval.tv_nsec; + } + + when = &clamped; + } cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 909a6c2860..1a3502a710 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d5e272c16a..a147b1cf0f 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2019 Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type) /* * Determine what rootvp to use. */ + mutex_enter(&curproc->p_lock); if (core_type == CORE_PROC) { rootvp = (PTOU(curproc)->u_rdir == NULL ? curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type) VN_HOLD(startvp); if (rootvp != rootdir) VN_HOLD(rootvp); + mutex_exit(&curproc->p_lock); if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp, startvp, CRED())) != 0) { pn_free(&pn); @@ -793,7 +795,7 @@ clock_t core_delay_usec = 10000; * using core_write() below, and so it has the same failure semantics. */ int -core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, +core_seg(proc_t *p, vnode_t *vp, u_offset_t offset, caddr_t addr, size_t size, rlim64_t rlimit, cred_t *credp) { caddr_t eaddr; @@ -801,6 +803,11 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, size_t len; int err = 0; + if (offset > OFF_MAX || offset + size > OFF_MAX || + offset + size < offset) { + return (EOVERFLOW); + } + eaddr = addr + size; for (base = addr; base < eaddr; base += len) { len = eaddr - base; @@ -841,15 +848,20 @@ core_seg(proc_t *p, vnode_t *vp, offset_t offset, caddr_t addr, size_t size, * unexpectedly returns zero but no progress has been made, we return ENOSPC. */ int -core_write(vnode_t *vp, enum uio_seg segflg, offset_t offset, +core_write(vnode_t *vp, enum uio_seg segflg, u_offset_t offset, const void *buf, size_t len, rlim64_t rlimit, cred_t *credp) { ssize_t resid = len; int error = 0; + if (offset > OFF_MAX || offset + len > OFF_MAX || + offset + len < offset) { + return (EOVERFLOW); + } + while (len != 0) { - error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, offset, - segflg, 0, rlimit, credp, &resid); + error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, len, + (offset_t)offset, segflg, 0, rlimit, credp, &resid); if (error != 0) break; diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 87c0896814..620f26034f 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -108,7 +109,8 @@ kmutex_t cpu_lock; cpu_t *cpu_list; /* list of all CPUs */ cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ -static cpuset_t cpu_available; /* set of available CPUs */ +cpuset_t cpu_active_set; /* cached set of active CPUs */ +cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */ @@ -386,36 +388,56 @@ force_thread_migrate(kthread_id_t tp) /* * Set affinity for a specified CPU. - * A reference count is incremented and the affinity is held until the - * reference count is decremented to zero by thread_affinity_clear(). - * This is so regions of code requiring affinity can be nested. - * Caller needs to ensure that cpu_id remains valid, which can be - * done by holding cpu_lock across this call, unless the caller - * specifies CPU_CURRENT in which case the cpu_lock will be acquired - * by thread_affinity_set and CPU->cpu_id will be the target CPU. + * + * Specifying a cpu_id of CPU_CURRENT, allowed _only_ when setting affinity for + * curthread, will set affinity to the CPU on which the thread is currently + * running. For other cpu_id values, the caller must ensure that the + * referenced CPU remains valid, which can be done by holding cpu_lock across + * this call. + * + * CPU affinity is guaranteed after return of thread_affinity_set(). If a + * caller setting affinity to CPU_CURRENT requires that its thread not migrate + * CPUs prior to a successful return, it should take extra precautions (such as + * their own call to kpreempt_disable) to ensure that safety. + * + * CPU_BEST can be used to pick a "best" CPU to migrate to, including + * potentially the current CPU. + * + * A CPU affinity reference count is maintained by thread_affinity_set and + * thread_affinity_clear (incrementing and decrementing it, respectively), + * maintaining CPU affinity while the count is non-zero, and allowing regions + * of code which require affinity to be nested. */ void thread_affinity_set(kthread_id_t t, int cpu_id) { - cpu_t *cp; - int c; + cpu_t *cp; ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL)); - if ((c = cpu_id) == CPU_CURRENT) { - mutex_enter(&cpu_lock); - cpu_id = CPU->cpu_id; + if (cpu_id == CPU_CURRENT) { + VERIFY3P(t, ==, curthread); + kpreempt_disable(); + cp = CPU; + } else if (cpu_id == CPU_BEST) { + VERIFY3P(t, ==, curthread); + kpreempt_disable(); + cp = disp_choose_best_cpu(); + } else { + /* + * We should be asserting that cpu_lock is held here, but + * the NCA code doesn't acquire it. The following assert + * should be uncommented when the NCA code is fixed. + * + * ASSERT(MUTEX_HELD(&cpu_lock)); + */ + VERIFY((cpu_id >= 0) && (cpu_id < NCPU)); + cp = cpu[cpu_id]; + + /* user must provide a good cpu_id */ + VERIFY(cp != NULL); } - /* - * We should be asserting that cpu_lock is held here, but - * the NCA code doesn't acquire it. The following assert - * should be uncommented when the NCA code is fixed. - * - * ASSERT(MUTEX_HELD(&cpu_lock)); - */ - ASSERT((cpu_id >= 0) && (cpu_id < NCPU)); - cp = cpu[cpu_id]; - ASSERT(cp != NULL); /* user must provide a good cpu_id */ + /* * If there is already a hard affinity requested, and this affinity * conflicts with that, panic. @@ -432,13 +454,14 @@ thread_affinity_set(kthread_id_t t, int cpu_id) * Make sure we're running on the right CPU. */ if (cp != t->t_cpu || t != curthread) { + ASSERT(cpu_id != CPU_CURRENT); force_thread_migrate(t); /* drops thread lock */ } else { thread_unlock(t); } - if (c == CPU_CURRENT) - mutex_exit(&cpu_lock); + if (cpu_id == CPU_CURRENT || cpu_id == CPU_BEST) + kpreempt_enable(); } /* @@ -1194,7 +1217,7 @@ cpu_online(cpu_t *cp) * Handle on-line request. * This code must put the new CPU on the active list before * starting it because it will not be paused, and will start - * using the active list immediately. The real start occurs + * using the active list immediately. The real start occurs * when the CPU_QUIESCED flag is turned off. */ @@ -1473,8 +1496,8 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_bound_cpu != cp) - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); + t->t_cpu = disp_lowpri_cpu(ncp, t, + t->t_pri); ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); @@ -1516,10 +1539,9 @@ again: for (loop_count = 0; (*bound_func)(cp, 0); loop_count++) { * Update CPU last ran on if it was this CPU */ - if (t->t_cpu == cp && t->t_bound_cpu != cp) { - t->t_cpu = disp_lowpri_cpu(ncp, - t->t_lpl, t->t_pri, NULL); - } + if (t->t_cpu == cp && t->t_bound_cpu != cp) + t->t_cpu = disp_lowpri_cpu(ncp, t, t->t_pri); + ASSERT(t->t_cpu != cp || t->t_bound_cpu == cp || t->t_weakbound_cpu == cp); t = t->t_next; @@ -1724,6 +1746,7 @@ cpu_list_init(cpu_t *cp) cp->cpu_part = &cp_default; CPUSET_ADD(cpu_available, cp->cpu_id); + CPUSET_ADD(cpu_active_set, cp->cpu_id); } /* @@ -1895,6 +1918,7 @@ cpu_add_active_internal(cpu_t *cp) cp->cpu_prev_onln = cpu_active->cpu_prev_onln; cpu_active->cpu_prev_onln->cpu_next_onln = cp; cpu_active->cpu_prev_onln = cp; + CPUSET_ADD(cpu_active_set, cp->cpu_id); if (pp->cp_cpulist) { cp->cpu_next_part = pp->cp_cpulist; @@ -1965,6 +1989,7 @@ cpu_remove_active(cpu_t *cp) } cp->cpu_next_onln = cp; cp->cpu_prev_onln = cp; + CPUSET_DEL(cpu_active_set, cp->cpu_id); cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; @@ -2704,13 +2729,18 @@ cpu_bind_thread(kthread_id_t tp, processorid_t bind, processorid_t *obind, return (0); } -#if CPUSET_WORDS > 1 -/* - * Functions for implementing cpuset operations when a cpuset is more - * than one word. On platforms where a cpuset is a single word these - * are implemented as macros in cpuvar.h. - */ +cpuset_t * +cpuset_alloc(int kmflags) +{ + return (kmem_alloc(sizeof (cpuset_t), kmflags)); +} + +void +cpuset_free(cpuset_t *s) +{ + kmem_free(s, sizeof (cpuset_t)); +} void cpuset_all(cpuset_t *s) @@ -2722,43 +2752,66 @@ cpuset_all(cpuset_t *s) } void -cpuset_all_but(cpuset_t *s, uint_t cpu) +cpuset_all_but(cpuset_t *s, const uint_t cpu) { cpuset_all(s); CPUSET_DEL(*s, cpu); } void -cpuset_only(cpuset_t *s, uint_t cpu) +cpuset_only(cpuset_t *s, const uint_t cpu) { CPUSET_ZERO(*s); CPUSET_ADD(*s, cpu); } +long +cpu_in_set(const cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + return (BT_TEST(s->cpub, cpu)); +} + +void +cpuset_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_SET(s->cpub, cpu); +} + +void +cpuset_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_CLEAR(s->cpub, cpu); +} + int -cpuset_isnull(cpuset_t *s) +cpuset_isnull(const cpuset_t *s) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s->cpub[i] != 0) return (0); + } return (1); } int -cpuset_cmp(cpuset_t *s1, cpuset_t *s2) +cpuset_isequal(const cpuset_t *s1, const cpuset_t *s2) { int i; - for (i = 0; i < CPUSET_WORDS; i++) + for (i = 0; i < CPUSET_WORDS; i++) { if (s1->cpub[i] != s2->cpub[i]) return (0); + } return (1); } uint_t -cpuset_find(cpuset_t *s) +cpuset_find(const cpuset_t *s) { uint_t i; @@ -2778,7 +2831,7 @@ cpuset_find(cpuset_t *s) } void -cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid) +cpuset_bounds(const cpuset_t *s, uint_t *smallestid, uint_t *largestid) { int i, j; uint_t bit; @@ -2822,7 +2875,72 @@ cpuset_bounds(cpuset_t *s, uint_t *smallestid, uint_t *largestid) *smallestid = *largestid = CPUSET_NOTINSET; } -#endif /* CPUSET_WORDS */ +void +cpuset_atomic_del(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR(s->cpub, (cpu)) +} + +void +cpuset_atomic_add(cpuset_t *s, const uint_t cpu) +{ + VERIFY(cpu < NCPU); + BT_ATOMIC_SET(s->cpub, (cpu)) +} + +long +cpuset_atomic_xadd(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_SET_EXCL(s->cpub, cpu, res); + return (res); +} + +long +cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu) +{ + long res; + + VERIFY(cpu < NCPU); + BT_ATOMIC_CLEAR_EXCL(s->cpub, cpu, res); + return (res); +} + +void +cpuset_or(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] |= src->cpub[i]; + } +} + +void +cpuset_xor(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] ^= src->cpub[i]; + } +} + +void +cpuset_and(cpuset_t *dst, cpuset_t *src) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] &= src->cpub[i]; + } +} + +void +cpuset_zero(cpuset_t *dst) +{ + for (int i = 0; i < CPUSET_WORDS; i++) { + dst->cpub[i] = 0; + } +} + /* * Unbind threads bound to specified CPU. @@ -3112,9 +3230,9 @@ cpu_get_state_str(cpu_t *cpu) static void cpu_stats_kstat_create(cpu_t *cp) { - int instance = cp->cpu_id; - char *module = "cpu"; - char *class = "misc"; + int instance = cp->cpu_id; + char *module = "cpu"; + char *class = "misc"; kstat_t *ksp; zoneid_t zoneid; @@ -3350,18 +3468,18 @@ cpu_stat_ks_update(kstat_t *ksp, int rw) cso->cpu_sysinfo.cpu[CPU_USER] = msnsecs[CMS_USER]; if (cso->cpu_sysinfo.cpu[CPU_KERNEL] < msnsecs[CMS_SYSTEM]) cso->cpu_sysinfo.cpu[CPU_KERNEL] = msnsecs[CMS_SYSTEM]; - cso->cpu_sysinfo.cpu[CPU_WAIT] = 0; - cso->cpu_sysinfo.wait[W_IO] = 0; + cso->cpu_sysinfo.cpu[CPU_WAIT] = 0; + cso->cpu_sysinfo.wait[W_IO] = 0; cso->cpu_sysinfo.wait[W_SWAP] = 0; cso->cpu_sysinfo.wait[W_PIO] = 0; - cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread); - cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite); - cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread); - cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite); - cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread); - cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite); - cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch); - cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap); + cso->cpu_sysinfo.bread = CPU_STATS(cp, sys.bread); + cso->cpu_sysinfo.bwrite = CPU_STATS(cp, sys.bwrite); + cso->cpu_sysinfo.lread = CPU_STATS(cp, sys.lread); + cso->cpu_sysinfo.lwrite = CPU_STATS(cp, sys.lwrite); + cso->cpu_sysinfo.phread = CPU_STATS(cp, sys.phread); + cso->cpu_sysinfo.phwrite = CPU_STATS(cp, sys.phwrite); + cso->cpu_sysinfo.pswitch = CPU_STATS(cp, sys.pswitch); + cso->cpu_sysinfo.trap = CPU_STATS(cp, sys.trap); cso->cpu_sysinfo.intr = 0; for (i = 0; i < PIL_MAX; i++) cso->cpu_sysinfo.intr += CPU_STATS(cp, sys.intr[i]); diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 25727d54c5..0bd6cfd44f 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -729,6 +729,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c index 0aa54eeaee..316dffc326 100644 --- a/usr/src/uts/common/os/cyclic.c +++ b/usr/src/uts/common/os/cyclic.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2018 Joyent Inc. */ /* @@ -112,6 +112,7 @@ * cyclic_remove() <-- Removes a cyclic * cyclic_bind() <-- Change a cyclic's CPU or partition binding * cyclic_reprogram() <-- Reprogram a cyclic's expiration + * cyclic_move_here() <-- Shuffle cyclic to current CPU * * Inter-subsystem Interfaces * @@ -3111,6 +3112,61 @@ cyclic_reprogram(cyclic_id_t id, hrtime_t expiration) return (1); } +/* + * void cyclic_move_here(cyclic_id_t) + * + * Overview + * + * cyclic_move_here() attempts to shuffle a cyclic onto the current CPU. + * + * Arguments and notes + * + * The first argument is a cyclic_id returned from cyclic_add(). + * cyclic_move_here() may _not_ be called on a cyclic_id returned from + * cyclic_add_omni() or one bound to a CPU or partition via cyclic_bind(). + * + * This cyclic shuffling is performed on a best-effort basis. If for some + * reason the current CPU is unsuitable or the thread migrates between CPUs + * during the call, the function may return with the cyclic residing on some + * other CPU. + * + * Return value + * + * None; cyclic_move_here() always reports success. + * + * Caller's context + * + * cpu_lock must be held by the caller, and the caller must not be in + * interrupt context. The caller may not hold any locks which are also + * grabbed by any cyclic handler. + */ +void +cyclic_move_here(cyclic_id_t id) +{ + cyc_id_t *idp = (cyc_id_t *)id; + cyc_cpu_t *cc = idp->cyi_cpu; + cpu_t *dest = CPU; + + ASSERT(MUTEX_HELD(&cpu_lock)); + CYC_PTRACE("move_here", idp, dest); + VERIFY3P(cc, !=, NULL); + VERIFY3U(cc->cyp_cyclics[idp->cyi_ndx].cy_flags & + (CYF_CPU_BOUND|CYF_PART_BOUND), ==, 0); + + if (cc->cyp_cpu == dest) { + return; + } + + /* Is the destination CPU suitable for a migration target? */ + if (dest->cpu_cyclic == NULL || + dest->cpu_cyclic->cyp_state == CYS_OFFLINE || + (dest->cpu_flags & CPU_ENABLE) == 0) { + return; + } + + cyclic_juggle_one_to(idp, dest->cpu_cyclic); +} + hrtime_t cyclic_getres() { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index f51e2c5ca1..24b6f0e2eb 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -99,6 +99,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * These are consumed within the specific exec modules, but are defined here @@ -143,7 +144,7 @@ exec_common(const char *fname, const char **argp, const char **envp, proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); struct user *up = PTOU(p); - long execsz; /* temporary count of exec size */ + size_t execsz; /* temporary count of exec size */ int i; int error; char exec_file[MAXCOMLEN+1]; @@ -265,8 +266,10 @@ exec_common(const char *fname, const char **argp, const char **envp, * only if the pathname does not contain a "/" the resolved path * points to a file in the current working (attribute) directory. */ - if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && + mutex_enter(&p->p_lock); + if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 && strchr(resolvepn.pn_path, '/') == NULL) { + mutex_exit(&p->p_lock); if (dir != NULL) VN_RELE(dir); error = EACCES; @@ -275,6 +278,7 @@ exec_common(const char *fname, const char **argp, const char **envp, VN_RELE(vp); goto out; } + mutex_exit(&p->p_lock); bzero(exec_file, MAXCOMLEN+1); (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -322,14 +326,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -361,7 +394,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -385,6 +418,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -444,8 +479,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -566,10 +603,10 @@ gexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -890,8 +927,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1448,7 +1491,7 @@ noexec( struct uarg *args, struct intpdata *idatap, int level, - long *execsz, + size_t *execsz, int setid, caddr_t exec_file, struct cred *cred) @@ -1555,6 +1598,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1591,6 +1655,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1673,8 +1738,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1687,6 +1753,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1793,7 +1873,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1806,6 +1886,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1818,6 +1903,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1961,6 +2051,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 1b9359da47..06e0117cd6 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -138,6 +138,27 @@ rexit(int rval) } /* + * Bump the init_restarts kstat and let interested parties know about the + * restart. + */ +static void +restart_init_notify(zone_t *zone) +{ + nvlist_t *nvl = NULL; + + zone->zone_proc_init_restarts++; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 && + nvlist_add_uint32(nvl, ZONE_CB_RESTARTS, + zone->zone_proc_init_restarts) == 0) { + zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS, + ZONE_EVENT_INIT_RESTART_SC, nvl); + } + + nvlist_free(nvl); +} + +/* * Called by proc_exit() when a zone's init exits, presumably because * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things @@ -230,7 +251,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -260,6 +281,8 @@ restart_init(int what, int why) ASSERT(p == curproc); (void) freectty(B_TRUE); + restart_init_notify(p->p_zone); + /* * Now exec() the new init(1M) on top of the current process. If we * succeed, the caller will treat this like a successful system call. @@ -320,6 +343,119 @@ proc_is_exiting(proc_t *p) } /* + * Return true if zone's init is restarted, false if exit processing should + * proceeed. + */ +static boolean_t +zone_init_exit(zone_t *z, int why, int what) +{ + /* + * Typically we don't let the zone's init exit unless zone_start_init() + * failed its exec, or we are shutting down the zone or the machine, + * although the various flags handled within this function will control + * the behavior. + * + * Since we are single threaded, we don't need to lock the following + * accesses to zone_proc_initpid. + */ + if (z->zone_boot_err != 0 || + zone_status_get(z) >= ZONE_IS_SHUTTING_DOWN || + zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + /* + * Clear the zone's init pid and proceed with exit processing. + */ + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + /* + * There are a variety of configuration flags on the zone to control + * init exit behavior. + * + * If the init process should be restarted, the "zone_restart_init" + * member will be set. + */ + if (!z->zone_restart_init) { + /* + * The zone has been setup to halt when init exits. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + /* + * At this point we know we're configured to restart init, but there + * are various modifiers to that behavior. + */ + + if (z->zone_reboot_on_init_exit) { + /* + * Some init programs in branded zones do not tolerate a + * restart in the traditional manner; setting + * "zone_reboot_on_init_exit" will cause the entire zone to be + * rebooted instead. + */ + + if (z->zone_restart_init_0) { + /* + * Some init programs in branded zones only want to + * restart if they exit 0, otherwise the zone should + * shutdown. Setting the "zone_restart_init_0" member + * controls this behavior. + */ + if (why == CLD_EXITED && what == 0) { + /* Trigger a zone reboot */ + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + /* Shutdown instead of reboot */ + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, + zone_kcred()); + } + } else { + /* Trigger a zone reboot */ + (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred()); + } + + z->zone_init_status = wstat(why, what); + z->zone_proc_initpid = -1; + return (B_FALSE); + } + + if (z->zone_restart_init_0) { + /* + * Some init programs in branded zones only want to restart if + * they exit 0, otherwise the zone should shutdown. Setting the + * "zone_restart_init_0" member controls this behavior. + * + * In this case we only restart init if it exited successfully. + */ + if (why == CLD_EXITED && what == 0 && + restart_init(what, why) == 0) { + return (B_TRUE); + } + } else { + /* + * No restart modifiers on the zone, attempt to restart init. + */ + if (restart_init(what, why) == 0) { + return (B_TRUE); + } + } + + + /* + * The restart failed, the zone will shut down. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); + z->zone_proc_initpid = -1; + return (B_FALSE); +} + +/* * Return value: * 1 - exitlwps() failed, call (or continue) lwp_exit() * 0 - restarting init. Return through system call path @@ -366,45 +502,36 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); + if (p->p_pid == z->zone_proc_initpid) { + /* If zone's init restarts, we're done here. */ + if (zone_init_exit(z, why, what)) + return (0); + } /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); /* - * Don't let init exit unless zone_start_init() failed its exec, or - * we are shutting down the zone or the machine. - * - * Since we are single threaded, we don't need to lock the - * following accesses to zone_proc_initpid. + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. */ - if (p->p_pid == z->zone_proc_initpid) { - if (z->zone_boot_err == 0 && - zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && - zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); - } else { - (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); - } - } - + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); /* - * Since we didn't or couldn't restart init, we clear - * the zone's init state and proceed with exit - * processing. + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. */ - z->zone_proc_initpid = -1; + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); } lwp_pcb_exit(); @@ -565,7 +692,7 @@ proc_exit(int why, int what) semexit(p); rv = wstat(why, what); - acct(rv & 0xff); + acct(rv); exacct_commit_proc(p, rv); /* @@ -658,10 +785,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -673,7 +812,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -847,8 +987,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -927,10 +1109,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -958,7 +1139,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -978,10 +1160,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; + + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } - proc_gone = 0; + if (pp->p_child == NULL) { + goto no_real_children; + } + } + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -989,6 +1198,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1033,12 +1247,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1107,11 +1325,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1130,7 +1349,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1226,6 +1445,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 76eddd4e50..41e7e63d2b 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -386,6 +386,7 @@ flist_grow(int maxfd) dst->uf_flag = src->uf_flag; dst->uf_busy = src->uf_busy; dst->uf_portfd = src->uf_portfd; + dst->uf_gen = src->uf_gen; } /* @@ -487,7 +488,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */ afd->a_fd[i] = -1; } -static void +void set_active_fd(int fd) { afd_t *afd = &curthread->t_activefd; @@ -575,13 +576,12 @@ is_active_fd(kthread_t *t, int fd) } /* - * Convert a user supplied file descriptor into a pointer to a file - * structure. Only task is to check range of the descriptor (soft - * resource limit was enforced at open time and shouldn't be checked - * here). + * Convert a user supplied file descriptor into a pointer to a file structure. + * Only task is to check range of the descriptor (soft resource limit was + * enforced at open time and shouldn't be checked here). */ file_t * -getf(int fd) +getf_gen(int fd, uf_entry_gen_t *genp) { uf_info_t *fip = P_FINFO(curproc); uf_entry_t *ufp; @@ -607,6 +607,9 @@ getf(int fd) return (NULL); } ufp->uf_refcnt++; + if (genp != NULL) { + *genp = ufp->uf_gen; + } set_active_fd(fd); /* record the active file descriptor */ @@ -615,6 +618,12 @@ getf(int fd) return (fp); } +file_t * +getf(int fd) +{ + return (getf_gen(fd, NULL)); +} + /* * Close whatever file currently occupies the file descriptor slot * and install the new file, usually NULL, in the file descriptor slot. @@ -667,6 +676,7 @@ closeandsetf(int fd, file_t *newfp) ASSERT(ufp->uf_flag == 0); fd_reserve(fip, fd, 1); ufp->uf_file = newfp; + ufp->uf_gen++; UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (0); @@ -852,7 +862,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { @@ -860,6 +871,7 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) cufp->uf_alloc = pufp->uf_alloc; cufp->uf_flag = pufp->uf_flag; cufp->uf_busy = pufp->uf_busy; + cufp->uf_gen = pufp->uf_gen; if (pufp->uf_file == NULL) { ASSERT(pufp->uf_flag == 0); if (pufp->uf_busy) { @@ -1028,6 +1040,9 @@ ufalloc_file(int start, file_t *fp) fd_reserve(fip, fd, 1); ASSERT(ufp->uf_file == NULL); ufp->uf_file = fp; + if (fp != NULL) { + ufp->uf_gen++; + } UF_EXIT(ufp); mutex_exit(&fip->fi_lock); return (fd); @@ -1183,6 +1198,7 @@ setf(int fd, file_t *fp) } else { UF_ENTER(ufp, fip, fd); ASSERT(ufp->uf_busy); + ufp->uf_gen++; } ASSERT(ufp->uf_fpollinfo == NULL); ASSERT(ufp->uf_flag == 0); @@ -1212,8 +1228,7 @@ f_getfl(int fd, int *flagp) error = EBADF; else { vnode_t *vp = fp->f_vnode; - int flag = fp->f_flag | - ((fp->f_flag2 & ~FEPOLLED) << 16); + int flag = fp->f_flag | (fp->f_flag2 << 16); /* * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index a63931459f..7e198910b4 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -705,7 +706,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -754,7 +755,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -791,6 +792,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -809,6 +813,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -852,8 +857,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -914,7 +929,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -955,7 +971,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1016,6 +1036,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1083,9 +1106,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1170,6 +1190,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); @@ -1187,6 +1219,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) */ fcnt_add(P_FINFO(pp), 1); + mutex_enter(&pp->p_lock); if (PTOU(pp)->u_cdir) { VN_HOLD(PTOU(pp)->u_cdir); } else { @@ -1200,6 +1233,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd) refstr_hold(PTOU(pp)->u_cwd); + mutex_exit(&pp->p_lock); /* * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index de2a4f26c4..07fd623a95 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@ /* * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \ !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint)) -static int +int smmap_common(caddr_t *addrp, size_t len, int prot, int flags, struct file *fp, offset_t pos) { @@ -780,8 +792,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -789,10 +799,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 86cb867da8..bf917ef716 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index bc0cda418b..ed2c7fc346 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. @@ -1011,6 +1012,7 @@ size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ size_t kmem_content_log_size; /* content log size [2% of memory] */ size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */ size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ @@ -1018,6 +1020,14 @@ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +#ifdef DEBUG +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ +#else +int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */ +#endif + +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ + #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ #else @@ -1098,6 +1108,7 @@ kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; kmem_log_header_t *kmem_slab_log; +kmem_log_header_t *kmem_zerosized_log; static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ @@ -2853,8 +2864,33 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we log to the kmem_zerosized_log that + * that this condition has occurred (which gives us + * enough information to be able to debug it). + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0; " + "see kmem_zerosized_log for details"); + } + + kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL); + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -4397,8 +4433,8 @@ kmem_init(void) } kmem_failure_log = kmem_log_init(kmem_failure_log_size); - kmem_slab_log = kmem_log_init(kmem_slab_log_size); + kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size); /* * Initialize STREAMS message caches so allocb() is available. diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 93c04cff8d..b09b2d3558 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -198,6 +198,9 @@ struct { kstat_named_t pagesfree; kstat_named_t pageslocked; kstat_named_t pagestotal; + kstat_named_t lowmemscan; + kstat_named_t zonecapscan; + kstat_named_t nthrottle; } system_pages_kstat = { { "physmem", KSTAT_DATA_ULONG }, { "nalloc", KSTAT_DATA_ULONG }, @@ -219,6 +222,9 @@ struct { { "pagesfree", KSTAT_DATA_ULONG }, { "pageslocked", KSTAT_DATA_ULONG }, { "pagestotal", KSTAT_DATA_ULONG }, + { "low_mem_scan", KSTAT_DATA_ULONG }, + { "zone_cap_scan", KSTAT_DATA_ULONG }, + { "n_throttle", KSTAT_DATA_ULONG }, }; static int header_kstat_update(kstat_t *, int); @@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw) system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial - availrmem); system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages; + system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan; + system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan; + system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle; /* * pp_kernel represents total pages used by the kernel since the * startup. This formula takes into account the boottime kernel diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c index 6288f47bed..6f6aced619 100644 --- a/usr/src/uts/common/os/lgrp.c +++ b/usr/src/uts/common/os/lgrp.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -90,6 +91,7 @@ #include <sys/pg.h> #include <sys/promif.h> #include <sys/sdt.h> +#include <sys/ht.h> lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ @@ -520,6 +522,8 @@ lgrp_main_mp_init(void) { klgrpset_t changed; + ht_init(); + /* * Update lgroup topology (if necessary) */ diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..06c03dd38e 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2019, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif @@ -491,7 +491,7 @@ log_console(log_t *lp, log_ctl_t *lc) mblk_t * log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg, - size_t size, int on_intr) + size_t size, int on_intr) { mblk_t *mp = NULL; mblk_t *mp2; diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index b2adae570f..341e4ae356 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/param.h> @@ -57,6 +57,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -115,7 +117,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +285,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +645,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +657,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -696,8 +698,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -718,6 +739,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + lwp_fp_init(lwp); if (state == TS_RUN) { @@ -755,8 +783,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -829,8 +858,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -838,21 +886,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -893,13 +953,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -943,6 +996,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1103,7 +1168,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7bc41b6954..3364d1e523 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -158,7 +158,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -288,7 +288,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)exec_fnamep, (const char **)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 3571747e9c..6be46fa422 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp) * Put pressure on pageout. */ page_needfree(free_get); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); mutex_enter(&mhp->mh_mutex); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 142c10754e..0410e6f47b 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1381,10 +1381,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c index e2a3335eb4..f1003f7834 100644 --- a/usr/src/uts/common/os/modctl.c +++ b/usr/src/uts/common/os/modctl.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 Joyent, Inc. */ /* @@ -3470,6 +3471,11 @@ mod_load(struct modctl *mp, int usepath) retval = install_stubs_by_name(mp, mp->mod_modname); /* + * Perform hotinlines before module is started. + */ + do_hotinlines(mp->mod_mp); + + /* * Now that the module is loaded, we need to give DTrace * a chance to notify its providers. This is done via * the dtrace_modload function pointer. diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c index 3605104ae7..a04294eed5 100644 --- a/usr/src/uts/common/os/modsysfile.c +++ b/usr/src/uts/common/os/modsysfile.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Nexenta Systems, Inc. */ @@ -57,10 +58,12 @@ struct hwc_class *hcl_head; /* head of list of classes */ static kmutex_t hcl_lock; /* for accessing list of classes */ #define DAFILE "/etc/driver_aliases" +#define PPTFILE "/etc/ppt_aliases" #define CLASSFILE "/etc/driver_classes" #define DACFFILE "/etc/dacf.conf" static char class_file[] = CLASSFILE; +static char pptfile[] = PPTFILE; static char dafile[] = DAFILE; static char dacffile[] = DACFFILE; @@ -2150,14 +2153,13 @@ hwc_parse_now(char *fname, struct par_list **pl, ddi_prop_t **props) return (0); /* always return success */ } -void -make_aliases(struct bind **bhash) +static void +parse_aliases(struct bind **bhash, struct _buf *file) { enum { AL_NEW, AL_DRVNAME, AL_DRVNAME_COMMA, AL_ALIAS, AL_ALIAS_COMMA } state; - struct _buf *file; char tokbuf[MAXPATHLEN]; char drvbuf[MAXPATHLEN]; token_t token; @@ -2166,9 +2168,6 @@ make_aliases(struct bind **bhash) static char dupwarn[] = "!Driver alias \"%s\" conflicts with " "an existing driver name or alias."; - if ((file = kobj_open_file(dafile)) == (struct _buf *)-1) - return; - state = AL_NEW; major = DDI_MAJOR_T_NONE; while (!done) { @@ -2253,8 +2252,22 @@ make_aliases(struct bind **bhash) kobj_file_err(CE_WARN, file, tok_err, tokbuf); } } +} - kobj_close_file(file); +void +make_aliases(struct bind **bhash) +{ + struct _buf *file; + + if ((file = kobj_open_file(pptfile)) != (struct _buf *)-1) { + parse_aliases(bhash, file); + kobj_close_file(file); + } + + if ((file = kobj_open_file(dafile)) != (struct _buf *)-1) { + parse_aliases(bhash, file); + kobj_close_file(file); + } } diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..eba6147fab 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -112,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -522,6 +535,20 @@ sprunlock(proc_t *p) THREAD_KPRI_RELEASE(); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + THREAD_KPRI_RELEASE(); +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index d3d362a8a7..861c748cff 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -56,6 +56,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1244,6 +1245,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2092,6 +2109,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2607,3 +2631,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bc1787c9ca..854fb602da 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 09b80323d5..e0a1126567 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids; kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */ kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; + kmutex_t rctl_lists_lock; rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2872,12 +2875,12 @@ rctl_init(void) * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, * int chargeproc) * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects. * * Return values * 0 - success @@ -2895,6 +2898,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(p != NULL); ASSERT(MUTEX_HELD(&p->p_lock)); + if (proj != NULL) { projp = proj; zonep = proj->kpj_zone; @@ -2938,11 +2942,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, } } - zonep->zone_locked_mem += inc; - projp->kpj_data.kpd_locked_mem += inc; if (chargeproc != 0) { + /* Check for overflow */ + if ((p->p_locked_mem + inc) < p->p_locked_mem) { + ret = EAGAIN; + goto out; + } + if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, + &e, inc, 0) & RCT_DENY) { + ret = EAGAIN; + goto out; + } + p->p_locked_mem += inc; } + + zonep->zone_locked_mem += inc; + projp->kpj_data.kpd_locked_mem += inc; out: mutex_exit(&zonep->zone_mem_lock); return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/port_kernel.h> #include <sys/signal.h> #include <sys/var.h> +#include <sys/policy.h> #include <sys/vmparam.h> #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl; rctl_hndl_t rc_process_semopm; rctl_hndl_t rc_process_portev; rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem; /* * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = { }; /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, + struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ + if (secpolicy_lock_memory(CRED()) == 0) + return (0); + return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { + rcop_no_action, + rcop_no_usage, + rcop_no_set, + proc_maxlockedmem_test +}; + +/* * void rctlproc_default_init() * * Overview @@ -383,6 +406,11 @@ rctlproc_init(void) rctl_add_default_limit("process.max-sigqueue-size", _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); + rc_process_maxlockedmem = rctl_register("process.max-locked-memory", + RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, + ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); + /* * Place minimal set of controls on "sched" process for inheritance by * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 5721083751..18b396a765 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -327,12 +328,17 @@ schedctl_sigblock(kthread_t *t) /* - * If the sc_sigblock field is set for the specified thread, set - * its signal mask to block all maskable signals, then clear the - * sc_sigblock field. This finishes what user-level code requested - * to be done when it set tdp->sc_shared->sc_sigblock non-zero. - * Called from signal-related code either by the current thread for - * itself or by a thread that holds the process's p_lock (/proc code). + * If the sc_sigblock field is set for the specified thread, set its signal + * mask to block all maskable signals, then clear the sc_sigblock field. This + * accomplishes what user-level code requested to be done when it set + * tdp->sc_shared->sc_sigblock non-zero. + * + * This is generally called by signal-related code in the current thread. In + * order to call against a thread other than curthread, p_lock for the + * containing process must be held. Even then, the caller is not protected + * from races with the thread in question updating its own fields. It is the + * responsibility of the caller to perform additional synchronization. + * */ void schedctl_finish_sigblock(kthread_t *t) diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index bacc595f78..5deae96d73 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..67a93581dd 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,28 @@ issig_forreal(void) } /* + * The brand hook name 'b_issig_stop' is a misnomer. + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + int r; + + /* + * The brand hook will return 0 if it would like + * us to drive on, -1 if we should restart + * the loop to check other conditions, or 1 if we + * should terminate the loop. + */ + r = BROP(p)->b_issig_stop(p, lwp); + if (r < 0) { + continue; + } else if (r > 0) { + break; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +695,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +747,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +761,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +993,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1114,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1220,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1248,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1383,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1863,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index d4c2f7023d..68afeef013 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -78,6 +78,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -986,12 +987,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1030,6 +1039,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1049,8 +1060,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2545,6 +2562,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2636,13 +2655,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2660,17 +2687,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2678,6 +2711,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2694,6 +2729,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3179,6 +3216,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 1ffb561428..ac1ee2d1ce 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -26,6 +26,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -8470,6 +8471,12 @@ mblk_copycred(mblk_t *mp, const mblk_t *src) dbp->db_cpid = cpid; } + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */ int hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, uint32_t start, uint32_t stuff, uint32_t end, uint32_t value, diff --git a/usr/src/uts/common/os/subr.c b/usr/src/uts/common/os/subr.c index 8ca338a986..ee7293db9a 100644 --- a/usr/src/uts/common/os/subr.c +++ b/usr/src/uts/common/os/subr.c @@ -23,8 +23,12 @@ * Use is subject to license terms. */ +/* + * Copyright 2019 Joyent, Inc. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ #include <sys/types.h> #include <sys/sysmacros.h> @@ -308,46 +312,60 @@ uchar_t bcd_to_byte[256] = { /* CSTYLED */ /* * Hot-patch a single instruction in the kernel's text. - * If you want to patch multiple instructions you must - * arrange to do it so that all intermediate stages are - * sane -- we don't stop other cpus while doing this. + * + * If you want to patch multiple instructions you must arrange to do it so that + * all intermediate stages are sane -- we don't stop other cpus while doing + * this. + * * Size must be 1, 2, or 4 bytes with iaddr aligned accordingly. + * + * The instruction itself might straddle a page boundary, so we have to account + * for that. */ void hot_patch_kernel_text(caddr_t iaddr, uint32_t new_instr, uint_t size) { + const uintptr_t pageoff = (uintptr_t)iaddr & PAGEOFFSET; + const boolean_t straddles = (pageoff + size > PAGESIZE); + const size_t mapsize = straddles ? PAGESIZE * 2 : PAGESIZE; + caddr_t ipageaddr = iaddr - pageoff; caddr_t vaddr; page_t **ppp; - uintptr_t off = (uintptr_t)iaddr & PAGEOFFSET; - vaddr = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); + vaddr = vmem_alloc(heap_arena, mapsize, VM_SLEEP); - (void) as_pagelock(&kas, &ppp, iaddr - off, PAGESIZE, S_WRITE); + (void) as_pagelock(&kas, &ppp, ipageaddr, mapsize, S_WRITE); hat_devload(kas.a_hat, vaddr, PAGESIZE, - hat_getpfnum(kas.a_hat, iaddr - off), - PROT_READ | PROT_WRITE, HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); + hat_getpfnum(kas.a_hat, ipageaddr), PROT_READ | PROT_WRITE, + HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); + + if (straddles) { + hat_devload(kas.a_hat, vaddr + PAGESIZE, PAGESIZE, + hat_getpfnum(kas.a_hat, ipageaddr + PAGESIZE), + PROT_READ | PROT_WRITE, HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); + } switch (size) { case 1: - *(uint8_t *)(vaddr + off) = new_instr; + *(uint8_t *)(vaddr + pageoff) = new_instr; break; case 2: - *(uint16_t *)(vaddr + off) = new_instr; + *(uint16_t *)(vaddr + pageoff) = new_instr; break; case 4: - *(uint32_t *)(vaddr + off) = new_instr; + *(uint32_t *)(vaddr + pageoff) = new_instr; break; default: panic("illegal hot-patch"); } membar_enter(); - sync_icache(vaddr + off, size); + sync_icache(vaddr + pageoff, size); sync_icache(iaddr, size); - as_pageunlock(&kas, ppp, iaddr - off, PAGESIZE, S_WRITE); - hat_unload(kas.a_hat, vaddr, PAGESIZE, HAT_UNLOAD_UNLOCK); - vmem_free(heap_arena, vaddr, PAGESIZE); + as_pageunlock(&kas, ppp, ipageaddr, mapsize, S_WRITE); + hat_unload(kas.a_hat, vaddr, mapsize, HAT_UNLOAD_UNLOCK); + vmem_free(heap_arena, vaddr, mapsize); } /* diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index c39819156d..e0cc20fa45 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5903,6 +5903,12 @@ ddi_ffs(long mask) return (ffs(mask)); } +int +ddi_ffsll(long long mask) +{ + return (ffs(mask)); +} + /* * Find last bit set. Take mask and clear * all but the most significant bit, and @@ -5914,8 +5920,14 @@ ddi_ffs(long mask) int ddi_fls(long mask) { + return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{ while (mask) { - long nx; + long long nx; if ((nx = (mask & (mask - 1))) == 0) break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index fb8bf07077..fb64000e4d 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,6 +23,7 @@ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2018, Joyent, Inc. */ @@ -61,8 +62,7 @@ struct mmaplf32a; int access(char *, int); int alarm(int); int auditsys(struct auditcalls *, rval_t *); -int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t); +int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); intptr_t brk(caddr_t); int chdir(char *); int chmod(char *, int); @@ -647,7 +647,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1002,7 +1002,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1094,18 +1094,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1113,7 +1115,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1127,14 +1130,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1146,7 +1150,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1154,7 +1159,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1168,14 +1174,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1203,5 +1210,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index b25a6cbcf1..5453ebf380 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,11 +25,12 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/timer.h> #include <sys/systm.h> +#include <sys/sysmacros.h> #include <sys/param.h> #include <sys/kmem.h> #include <sys/debug.h> @@ -81,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it) * waiters. p_lock must be held on entry; it will not be dropped by * timer_unlock(). */ +/* ARGSUSED */ static void timer_unlock(proc_t *p, itimer_t *it) { @@ -123,6 +125,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) timer_lock(p, it); } + ASSERT(p->p_itimer_sz > tid); ASSERT(p->p_itimer[tid] == it); p->p_itimer[tid] = NULL; @@ -137,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) it->it_backend->clk_timer_delete(it); - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portev) { port_kevent_t *pev; @@ -199,18 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) static itimer_t * timer_grab(proc_t *p, timer_t tid) { - itimer_t **itp, *it; + itimer_t *it; - if (tid >= timer_max || tid < 0) + if (tid < 0) { return (NULL); + } mutex_enter(&p->p_lock); - - if ((itp = p->p_itimer) == NULL || (it = itp[tid]) == NULL) { + if (p->p_itimer == NULL || tid >= p->p_itimer_sz || + (it = p->p_itimer[tid]) == NULL) { mutex_exit(&p->p_lock); return (NULL); } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (it->it_lock & ITLK_REMOVE) { @@ -232,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid) * should not be held on entry; timer_release() will acquire p_lock but * will drop it before returning. */ -static void +void timer_release(proc_t *p, itimer_t *it) { mutex_enter(&p->p_lock); @@ -245,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it) * p_lock should not be held on entry; timer_delete_grabbed() will acquire * p_lock, but will drop it before returning. */ -static void +void timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it) { mutex_enter(&p->p_lock); @@ -258,6 +263,13 @@ clock_timer_init() { clock_timer_cache = kmem_cache_create("timer_cache", sizeof (itimer_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * Push the timer_max limit up to at least 4 * NCPU. Due to the way + * NCPU is defined, proper initialization of the timer limit is + * performed at runtime. + */ + timer_max = MAX(NCPU * 4, timer_max); } void @@ -453,6 +465,9 @@ timer_fire(itimer_t *it) it->it_pending = 1; port_send_event((port_kevent_t *)it->it_portev); mutex_exit(&it->it_mutex); + } else if (it->it_flags & IT_CALLBACK) { + it->it_cb_func(it); + ASSERT(MUTEX_NOT_HELD(&it->it_mutex)); } else if (it->it_flags & IT_SIGNAL) { it->it_pending = 1; mutex_exit(&it->it_mutex); @@ -466,159 +481,175 @@ timer_fire(itimer_t *it) mutex_exit(&p->p_lock); } -int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +/* + * Allocate an itimer_t and find and appropriate slot for it in p_itimer. + * Acquires p_lock and holds it on return, regardless of success. + */ +static itimer_t * +timer_alloc(proc_t *p, timer_t *id) { - struct sigevent ev; - proc_t *p = curproc; - clock_backend_t *backend; - itimer_t *it, **itp; - sigqueue_t *sigq; - cred_t *cr = CRED(); - int error = 0; - timer_t i; - port_notify_t tim_pnevp; - port_kevent_t *pkevp = NULL; + itimer_t *it, **itp = NULL; + uint_t i; - if ((backend = CLOCK_BACKEND(clock)) == NULL) - return (set_errno(EINVAL)); + ASSERT(MUTEX_NOT_HELD(&p->p_lock)); - if (evp != NULL) { - /* - * short copyin() for binary compatibility - * fetch oldsigevent to determine how much to copy in. - */ - if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(evp, &ev, sizeof (struct oldsigevent))) - return (set_errno(EFAULT)); + it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); + bzero(it, sizeof (itimer_t)); + mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, - sizeof (port_notify_t))) - return (set_errno(EFAULT)); + mutex_enter(&p->p_lock); +retry: + if (p->p_itimer != NULL) { + for (i = 0; i < p->p_itimer_sz; i++) { + if (p->p_itimer[i] == NULL) { + itp = &(p->p_itimer[i]); + break; } -#ifdef _SYSCALL32_IMPL - } else { - struct sigevent32 ev32; - port_notify32_t tim_pnevp32; + } + } - if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) - return (set_errno(EFAULT)); - ev.sigev_notify = ev32.sigev_notify; - ev.sigev_signo = ev32.sigev_signo; + /* + * A suitable slot was not found. If possible, allocate (or resize) + * the p_itimer array and try again. + */ + if (itp == NULL) { + uint_t target_sz = _TIMER_ALLOC_INIT; + itimer_t **itp_new; + + if (p->p_itimer != NULL) { + ASSERT(p->p_itimer_sz != 0); + + target_sz = p->p_itimer_sz * 2; + } + /* + * Protect against exceeding the max or overflow + */ + if (target_sz > timer_max || target_sz > INT_MAX || + target_sz < p->p_itimer_sz) { + kmem_cache_free(clock_timer_cache, it); + return (NULL); + } + mutex_exit(&p->p_lock); + itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), + KM_SLEEP); + mutex_enter(&p->p_lock); + if (target_sz <= p->p_itimer_sz) { /* - * See comment in sigqueue32() on handling of 32-bit - * sigvals in a 64-bit kernel. + * A racing thread performed the resize while we were + * waiting outside p_lock. Discard our now-useless + * allocation and retry. */ - ev.sigev_value.sival_int = ev32.sigev_value.sival_int; - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin((void *)(uintptr_t) - ev32.sigev_value.sival_ptr, - (void *)&tim_pnevp32, - sizeof (port_notify32_t))) - return (set_errno(EFAULT)); - tim_pnevp.portnfy_port = - tim_pnevp32.portnfy_port; - tim_pnevp.portnfy_user = - (void *)(uintptr_t)tim_pnevp32.portnfy_user; + kmem_free(itp_new, target_sz * sizeof (itimer_t *)); + goto retry; + } else { + /* + * Instantiate the larger allocation and select the + * first fresh entry for use. + */ + if (p->p_itimer != NULL) { + uint_t old_sz; + + old_sz = p->p_itimer_sz; + bcopy(p->p_itimer, itp_new, + old_sz * sizeof (itimer_t *)); + kmem_free(p->p_itimer, + old_sz * sizeof (itimer_t *)); + + /* + * Short circuit to use the first free entry in + * the new allocation. It's possible that + * other lower-indexed timers were freed while + * p_lock was dropped, but skipping over them + * is not harmful at all. In the common case, + * we skip the need to walk over an array + * filled with timers before arriving at the + * slot we know is fresh from the allocation. + */ + i = old_sz; + } else { + /* + * For processes lacking any existing timers, + * we can simply select the first entry. + */ + i = 0; } -#endif + p->p_itimer = itp_new; + p->p_itimer_sz = target_sz; } - switch (ev.sigev_notify) { - case SIGEV_NONE: - break; - case SIGEV_SIGNAL: - if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) - return (set_errno(EINVAL)); - break; - case SIGEV_THREAD: - case SIGEV_PORT: - break; - default: - return (set_errno(EINVAL)); - } - } else { - /* - * Use the clock's default sigevent (this is a structure copy). - */ - ev = backend->clk_default; } + ASSERT(i <= INT_MAX); + *id = (timer_t)i; + return (it); +} + +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend. Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete(). This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ +int +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, + itimer_t **itp, timer_t *tidp) +{ + proc_t *p = curproc; + int error = 0; + itimer_t *it; + sigqueue_t *sigq; + timer_t tid; + /* - * We'll allocate our timer and sigqueue now, before we grab p_lock. - * If we can't find an empty slot, we'll free them before returning. + * We'll allocate our sigqueue now, before we grab p_lock. + * If we can't find an empty slot, we'll free it before returning. */ - it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); - bzero(it, sizeof (itimer_t)); - mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); - mutex_enter(&p->p_lock); - /* - * If this is this process' first timer, we need to attempt to allocate - * an array of timerstr_t pointers. We drop p_lock to perform the - * allocation; if we return to discover that p_itimer is non-NULL, - * we will free our allocation and drive on. + * Allocate a timer and choose a slot for it. This acquires p_lock. */ - if ((itp = p->p_itimer) == NULL) { - mutex_exit(&p->p_lock); - itp = kmem_zalloc(timer_max * sizeof (itimer_t *), KM_SLEEP); - mutex_enter(&p->p_lock); - - if (p->p_itimer == NULL) - p->p_itimer = itp; - else { - kmem_free(itp, timer_max * sizeof (itimer_t *)); - itp = p->p_itimer; - } - } - - for (i = 0; i < timer_max && itp[i] != NULL; i++) - continue; + it = timer_alloc(p, &tid); + ASSERT(MUTEX_HELD(&p->p_lock)); - if (i == timer_max) { - /* - * We couldn't find a slot. Drop p_lock, free the preallocated - * timer and sigqueue, and return an error. - */ + if (it == NULL) { mutex_exit(&p->p_lock); - kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - - return (set_errno(EAGAIN)); + return (EAGAIN); } - ASSERT(i < timer_max && itp[i] == NULL); + ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); + ASSERT(evp != NULL); /* * If we develop other notification mechanisms, this will need * to call into (yet another) backend. */ - sigq->sq_info.si_signo = ev.sigev_signo; - if (evp == NULL) - sigq->sq_info.si_value.sival_int = i; - else - sigq->sq_info.si_value = ev.sigev_value; + sigq->sq_info.si_signo = evp->sigev_signo; + sigq->sq_info.si_value = evp->sigev_value; sigq->sq_info.si_code = SI_TIMER; sigq->sq_info.si_pid = p->p_pid; sigq->sq_info.si_ctid = PRCTID(p); sigq->sq_info.si_zoneid = getzoneid(); - sigq->sq_info.si_uid = crgetruid(cr); + sigq->sq_info.si_uid = crgetruid(CRED()); sigq->sq_func = timer_signal; sigq->sq_next = NULL; sigq->sq_backptr = it; it->it_sigq = sigq; it->it_backend = backend; it->it_lock = ITLK_LOCKED; - itp[i] = it; - - if (ev.sigev_notify == SIGEV_THREAD || - ev.sigev_notify == SIGEV_PORT) { + if (evp->sigev_notify == SIGEV_THREAD || + evp->sigev_notify == SIGEV_PORT) { int port; + port_kevent_t *pkevp = NULL; + + ASSERT(pnp != NULL); /* * This timer is programmed to use event port notification when @@ -638,18 +669,17 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) */ it->it_flags |= IT_PORT; - port = tim_pnevp.portnfy_port; + port = pnp->portnfy_port; /* associate timer as event source with the port */ error = port_associate_ksource(port, PORT_SOURCE_TIMER, (port_source_t **)&it->it_portsrc, timer_close_port, (void *)it, NULL); if (error) { - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* allocate an event structure/slot */ @@ -658,23 +688,24 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) if (error) { (void) port_dissociate_ksource(port, PORT_SOURCE_TIMER, (port_source_t *)it->it_portsrc); - itp[i] = NULL; /* clear slot */ mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* initialize event data */ - port_init_event(pkevp, i, tim_pnevp.portnfy_user, + port_init_event(pkevp, tid, pnp->portnfy_user, timer_port_callback, it); it->it_portev = pkevp; it->it_portfd = port; } else { - if (ev.sigev_notify == SIGEV_SIGNAL) + if (evp->sigev_notify == SIGEV_SIGNAL) it->it_flags |= IT_SIGNAL; } + /* Populate the slot now that the timer is prepped. */ + p->p_itimer[tid] = it; mutex_exit(&p->p_lock); /* @@ -687,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_lwp = ttolwp(curthread); it->it_proc = p; - if (copyout(&i, tid, sizeof (timer_t)) != 0) { - error = EFAULT; - goto err; - } - - /* - * If we're here, then we have successfully created the timer; we - * just need to release the timer and return. - */ - timer_release(p, it); - + *itp = it; + *tidp = tid; return (0); err: @@ -708,11 +730,115 @@ err: * impossible for a removal to be pending. */ ASSERT(!(it->it_lock & ITLK_REMOVE)); - timer_delete_grabbed(p, i, it); + timer_delete_grabbed(p, tid, it); - return (set_errno(error)); + return (error); } + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ + int error = 0; + proc_t *p = curproc; + clock_backend_t *backend; + struct sigevent ev; + itimer_t *it; + timer_t tid; + port_notify_t tim_pnevp; + + if ((backend = CLOCK_BACKEND(clock)) == NULL) + return (set_errno(EINVAL)); + + if (evp != NULL) { + /* + * short copyin() for binary compatibility + * fetch oldsigevent to determine how much to copy in. + */ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(evp, &ev, sizeof (struct oldsigevent))) + return (set_errno(EFAULT)); + + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, + sizeof (port_notify_t))) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + } else { + struct sigevent32 ev32; + port_notify32_t tim_pnevp32; + + if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) + return (set_errno(EFAULT)); + ev.sigev_notify = ev32.sigev_notify; + ev.sigev_signo = ev32.sigev_signo; + /* + * See comment in sigqueue32() on handling of 32-bit + * sigvals in a 64-bit kernel. + */ + ev.sigev_value.sival_int = ev32.sigev_value.sival_int; + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin((void *)(uintptr_t) + ev32.sigev_value.sival_ptr, + (void *)&tim_pnevp32, + sizeof (port_notify32_t))) + return (set_errno(EFAULT)); + tim_pnevp.portnfy_port = + tim_pnevp32.portnfy_port; + tim_pnevp.portnfy_user = + (void *)(uintptr_t)tim_pnevp32.portnfy_user; + } +#endif + } + switch (ev.sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) + return (set_errno(EINVAL)); + break; + case SIGEV_THREAD: + case SIGEV_PORT: + break; + default: + return (set_errno(EINVAL)); + } + } else { + /* + * Use the clock's default sigevent (this is a structure copy). + */ + ev = backend->clk_default; + } + + if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * Populate si_value with the timer ID if no sigevent was passed in. + */ + if (evp == NULL) { + it->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + timer_delete_grabbed(p, tid, it); + return (set_errno(EFAULT)); + } + + /* + * If we're here, then we have successfully created the timer; we + * just need to release the timer and return. + */ + timer_release(p, it); + + return (0); +} + + int timer_gettime(timer_t tid, itimerspec_t *val) { @@ -832,20 +958,23 @@ timer_getoverrun(timer_t tid) void timer_lwpexit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) { continue; + } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -876,20 +1005,22 @@ timer_lwpexit(void) void timer_lwpbind() { - timer_t i; + uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } - for (i = 0; i < timer_max; i++) { - if ((it = itp[i]) == NULL) + for (i = 0; i < p->p_itimer_sz; i++) { + if ((it = p->p_itimer[i]) == NULL) continue; + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -911,16 +1042,19 @@ timer_lwpbind() void timer_exit(void) { - timer_t i; + uint_t i; proc_t *p = curproc; ASSERT(p->p_itimer != NULL); + ASSERT(p->p_itimer_sz != 0); - for (i = 0; i < timer_max; i++) - (void) timer_delete(i); + for (i = 0; i < p->p_itimer_sz; i++) { + (void) timer_delete((timer_t)i); + } - kmem_free(p->p_itimer, timer_max * sizeof (itimer_t *)); + kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *)); p->p_itimer = NULL; + p->p_itimer_sz = 0; } /* @@ -977,7 +1111,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose) for (tid = 0; tid < timer_max; tid++) { if ((it = timer_grab(p, tid)) == NULL) continue; - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portfd == port) { port_kevent_t *pev; diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 61acc6cf97..53be806026 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* @@ -1172,6 +1173,14 @@ timespectohz64(timespec_t *tv) void hrt2ts(hrtime_t hrt, timestruc_t *tsp) { +#if defined(__amd64) + /* + * The cleverness explained above is unecessary on x86_64 CPUs where + * modern compilers are able to optimize down to faster operations. + */ + tsp->tv_sec = hrt / NANOSEC; + tsp->tv_nsec = hrt % NANOSEC; +#else uint32_t sec, nsec, tmp; tmp = (uint32_t)(hrt >> 30); @@ -1193,20 +1202,28 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp) } tsp->tv_sec = (time_t)sec; tsp->tv_nsec = nsec; +#endif /* defined(__amd64) */ } /* * Convert from timestruc_t to hrtime_t. - * - * The code below is equivalent to: - * - * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; - * - * but requires no integer multiply. */ hrtime_t ts2hrt(const timestruc_t *tsp) { +#if defined(__amd64) || defined(__i386) + /* + * On modern x86 CPUs, the simple version is faster. + */ + return ((tsp->tv_sec * NANOSEC) + tsp->tv_nsec); +#else + /* + * The code below is equivalent to: + * + * hrt = tsp->tv_sec * NANOSEC + tsp->tv_nsec; + * + * but requires no integer multiply. + */ hrtime_t hrt; hrt = tsp->tv_sec; @@ -1215,6 +1232,7 @@ ts2hrt(const timestruc_t *tsp) hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 9) + tsp->tv_nsec; return (hrt); +#endif /* defined(__amd64) || defined(__i386) */ } /* @@ -1246,6 +1264,13 @@ tv2hrt(struct timeval *tvp) void hrt2tv(hrtime_t hrt, struct timeval *tvp) { +#if defined(__amd64) + /* + * Like hrt2ts, the simple version is faster on x86_64. + */ + tvp->tv_sec = hrt / NANOSEC; + tvp->tv_usec = (hrt % NANOSEC) / (NANOSEC / MICROSEC); +#else uint32_t sec, nsec, tmp; uint32_t q, r, t; @@ -1267,17 +1292,17 @@ hrt2tv(hrtime_t hrt, struct timeval *tvp) sec++; } tvp->tv_sec = (time_t)sec; -/* - * this routine is very similar to hr2ts, but requires microseconds - * instead of nanoseconds, so an interger divide by 1000 routine - * completes the conversion - */ + /* + * this routine is very similar to hr2ts, but requires microseconds + * instead of nanoseconds, so an interger divide by 1000 routine + * completes the conversion + */ t = (nsec >> 7) + (nsec >> 8) + (nsec >> 12); q = (nsec >> 1) + t + (nsec >> 15) + (t >> 11) + (t >> 14); q = q >> 9; r = nsec - q*1000; tvp->tv_usec = q + ((r + 24) >> 10); - +#endif /* defined(__amd64) */ } int diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index 608208bbca..f5ee76a2cb 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -58,6 +59,7 @@ #include <sys/tnf_probe.h> #include <sys/mem_cage.h> #include <sys/time.h> +#include <sys/zone.h> #include <vm/hat.h> #include <vm/as.h> @@ -73,7 +75,7 @@ static int checkpage(page_t *, int); * algorithm. They are initialized to 0, and then computed at boot time * based on the size of the system. If they are patched non-zero in * a loaded vmunix they are left alone and may thus be changed per system - * using adb on the loaded system. + * using mdb on the loaded system. */ pgcnt_t slowscan = 0; pgcnt_t fastscan = 0; @@ -81,6 +83,7 @@ pgcnt_t fastscan = 0; static pgcnt_t handspreadpages = 0; static int loopfraction = 2; static pgcnt_t looppages; +/* See comment below describing 4% and 80% */ static int min_percent_cpu = 4; static int max_percent_cpu = 80; static pgcnt_t maxfastscan = 0; @@ -98,14 +101,34 @@ pgcnt_t deficit; pgcnt_t nscan; pgcnt_t desscan; +/* kstats */ +uint64_t low_mem_scan; +uint64_t zone_cap_scan; +uint64_t n_throttle; + +clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */ + /* * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks * are the number of ticks in each wakeup cycle that gives the * equivalent of some underlying %CPU duty cycle. - * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is - * awakened every 25 clock ticks. So, converting from %CPU to ticks - * per wakeup cycle would be x% of 25, that is (x * 100) / 25. - * So, for example, 4% == 1 tick and 80% == 20 ticks. + * + * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging() + * will run 4 times/sec to update pageout scanning parameters and kickoff + * the pageout_scanner() thread if necessary. + * + * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When + * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1). + * + * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When + * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed + * by the scanner in a 1 second interval is 80% of a CPU + * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25 + * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec. + * + * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks + * will be 200, so the CPU percentages are the same as when hz is 100. * * min_pageout_ticks: * ticks/wakeup equivalent of min_percent_cpu. @@ -117,19 +140,29 @@ pgcnt_t desscan; * Number of clock ticks budgeted for each wakeup cycle. * Computed each time around by schedpaging(). * Varies between min_pageout_ticks .. max_pageout_ticks, - * depending on memory pressure. - * - * pageout_lbolt: - * Timestamp of the last time pageout_scanner woke up and started - * (or resumed) scanning for not recently referenced pages. + * depending on memory pressure or zones over their cap. */ static clock_t min_pageout_ticks; static clock_t max_pageout_ticks; static clock_t pageout_ticks; -static clock_t pageout_lbolt; -static uint_t reset_hands; +#define MAX_PSCAN_THREADS 16 +static boolean_t reset_hands[MAX_PSCAN_THREADS]; + +/* + * These can be tuned in /etc/system or set with mdb. + * 'des_page_scanners' is the desired number of page scanner threads. The + * system will bring the actual number of threads into line with the desired + * number. If des_page_scanners is set to an invalid value, the system will + * correct the setting. + */ +uint_t des_page_scanners; +uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */ + +uint_t n_page_scanners; +static pgcnt_t pscan_region_sz; /* informational only */ + #define PAGES_POLL_MASK 1023 @@ -145,33 +178,37 @@ static uint_t reset_hands; * pageout_sample_pages: * The accumulated number of pages scanned during sampling. * - * pageout_sample_ticks: - * The accumulated clock ticks for the sample. + * pageout_sample_etime: + * The accumulated number of nanoseconds for the sample. * * pageout_rate: - * Rate in pages/nanosecond, computed at the end of sampling. + * Rate in pages/second, computed at the end of sampling. * * pageout_new_spread: - * The new value to use for fastscan and handspreadpages. - * Calculated after enough samples have been taken. + * The new value to use for maxfastscan and (perhaps) handspreadpages. + * Intended to be the number pages that can be scanned per sec using ~10% + * of a CPU. Calculated after enough samples have been taken. + * pageout_rate / 10 */ typedef hrtime_t hrrate_t; -static uint64_t pageout_sample_lim = 4; -static uint64_t pageout_sample_cnt = 0; +static uint_t pageout_sample_lim = 4; +static uint_t pageout_sample_cnt = 0; static pgcnt_t pageout_sample_pages = 0; static hrrate_t pageout_rate = 0; static pgcnt_t pageout_new_spread = 0; -static clock_t pageout_cycle_ticks; -static hrtime_t sample_start, sample_end; static hrtime_t pageout_sample_etime = 0; +/* True if page scanner is first starting up */ +#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim) + /* * Record number of times a pageout_scanner wakeup cycle finished because it * timed out (exceeded its CPU budget), rather than because it visited - * its budgeted number of pages. + * its budgeted number of pages. This is only done when scanning under low + * free memory conditions, not when scanning for zones over their cap. */ uint64_t pageout_timeouts = 0; @@ -194,25 +231,35 @@ kcondvar_t memavail_cv; #define LOOPPAGES total_pages /* - * Set up the paging constants for the clock algorithm. - * Called after the system is initialized and the amount of memory - * and number of paging devices is known. + * Local boolean to control scanning when zones are over their cap. Avoids + * accessing the zone_num_over_cap variable except within schedpaging(), which + * only runs periodically. This is here only to reduce our access to + * zone_num_over_cap, since it is already accessed a lot during paging, and + * the page scanner accesses the zones_over variable on each page during a + * scan. There is no lock needed for zone_num_over_cap since schedpaging() + * doesn't modify the variable, it only cares if the variable is 0 or non-0. + */ +static boolean_t zones_over = B_FALSE; + +/* + * Set up the paging constants for the page scanner clock-hand algorithm. + * Called at startup after the system is initialized and the amount of memory + * and number of paging devices is known (recalc will be 0). Called again once + * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples + * (recalc will be 1). + * + * Will also be called after a memory dynamic reconfiguration operation and + * recalc will be 1 in those cases too. * - * lotsfree is 1/64 of memory, but at least 512K. + * lotsfree is 1/64 of memory, but at least 512K (ha!). * desfree is 1/2 of lotsfree. * minfree is 1/2 of desfree. - * - * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: - * - * lotsfree = btop(512K) - * desfree = btop(200K) - * minfree = btop(100K) - * throttlefree = INT_MIN - * max_percent_cpu = 4 */ void setupclock(int recalc) { + uint_t i; + pgcnt_t sz, tmp; static spgcnt_t init_lfree, init_dfree, init_mfree; static spgcnt_t init_tfree, init_preserve, init_mpgio; @@ -221,8 +268,8 @@ setupclock(int recalc) looppages = LOOPPAGES; /* - * setupclock can now be called to recalculate the paging - * parameters in the case of dynamic addition of memory. + * setupclock can be called to recalculate the paging + * parameters in the case of dynamic reconfiguration of memory. * So to make sure we make the proper calculations, if such a * situation should arise, we save away the initial values * of each parameter so we can recall them when needed. This @@ -311,105 +358,98 @@ setupclock(int recalc) maxpgio = init_mpgio; /* - * The clock scan rate varies between fastscan and slowscan - * based on the amount of free memory available. Fastscan - * rate should be set based on the number pages that can be - * scanned per sec using ~10% of processor time. Since this - * value depends on the processor, MMU, Mhz etc., it is - * difficult to determine it in a generic manner for all - * architectures. + * When the system is in a low memory state, the page scan rate varies + * between fastscan and slowscan based on the amount of free memory + * available. When only zones are over their memory cap, the scan rate + * is always fastscan. * - * Instead of trying to determine the number of pages scanned - * per sec for every processor, fastscan is set to be the smaller - * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling - * time is limited to ~4% of processor time. + * The fastscan rate should be set based on the number pages that can + * be scanned per sec using ~10% of a CPU. Since this value depends on + * the processor, MMU, Ghz etc., it must be determined dynamically. * - * Setting fastscan to be 1/2 of memory allows pageout to scan - * all of memory in ~2 secs. This implies that user pages not - * accessed within 1 sec (assuming, handspreadpages == fastscan) - * can be reclaimed when free memory is very low. Stealing pages - * not accessed within 1 sec seems reasonable and ensures that - * active user processes don't thrash. + * When the scanner first starts up, fastscan will be set to 0 and + * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages). + * However, once the scanner has collected enough samples, then fastscan + * is set to be the smaller of 1/2 of memory (looppages / loopfraction) + * or maxfastscan (which is set from pageout_new_spread). Thus, + * MAXHANDSPREADPAGES is irrelevant after the scanner is fully + * initialized. * - * Smaller values of fastscan result in scanning fewer pages - * every second and consequently pageout may not be able to free - * sufficient memory to maintain the minimum threshold. Larger - * values of fastscan result in scanning a lot more pages which - * could lead to thrashing and higher CPU usage. + * pageout_new_spread is calculated when the scanner first starts + * running. During this initial sampling period the nscan_limit + * is set to the total_pages of system memory. Thus, the scanner could + * theoretically scan all of memory in one pass. However, each sample + * is also limited by the %CPU budget. This is controlled by + * pageout_ticks which is set in schedpaging(). During the sampling + * period, pageout_ticks is set to max_pageout_ticks. This tick value + * is derived from the max_percent_cpu (80%) described above. On a + * system with more than a small amount of memory (~8GB), the scanner's + * %CPU will be the limiting factor in calculating pageout_new_spread. * - * Fastscan needs to be limited to a maximum value and should not - * scale with memory to prevent pageout from consuming too much - * time for scanning on slow CPU's and avoid thrashing, as a - * result of scanning too many pages, on faster CPU's. - * The value of 64 Meg was chosen for MAXHANDSPREADPAGES - * (the upper bound for fastscan) based on the average number - * of pages that can potentially be scanned in ~1 sec (using ~4% - * of the CPU) on some of the following machines that currently - * run Solaris 2.x: + * At the end of the sampling period, the pageout_rate indicates how + * many pages could be scanned per second. The pageout_new_spread is + * then set to be 1/10th of that (i.e. approximating 10% of a CPU). + * Of course, this value could still be more than the physical memory + * on the system. If so, fastscan is set to 1/2 of memory, as + * mentioned above. * - * average memory scanned in ~1 sec + * All of this leads up to the setting of handspreadpages, which is + * set to fastscan. This is the distance, in pages, between the front + * and back hands during scanning. It will dictate which pages will + * be considered "hot" on the backhand and which pages will be "cold" + * and reclaimed * - * 25 Mhz SS1+: 23 Meg - * LX: 37 Meg - * 50 Mhz SC2000: 68 Meg + * If the scanner is limited by desscan, then at the highest rate it + * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the + * scanner is limited by the %CPU, then at the highest rate (20% of a + * CPU per cycle) the number of pages scanned could be much less. * - * 40 Mhz 486: 26 Meg - * 66 Mhz 486: 42 Meg + * Thus, if the scanner is limited by desscan, then the handspreadpages + * setting means 1sec between the front and back hands, but if the + * scanner is limited by %CPU, it could be several seconds between the + * two hands. * - * When free memory falls just below lotsfree, the scan rate - * goes from 0 to slowscan (i.e., pageout starts running). This + * The basic assumption is that at the worst case, stealing pages + * not accessed within 1 sec seems reasonable and ensures that active + * user processes don't thrash. This is especially true when the system + * is in a low memory state. + * + * There are some additional factors to consider for the case of + * scanning when zones are over their cap. In this situation it is + * also likely that the machine will have a large physical memory which + * will take many seconds to fully scan (due to the %CPU and desscan + * limits per cycle). It is probable that there will be few (or 0) + * pages attributed to these zones in any single scanning cycle. The + * result is that reclaiming enough pages for these zones might take + * several additional seconds (this is generally not a problem since + * the zone physical cap is just a soft cap). + * + * This is similar to the typical multi-processor situation in which + * pageout is often unable to maintain the minimum paging thresholds + * under heavy load due to the fact that user processes running on + * other CPU's can be dirtying memory at a much faster pace than + * pageout can find pages to free. + * + * One potential approach to address both of these cases is to enable + * more than one CPU to run the page scanner, in such a manner that the + * various clock hands don't overlap. However, this also makes it more + * difficult to determine the values for fastscan, slowscan and + * handspreadpages. This is left as a future enhancement, if necessary. + * + * When free memory falls just below lotsfree, the scan rate goes from + * 0 to slowscan (i.e., the page scanner starts running). This * transition needs to be smooth and is achieved by ensuring that * pageout scans a small number of pages to satisfy the transient * memory demand. This is set to not exceed 100 pages/sec (25 per * wakeup) since scanning that many pages has no noticible impact * on system performance. * - * In addition to setting fastscan and slowscan, pageout is - * limited to using ~4% of the CPU. This results in increasing - * the time taken to scan all of memory, which in turn means that - * user processes have a better opportunity of preventing their - * pages from being stolen. This has a positive effect on - * interactive and overall system performance when memory demand - * is high. - * - * Thus, the rate at which pages are scanned for replacement will - * vary linearly between slowscan and the number of pages that - * can be scanned using ~4% of processor time instead of varying - * linearly between slowscan and fastscan. - * - * Also, the processor time used by pageout will vary from ~1% - * at slowscan to ~4% at fastscan instead of varying between - * ~1% at slowscan and ~10% at fastscan. - * - * The values chosen for the various VM parameters (fastscan, - * handspreadpages, etc) are not universally true for all machines, - * but appear to be a good rule of thumb for the machines we've - * tested. They have the following ranges: - * - * cpu speed: 20 to 70 Mhz - * page size: 4K to 8K - * memory size: 16M to 5G - * page scan rate: 4000 - 17400 4K pages per sec - * - * The values need to be re-examined for machines which don't - * fall into the various ranges (e.g., slower or faster CPUs, - * smaller or larger pagesizes etc) shown above. - * - * On an MP machine, pageout is often unable to maintain the - * minimum paging thresholds under heavy load. This is due to - * the fact that user processes running on other CPU's can be - * dirtying memory at a much faster pace than pageout can find - * pages to free. The memory demands could be met by enabling - * more than one CPU to run the clock algorithm in such a manner - * that the various clock hands don't overlap. This also makes - * it more difficult to determine the values for fastscan, slowscan - * and handspreadpages. - * - * The swapper is currently used to free up memory when pageout - * is unable to meet memory demands by swapping out processes. - * In addition to freeing up memory, swapping also reduces the - * demand for memory by preventing user processes from running - * and thereby consuming memory. + * The swapper is currently used to free up memory when pageout is + * unable to meet memory demands. It does this by swapping out entire + * processes. In addition to freeing up memory, swapping also reduces + * the demand for memory because the swapped out processes cannot + * run, and thereby consume memory. However, this is a pathological + * state and performance will generally be considered unacceptable. */ if (init_mfscan == 0) { if (pageout_new_spread != 0) @@ -419,12 +459,13 @@ setupclock(int recalc) } else { maxfastscan = init_mfscan; } - if (init_fscan == 0) + if (init_fscan == 0) { fastscan = MIN(looppages / loopfraction, maxfastscan); - else + } else { fastscan = init_fscan; - if (fastscan > looppages / loopfraction) - fastscan = looppages / loopfraction; + if (fastscan > looppages / loopfraction) + fastscan = looppages / loopfraction; + } /* * Set slow scan time to 1/10 the fast scan time, but @@ -444,12 +485,10 @@ setupclock(int recalc) * decreases as the scan rate rises. It must be < the amount * of pageable memory. * - * Since pageout is limited to ~4% of the CPU, setting handspreadpages - * to be "fastscan" results in the front hand being a few secs - * (varies based on the processor speed) ahead of the back hand - * at fastscan rates. This distance can be further reduced, if - * necessary, by increasing the processor time used by pageout - * to be more than ~4% and preferrably not more than ~10%. + * Since pageout is limited to the %CPU per cycle, setting + * handspreadpages to be "fastscan" results in the front hand being + * a few secs (varies based on the processor speed) ahead of the back + * hand at fastscan rates. * * As a result, user processes have a much better chance of * referencing their pages before the back hand examines them. @@ -471,29 +510,78 @@ setupclock(int recalc) if (handspreadpages >= looppages) handspreadpages = looppages - 1; + if (recalc == 0) { + /* + * Setup basic values at initialization. + */ + pscan_region_sz = total_pages; + des_page_scanners = n_page_scanners = 1; + reset_hands[0] = B_TRUE; + return; + } + /* - * If we have been called to recalculate the parameters, - * set a flag to re-evaluate the clock hand pointers. + * Recalculating + * + * We originally set the number of page scanners to 1. Now that we + * know what the handspreadpages is for a scanner, figure out how many + * scanners we should run. We want to ensure that the regions don't + * overlap and that they are not touching. + * + * A default 64GB region size is used as the initial value to calculate + * how many scanner threads we should create on lower memory systems. + * The idea is to limit the number of threads to a practical value + * (e.g. a 64GB machine really only needs one scanner thread). For very + * large memory systems, we limit ourselves to MAX_PSCAN_THREADS + * threads. + * + * The scanner threads themselves are evenly spread out around the + * memory "clock" in pageout_scanner when we reset the hands, and each + * thread will scan all of memory. */ - if (recalc) - reset_hands = 1; + sz = (btop(64ULL * 0x40000000ULL)); + if (sz < handspreadpages) { + /* + * 64GB is smaller than the separation between the front + * and back hands; use double handspreadpages. + */ + sz = handspreadpages << 1; + } + if (sz > total_pages) { + sz = total_pages; + } + /* Record region size for inspection with mdb, otherwise unused */ + pscan_region_sz = sz; + + tmp = sz; + for (i = 1; tmp < total_pages; i++) { + tmp += sz; + } + + if (i > MAX_PSCAN_THREADS) + i = MAX_PSCAN_THREADS; + + des_page_scanners = i; } /* * Pageout scheduling. * * Schedpaging controls the rate at which the page out daemon runs by - * setting the global variables nscan and desscan RATETOSCHEDPAGING - * times a second. Nscan records the number of pages pageout has examined - * in its current pass; schedpaging resets this value to zero each time - * it runs. Desscan records the number of pages pageout should examine - * in its next pass; schedpaging sets this value based on the amount of - * currently available memory. + * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING + * times a second. The pageout_ticks variable controls the percent of one + * CPU that each page scanner thread should consume (see min_percent_cpu + * and max_percent_cpu descriptions). The desscan variable records the number + * of pages pageout should examine in its next pass; schedpaging sets this + * value based on the amount of currently available memory. In addtition, the + * nscan variable records the number of pages pageout has examined in its + * current pass; schedpaging resets this value to zero each time it runs. */ -#define RATETOSCHEDPAGING 4 /* hz that is */ +#define RATETOSCHEDPAGING 4 /* times/second */ -static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ +/* held while pageout_scanner or schedpaging are modifying shared data */ +static kmutex_t pageout_mutex; /* * Pool of available async pageout putpage requests. @@ -506,7 +594,7 @@ static kcondvar_t push_cv; static int async_list_size = 256; /* number of async request structs */ -static void pageout_scanner(void); +static void pageout_scanner(void *); /* * If a page is being shared more than "po_share" times @@ -535,67 +623,153 @@ schedpaging(void *arg) if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) kcage_cageout_wakeup(); - if (mutex_tryenter(&pageout_mutex)) { - /* pageout() not running */ - nscan = 0; - vavail = freemem - deficit; - if (pageout_new_spread != 0) - vavail -= needfree; - if (vavail < 0) - vavail = 0; - if (vavail > lotsfree) - vavail = lotsfree; + (void) atomic_swap_ulong(&nscan, 0); + vavail = freemem - deficit; + if (pageout_new_spread != 0) + vavail -= needfree; + if (vavail < 0) + vavail = 0; + if (vavail > lotsfree) + vavail = lotsfree; + /* + * Fix for 1161438 (CRS SPR# 73922). All variables + * in the original calculation for desscan were 32 bit signed + * ints. As freemem approaches 0x0 on a system with 1 Gig or + * more of memory, the calculation can overflow. When this + * happens, desscan becomes negative and pageout_scanner() + * stops paging out. + */ + if ((needfree) && (pageout_new_spread == 0)) { /* - * Fix for 1161438 (CRS SPR# 73922). All variables - * in the original calculation for desscan were 32 bit signed - * ints. As freemem approaches 0x0 on a system with 1 Gig or - * more of memory, the calculation can overflow. When this - * happens, desscan becomes negative and pageout_scanner() - * stops paging out. + * If we've not yet collected enough samples to + * calculate a spread, kick into high gear anytime + * needfree is non-zero. Note that desscan will not be + * the limiting factor for systems with larger memory; + * the %CPU will limit the scan. That will also be + * maxed out below. */ - if ((needfree) && (pageout_new_spread == 0)) { - /* - * If we've not yet collected enough samples to - * calculate a spread, use the old logic of kicking - * into high gear anytime needfree is non-zero. - */ - desscan = fastscan / RATETOSCHEDPAGING; - } else { - /* - * Once we've calculated a spread based on system - * memory and usage, just treat needfree as another - * form of deficit. - */ - spgcnt_t faststmp, slowstmp, result; + desscan = fastscan / RATETOSCHEDPAGING; + } else { + /* + * Once we've calculated a spread based on system + * memory and usage, just treat needfree as another + * form of deficit. + */ + spgcnt_t faststmp, slowstmp, result; + + slowstmp = slowscan * vavail; + faststmp = fastscan * (lotsfree - vavail); + result = (slowstmp + faststmp) / + nz(lotsfree) / RATETOSCHEDPAGING; + desscan = (pgcnt_t)result; + } + + /* + * If we've not yet collected enough samples to calculate a + * spread, also kick %CPU to the max. + */ + if (pageout_new_spread == 0) { + pageout_ticks = max_pageout_ticks; + } else { + pageout_ticks = min_pageout_ticks + + (lotsfree - vavail) * + (max_pageout_ticks - min_pageout_ticks) / + nz(lotsfree); + } - slowstmp = slowscan * vavail; - faststmp = fastscan * (lotsfree - vavail); - result = (slowstmp + faststmp) / - nz(lotsfree) / RATETOSCHEDPAGING; - desscan = (pgcnt_t)result; + if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) { + /* + * We have finished the pagescan initialization and the desired + * number of page scanners has changed, either because + * initialization just finished, because of a memory DR, or + * because des_page_scanners has been modified on the fly (i.e. + * by mdb). If we need more scanners, start them now, otherwise + * the excess scanners will terminate on their own when they + * reset their hands. + */ + uint_t i; + uint_t curr_nscan = n_page_scanners; + pgcnt_t max = total_pages / handspreadpages; + + if (des_page_scanners > max) + des_page_scanners = max; + + if (des_page_scanners > MAX_PSCAN_THREADS) { + des_page_scanners = MAX_PSCAN_THREADS; + } else if (des_page_scanners == 0) { + des_page_scanners = 1; } - pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * - (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); + /* + * Each thread has its own entry in the reset_hands array, so + * we don't need any locking in pageout_scanner to check the + * thread's reset_hands entry. Thus, we use a pre-allocated + * fixed size reset_hands array and upper limit on the number + * of pagescan threads. + * + * The reset_hands entries need to be true before we start new + * scanners, but if we're reducing, we don't want a race on the + * recalculation for the existing threads, so we set + * n_page_scanners first. + */ + n_page_scanners = des_page_scanners; + for (i = 0; i < MAX_PSCAN_THREADS; i++) { + reset_hands[i] = B_TRUE; + } - if (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim) { - TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, - "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); - } else { - /* - * There are enough free pages, no need to - * kick the scanner thread. And next time - * around, keep more of the `highly shared' - * pages. - */ - cv_signal_pageout(); - if (po_share > MIN_PO_SHARE) { - po_share >>= 1; + if (des_page_scanners > curr_nscan) { + /* Create additional pageout scanner threads. */ + for (i = curr_nscan; i < des_page_scanners; i++) { + (void) lwp_kernel_create(proc_pageout, + pageout_scanner, (void *)(uintptr_t)i, + TS_RUN, curthread->t_pri); } } + } + + zones_over = B_FALSE; + + if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) { + if (!PAGE_SCAN_STARTUP) + low_mem_scan++; + DTRACE_PROBE(schedpage__wake__low); + WAKE_PAGEOUT_SCANNER(); + + } else if (zone_num_over_cap > 0) { + /* One or more zones are over their cap. */ + + /* No page limit */ + desscan = total_pages; + + /* + * Increase the scanning CPU% to the max. This implies + * 80% of one CPU/sec if the scanner can run each + * opportunity. Can also be tuned via setting + * zone_pageout_ticks in /etc/system or with mdb. + */ + pageout_ticks = (zone_pageout_ticks != 0) ? + zone_pageout_ticks : max_pageout_ticks; + + zones_over = B_TRUE; + zone_cap_scan++; + + DTRACE_PROBE(schedpage__wake__zone); + WAKE_PAGEOUT_SCANNER(); + + } else { + /* + * There are enough free pages, no need to + * kick the scanner thread. And next time + * around, keep more of the `highly shared' + * pages. + */ + cv_signal_pageout(); + + mutex_enter(&pageout_mutex); + if (po_share > MIN_PO_SHARE) { + po_share >>= 1; + } mutex_exit(&pageout_mutex); } @@ -617,36 +791,46 @@ ulong_t push_list_size; /* # of requests on pageout queue */ #define FRONT 1 #define BACK 2 -int dopageout = 1; /* must be non-zero to turn page stealing on */ +int dopageout = 1; /* /etc/system tunable to disable page reclamation */ /* * The page out daemon, which runs as process 2. * - * As long as there are at least lotsfree pages, - * this process is not run. When the number of free - * pages stays in the range desfree to lotsfree, - * this daemon runs through the pages in the loop - * at a rate determined in schedpaging(). Pageout manages - * two hands on the clock. The front hand moves through - * memory, clearing the reference bit, - * and stealing pages from procs that are over maxrss. - * The back hand travels a distance behind the front hand, - * freeing the pages that have not been referenced in the time - * since the front hand passed. If modified, they are pushed to - * swap before being freed. + * Page out occurs when either: + * a) there is less than lotsfree pages, + * b) there are one or more zones over their physical memory cap. + * + * The daemon treats physical memory as a circular array of pages and scans the + * pages using a 'two-handed clock' algorithm. The front hand moves through + * the pages, clearing the reference bit. The back hand travels a distance + * (handspreadpages) behind the front hand, freeing the pages that have not + * been referenced in the time since the front hand passed. If modified, they + * are first written to their backing store before being freed. + * + * In order to make page invalidation more responsive on machines with larger + * memory, multiple pageout_scanner threads may be created. In this case, the + * threads are evenly distributed around the the memory "clock face" so that + * memory can be reclaimed more quickly (that is, there can be large regions in + * which no pages can be reclaimed by a single thread, leading to lag which + * causes undesirable behavior such as htable stealing). + * + * As long as there are at least lotsfree pages, or no zones over their cap, + * then pageout_scanner threads are not run. When pageout_scanner threads are + * running for case (a), all pages are considered for pageout. For case (b), + * only pages belonging to a zone over its cap will be considered for pageout. * - * There are 2 threads that act on behalf of the pageout process. - * One thread scans pages (pageout_scanner) and frees them up if + * There are multiple threads that act on behalf of the pageout process. + * A set of threads scan pages (pageout_scanner) and frees them up if * they don't require any VOP_PUTPAGE operation. If a page must be * written back to its backing store, the request is put on a list * and the other (pageout) thread is signaled. The pageout thread * grabs VOP_PUTPAGE requests from the list, and processes them. * Some filesystems may require resources for the VOP_PUTPAGE * operations (like memory) and hence can block the pageout - * thread, but the scanner thread can still operate. There is still + * thread, but the pageout_scanner threads can still operate. There is still * no guarantee that memory deadlocks cannot occur. * - * For now, this thing is in very rough form. + * The pageout_scanner parameters are determined in schedpaging(). */ void pageout() @@ -684,9 +868,9 @@ pageout() pageout_pri = curthread->t_pri; - /* Create the pageout scanner thread. */ - (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, - pageout_pri - 1); + /* Create the (first) pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0, + TS_RUN, pageout_pri - 1); /* * kick off pageout scheduler. @@ -720,6 +904,7 @@ pageout() arg->a_next = NULL; mutex_exit(&push_lock); + DTRACE_PROBE(pageout__push); if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { pushes++; @@ -740,32 +925,24 @@ pageout() * Kernel thread that scans pages looking for ones to free */ static void -pageout_scanner(void) +pageout_scanner(void *a) { struct page *fronthand, *backhand; - uint_t count; + uint_t count, iter = 0; callb_cpr_t cprinfo; - pgcnt_t nscan_limit; + pgcnt_t nscan_cnt, nscan_limit; pgcnt_t pcount; + uint_t inst = (uint_t)(uintptr_t)a; + hrtime_t sample_start, sample_end; + clock_t pageout_lbolt; + kmutex_t pscan_mutex; - CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); - mutex_enter(&pageout_mutex); + VERIFY3U(inst, <, MAX_PSCAN_THREADS); - /* - * The restart case does not attempt to point the hands at roughly - * the right point on the assumption that after one circuit things - * will have settled down - and restarts shouldn't be that often. - */ + mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL); - /* - * Set the two clock hands to be separated by a reasonable amount, - * but no more than 360 degrees apart. - */ - backhand = page_first(); - if (handspreadpages >= total_pages) - fronthand = page_nextn(backhand, total_pages - 1); - else - fronthand = page_nextn(backhand, handspreadpages); + CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan"); + mutex_enter(&pscan_mutex); min_pageout_ticks = MAX(1, ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); @@ -776,71 +953,116 @@ loop: cv_signal_pageout(); CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(&proc_pageout->p_cv, &pageout_mutex); - CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); + cv_wait(&proc_pageout->p_cv, &pscan_mutex); + CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex); if (!dopageout) goto loop; - if (reset_hands) { - reset_hands = 0; + if (reset_hands[inst]) { + struct page *first; + pgcnt_t offset = total_pages / n_page_scanners; - backhand = page_first(); - if (handspreadpages >= total_pages) + reset_hands[inst] = B_FALSE; + if (inst >= n_page_scanners) { + /* + * The desired number of page scanners has been + * reduced and this instance is no longer wanted. + * Exit the lwp. + */ + VERIFY3U(inst, !=, 0); + mutex_exit(&pscan_mutex); + mutex_enter(&curproc->p_lock); + lwp_exit(); + } + + /* + * The reset case repositions the hands at the proper place + * on the memory clock face to prevent creep into another + * thread's active region or when the number of threads has + * changed. + * + * Set the two clock hands to be separated by a reasonable + * amount, but no more than 360 degrees apart. + * + * If inst == 0, backhand starts at first page, otherwise + * it is (inst * offset) around the memory "clock face" so that + * we spread out each scanner instance evenly. + */ + first = page_first(); + backhand = page_nextn(first, offset * inst); + if (handspreadpages >= total_pages) { fronthand = page_nextn(backhand, total_pages - 1); - else + } else { fronthand = page_nextn(backhand, handspreadpages); + } } + /* + * This CPU kstat is only incremented here and we're obviously on this + * CPU, so no lock. + */ CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); count = 0; - TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, - "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", - freemem, lotsfree, nscan, desscan); - /* Kernel probe */ TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree); pcount = 0; - if (pageout_sample_cnt < pageout_sample_lim) { + nscan_cnt = 0; + if (PAGE_SCAN_STARTUP) { nscan_limit = total_pages; } else { nscan_limit = desscan; } + + DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst, + page_t *, backhand, page_t *, fronthand); + pageout_lbolt = ddi_get_lbolt(); sample_start = gethrtime(); /* * Scan the appropriate number of pages for a single duty cycle. - * However, stop scanning as soon as there is enough free memory. - * For a short while, we will be sampling the performance of the - * scanner and need to keep running just to get sample data, in - * which case we keep going and don't pay attention to whether - * or not there is enough free memory. + * Only scan while at least one of these is true: + * 1) one or more zones is over its cap + * 2) there is not enough free memory + * 3) during page scan startup when determining sample data */ - - while (nscan < nscan_limit && (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim)) { + while (nscan_cnt < nscan_limit && + (zones_over || + freemem < lotsfree + needfree || + PAGE_SCAN_STARTUP)) { int rvfront, rvback; + DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst); + /* * Check to see if we have exceeded our %CPU budget * for this wakeup, but not on every single page visited, * just every once in a while. */ if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { + clock_t pageout_cycle_ticks; + pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt; if (pageout_cycle_ticks >= pageout_ticks) { - ++pageout_timeouts; + /* + * This is where we normally break out of the + * loop when scanning zones or sampling. + */ + if (!zones_over) { + atomic_inc_64(&pageout_timeouts); + } + DTRACE_PROBE1(pageout__timeout, uint_t, inst); break; } } /* * If checkpage manages to add a page to the free list, - * we give ourselves another couple of trips around the loop. + * we give ourselves another couple of trips around memory. */ if ((rvfront = checkpage(fronthand, FRONT)) == 1) count = 0; @@ -850,7 +1072,8 @@ loop: ++pcount; /* - * protected by pageout_mutex instead of cpu_stat_lock + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, scan, 1); @@ -858,7 +1081,7 @@ loop: * Don't include ineligible pages in the number scanned. */ if (rvfront != -1 || rvback != -1) - nscan++; + nscan_cnt++; backhand = page_next(backhand); @@ -868,56 +1091,89 @@ loop: */ if ((fronthand = page_next(fronthand)) == page_first()) { - TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, - "pageout_hand_wrap:freemem %ld whichhand %d", - freemem, FRONT); + DTRACE_PROBE1(pageout__wrap__front, uint_t, inst); /* - * protected by pageout_mutex instead of cpu_stat_lock + * Every 64 wraps we reposition our hands within our + * region to prevent creep into another thread. + */ + if ((++iter % pageout_reset_cnt) == 0) + reset_hands[inst] = B_TRUE; + + /* + * This CPU kstat is only incremented here and we're + * obviously on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, rev, 1); - if (++count > 1) { + + /* + * If scanning because the system is low on memory, + * then when we wraparound memory we want to try to + * reclaim more pages. + * If scanning only because zones are over their cap, + * then wrapping is common and we simply keep going. + */ + if (freemem < lotsfree + needfree && ++count > 1) { /* + * The system is low on memory. * Extremely unlikely, but it happens. - * We went around the loop at least once - * and didn't get far enough. + * We went around memory at least once + * and didn't reclaim enough. * If we are still skipping `highly shared' * pages, skip fewer of them. Otherwise, * give up till the next clock tick. */ + mutex_enter(&pageout_mutex); if (po_share < MAX_PO_SHARE) { po_share <<= 1; + mutex_exit(&pageout_mutex); } else { /* - * Really a "goto loop", but - * if someone is TRACing or - * TNF_PROBE_ing, at least - * make records to show - * where we are. + * Really a "goto loop", but if someone + * is tracing or TNF_PROBE_ing, hit + * those probes first. */ + mutex_exit(&pageout_mutex); break; } } } } + atomic_add_long(&nscan, nscan_cnt); + sample_end = gethrtime(); - TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, - "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", - freemem, lotsfree, nscan, desscan, count); + DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount, + uint_t, inst); /* Kernel probe */ TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, - tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem); + tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free, + freemem); - if (pageout_sample_cnt < pageout_sample_lim) { + /* + * The following two blocks are only relevant when the scanner is + * first started up. After the scanner runs for a while, neither of + * the conditions will ever be true again. + * + * The global variables used below are only modified by this thread and + * only during initial scanning when there is a single page scanner + * thread running. Thus, we don't use any locking. + */ + if (PAGE_SCAN_STARTUP) { + VERIFY3U(inst, ==, 0); pageout_sample_pages += pcount; pageout_sample_etime += sample_end - sample_start; ++pageout_sample_cnt; - } - if (pageout_sample_cnt >= pageout_sample_lim && - pageout_new_spread == 0) { + + } else if (pageout_new_spread == 0) { + uint_t i; + + /* + * We have run enough samples, set the spread. + */ + VERIFY3U(inst, ==, 0); pageout_rate = (hrrate_t)pageout_sample_pages * (hrrate_t)(NANOSEC) / pageout_sample_etime; pageout_new_spread = pageout_rate / 10; @@ -931,9 +1187,8 @@ loop: * Look at the page at hand. If it is locked (e.g., for physical i/o), * system (u., page table) or free, then leave it alone. Otherwise, * if we are running the front hand, turn off the page's reference bit. - * If the proc is over maxrss, we take it. If running the back hand, - * check whether the page has been reclaimed. If not, free the page, - * pushing it to disk first if necessary. + * If running the back hand, check whether the page has been reclaimed. + * If not, free the page, pushing it to disk first if necessary. * * Return values: * -1 if the page is not a candidate at all, @@ -947,6 +1202,7 @@ checkpage(struct page *pp, int whichhand) int isfs = 0; int isexec = 0; int pagesync_flag; + zoneid_t zid = ALL_ZONES; /* * Skip pages: @@ -989,6 +1245,21 @@ checkpage(struct page *pp, int whichhand) return (-1); } + if (zones_over) { + ASSERT(pp->p_zoneid == ALL_ZONES || + pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); + if (pp->p_zoneid == ALL_ZONES || + zone_pdata[pp->p_zoneid].zpers_over == 0) { + /* + * Cross-zone shared page, or zone not over it's cap. + * Leave the page alone. + */ + page_unlock(pp); + return (-1); + } + zid = pp->p_zoneid; + } + /* * Maintain statistics for what we are freeing */ @@ -1016,31 +1287,24 @@ checkpage(struct page *pp, int whichhand) recheck: /* - * If page is referenced; make unreferenced but reclaimable. - * If this page is not referenced, then it must be reclaimable - * and we can add it to the free list. + * If page is referenced; fronthand makes unreferenced and reclaimable. + * For the backhand, a process referenced the page since the front hand + * went by, so it's not a candidate for freeing up. */ if (ppattr & P_REF) { - TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, - "pageout_isref:pp %p whichhand %d", pp, whichhand); + DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand); if (whichhand == FRONT) { - /* - * Checking of rss or madvise flags needed here... - * - * If not "well-behaved", fall through into the code - * for not referenced. - */ hat_clrref(pp); } - /* - * Somebody referenced the page since the front - * hand went by, so it's not a candidate for - * freeing up. - */ page_unlock(pp); return (0); } + /* + * This page is not referenced, so it must be reclaimable and we can + * add it to the free list. This can be done by either hand. + */ + VM_STAT_ADD(pageoutvmstats.checkpage[0]); /* @@ -1073,8 +1337,9 @@ recheck: u_offset_t offset = pp->p_offset; /* - * XXX - Test for process being swapped out or about to exit? - * [Can't get back to process(es) using the page.] + * Note: There is no possibility to test for process being + * swapped out or about to exit since we can't get back to + * process(es) from the page. */ /* @@ -1092,6 +1357,11 @@ recheck: VN_RELE(vp); return (0); } + if (isfs) { + zone_pageout_stat(zid, ZPO_DIRTY); + } else { + zone_pageout_stat(zid, ZPO_ANONDIRTY); + } return (1); } @@ -1102,8 +1372,7 @@ recheck: * the pagesync but before it was unloaded we catch it * and handle the page properly. */ - TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, - "pageout_free:pp %p whichhand %d", pp, whichhand); + DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand); (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_MOD | P_REF); if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) @@ -1120,8 +1389,10 @@ recheck: } else { CPU_STATS_ADD_K(vm, fsfree, 1); } + zone_pageout_stat(zid, ZPO_FS); } else { CPU_STATS_ADD_K(vm, anonfree, 1); + zone_pageout_stat(zid, ZPO_ANON); } return (1); /* freed a page! */ diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index c177ecfd75..ad35fd7187 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1627,7 +1627,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index c759f7e010..1db130797c 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -106,14 +106,16 @@ * removed from the list of active zones. zone_destroy() returns, and * the zone can be recreated. * - * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor - * callbacks are executed, and all memory associated with the zone is - * freed. + * ZONE_IS_FREE (internal state): All references have been dropped and + * the zone_t is no longer in the zone_active nor zone_deathrow lists. + * The zone_t is in the process of being freed. This state exists + * only for publishing a sysevent to indicate that the zone by this + * name can be booted again. * - * Threads can wait for the zone to enter a requested state by using - * zone_status_wait() or zone_status_timedwait() with the desired - * state passed in as an argument. Zone state transitions are - * uni-directional; it is not possible to move back to an earlier state. + * Threads can wait for the zone to enter a requested state (other than + * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait() + * with the desired state passed in as an argument. Zone state transitions + * are uni-directional; it is not possible to move back to an earlier state. * * * Zone-Specific Data: @@ -252,6 +254,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -312,6 +316,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -327,8 +332,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -350,6 +355,7 @@ const char *zone_status_table[] = { ZONE_EVENT_SHUTTING_DOWN, /* down */ ZONE_EVENT_SHUTTING_DOWN, /* dying */ ZONE_EVENT_UNINITIALIZED, /* dead */ + ZONE_EVENT_FREE, /* free */ }; /* @@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t); static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); static int zone_set_network(zoneid_t, zone_net_data_t *); static int zone_get_network(zoneid_t, zone_net_data_t *); +static void zone_status_set(zone_t *, zone_status_t); typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); @@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + + ASSERT(MUTEX_HELD(&p->p_lock)); + q = ptob(zp->zpers_pg_cnt); + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zoneid_t zid; + uint_t pg_val; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pdata[zid].zpers_pg_limit = pg_val; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); + zp = &zone_pdata[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts; zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; return (0); @@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_init_restarts, "init_restarts", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; @@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2101,8 +2579,12 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2209,6 +2691,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2250,6 +2747,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2260,6 +2771,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2281,6 +2797,9 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_restart_init_0 = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2362,6 +2881,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2377,6 +2898,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -2390,8 +2914,30 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); + /* + * This zone_t can no longer inhibit creation of another zone_t + * with the same name or debug ID. Generate a sysevent so that + * userspace tools know it is safe to carry on. + */ + mutex_enter(&zone_status_lock); + zone_status_set(zone, ZONE_IS_FREE); + mutex_exit(&zone_status_lock); + cpu_uarray_free(zone->zone_ustate); if (zone->zone_rootvp != NULL) @@ -2436,11 +2982,17 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); - ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && - status >= zone_status_get(zone)); + ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE || + status == ZONE_IS_FREE) && status >= zone_status_get(zone)); + + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || @@ -2449,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG (void) printf( "Failed to allocate and send zone state change event.\n"); +#else + /* EMPTY */ #endif } nvlist_free(nvl); @@ -2474,6 +3028,38 @@ zone_status_get(zone_t *zone) return (zone->zone_status); } +/* + * Publish a zones-related sysevent for purposes other than zone state changes. + * While it is unfortunate that zone_event_chan is associated with + * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be + * the only ones with class "status" and subclass "change". + */ +void +zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass, + nvlist_t *ev_nvl) +{ + nvlist_t *nvl = NULL; + timestruc_t now; + uint64_t t; + + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + + if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 || + nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 || + sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com", + "kernel", nvl, EVCH_SLEEP) != 0) { +#ifdef DEBUG + (void) printf("Failed to allocate and send zone misc event.\n"); +#else + /* EMPTY */ +#endif + } + nvlist_free(nvl); +} + static int zone_set_bootargs(zone_t *zone, const char *zone_bootargs) { @@ -2527,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2602,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname) } static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ - uint64_t mcap; - int err = 0; - - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; - - return (err); -} - -static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; @@ -3020,6 +3599,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3766,6 +4351,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3794,9 +4390,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -4282,8 +4923,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4402,7 +5044,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4474,6 +5116,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4481,6 +5124,9 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_restart_init_0 = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4547,8 +5193,13 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_max_swap_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); @@ -4557,6 +5208,13 @@ zone_create(const char *zone_name, const char *zone_root, */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -4705,8 +5363,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4845,6 +5503,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4855,7 +5514,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5184,6 +5852,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5217,6 +5886,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -5606,14 +6281,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5677,6 +6344,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5708,10 +6392,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * No attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID) { return (set_errno(EINVAL)); } @@ -5724,11 +6407,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) mutex_exit(&zonehash_lock); /* - * At present most attributes can only be set on non-running, + * At present attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5741,6 +6424,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone->zone_restart_init = B_FALSE; err = 0; break; + case ZONE_ATTR_INITRESTART0: + zone->zone_restart_init_0 = B_TRUE; + err = 0; + break; + case ZONE_ATTR_INITREBOOT: + zone->zone_reboot_on_init_exit = B_TRUE; + err = 0; + break; case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; @@ -5753,9 +6444,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5783,6 +6471,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6486,6 +7190,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6496,7 +7201,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6597,6 +7302,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6774,7 +7480,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6837,16 +7543,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6914,7 +7619,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6951,6 +7657,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7052,6 +7770,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; zone_t *thiszone; + /* + * Only the GZ may add a datalink to a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may add a + * datalink to a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * When links exist in the GZ, they aren't added to the GZ's + * zone_dl_list. We must enforce this because link_activate() + * depends on zone_check_datalink() returning only NGZs. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((thiszone = zone_find_by_id(zoneid)) == NULL) return (set_errno(ENXIO)); @@ -7084,6 +7823,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; int err = 0; + /* + * Only the GZ may remove a datalink from a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may remove a + * datalink from a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * If we can't add a datalink to the GZ's zone_dl_list then we + * certainly can't remove them either. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((zone = zone_find_by_id(zoneid)) == NULL) return (set_errno(EINVAL)); @@ -7101,25 +7860,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) } /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid. Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned. */ int zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) { zone_t *zone; + zoneid_t zoneid = *zoneidp; + zoneid_t caller = getzoneid(); int err = ENXIO; - if (*zoneidp != ALL_ZONES) { - if ((zone = zone_find_by_id(*zoneidp)) != NULL) { - if (zone_dl_exists(zone, linkid)) + /* + * Only the GZ may enquire about all zones; an NGZ may only + * enuqire about itself. + */ + if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) + zoneid = caller; + + if (zoneid != caller && caller != GLOBAL_ZONEID) + return (err); + + if (zoneid != ALL_ZONES) { + if ((zone = zone_find_by_id(zoneid)) != NULL) { + if (zone_dl_exists(zone, linkid)) { + /* + * We need to set this in case an NGZ + * passes ALL_ZONES. + */ + *zoneidp = zoneid; err = 0; + } zone_rele(zone); } return (err); } + ASSERT(caller == GLOBAL_ZONEID); mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { @@ -7130,6 +7927,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) } } mutex_exit(&zonehash_lock); + return (err); } @@ -7150,6 +7948,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) zone_dl_t *zdl; datalink_id_t *idptr = idarray; + /* + * Only the GZ or the owning zone may look at the datalink list. + */ + if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) + return (set_errno(EPERM)); + if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) return (set_errno(EFAULT)); if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7175,6 +7979,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) @@ -7388,3 +8199,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). + */ + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { + return; + } + + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpers_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_persist_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpers_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpers_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpers_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpers_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpers_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_persist_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpers_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/refhash/refhash.c index 8f96c2d9f1..e2de00597e 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c +++ b/usr/src/uts/common/refhash/refhash.c @@ -10,16 +10,18 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> +#include <sys/refhash.h> #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/kmem.h> #include <sys/list.h> #include <sys/ddi.h> +#define RHL_F_DEAD 0x01 + #ifdef lint extern refhash_link_t *obj_to_link(refhash_t *, void *); extern void *link_to_obj(refhash_t *, refhash_link_t *); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 8d26a71342..909160f2db 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -21,7 +21,7 @@ # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019, Joyent, Inc. # Copyright 2013 Garrett D'Amore <garrett@damore.org> # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com> @@ -258,6 +258,7 @@ CHKHDRS= \ flock.h \ flock_impl.h \ fork.h \ + frameio.h \ fss.h \ fsspriocntl.h \ fsid.h \ @@ -283,6 +284,7 @@ CHKHDRS= \ idmap.h \ ieeefp.h \ id_space.h \ + inotify.h \ instance.h \ int_const.h \ int_fmtio.h \ @@ -351,6 +353,7 @@ CHKHDRS= \ lgrp.h \ lgrp_user.h \ libc_kernel.h \ + limits.h \ link.h \ list.h \ list_impl.h \ @@ -435,6 +438,9 @@ CHKHDRS= \ ontrap.h \ open.h \ openpromio.h \ + overlay.h \ + overlay_common.h \ + overlay_target.h \ panic.h \ param.h \ pathconf.h \ @@ -511,6 +517,7 @@ CHKHDRS= \ sema_impl.h \ semaphore.h \ sendfile.h \ + sensors.h \ ser_sync.h \ session.h \ sha1.h \ @@ -659,6 +666,8 @@ CHKHDRS= \ vmem.h \ vmem_impl.h \ vmsystm.h \ + vnd.h \ + vnd_errno.h \ vnic.h \ vnic_impl.h \ vnode.h \ @@ -670,11 +679,13 @@ CHKHDRS= \ vuid_queue.h \ vuid_state.h \ vuid_store.h \ + vxlan.h \ wait.h \ waitq.h \ watchpoint.h \ winlockio.h \ zcons.h \ + zfd.h \ zone.h \ xti_inet.h \ xti_osi.h \ @@ -840,13 +851,14 @@ FSHDRS= \ autofs.h \ decomp.h \ dv_node.h \ - sdev_impl.h \ fifonode.h \ hsfs_isospec.h \ hsfs_node.h \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ @@ -856,6 +868,8 @@ FSHDRS= \ pc_label.h \ pc_node.h \ pxfs_ki.h \ + sdev_impl.h \ + sdev_plugin.h \ snode.h \ swapnode.h \ tmp.h \ @@ -980,6 +994,7 @@ SATAGENHDRS= \ SYSEVENTHDRS= \ ap_driver.h \ + datalink.h \ dev.h \ domain.h \ dr.h \ diff --git a/usr/src/uts/common/sys/acct.h b/usr/src/uts/common/sys/acct.h index f00884681b..e01ad61025 100644 --- a/usr/src/uts/common/sys/acct.h +++ b/usr/src/uts/common/sys/acct.h @@ -22,6 +22,7 @@ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -88,7 +89,7 @@ extern int acct(const char *); #if defined(_KERNEL) -void acct(char); +void acct(int); int sysacct(char *); struct vnode; diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 547c9cc241..80733aa31e 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_AGGR_IMPL_H @@ -54,25 +56,47 @@ extern "C" { */ #define MAC_PSEUDO_RING_INUSE 0x01 +#define MAX_GROUPS_PER_PORT 128 + +/* + * VLAN filters placed on the Rx pseudo group. + */ +typedef struct aggr_vlan { + list_node_t av_link; + uint16_t av_vid; /* VLAN ID */ + uint_t av_refs; /* num aggr clients using this VID */ +} aggr_vlan_t; + typedef struct aggr_unicst_addr_s { uint8_t aua_addr[ETHERADDRL]; struct aggr_unicst_addr_s *aua_next; } aggr_unicst_addr_t; typedef struct aggr_pseudo_rx_ring_s { - mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */ - struct aggr_port_s *arr_port; - mac_ring_handle_t arr_hw_rh; - uint_t arr_flags; - uint64_t arr_gen; + mac_ring_handle_t arr_rh; /* set by aggr_fill_ring() */ + struct aggr_port_s *arr_port; + struct aggr_pseudo_rx_group_s *arr_grp; + mac_ring_handle_t arr_hw_rh; + uint_t arr_flags; + uint64_t arr_gen; } aggr_pseudo_rx_ring_t; +/* + * An aggr pseudo group abstracts the underlying ports' HW groups. For + * example, if each port has 8 groups (mac_group_t), then the aggr + * will create 8 pseudo groups. Each pseudo group represents a + * collection of HW groups: one group from each port. If you have + * three ports then the pseudo group stands in for three HW groups. + */ typedef struct aggr_pseudo_rx_group_s { + uint_t arg_index; struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */ mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */ aggr_unicst_addr_t *arg_macaddr; aggr_pseudo_rx_ring_t arg_rings[MAX_RINGS_PER_GROUP]; uint_t arg_ring_cnt; + uint_t arg_untagged; /* num clients untagged */ + list_t arg_vlans; /* VLANs on this group */ } aggr_pseudo_rx_group_t; typedef struct aggr_pseudo_tx_ring_s { @@ -106,12 +130,13 @@ typedef struct aggr_port_s { lp_collector_enabled : 1, lp_promisc_on : 1, lp_no_link_update : 1, - lp_rx_grp_added : 1, lp_tx_grp_added : 1, lp_closing : 1, - lp_pad_bits : 24; + lp_pad_bits : 25; mac_handle_t lp_mh; - mac_client_handle_t lp_mch; + + mac_client_handle_t lp_mch; + const mac_info_t *lp_mip; mac_notify_handle_t lp_mnh; uint_t lp_tx_idx; /* idx in group's tx array */ @@ -123,13 +148,19 @@ typedef struct aggr_port_s { aggr_lacp_port_t lp_lacp; /* LACP state */ lacp_stats_t lp_lacp_stats; uint32_t lp_margin; - mac_promisc_handle_t lp_mphp; + mac_unicast_handle_t lp_mah; /* List of non-primary addresses that requires promiscous mode set */ aggr_unicst_addr_t *lp_prom_addr; - /* handle of the underlying HW RX group */ - mac_group_handle_t lp_hwgh; + + /* + * References to the underlying HW Rx groups of this port. + * Used by aggr to program HW classification for the pseudo + * groups. + */ + mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT]; + int lp_tx_ring_cnt; /* handles of the underlying HW TX rings */ mac_ring_handle_t *lp_tx_rings; @@ -176,7 +207,7 @@ typedef struct aggr_grp_s { lg_lso : 1, lg_pad_bits : 8; aggr_port_t *lg_ports; /* list of configured ports */ - aggr_port_t *lg_mac_addr_port; + aggr_port_t *lg_mac_addr_port; /* using address of this port */ mac_handle_t lg_mh; zoneid_t lg_zoneid; uint_t lg_nattached_ports; @@ -186,11 +217,18 @@ typedef struct aggr_grp_s { uint_t lg_tx_ports_size; /* size of lg_tx_ports */ uint32_t lg_tx_policy; /* outbound policy */ uint8_t lg_mac_tx_policy; - uint64_t lg_ifspeed; link_state_t lg_link_state; + + + /* + * The lg_stat_lock must be held when accessing these fields. + */ + kmutex_t lg_stat_lock; + uint64_t lg_ifspeed; link_duplex_t lg_link_duplex; uint64_t lg_stat[MAC_NSTAT]; uint64_t lg_ether_stat[ETHER_NSTAT]; + aggr_lacp_mode_t lg_lacp_mode; /* off, active, or passive */ Agg_t aggr; /* 802.3ad data */ uint32_t lg_hcksum_txflags; @@ -213,7 +251,9 @@ typedef struct aggr_grp_s { kthread_t *lg_lacp_rx_thread; boolean_t lg_lacp_done; - aggr_pseudo_rx_group_t lg_rx_group; + uint_t lg_rx_group_count; + aggr_pseudo_rx_group_t lg_rx_groups[MAX_GROUPS_PER_PORT]; + aggr_pseudo_tx_group_t lg_tx_group; kmutex_t lg_tx_flowctl_lock; @@ -335,8 +375,11 @@ extern void aggr_grp_port_hold(aggr_port_t *); extern void aggr_grp_port_rele(aggr_port_t *); extern void aggr_grp_port_wait(aggr_grp_t *); -extern int aggr_port_addmac(aggr_port_t *, const uint8_t *); -extern void aggr_port_remmac(aggr_port_t *, const uint8_t *); +extern int aggr_port_addmac(aggr_port_t *, uint_t, const uint8_t *); +extern void aggr_port_remmac(aggr_port_t *, uint_t, const uint8_t *); + +extern int aggr_port_addvlan(aggr_port_t *, uint_t, uint16_t); +extern int aggr_port_remvlan(aggr_port_t *, uint_t, uint16_t); extern mblk_t *aggr_ring_tx(void *, mblk_t *); extern mblk_t *aggr_find_tx_ring(void *, mblk_t *, diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 1fb5011970..b3b2898987 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -78,6 +78,9 @@ typedef struct { #define AT_FLAGS 8 /* processor flags */ #define AT_ENTRY 9 /* a.out entry point */ +/* First introduced on Linux */ +#define AT_RANDOM 25 /* address of 16 random bytes */ + /* * These relate to the original PPC ABI document; Linux reused * the values for other things (see below), so disambiguation of @@ -90,19 +93,18 @@ typedef struct { * These are the values from LSB 1.3, the first five are also described * in the draft amd64 ABI. * - * At the time of writing, Solaris doesn't place any of these values into - * the aux vector, except AT_CLKTCK which is placed on the aux vector for - * lx branded processes; also, we do similar things via AT_SUN_ values. + * At the time of writing, illumos doesn't place any of these values into the + * aux vector, except where noted. We do similar things via AT_SUN_ values. * * AT_NOTELF 10 program is not ELF? - * AT_UID 11 real user id - * AT_EUID 12 effective user id - * AT_GID 13 real group id - * AT_EGID 14 effective group id + * AT_UID 11 real user id (provided in LX) + * AT_EUID 12 effective user id (provided in LX) + * AT_GID 13 real group id (provided in LX) + * AT_EGID 14 effective group id (provided in LX) * * AT_PLATFORM 15 * AT_HWCAP 16 - * AT_CLKTCK 17 c.f. _SC_CLK_TCK + * AT_CLKTCK 17 c.f. _SC_CLK_TCK (provided in LX) * AT_FPUCW 18 * * AT_DCACHEBSIZE 19 (moved from 10) @@ -110,6 +112,16 @@ typedef struct { * AT_UCACHEBSIZE 21 (moved from 12) * * AT_IGNOREPPC 22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE 23 secure mode boolean (provided in LX) + * AT_BASE_PLATFORM 24 string identifying real platform, may + * differ from AT_PLATFORM. + * AT_HWCAP2 26 extension of AT_HWCAP + * AT_EXECFN 31 filename of program + * AT_SYSINFO 32 + * AT_SYSINFO_EHDR 33 The vDSO location */ /* @@ -186,6 +198,8 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_AUX1 2020 #define AT_SUN_BRAND_AUX2 2021 #define AT_SUN_BRAND_AUX3 2022 +#define AT_SUN_BRAND_AUX4 2025 +#define AT_SUN_BRAND_NROOT 2024 /* * Aux vector for comm page diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index badc3faff8..df22f492bf 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_BRAND_H @@ -102,29 +103,106 @@ struct brand_mach_ops; struct intpdata; struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock) + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + * during lwp creation before b_initlwp could be called. + * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock) + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + * b_clearbrand - Perform any actions necessary when clearing the brand. + * b_rpc_statd - Upcall to rpc.statd running within the zone + * b_acct_out - Output properly formatted accounting record + */ struct brand_ops { - void (*b_init_brand_data)(zone_t *); + void (*b_init_brand_data)(zone_t *, kmutex_t *); void (*b_free_brand_data)(zone_t *); int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void (*b_setbrand)(struct proc *); int (*b_getattr)(zone_t *, int, void *, size_t *); int (*b_setattr)(zone_t *, int, void *, size_t); void (*b_copy_procdata)(struct proc *, struct proc *); - void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_proc_exit)(struct proc *); void (*b_exec)(); void (*b_lwp_setrval)(klwp_t *, int, int); - int (*b_initlwp)(klwp_t *); + void *(*b_lwpdata_alloc)(struct proc *); + void (*b_lwpdata_free)(void *); + void (*b_initlwp)(klwp_t *, void *); + void (*b_initlwp_post)(klwp_t *); void (*b_forklwp)(klwp_t *, klwp_t *); void (*b_freelwp)(klwp_t *); void (*b_lwpexit)(klwp_t *); - int (*b_elfexec)(struct vnode *vp, struct execa *uap, - struct uarg *args, struct intpdata *idata, int level, - long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + int (*b_elfexec)(struct vnode *, struct execa *, struct uarg *, + struct intpdata *, int, size_t *, int, caddr_t, struct cred *, + int *); void (*b_sigset_native_to_brand)(sigset_t *); void (*b_sigset_brand_to_native)(sigset_t *); + void (*b_sigfd_translate)(k_siginfo_t *); int b_nsig; + void (*b_exit_with_sig)(proc_t *, sigqueue_t *); + boolean_t (*b_wait_filter)(proc_t *, proc_t *); + boolean_t (*b_native_exec)(uint8_t, const char **); + uint32_t (*b_map32limit)(proc_t *); + void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); + int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, + boolean_t *, int *); + int (*b_sigcld_repost)(proc_t *, sigqueue_t *); + int (*b_issig_stop)(proc_t *, klwp_t *); + boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); + int (*b_setid_clear)(vattr_t *vap, cred_t *cr); + int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); + boolean_t b_intp_parse_arg; + void (*b_clearbrand)(proc_t *, boolean_t); + void (*b_rpc_statd)(int, void *, void *); + void (*b_acct_out)(struct vnode *, int); }; /* @@ -135,6 +213,7 @@ typedef struct brand { char *b_name; struct brand_ops *b_ops; struct brand_mach_ops *b_machops; + size_t b_data_size; } brand_t; extern brand_t native_brand; @@ -165,7 +244,7 @@ extern brand_t *brand_register_zone(struct brand_attr *); extern brand_t *brand_find_name(char *); extern void brand_unregister_zone(brand_t *); extern int brand_zone_count(brand_t *); -extern void brand_setbrand(proc_t *); +extern int brand_setbrand(proc_t *, boolean_t); extern void brand_clearbrand(proc_t *, boolean_t); /* @@ -178,17 +257,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t, extern void brand_solaris_copy_procdata(proc_t *, proc_t *, struct brand *); extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, - intpdata_t *, int, long *, int, caddr_t, cred_t *, int, - struct brand *, char *, char *, char *, char *, char *); + intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int *, + struct brand *, char *, char *, char *); extern void brand_solaris_exec(struct brand *); extern int brand_solaris_fini(char **, struct modlinkage *, struct brand *); extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *); extern void brand_solaris_freelwp(klwp_t *, struct brand *); -extern int brand_solaris_initlwp(klwp_t *, struct brand *); +extern void brand_solaris_initlwp(klwp_t *, struct brand *); extern void brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void brand_solaris_proc_exit(struct proc *, klwp_t *, - struct brand *); +extern void brand_solaris_proc_exit(struct proc *, struct brand *); extern void brand_solaris_setbrand(proc_t *, struct brand *); #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index e20e0e0c35..b6b5c20e44 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. * * Copyright 2017 RackTop Systems. */ @@ -188,6 +189,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -200,6 +202,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h index 21cf94dcf9..2c70d7c9f1 100644 --- a/usr/src/uts/common/sys/contract/process.h +++ b/usr/src/uts/common/sys/contract/process.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_CONTRACT_PROCESS_H #define _SYS_CONTRACT_PROCESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/contract.h> #include <sys/time.h> @@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t; #define CT_PR_NOORPHAN 0x2 /* kill when contract is abandoned */ #define CT_PR_PGRPONLY 0x4 /* only kill process group on fatal errors */ #define CT_PR_REGENT 0x8 /* automatically detach inherited contracts */ -#define CT_PR_ALLPARAM 0xf +#define CT_PR_KEEP_EXEC 0x10 /* preserve template accross exec */ +#define CT_PR_ALLPARAM 0x1f /* * ctr_ev_* flags diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 8565ca053e..7ac2fafe2f 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -23,6 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2019 Joyent, Inc. * Copyright 2017 RackTop Systems. */ @@ -98,11 +99,11 @@ typedef struct cpu { /* * Links to other CPUs. It is safe to walk these lists if * one of the following is true: - * - cpu_lock held - * - preemption disabled via kpreempt_disable - * - PIL >= DISP_LEVEL - * - acting thread is an interrupt thread - * - all other CPUs are paused + * - cpu_lock held + * - preemption disabled via kpreempt_disable + * - PIL >= DISP_LEVEL + * - acting thread is an interrupt thread + * - all other CPUs are paused */ struct cpu *cpu_next; /* next existing CPU */ struct cpu *cpu_prev; /* prev existing CPU */ @@ -130,7 +131,7 @@ typedef struct cpu { */ char cpu_runrun; /* scheduling flag - set to preempt */ char cpu_kprunrun; /* force kernel preemption */ - pri_t cpu_chosen_level; /* priority at which cpu */ + pri_t cpu_chosen_level; /* priority at which cpu */ /* was chosen for scheduling */ kthread_t *cpu_dispthread; /* thread selected for dispatch */ disp_lock_t cpu_thread_lock; /* dispatcher lock on current thread */ @@ -286,7 +287,7 @@ extern cpu_core_t cpu_core[]; * list in avintr.c. */ #define INTR_ACTIVE(cpup, level) \ - ((level) <= LOCK_LEVEL ? \ + ((level) <= LOCK_LEVEL ? \ ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup))) /* @@ -389,7 +390,6 @@ extern cpu_core_t cpu_core[]; #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ /* Note: inside ifdef: _KERNEL || _KMEMUSER || _BOOT */ -#if defined(_MACHDEP) /* * Macros for manipulating sets of CPUs as a bitmap. Note that this @@ -405,34 +405,60 @@ extern cpu_core_t cpu_core[]; #define CPUSET_WORDS BT_BITOUL(NCPU) #define CPUSET_NOTINSET ((uint_t)-1) -#if CPUSET_WORDS > 1 - -typedef struct cpuset { +#if defined(_MACHDEP) +struct cpuset { ulong_t cpub[CPUSET_WORDS]; -} cpuset_t; +}; +#else +struct cpuset; +#endif + +typedef struct cpuset cpuset_t; + +extern cpuset_t *cpuset_alloc(int); +extern void cpuset_free(cpuset_t *); + +/* + * Functions for manipulating cpusets. These were previously considered + * private when some cpuset_t handling was performed in the CPUSET_* macros. + * They are now acceptable to use in non-_MACHDEP code. + */ +extern void cpuset_all(cpuset_t *); +extern void cpuset_all_but(cpuset_t *, const uint_t); +extern int cpuset_isnull(const cpuset_t *); +extern int cpuset_isequal(const cpuset_t *, const cpuset_t *); +extern void cpuset_only(cpuset_t *, const uint_t); +extern long cpu_in_set(const cpuset_t *, const uint_t); +extern void cpuset_add(cpuset_t *, const uint_t); +extern void cpuset_del(cpuset_t *, const uint_t); +extern uint_t cpuset_find(const cpuset_t *); +extern void cpuset_bounds(const cpuset_t *, uint_t *, uint_t *); +extern void cpuset_atomic_del(cpuset_t *, const uint_t); +extern void cpuset_atomic_add(cpuset_t *, const uint_t); +extern long cpuset_atomic_xadd(cpuset_t *, const uint_t); +extern long cpuset_atomic_xdel(cpuset_t *, const uint_t); +extern void cpuset_or(cpuset_t *, cpuset_t *); +extern void cpuset_xor(cpuset_t *, cpuset_t *); +extern void cpuset_and(cpuset_t *, cpuset_t *); +extern void cpuset_zero(cpuset_t *); + + +#if defined(_MACHDEP) /* - * Private functions for manipulating cpusets that do not fit in a - * single word. These should not be used directly; instead the - * CPUSET_* macros should be used so the code will be portable - * across different definitions of NCPU. + * Prior to the cpuset_t restructuring, the CPUSET_* macros contained + * significant logic, rather than directly invoking the backend functions. + * They are maintained here so that existing _MACHDEP code can use them. */ -extern void cpuset_all(cpuset_t *); -extern void cpuset_all_but(cpuset_t *, uint_t); -extern int cpuset_isnull(cpuset_t *); -extern int cpuset_cmp(cpuset_t *, cpuset_t *); -extern void cpuset_only(cpuset_t *, uint_t); -extern uint_t cpuset_find(cpuset_t *); -extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *); #define CPUSET_ALL(set) cpuset_all(&(set)) #define CPUSET_ALL_BUT(set, cpu) cpuset_all_but(&(set), cpu) #define CPUSET_ONLY(set, cpu) cpuset_only(&(set), cpu) -#define CPU_IN_SET(set, cpu) BT_TEST((set).cpub, cpu) -#define CPUSET_ADD(set, cpu) BT_SET((set).cpub, cpu) -#define CPUSET_DEL(set, cpu) BT_CLEAR((set).cpub, cpu) +#define CPU_IN_SET(set, cpu) cpu_in_set(&(set), cpu) +#define CPUSET_ADD(set, cpu) cpuset_add(&(set), cpu) +#define CPUSET_DEL(set, cpu) cpuset_del(&(set), cpu) #define CPUSET_ISNULL(set) cpuset_isnull(&(set)) -#define CPUSET_ISEQUAL(set1, set2) cpuset_cmp(&(set1), &(set2)) +#define CPUSET_ISEQUAL(set1, set2) cpuset_isequal(&(set1), &(set2)) /* * Find one CPU in the cpuset. @@ -460,86 +486,24 @@ extern void cpuset_bounds(cpuset_t *, uint_t *, uint_t *); * deleting a cpu that's not in the cpuset) */ -#define CPUSET_ATOMIC_DEL(set, cpu) BT_ATOMIC_CLEAR((set).cpub, (cpu)) -#define CPUSET_ATOMIC_ADD(set, cpu) BT_ATOMIC_SET((set).cpub, (cpu)) - -#define CPUSET_ATOMIC_XADD(set, cpu, result) \ - BT_ATOMIC_SET_EXCL((set).cpub, cpu, result) - -#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ - BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result) - - -#define CPUSET_OR(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] |= (set2).cpub[_i]; \ -} - -#define CPUSET_XOR(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] ^= (set2).cpub[_i]; \ -} - -#define CPUSET_AND(set1, set2) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set1).cpub[_i] &= (set2).cpub[_i]; \ -} - -#define CPUSET_ZERO(set) { \ - int _i; \ - for (_i = 0; _i < CPUSET_WORDS; _i++) \ - (set).cpub[_i] = 0; \ -} - -#elif CPUSET_WORDS == 1 - -typedef ulong_t cpuset_t; /* a set of CPUs */ - -#define CPUSET(cpu) (1UL << (cpu)) - -#define CPUSET_ALL(set) ((void)((set) = ~0UL)) -#define CPUSET_ALL_BUT(set, cpu) ((void)((set) = ~CPUSET(cpu))) -#define CPUSET_ONLY(set, cpu) ((void)((set) = CPUSET(cpu))) -#define CPU_IN_SET(set, cpu) ((set) & CPUSET(cpu)) -#define CPUSET_ADD(set, cpu) ((void)((set) |= CPUSET(cpu))) -#define CPUSET_DEL(set, cpu) ((void)((set) &= ~CPUSET(cpu))) -#define CPUSET_ISNULL(set) ((set) == 0) -#define CPUSET_ISEQUAL(set1, set2) ((set1) == (set2)) -#define CPUSET_OR(set1, set2) ((void)((set1) |= (set2))) -#define CPUSET_XOR(set1, set2) ((void)((set1) ^= (set2))) -#define CPUSET_AND(set1, set2) ((void)((set1) &= (set2))) -#define CPUSET_ZERO(set) ((void)((set) = 0)) - -#define CPUSET_FIND(set, cpu) { \ - cpu = (uint_t)(lowbit(set) - 1); \ -} - -#define CPUSET_BOUNDS(set, smallest, largest) { \ - smallest = (uint_t)(lowbit(set) - 1); \ - largest = (uint_t)(highbit(set) - 1); \ -} +#define CPUSET_ATOMIC_DEL(set, cpu) cpuset_atomic_del(&(set), cpu) +#define CPUSET_ATOMIC_ADD(set, cpu) cpuset_atomic_add(&(set), cpu) -#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu)) -#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu)) +#define CPUSET_ATOMIC_XADD(set, cpu, result) \ + (result) = cpuset_atomic_xadd(&(set), cpu) -#define CPUSET_ATOMIC_XADD(set, cpu, result) \ - { result = atomic_set_long_excl(&(set), (cpu)); } +#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ + (result) = cpuset_atomic_xdel(&(set), cpu) -#define CPUSET_ATOMIC_XDEL(set, cpu, result) \ - { result = atomic_clear_long_excl(&(set), (cpu)); } +#define CPUSET_OR(set1, set2) cpuset_or(&(set1), &(set2)) -#else /* CPUSET_WORDS <= 0 */ +#define CPUSET_XOR(set1, set2) cpuset_xor(&(set1), &(set2)) -#error NCPU is undefined or invalid +#define CPUSET_AND(set1, set2) cpuset_and(&(set1), &(set2)) -#endif /* CPUSET_WORDS */ - -extern cpuset_t cpu_seqid_inuse; +#define CPUSET_ZERO(set) cpuset_zero(&(set)) -#endif /* _MACHDEP */ +#endif /* _MACHDEP */ #endif /* _KERNEL || _KMEMUSER || _BOOT */ #define CPU_CPR_OFFLINE 0x0 @@ -550,10 +514,14 @@ extern cpuset_t cpu_seqid_inuse; #if defined(_KERNEL) || defined(_KMEMUSER) +extern cpuset_t cpu_seqid_inuse; + extern struct cpu *cpu[]; /* indexed by CPU number */ extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ extern cpu_t *cpu_list; /* list of CPUs */ extern cpu_t *cpu_active; /* list of active CPUs */ +extern cpuset_t cpu_active_set; /* cached set of active CPUs */ +extern cpuset_t cpu_available; /* cached set of available CPUs */ extern int ncpus; /* number of CPUs present */ extern int ncpus_online; /* number of CPUs not quiesced */ extern int max_ncpus; /* max present before ncpus is known */ @@ -572,13 +540,19 @@ extern struct cpu *curcpup(void); #endif /* - * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id - * as the target and to grab cpu_lock instead of requiring the caller - * to grab it. + * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's + * current CPU is; holding cpu_lock is not required. */ #define CPU_CURRENT -3 /* + * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a + * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock + * is not required. + */ +#define CPU_BEST -4 + +/* * Per-CPU statistics * * cpu_stats_t contains numerous system and VM-related statistics, in the form @@ -613,7 +587,7 @@ extern struct cpu *curcpup(void); */ #define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) -#endif /* _KERNEL || _KMEMUSER */ +#endif /* defined(_KERNEL) || defined(_KMEMUSER) */ /* * CPU support routines (not for genassym.c) diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index fb79dfecde..1f938132e0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/cyclic.h b/usr/src/uts/common/sys/cyclic.h index 5f28543f9f..270a09449f 100644 --- a/usr/src/uts/common/sys/cyclic.h +++ b/usr/src/uts/common/sys/cyclic.h @@ -23,6 +23,7 @@ * Use is subject to license terms. * * Copyright 2017 RackTop Systems. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_CYCLIC_H @@ -81,6 +82,7 @@ extern cyclic_id_t cyclic_add_omni(cyc_omni_handler_t *); extern void cyclic_remove(cyclic_id_t); extern void cyclic_bind(cyclic_id_t, cpu_t *, cpupart_t *); extern int cyclic_reprogram(cyclic_id_t, hrtime_t); +extern void cyclic_move_here(cyclic_id_t); extern hrtime_t cyclic_getres(); extern int cyclic_offline(cpu_t *cpu); diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h index b324f4d323..cb3711edcd 100644 --- a/usr/src/uts/common/sys/disp.h +++ b/usr/src/uts/common/sys/disp.h @@ -23,6 +23,8 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -63,11 +65,11 @@ typedef struct _disp { /* * Priorities: * disp_maxrunpri is the maximum run priority of runnable threads - * on this queue. It is -1 if nothing is runnable. + * on this queue. It is -1 if nothing is runnable. * * disp_max_unbound_pri is the maximum run priority of threads on * this dispatch queue but runnable by any CPU. This may be left - * artificially high, then corrected when some CPU tries to take + * artificially high, then corrected when some CPU tries to take * an unbound thread. It is -1 if nothing is runnable. */ pri_t disp_maxrunpri; /* maximum run priority */ @@ -151,8 +153,7 @@ extern void dq_srundec(kthread_t *); extern void cpu_rechoose(kthread_t *); extern void cpu_surrender(kthread_t *); extern void kpreempt(int); -extern struct cpu *disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t, - struct cpu *); +extern struct cpu *disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t); extern int disp_bound_threads(struct cpu *, int); extern int disp_bound_anythreads(struct cpu *, int); extern int disp_bound_partition(struct cpu *, int); @@ -167,6 +168,8 @@ extern void resume_from_zombie(kthread_t *) extern void disp_swapped_enq(kthread_t *); extern int disp_anywork(void); +extern struct cpu *disp_choose_best_cpu(void); + #define KPREEMPT_SYNC (-1) #define kpreempt_disable() \ { \ @@ -183,6 +186,8 @@ extern int disp_anywork(void); #endif /* _KERNEL */ +#define CPU_IDLE_PRI (-1) + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index 6449f39a35..5be223ce93 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -192,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -204,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* @@ -356,6 +358,7 @@ typedef struct dld_ioc_led { #define DLD_CAPAB_POLL 0x00000002 #define DLD_CAPAB_PERIM 0x00000003 #define DLD_CAPAB_LSO 0x00000004 +#define DLD_CAPAB_IPCHECK 0x00000005 #define DLD_ENABLE 0x00000001 #define DLD_DISABLE 0x00000002 @@ -382,6 +385,7 @@ typedef struct dld_ioc_led { */ typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define DI_DIRECT_RAW 0x1 /* * Direct Tx/Rx capability. */ @@ -406,8 +410,16 @@ typedef struct dld_capab_direct_s { /* flow control "can I put on a ring" callback */ uintptr_t di_tx_fctl_df; /* canput-like callback */ void *di_tx_fctl_dh; + + /* flags that control our behavior */ + uint_t di_flags; } dld_capab_direct_t; +typedef struct dld_capab_ipcheck_s { + uintptr_t ipc_allowed_df; + void *ipc_allowed_dh; +} dld_capab_ipcheck_t; + /* * Polling/softring capability. */ diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 035eea893a..336fa9cb67 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum { typedef enum { DLD_UNINITIALIZED, DLD_PASSIVE, - DLD_ACTIVE + DLD_ACTIVE, + DLD_EXCLUSIVE } dld_passivestate_t; /* @@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, extern void dld_str_notify_ind(dld_str_t *); extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, uintptr_t, uint16_t); +extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" { #define IPTUN_IOC 0x454A #define BRIDGE_IOC 0xB81D #define IBPART_IOC 0x6171 +#define OVERLAY_IOC 0x2005 /* GLDv3 modules use these macros to generate unique ioctl commands */ #define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" { #define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid)) #define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid)) #define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid)) #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..d76daffeb7 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -107,6 +108,7 @@ typedef struct dl_ipnetinfo { #define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */ #define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */ #define DL_NOTIFY_CONF 0x116 /* Notification from upstream */ +#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */ /* * Primitives used for Connectionless Service @@ -388,6 +390,8 @@ typedef struct dl_ipnetinfo { #define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */ #define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */ #define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */ +#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */ +#define DL_PROMISC_FIXUPS 0x05 /* above will be fixed up */ /* * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -673,11 +677,11 @@ typedef struct { #define HCKSUM_ENABLE 0x01 /* Set to enable hardware checksum */ /* capability */ #define HCKSUM_INET_PARTIAL 0x02 /* Partial 1's complement checksum */ - /* ability */ + /* ability for TCP/UDP packets. */ #define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */ - /* ability for IPv4 packets. */ + /* ability for IPv4 TCP/UDP packets. */ #define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */ - /* ability for IPv6 packets. */ + /* ability for IPv6 TCP/UDP packets. */ #define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */ /* capability */ #ifdef _KERNEL @@ -1107,6 +1111,13 @@ typedef struct { } dl_intr_mode_req_t; /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { + t_uscalar_t dl_primitive; +} dl_exclusive_req_t; + +/* * CONNECTION-ORIENTED SERVICE PRIMITIVES */ @@ -1528,6 +1539,7 @@ union DL_primitives { dl_control_ack_t control_ack; dl_passive_req_t passive_req; dl_intr_mode_req_t intr_mode_req; + dl_exclusive_req_t exclusive_req; }; #define DL_INFO_REQ_SIZE sizeof (dl_info_req_t) @@ -1596,6 +1608,7 @@ union DL_primitives { #define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t) #define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t) #define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t) +#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t) #ifdef _KERNEL /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 6bd2bbe35a..81f9e2abac 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_H @@ -85,6 +86,8 @@ typedef struct dls_link_s dls_link_t; #define DLS_PROMISC_SAP 0x00000001 #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 +#define DLS_PROMISC_RX_ONLY 0x00000008 +#define DLS_PROMISC_FIXUPS 0x00000010 extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); extern void dls_close(dld_str_t *); @@ -106,11 +109,13 @@ extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); +extern int dls_devnet_open_in_zone(const char *, + dls_dl_handle_t *, dev_t *, zoneid_t); extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -122,12 +127,13 @@ extern int dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *); extern void dls_devnet_rele(dls_dl_handle_t); extern void dls_devnet_prop_task_wait(dls_dl_handle_t); +extern const char *dls_devnet_link(dls_dl_handle_t); extern const char *dls_devnet_mac(dls_dl_handle_t); extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -141,6 +147,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t, extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, datalink_class_t *, uint32_t *, uint32_t *); extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int dls_mgmt_get_linkid_in_zone(const char *, + datalink_id_t *, zoneid_t); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); extern int dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 60f51c47b5..329f8dd08e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_IMPL_H @@ -46,11 +47,12 @@ typedef struct dls_multicst_addr_s { } dls_multicst_addr_t; struct dls_link_s { /* Protected by */ - char dl_name[MAXNAMELEN]; /* SL */ + char dl_name[MAXNAMELEN]; /* RO */ uint_t dl_ddi_instance; /* SL */ mac_handle_t dl_mh; /* SL */ mac_client_handle_t dl_mch; /* SL */ mac_unicast_handle_t dl_mah; /* SL */ + mac_notify_handle_t dl_mnh; /* SL */ const mac_info_t *dl_mip; /* SL */ uint_t dl_ref; /* SL */ mod_hash_t *dl_str_hash; /* SL, modhash lock */ @@ -61,6 +63,7 @@ struct dls_link_s { /* Protected by */ uint_t dl_zone_ref; link_tagmode_t dl_tagmode; /* atomic */ uint_t dl_nonip_cnt; /* SL */ + uint_t dl_exclusive; /* SL */ }; typedef struct dls_head_s { @@ -96,13 +99,16 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); extern int dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *, dls_link_t **); extern void dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *); +extern int dls_devnet_hold_tmp_by_link(dls_link_t *, + dls_dl_handle_t *); extern void dls_init(void); extern int dls_fini(void); @@ -126,6 +132,7 @@ extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int dls_exclusive_set(dld_str_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b4032c24d6..6fec277991 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum { DATALINK_CLASS_SIMNET = 0x20, DATALINK_CLASS_BRIDGE = 0x40, DATALINK_CLASS_IPTUN = 0x80, - DATALINK_CLASS_PART = 0x100 + DATALINK_CLASS_PART = 0x100, + DATALINK_CLASS_OVERLAY = 0x200 } datalink_class_t; #define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ - DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) + DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ + DATALINK_CLASS_OVERLAY) /* * A combination of flags and media. @@ -111,10 +114,14 @@ typedef uint64_t datalink_media_t; #define DLMGMT_CMD_BASE 128 /* - * Indicate the link mapping is active or persistent + * Indicate if the link mapping is active, persistent, or transient. A + * transient link is an active link with a twist -- it is an active + * link which is destroyed along with the zone rather than reassigned + * to the GZ. */ #define DLMGMT_ACTIVE 0x01 #define DLMGMT_PERSIST 0x02 +#define DLMGMT_TRANSIENT 0x04 /* upcall argument */ typedef struct dlmgmt_door_arg { @@ -165,6 +172,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { @@ -225,6 +233,7 @@ typedef struct dlmgmt_getattr_retval_s { char lr_attrval[MAXLINKATTRVALLEN]; } dlmgmt_getattr_retval_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index 4bd884e9c2..1a2ca397ef 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -500,6 +500,11 @@ typedef struct { #define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */ #define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */ +/* + * Linux specific program headers not even used by Linux (!!) + */ +#define PT_PAX_FLAGS 0x65041580 /* PaX flags (see below) */ + #define PT_LOSUNW 0x6ffffffa #define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment (unused) */ #define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */ @@ -515,6 +520,45 @@ typedef struct { #define PF_W 0x2 #define PF_X 0x1 +/* + * PaX is a regrettable series of never-integrated Linux patches for a + * facility to provide additional protections on memory pages for purposes of + * increasing security, and for allowing binaries to demand (or refuse) those + * protections via the PT_PAX_FLAGS program header. (Portents of its + * rudderless existence, "PaX" is a term of indefinite origin written by an + * unknown group of people.) This facility is unfortunate in any number of + * ways, and was largely obviated by the broad adoption of non-executable + * stacks at any rate -- but it lives on in binaries that continue to mark + * themselves to explicitly refuse the (never-integrated, now-obviated) + * facility. One might cringe that PaX overloads the meaning of the p_flags + * to specify protections, but that is the least of its transgressions: + * instead of using one p_type constant to explicitly enable a series of + * protections and another to explicitly disable others, it insists on + * conflating both actions into PT_PAX_FLAGS. The resulting doubling of + * constant definitions (two constant definitions for every protection instead + * of merely one) assures that the values can't even fit in the eight + * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for + * such filthy functionality. And were all of this not enough, there is one + * final nomenclature insult to be added to this semantic injury: the + * constants for the p_flags don't even embed "_PAX_" in their name -- despite + * the fact that this is their only purpose! We resist the temptation to + * right this final wrong here; we grit our teeth and provide exactly the + * Linux definitions -- or rather, what would have been the Linux definitions + * had this belching jalopy ever been permitted to crash itself into mainline. + */ +#define PF_PAGEEXEC 0x00000010 /* PaX: enable PAGEEXEC */ +#define PF_NOPAGEEXEC 0x00000020 /* PaX: disable PAGEEXEC */ +#define PF_SEGMEXEC 0x00000040 /* PaX: enable SEGMEXEC */ +#define PF_NOSEGMEXEC 0x00000080 /* PaX: disable SEGMEXEC */ +#define PF_MPROTECT 0x00000100 /* PaX: enable MPROTECT */ +#define PF_NOMPROTECT 0x00000200 /* PaX: disable MPROTECT */ +#define PF_RANDEXEC 0x00000400 /* PaX: enable RANDEXEC */ +#define PF_NORANDEXEC 0x00000800 /* PaX: disable RANDEXEC */ +#define PF_EMUTRAMP 0x00001000 /* PaX: enable EMUTRAMP */ +#define PF_NOEMUTRAMP 0x00002000 /* PaX: disable EMUTRAMP */ +#define PF_RANDMMAP 0x00004000 /* PaX: enable RANDMMAP */ +#define PF_NORANDMMAP 0x00008000 /* PaX: disable RANDMMAP */ + #define PF_MASKOS 0x0ff00000 /* OS specific values */ #define PF_MASKPROC 0xf0000000 /* processor specific values */ diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h index 1b0d961b0b..b64a101348 100644 --- a/usr/src/uts/common/sys/eventfd.h +++ b/usr/src/uts/common/sys/eventfd.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -47,6 +47,13 @@ typedef uint64_t eventfd_t; #define EVENTFDIOC (('e' << 24) | ('f' << 16) | ('d' << 8)) #define EVENTFDIOC_SEMAPHORE (EVENTFDIOC | 1) /* toggle sem state */ +/* + * Kernel-internal method to write to eventfd while bypassing overflow limits, + * therefore avoiding potential to block as well. This is used to fulfill AIO + * behavior in LX related to eventfd notification. + */ +#define EVENTFDIOC_POST (EVENTFDIOC | 2) + #ifndef _KERNEL extern int eventfd(unsigned int, int); @@ -58,6 +65,7 @@ extern int eventfd_write(int, eventfd_t); #define EVENTFDMNRN_EVENTFD 0 #define EVENTFDMNRN_CLONE 1 #define EVENTFD_VALMAX (ULLONG_MAX - 1ULL) +#define EVENTFD_VALOVERFLOW ULLONG_MAX #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index 8056f9a8e8..12115b7e27 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -26,6 +26,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright 2019 Joyent, Inc. + */ + #ifndef _SYS_EXEC_H #define _SYS_EXEC_H @@ -79,7 +83,7 @@ typedef struct uarg { ssize_t arglen; char *fname; char *pathname; - ssize_t auxsize; + size_t auxsize; caddr_t stackend; size_t stk_align; size_t stk_size; @@ -102,10 +106,13 @@ typedef struct uarg { vnode_t *ex_vp; char *emulator; char *brandname; + const char *brand_nroot; char *auxp_auxflags; /* addr of auxflags auxv on the user stack */ char *auxp_brand; /* address of first brand auxv on user stack */ cred_t *pfcred; boolean_t scrubenv; + uintptr_t maxstack; + boolean_t stk_prot_override; uintptr_t commpage; } uarg_t; @@ -175,8 +182,8 @@ struct execsw { int exec_maglen; int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, - long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + size_t *execsz, int setid, caddr_t exec_file, + struct cred *cred, int *brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -213,8 +220,8 @@ extern int exece(const char *fname, const char **argp, const char **envp); extern int exec_common(const char *fname, const char **argp, const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, - struct intpdata *idata, int level, long *execsz, caddr_t exec_file, - struct cred *cred, int brand_action); + struct intpdata *idata, int level, size_t *execsz, caddr_t exec_file, + struct cred *cred, int *brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); @@ -239,26 +246,32 @@ extern void exec_set_sp(size_t); * when compiling the 32-bit compatability elf code in the elfexec module. */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, uint_t *, caddr_t *, + size_t *); #endif /* !_ELF32_COMPAT */ #if defined(_LP64) extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, uint_t *, caddr_t *, + size_t *); #endif /* _LP64 */ /* * Utility functions for exec module core routines: */ -extern int core_seg(proc_t *, vnode_t *, offset_t, caddr_t, - size_t, rlim64_t, cred_t *); +extern int core_seg(proc_t *, vnode_t *, u_offset_t, caddr_t, size_t, + rlim64_t, cred_t *); -extern int core_write(vnode_t *, enum uio_seg, offset_t, - const void *, size_t, rlim64_t, cred_t *); +extern int core_write(vnode_t *, enum uio_seg, u_offset_t, const void *, + size_t, rlim64_t, cred_t *); /* a.out stuff */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index ec0741fe08..556a7ab2a1 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -27,13 +27,13 @@ /* All Rights Reserved */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ -/* Copyright 2015 Joyent, Inc. */ +/* Copyright 2017 Joyent, Inc. */ #ifndef _SYS_FILE_H #define _SYS_FILE_H #include <sys/t_lock.h> -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL) #include <sys/model.h> #include <sys/user.h> #endif @@ -122,11 +122,6 @@ typedef struct fpollinfo { #if defined(_KERNEL) || defined(_FAKE_KERNEL) /* - * This is a flag that is set on f_flag2, but is never user-visible - */ -#define FEPOLLED 0x8000 - -/* * Fake flags for driver ioctl calls to inform them of the originating * process' model. See <sys/model.h> * @@ -200,6 +195,7 @@ struct vattr; struct uf_info; extern file_t *getf(int); +extern file_t *getf_gen(int, uf_entry_gen_t *); extern void releasef(int); extern void areleasef(int, struct uf_info *); #ifndef _BOOT @@ -226,6 +222,7 @@ extern void fcnt_add(struct uf_info *, int); extern void close_exec(struct uf_info *); extern void clear_stale_fd(void); extern void clear_active_fd(int); +extern void set_active_fd(int); extern void free_afd(afd_t *afd); extern int fgetstartvp(int, char *, struct vnode **); extern int fsetattrat(int, char *, int, struct vattr *); diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define _SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { + void *fv_buf; /* Buffer with data */ + size_t fv_buflen; /* Size of the buffer */ + size_t fv_actlen; /* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { + uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */ + uint_t fio_nvpf; /* How many vectors make up one frame */ + uint_t fio_nvecs; /* The total number of vectors */ + framevec_t fio_vecs[]; /* C99 VLA */ +} frameio_t; + + +#define FRAMEIO_VERSION_ONE 1 +#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE + +#define FRAMEIO_NVECS_MAX 32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { + caddr32_t fv_buf; + size32_t fv_buflen; + size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { + uint_t fio_version; + uint_t fio_vecspframe; + uint_t fio_nvecs; + framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { + MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, + int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/fifonode.h b/usr/src/uts/common/sys/fs/fifonode.h index d8b158ce3c..1ea8563e1c 100644 --- a/usr/src/uts/common/sys/fs/fifonode.h +++ b/usr/src/uts/common/sys/fs/fifonode.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -83,6 +84,7 @@ struct fifonode { struct msgb *fn_tail; /* last message to read */ fifolock_t *fn_lock; /* pointer to per fifo lock */ uint_t fn_count; /* Number of bytes on fn_mp */ + uint_t fn_hiwat; /* pipe (fifofast) high water */ kcondvar_t fn_wait_cv; /* fifo conditional variable */ ushort_t fn_wcnt; /* number of writers */ ushort_t fn_rcnt; /* number of readers */ @@ -135,6 +137,8 @@ typedef struct fifodata { #define FIFOPOLLRBAND 0x20000 #define FIFOSTAYFAST 0x40000 /* don't turn into stream mode */ #define FIFOWAITMODE 0x80000 /* waiting for the possibility to change mode */ +/* Data on loan, block reads. Use in conjunction with FIFOSTAYFAST. */ +#define FIFORDBLOCK 0x100000 #define FIFOHIWAT (16 * 1024) #define FIFOLOWAT (0) @@ -147,16 +151,6 @@ typedef struct fifodata { #if defined(_KERNEL) -/* - * Fifohiwat defined as a variable is to allow tuning of the high - * water mark if needed. It is not meant to be released. - */ -#if FIFODEBUG -extern int Fifohiwat; -#else /* FIFODEBUG */ -#define Fifohiwat FIFOHIWAT -#endif /* FIFODEBUG */ - extern struct vnodeops *fifo_vnodeops; extern const struct fs_operation_def fifo_vnodeops_template[]; extern struct kmem_cache *fnode_cache; @@ -181,6 +175,8 @@ extern void fifo_fastoff(fifonode_t *); extern struct streamtab *fifo_getinfo(); extern void fifo_wakereader(fifonode_t *, fifolock_t *); extern void fifo_wakewriter(fifonode_t *, fifolock_t *); +extern boolean_t fifo_stayfast_enter(fifonode_t *); +extern void fifo_stayfast_exit(fifonode_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_INFO_H +#define _SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index 9f9ce5c8c1..d1c5f674f1 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -37,6 +37,7 @@ extern "C" { #include <sys/vfs_opreg.h> #include <sys/list.h> #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h> #include <sys/sunddi.h> /* @@ -129,6 +130,21 @@ typedef struct sdev_local_data { struct sdev_dprof sdev_lprof; /* profile for multi-inst */ } sdev_local_data_t; +/* sdev_flags */ +typedef enum sdev_flags { + SDEV_BUILD = 0x0001, /* directory cache out-of-date */ + SDEV_GLOBAL = 0x0002, /* global /dev nodes */ + SDEV_PERSIST = 0x0004, /* backing store persisted node */ + SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */ + SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */ + /* (ex: pts) */ + SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */ + SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */ + /* need update */ + SDEV_SUBDIR = 0x0080, /* match all subdirs under here */ + SDEV_ZONED = 0x0100 /* zoned subdir */ +} sdev_flags_t; + /* * /dev filesystem sdev_node defines */ @@ -151,7 +167,7 @@ typedef struct sdev_node { ino64_t sdev_ino; /* inode */ uint_t sdev_nlink; /* link count */ int sdev_state; /* state of this node */ - int sdev_flags; /* flags bit */ + sdev_flags_t sdev_flags; /* flags bit */ kmutex_t sdev_lookup_lock; /* node creation synch lock */ kcondvar_t sdev_lookup_cv; /* node creation sync cv */ @@ -162,7 +178,7 @@ typedef struct sdev_node { struct sdev_global_data sdev_globaldata; struct sdev_local_data sdev_localdata; } sdev_instance_data; - + list_node_t sdev_plist; /* link on plugin list */ void *sdev_private; } sdev_node_t; @@ -193,29 +209,11 @@ typedef enum { SDEV_READY } sdev_node_state_t; -/* sdev_flags */ -#define SDEV_BUILD 0x0001 /* directory cache out-of-date */ -#define SDEV_GLOBAL 0x0002 /* global /dev nodes */ -#define SDEV_PERSIST 0x0004 /* backing store persisted node */ -#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */ -#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */ - /* (ex: pts) */ -#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */ -#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */ - /* need update */ -#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */ -#define SDEV_ZONED 0x0100 /* zoned subdir */ - /* sdev_lookup_flags */ #define SDEV_LOOKUP 0x0001 /* node creation in progress */ #define SDEV_READDIR 0x0002 /* VDIR readdir in progress */ #define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */ -#define SDEV_VTOR_INVALID -1 -#define SDEV_VTOR_SKIP 0 -#define SDEV_VTOR_VALID 1 -#define SDEV_VTOR_STALE 2 - /* convenient macros */ #define SDEV_IS_GLOBAL(dv) \ (dv->sdev_flags & SDEV_GLOBAL) @@ -368,8 +366,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *, extern int devname_profile_update(char *, size_t); extern struct sdev_data *sdev_find_mntinfo(char *); void sdev_mntinfo_rele(struct sdev_data *); +typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *); +void sdev_mnt_walk(sdev_mnt_walk_f, void *); extern struct vnodeops *devpts_getvnodeops(void); extern struct vnodeops *devvt_getvnodeops(void); +extern void sdev_plugin_nodeready(struct sdev_node *); +extern int sdev_plugin_init(void); +extern int sdev_plugin_fini(void); /* * boot states - warning, the ordering here is significant @@ -515,6 +518,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *); extern void sdev_modctl_dump_files(void); /* + * plugin and legacy vtab stuff + */ +/* directory dependent vop table */ +typedef struct sdev_vop_table { + char *vt_name; /* subdirectory name */ + const fs_operation_def_t *vt_service; /* vnodeops table */ + struct vnodeops **vt_global_vops; /* global container for vop */ + int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ + int vt_flags; +} sdev_vop_table_t; + +extern struct sdev_vop_table vtab[]; +extern struct vnodeops *sdev_get_vop(struct sdev_node *); +extern void sdev_set_no_negcache(struct sdev_node *); +extern void *sdev_get_vtor(struct sdev_node *dv); + +/* * globals */ extern kmutex_t sdev_lock; @@ -527,6 +547,7 @@ extern struct vnodeops *devipnet_vnodeops; extern struct vnodeops *devvt_vnodeops; extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */ extern struct vnodeops *devzvol_vnodeops; +extern int sdev_vnodeops_tbl_size; extern const fs_operation_def_t sdev_vnodeops_tbl[]; extern const fs_operation_def_t devpts_vnodeops_tbl[]; diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h new file mode 100644 index 0000000000..f4ed813c1e --- /dev/null +++ b/usr/src/uts/common/sys/fs/sdev_plugin.h @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#ifndef _SYS_SDEV_PLUGIN_H +#define _SYS_SDEV_PLUGIN_H + +/* + * Kernel sdev plugin interface + */ + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/vnode.h> + +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef uintptr_t sdev_plugin_hdl_t; +typedef uintptr_t sdev_ctx_t; + +/* + * Valid return values for sdev_plugin_validate_t. + */ +typedef enum sdev_plugin_validate { + SDEV_VTOR_INVALID = -1, + SDEV_VTOR_SKIP = 0, + SDEV_VTOR_VALID = 1, + SDEV_VTOR_STALE = 2 +} sdev_plugin_validate_t; + +/* + * Valid flags + */ +typedef enum sdev_plugin_flags { + SDEV_PLUGIN_NO_NCACHE = 0x1, + SDEV_PLUGIN_SUBDIR = 0x2 +} sdev_plugin_flags_t; + +#define SDEV_PLUGIN_FLAGS_MASK 0x3 + +/* + * Functions a module must implement + */ +typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t); +typedef int (*sp_filldir_f)(sdev_ctx_t); +typedef void (*sp_inactive_f)(sdev_ctx_t); + +#define SDEV_PLUGIN_VERSION 1 + +typedef struct sdev_plugin_ops { + int spo_version; + sdev_plugin_flags_t spo_flags; + sp_valid_f spo_validate; + sp_filldir_f spo_filldir; + sp_inactive_f spo_inactive; +} sdev_plugin_ops_t; + +extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *, + int *); +extern int sdev_plugin_unregister(sdev_plugin_hdl_t); + +typedef enum sdev_ctx_flags { + SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */ +} sdev_ctx_flags_t; + +/* + * Context helper functions + */ +extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t); +extern const char *sdev_ctx_name(sdev_ctx_t); +extern const char *sdev_ctx_path(sdev_ctx_t); +extern int sdev_ctx_minor(sdev_ctx_t, minor_t *); +extern enum vtype sdev_ctx_vtype(sdev_ctx_t); + +/* + * Callbacks to manipulate nodes + */ +extern int sdev_plugin_mkdir(sdev_ctx_t, char *); +extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDEV_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index fb07de6588..f4cee09244 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -23,7 +23,7 @@ * All rights reserved. Use is subject to license terms. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_FS_TMP_H @@ -43,8 +43,10 @@ struct tmount { struct vfs *tm_vfsp; /* filesystem's vfs struct */ struct tmpnode *tm_rootnode; /* root tmpnode */ char *tm_mntpath; /* name of tmpfs mount point */ - ulong_t tm_anonmax; /* file system max anon reservation */ - pgcnt_t tm_anonmem; /* pages of reserved anon memory */ + size_t tm_anonmax; /* file system max anon reservation */ + size_t tm_anonmem; /* bytes of reserved anon memory */ + /* and allocated kmem for the fs */ + size_t tm_allocmem; /* bytes alloced from tmp_kmem_ funcs */ dev_t tm_dev; /* unique dev # of mounted `device' */ uint_t tm_gen; /* pseudo generation number for files */ kmutex_t tm_contents; /* lock for tmount structure */ @@ -58,6 +60,7 @@ struct tmount { #define VTOTM(vp) ((struct tmount *)(vp)->v_vfsp->vfs_data) #define VTOTN(vp) ((struct tmpnode *)(vp)->v_data) #define TNTOV(tp) ((tp)->tn_vnode) +#define TNTOTM(tp) (VTOTM(TNTOV(tp))) #define tmpnode_hold(tp) VN_HOLD(TNTOV(tp)) #define tmpnode_rele(tp) VN_RELE(TNTOV(tp)) @@ -69,41 +72,39 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ /* * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system. E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system. In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care + * should be exercised in changing this: tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure. */ -#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */ +#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */ extern size_t tmpfs_minfree; /* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */ - -extern size_t tmp_kmemspace; -extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */ - extern void tmpnode_init(struct tmount *, struct tmpnode *, struct vattr *, struct cred *); +extern void tmpnode_cleanup(struct tmpnode *tp); extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t); extern void tmpnode_growmap(struct tmpnode *, ulong_t); extern int tdirlookup(struct tmpnode *, char *, struct tmpnode **, struct cred *); extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *, enum dr_op, struct cred *); -extern void tdirinit(struct tmpnode *, struct tmpnode *); +extern int tdirinit(struct tmpnode *, struct tmpnode *); extern void tdirtrunc(struct tmpnode *); -extern void *tmp_memalloc(size_t, int); -extern void tmp_memfree(void *, size_t); extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int); extern int tmp_taccess(void *, int, struct cred *); extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, @@ -114,6 +115,9 @@ extern int tdirenter(struct tmount *, struct tmpnode *, char *, enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *, struct tmpnode **, struct cred *, caller_context_t *); +extern void *tmp_kmem_zalloc(struct tmount *, size_t, int); +extern void tmp_kmem_free(struct tmount *, void *, size_t); + #define TMP_MUSTHAVE 0x01 #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 2d4e1aa7fb..4a48af52a1 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -21,13 +21,12 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FX_H #define _SYS_FX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/thread.h> #include <sys/ddi.h> @@ -145,7 +144,14 @@ typedef struct fxkparms { uint_t fx_cflags; } fxkparms_t; +/* + * control flags (kparms->fx_cflags). + */ +#define FX_DOUPRILIM 0x01 /* change user priority limit */ +#define FX_DOUPRI 0x02 /* change user priority */ +#define FX_DOTQ 0x04 /* change FX time quantum */ +#define FXMAXUPRI 60 /* maximum user priority setting */ /* * Interface for partner private code. This is not a public interface. diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..91ab46fc44 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,59 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_GSQUEUE_H +#define _SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define GSQUEUE_FILL 0x0001 +#define GSQUEUE_NODRAIN 0x0002 +#define GSQUEUE_PROCESS 0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, + int, uint8_t); + +#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h index d8a15f0fe5..f3337bbacf 100644 --- a/usr/src/uts/common/sys/hook_impl.h +++ b/usr/src/uts/common/sys/hook_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018, Joyent, Inc. */ /* @@ -171,7 +172,7 @@ typedef struct hook_family_int { cvwaitlock_t hfi_lock; SLIST_ENTRY(hook_family_int) hfi_entry; hook_event_int_head_t hfi_head; - hook_family_t hfi_family; + hook_family_t hfi_family; kstat_t *hfi_kstat; struct hook_stack *hfi_stack; hook_notify_head_t hfi_nhead; @@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t; #define Hn_ARP "arp" #define Hn_IPV4 "inet" #define Hn_IPV6 "inet6" +#define Hn_VIONA "viona_inet" extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t); extern int hook_register(hook_family_int_t *, char *, hook_t *); diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index d56fcceb5a..46d25f207f 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All Rights reserved. */ #ifndef _ID_SPACE_H @@ -34,8 +35,6 @@ extern "C" { #include <sys/mutex.h> #include <sys/vmem.h> -#ifdef _KERNEL - typedef vmem_t id_space_t; id_space_t *id_space_create(const char *, id_t, id_t); @@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *); id_t id_alloc_specific_nosleep(id_space_t *, id_t); void id_free(id_space_t *, id_t); -#endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the inotify facility. Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define _SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ISDIR 0x40000000 +#define IN_ONESHOT 0x80000000 + +/* + * Helpful constants. + */ +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_ALL_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ + IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define IN_CHILD_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +struct inotify_event { + int32_t wd; /* watch descriptor */ + uint32_t mask; /* mask of events */ + uint32_t cookie; /* event association cookie, if any */ + uint32_t len; /* size of name field */ + char name[]; /* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8)) +#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */ +#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */ +#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */ +#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name +#else +#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define INOTIFY_PTR(type, name) type *name +#endif + +typedef struct inotify_addwatch { + int inaw_fd; /* open fd for object */ + uint32_t inaw_mask; /* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { + INOTIFY_PTR(char, inac_name); /* pointer to name */ + int inac_fd; /* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define IN_UNMASKABLE \ + (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define IN_MODIFIERS \ + (IN_EXCL_UNLINK | IN_ONESHOT) + +#define IN_FLAGS \ + (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define IN_REMOVAL (1ULL << 32) +#define INOTIFYMNRN_INOTIFY 0 +#define INOTIFYMNRN_CLONE 1 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h index 0569c3e967..d7dc365c09 100644 --- a/usr/src/uts/common/sys/ipc_impl.h +++ b/usr/src/uts/common/sys/ipc_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #ifndef _IPC_IMPL_H @@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *); kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *); void ipc_cleanup(ipc_service_t *, kipc_perm_t *); +void ipc_rmsvc(ipc_service_t *, kipc_perm_t *); int ipc_rmid(ipc_service_t *, int, cred_t *); int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h index bad74f8b81..f21c3fb5af 100644 --- a/usr/src/uts/common/sys/ipd.h +++ b/usr/src/uts/common/sys/ipd.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -35,7 +35,7 @@ extern "C" { #endif #define IPD_DEV_PATH "/dev/ipd" -#define IPD_MAX_DELAY 10000 /* 10 ms in us */ +#define IPD_MAX_DELAY 1000000 /* 1 second in microseconds */ typedef struct ipd_ioc_perturb { zoneid_t ipip_zoneid; diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index bf89ef0d33..0a76ee19a7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -95,7 +96,7 @@ extern "C" { /* insert new signals here, and move _SIGRTM* appropriately */ #define _SIGRTMIN 42 /* first (highest-priority) realtime signal */ -#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */ +#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */ extern long _sysconf(int); /* System Private interface to sysconf() */ #define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */ #define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..0ea1a396b9 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -191,7 +191,14 @@ typedef struct _klwp { struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */ struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ - void *lwp_brand; /* per-lwp brand data */ + /* + * Branding: + * lwp_brand - per-lwp brand data + * lwp_brand_syscall - brand syscall interposer + */ + void *lwp_brand; + int (*lwp_brand_syscall)(void); + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h index 2396ef4625..d52a54f6b7 100644 --- a/usr/src/uts/common/sys/kobj.h +++ b/usr/src/uts/common/sys/kobj.h @@ -24,6 +24,9 @@ * * Copyright 2017 RackTop Systems. */ +/* + * Copyright (c) 2017 Joyent, Inc. + */ #ifndef _SYS_KOBJ_H #define _SYS_KOBJ_H @@ -47,6 +50,12 @@ struct module_list { struct module *mp; }; +typedef struct hotinline_desc { + char *hid_symname; /* symbol name */ + uintptr_t hid_instr_offset; /* offset of call in text */ + struct hotinline_desc *hid_next; /* next hotinline */ +} hotinline_desc_t; + typedef unsigned short symid_t; /* symbol table index */ typedef unsigned char *reloc_dest_t; @@ -99,6 +108,8 @@ struct module { caddr_t textwin; caddr_t textwin_base; + hotinline_desc_t *hi_calls; + sdt_probedesc_t *sdt_probes; size_t sdt_nprobes; char *sdt_tab; @@ -187,6 +198,7 @@ extern int kobj_read_file(struct _buf *, char *, unsigned, unsigned); extern int kobj_get_filesize(struct _buf *, uint64_t *size); extern uintptr_t kobj_getelfsym(char *, void *, int *); extern void kobj_set_ctf(struct module *, caddr_t data, size_t size); +extern void do_hotinlines(struct module *); extern int kobj_filbuf(struct _buf *); extern void kobj_sync(void); diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index 5d8827f1ae..d720caa631 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -21,6 +21,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_KSOCKET_H_ @@ -122,6 +123,11 @@ extern int ksocket_close(ksocket_t, struct cred *); extern void ksocket_hold(ksocket_t); extern void ksocket_rele(ksocket_t); +typedef boolean_t (*ksocket_krecv_f)(ksocket_t, struct msgb *, size_t, int, + void *); +extern int ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *); +extern void ksocket_krecv_unblock(ksocket_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h new file mode 100644 index 0000000000..88625d1829 --- /dev/null +++ b/usr/src/uts/common/sys/limits.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LIMITS_H +#define _SYS_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define IOV_MAX 1024 + +#ifdef _KERNEL +#define IOV_MAX_STACK 16 /* max. IOV on-stack allocation */ +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIMITS_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 0907d6deff..afe554ba03 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> */ @@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s { } mac_propval_uint32_range_t; /* + * Defines ranges which are a series of C style strings. + */ +typedef struct mac_propval_str_range_s { + uint32_t mpur_nextbyte; + char mpur_data[1]; +} mac_propval_str_range_t; + +/* * Data type of property values. */ typedef enum { @@ -120,6 +128,7 @@ typedef struct mac_propval_range_s { mac_propval_type_t mpr_type; /* type of value */ union { mac_propval_uint32_range_t mpr_uint32[1]; + mac_propval_str_range_t mpr_str; } u; } mac_propval_range_t; @@ -614,6 +623,36 @@ typedef struct mactype_register_s { } mactype_register_t; /* + * Flags to describe the hardware emulation desired from a client when + * calling mac_hw_emul(). + * + * MAC_HWCKSUM_EMUL + * + * If an mblk is marked with HCK_* flags, then calculate those + * checksums and update the checksum flags. + * + * MAC_IPCKSUM_EMUL + * + * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header + * checksum. We still update both the IPv4 and ULP checksum + * flags. + * + * MAC_LSO_EMUL + * + * If an mblk is marked with HW_LSO, then segment the LSO mblk + * into a new chain of mblks which reference the original data + * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the + * caller needs both then it must set both. + */ +typedef enum mac_emul { + MAC_HWCKSUM_EMUL = (1 << 0), + MAC_IPCKSUM_EMUL = (1 << 1), + MAC_LSO_EMUL = (1 << 2) +} mac_emul_t; + +#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL) + +/* * Driver interface functions. */ extern int mac_open_by_linkid(datalink_id_t, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 0fc4939503..8fff314bfe 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -88,6 +88,7 @@ typedef enum { } mac_client_promisc_type_t; /* flags passed to mac_unicast_add() */ + #define MAC_UNICAST_NODUPCHECK 0x0001 #define MAC_UNICAST_PRIMARY 0x0002 #define MAC_UNICAST_HW 0x0004 @@ -115,6 +116,7 @@ typedef enum { #define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 #define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004 #define MAC_PROMISC_FLAGS_NO_COPY 0x0008 +#define MAC_PROMISC_FLAGS_DO_FIXUPS 0x0010 /* flags passed to mac_tx() */ #define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ @@ -136,6 +138,7 @@ extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *); extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *); extern void mac_rx_clear(mac_client_handle_t); +extern void mac_rx_barrier(mac_client_handle_t); extern void mac_secondary_dup(mac_client_handle_t, mac_client_handle_t); extern void mac_secondary_cleanup(mac_client_handle_t); extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *, @@ -198,6 +201,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *); extern void mac_client_set_rings(mac_client_handle_t, int, int); +extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 9b3b4fe369..21e8620121 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -24,7 +24,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MAC_CLIENT_IMPL_H @@ -57,7 +57,7 @@ typedef struct mac_unicast_impl_s { /* Protected by */ uint16_t mui_vid; /* SL */ } mac_unicast_impl_t; -#define MAC_CLIENT_FLAGS_PRIMARY 0X0001 +#define MAC_CLIENT_FLAGS_PRIMARY 0x0001 #define MAC_CLIENT_FLAGS_VNIC_PRIMARY 0x0002 #define MAC_CLIENT_FLAGS_MULTI_PRIMARY 0x0004 #define MAC_CLIENT_FLAGS_PASSIVE_PRIMARY 0x0008 @@ -83,6 +83,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */ boolean_t mpi_no_phys; /* WO */ boolean_t mpi_strip_vlan_tag; /* WO */ boolean_t mpi_no_copy; /* WO */ + boolean_t mpi_do_fixups; /* WO */ } mac_promisc_impl_t; typedef union mac_tx_percpu_s { @@ -131,12 +132,17 @@ struct mac_client_impl_s { /* Protected by */ uint32_t mci_flags; /* SL */ krwlock_t mci_rw_lock; mac_unicast_impl_t *mci_unicast_list; /* mci_rw_lock */ + /* * The mac_client_impl_t may be shared by multiple clients, i.e * multiple VLANs sharing the same MAC client. In this case the - * address/vid tubles differ and are each associated with their + * address/vid tuples differ and are each associated with their * own flow entry, but the rest underlying components SRS, etc, * are common. + * + * This is only needed to support sun4v vsw. There are several + * places in MAC we could simplify the code if we removed + * sun4v support. */ flow_entry_t *mci_flent_list; /* mci_rw_lock */ uint_t mci_nflents; /* mci_rw_lock */ @@ -313,6 +319,74 @@ extern int mac_tx_percpu_cnt; (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 && \ (mcip)->mci_nvids == 1) \ +/* + * MAC Client Implementation State (mci_state_flags) + * + * MCIS_IS_VNIC + * + * The client is a VNIC. + * + * MCIS_EXCLUSIVE + * + * The client has exclusive control over the MAC, such that it is + * the sole client of the MAC. + * + * MCIS_TAG_DISABLE + * + * MAC will not add VLAN tags to outgoing traffic. If this flag + * is set it is up to the client to add the correct VLAN tag. + * + * MCIS_STRIP_DISABLE + * + * MAC will not strip the VLAN tags on incoming traffic before + * passing it to mci_rx_fn. This only applies to non-bypass + * traffic. + * + * MCIS_IS_AGGR_PORT + * + * The client represents a port on an aggr. + * + * MCIS_CLIENT_POLL_CAPABLE + * + * The client is capable of polling the Rx TCP/UDP softrings. + * + * MCIS_DESC_LOGGED + * + * This flag is set when the client's link info has been logged + * by the mac_log_linkinfo() timer. This ensures that the + * client's link info is only logged once. + * + * MCIS_SHARE_BOUND + * + * This client has an HIO share bound to it. + * + * MCIS_DISABLE_TX_VID_CHECK + * + * MAC will not check the VID of the client's Tx traffic. + * + * MCIS_USE_DATALINK_NAME + * + * The client is using the same name as its underlying MAC. This + * happens when dlmgmtd is unreachable during client creation. + * + * MCIS_UNICAST_HW + * + * The client requires MAC address hardware classification. This + * is only used by sun4v vsw. + * + * MCIS_IS_AGGR_CLIENT + * + * The client sits atop an aggr. + * + * MCIS_RX_BYPASS_DISABLE + * + * Do not allow the client to enable DLS bypass. + * + * MCIS_NO_UNICAST_ADDR + * + * This client has no MAC unicast addresss associated with it. + * + */ /* MCI state flags */ #define MCIS_IS_VNIC 0x0001 #define MCIS_EXCLUSIVE 0x0002 @@ -325,7 +399,7 @@ extern int mac_tx_percpu_cnt; #define MCIS_DISABLE_TX_VID_CHECK 0x0100 #define MCIS_USE_DATALINK_NAME 0x0200 #define MCIS_UNICAST_HW 0x0400 -#define MCIS_IS_AGGR 0x0800 +#define MCIS_IS_AGGR_CLIENT 0x0800 #define MCIS_RX_BYPASS_DISABLE 0x1000 #define MCIS_NO_UNICAST_ADDR 0x2000 @@ -337,8 +411,7 @@ extern int mac_tx_percpu_cnt; extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); extern void mac_client_init(void); extern void mac_client_fini(void); -extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, - mac_client_impl_t *); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *); extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 6b409513a6..97b3fd685a 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -58,6 +58,9 @@ extern const mac_info_t *mac_info(mac_handle_t); extern boolean_t mac_info_get(const char *, mac_info_t *); extern boolean_t mac_promisc_get(mac_handle_t); +extern boolean_t mac_protect_check_addr(mac_client_handle_t, boolean_t, + in6_addr_t *); + extern int mac_start(mac_handle_t); extern void mac_stop(mac_handle_t); @@ -121,9 +124,17 @@ extern void mac_tx_client_quiesce(mac_client_handle_t); extern void mac_tx_client_condemn(mac_client_handle_t); extern void mac_tx_client_restart(mac_client_handle_t); extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); +extern uint_t mac_hwrings_idx_get(mac_handle_t, uint_t, mac_group_handle_t *, + mac_ring_handle_t *, mac_ring_type_t); extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *, mac_ring_handle_t *, mac_ring_type_t); extern uint_t mac_hwring_getinfo(mac_ring_handle_t); +extern void mac_hwring_set_passthru(mac_ring_handle_t, mac_rx_t, void *, + mac_resource_handle_t); +extern void mac_hwring_clear_passthru(mac_ring_handle_t); +extern void mac_client_set_flow_cb(mac_client_handle_t, mac_rx_t, void *); +extern void mac_client_clear_flow_cb(mac_client_handle_t); + extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t, mac_ring_handle_t); extern void mac_hwring_teardown(mac_ring_handle_t); @@ -131,6 +142,8 @@ extern int mac_hwring_disable_intr(mac_ring_handle_t); extern int mac_hwring_enable_intr(mac_ring_handle_t); extern int mac_hwring_start(mac_ring_handle_t); extern void mac_hwring_stop(mac_ring_handle_t); +extern int mac_hwring_activate(mac_ring_handle_t); +extern void mac_hwring_quiesce(mac_ring_handle_t); extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int); extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); extern int mac_hwring_getstat(mac_ring_handle_t, uint_t, uint64_t *); @@ -144,6 +157,13 @@ extern void mac_hwring_set_default(mac_handle_t, mac_ring_handle_t); extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *); extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *); +extern int mac_hwgroup_addvlan(mac_group_handle_t, uint16_t); +extern int mac_hwgroup_remvlan(mac_group_handle_t, uint16_t); + +extern boolean_t mac_has_hw_vlan(mac_handle_t); + +extern uint_t mac_get_num_rx_groups(mac_handle_t); +extern int mac_set_promisc(mac_handle_t, boolean_t); extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t, mac_resource_props_t *); @@ -171,6 +191,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); extern void *mac_get_devinfo(mac_handle_t); extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t); extern uint32_t mac_no_notification(mac_handle_t); extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h index e290ba7dbe..d37752ec23 100644 --- a/usr/src/uts/common/sys/mac_flow.h +++ b/usr/src/uts/common/sys/mac_flow.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ #ifndef _MAC_FLOW_H @@ -155,6 +155,14 @@ typedef enum { #define MPT_MAXIPADDR MPT_MAXCNT #define MPT_MAXCID MPT_MAXCNT #define MPT_MAXCIDLEN 256 +#define MPT_FALSE 0x00000000 +#define MPT_TRUE 0x00000001 + +/* Dynamic address detection types */ +#define MPT_DYN_DHCPV4 0x00000001 +#define MPT_DYN_DHCPV6 0x00000002 +#define MPT_DYN_SLAAC 0x00000004 +#define MPT_DYN_ALL 0x00000007 typedef struct mac_ipaddr_s { uint32_t ip_version; @@ -175,11 +183,13 @@ typedef struct mac_dhcpcid_s { } mac_dhcpcid_t; typedef struct mac_protect_s { - uint32_t mp_types; - uint32_t mp_ipaddrcnt; - mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; - uint32_t mp_cidcnt; - mac_dhcpcid_t mp_cids[MPT_MAXCID]; + uint32_t mp_types; /* Enabled protection types */ + uint32_t mp_ipaddrcnt; /* Count of allowed IPs */ + mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; /* Allowed IPs */ + uint32_t mp_cidcnt; /* Count of allowed DHCP CIDs */ + mac_dhcpcid_t mp_cids[MPT_MAXCID]; /* Allowed DHCP CIDs */ + uint32_t mp_allcids; /* Whether to allow all CIDs through */ + uint32_t mp_dynamic; /* Enabled dynamic address methods */ } mac_protect_t; /* The default priority for links */ diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 774c4fad9a..ce09304699 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_MAC_IMPL_H @@ -108,6 +108,7 @@ typedef struct mac_cb_info_s { kcondvar_t mcbi_cv; uint_t mcbi_del_cnt; /* Deleted callback cnt */ uint_t mcbi_walker_cnt; /* List walker count */ + uint_t mcbi_barrier_cnt; /* Barrier waiter count */ } mac_cb_info_t; typedef struct mac_notify_cb_s { @@ -123,40 +124,18 @@ typedef struct mac_notify_cb_s { */ typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -#define MAC_CALLBACK_WALKER_INC(mcbi) { \ - mutex_enter((mcbi)->mcbi_lockp); \ - (mcbi)->mcbi_walker_cnt++; \ - mutex_exit((mcbi)->mcbi_lockp); \ -} +#define MAC_CALLBACK_WALKER_INC(mcbi) \ + mac_callback_walker_enter(mcbi) -#define MAC_CALLBACK_WALKER_INC_HELD(mcbi) (mcbi)->mcbi_walker_cnt++; - -#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) { \ - mac_cb_t *rmlist; \ - \ - mutex_enter((mcbi)->mcbi_lockp); \ - if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \ - rmlist = mac_callback_walker_cleanup((mcbi), headp); \ - mac_callback_free(rmlist); \ - cv_broadcast(&(mcbi)->mcbi_cv); \ - } \ - mutex_exit((mcbi)->mcbi_lockp); \ -} +#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) \ + mac_callback_walker_exit(mcbi, headp, B_FALSE) -#define MAC_PROMISC_WALKER_INC(mip) \ - MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info) - -#define MAC_PROMISC_WALKER_DCR(mip) { \ - mac_cb_info_t *mcbi; \ - \ - mcbi = &(mip)->mi_promisc_cb_info; \ - mutex_enter(mcbi->mcbi_lockp); \ - if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \ - i_mac_promisc_walker_cleanup(mip); \ - cv_broadcast(&mcbi->mcbi_cv); \ - } \ - mutex_exit(mcbi->mcbi_lockp); \ -} +#define MAC_PROMISC_WALKER_INC(mip) \ + mac_callback_walker_enter(&(mip)->mi_promisc_cb_info) + +#define MAC_PROMISC_WALKER_DCR(mip) \ + mac_callback_walker_exit(&(mip)->mi_promisc_cb_info, \ + &(mip)->mi_promisc_list, B_TRUE) typedef struct mactype_s { const char *mt_ident; @@ -208,9 +187,18 @@ struct mac_ring_s { mac_ring_t *mr_next; /* next ring in the chain */ mac_group_handle_t mr_gh; /* reference to group */ - mac_classify_type_t mr_classify_type; /* HW vs SW */ + mac_classify_type_t mr_classify_type; struct mac_soft_ring_set_s *mr_srs; /* associated SRS */ - mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + + /* + * Ring passthru callback and arguments. See the + * MAC_PASSTHRU_CLASSIFIER comment in mac_provider.h. + */ + mac_rx_t mr_pt_fn; + void *mr_pt_arg1; + mac_resource_handle_t mr_pt_arg2; + uint_t mr_refcnt; /* Ring references */ /* ring generation no. to guard against drivers using stale rings */ uint64_t mr_gen_num; @@ -244,7 +232,7 @@ struct mac_ring_s { (mr)->mr_refcnt++; \ } -#define MR_REFRELE(mr) { \ +#define MR_REFRELE(mr) { \ mutex_enter(&(mr)->mr_lock); \ ASSERT((mr)->mr_refcnt != 0); \ (mr)->mr_refcnt--; \ @@ -255,8 +243,8 @@ struct mac_ring_s { } /* - * Per mac client flow information associated with a RX group. - * The entire structure is SL protected. + * Used to attach MAC clients to an Rx group. The members are SL + * protected. */ typedef struct mac_grp_client { struct mac_grp_client *mgc_next; @@ -270,15 +258,20 @@ typedef struct mac_grp_client { ((g)->mrg_clients->mgc_next == NULL)) ? \ (g)->mrg_clients->mgc_client : NULL) +#define MAC_GROUP_HW_VLAN(g) \ + (((g) != NULL) && \ + ((g)->mrg_info.mgi_addvlan != NULL) && \ + ((g)->mrg_info.mgi_remvlan != NULL)) + /* * Common ring group data structure for ring control and management. - * The entire structure is SL protected + * The entire structure is SL protected. */ struct mac_group_s { int mrg_index; /* index in the list */ mac_ring_type_t mrg_type; /* ring type */ mac_group_state_t mrg_state; /* state of the group */ - mac_group_t *mrg_next; /* next ring in the chain */ + mac_group_t *mrg_next; /* next group in the chain */ mac_handle_t mrg_mh; /* reference to MAC */ mac_ring_t *mrg_rings; /* grouped rings */ uint_t mrg_cur_count; /* actual size of group */ @@ -300,7 +293,7 @@ struct mac_group_s { mac_ring_handle_t mrh = rh; \ mac_impl_t *mimpl = (mac_impl_t *)mhp; \ /* \ - * Send packets through a selected tx ring, or through the \ + * Send packets through a selected tx ring, or through the \ * default handler if there is no selected ring. \ */ \ if (mrh == NULL) \ @@ -322,9 +315,9 @@ struct mac_group_s { #define MAC_TX(mip, rh, mp, src_mcip) { \ mac_ring_handle_t rhandle = (rh); \ /* \ - * If there is a bound Hybrid I/O share, send packets through \ + * If there is a bound Hybrid I/O share, send packets through \ * the default tx ring. (When there's a bound Hybrid I/O share, \ - * the tx rings of this client are mapped in the guest domain \ + * the tx rings of this client are mapped in the guest domain \ * and not accessible from here.) \ */ \ _NOTE(CONSTANTCONDITION) \ @@ -333,7 +326,7 @@ struct mac_group_s { if (mip->mi_promisc_list != NULL) \ mac_promisc_dispatch(mip, mp, src_mcip); \ /* \ - * Grab the proper transmit pointer and handle. Special \ + * Grab the proper transmit pointer and handle. Special \ * optimization: we can test mi_bridge_link itself atomically, \ * and if that indicates no bridge send packets through tx ring.\ */ \ @@ -360,17 +353,23 @@ typedef struct mac_mcast_addrs_s { } mac_mcast_addrs_t; typedef enum { - MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* hardware steering */ + MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* HW classification */ MAC_ADDRESS_TYPE_UNICAST_PROMISC /* promiscuous mode */ } mac_address_type_t; +typedef struct mac_vlan_s { + struct mac_vlan_s *mv_next; + uint16_t mv_vid; +} mac_vlan_t; + typedef struct mac_address_s { mac_address_type_t ma_type; /* address type */ - int ma_nusers; /* number of users */ - /* of that address */ + int ma_nusers; /* num users of addr */ struct mac_address_s *ma_next; /* next address */ uint8_t ma_addr[MAXMACADDRLEN]; /* address value */ size_t ma_len; /* address length */ + mac_vlan_t *ma_vlans; /* VLANs on this addr */ + boolean_t ma_untagged; /* accept untagged? */ mac_group_t *ma_group; /* asscociated group */ mac_impl_t *ma_mip; /* MAC handle */ } mac_address_t; @@ -487,7 +486,7 @@ struct mac_impl_s { mac_capab_led_t mi_led; /* - * MAC address list. SL protected. + * MAC address and VLAN lists. SL protected. */ mac_address_t *mi_addresses; @@ -654,6 +653,7 @@ struct mac_impl_s { #define MIS_LEGACY 0x0040 #define MIS_NO_ACTIVE 0x0080 #define MIS_POLL_DISABLE 0x0100 +#define MIS_IS_OVERLAY 0x0200 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start @@ -722,23 +722,35 @@ typedef struct mac_client_impl_s mac_client_impl_t; extern void mac_init(void); extern int mac_fini(void); +/* + * MAC packet/chain drop functions to aggregate all dropped-packet + * debugging to a single surface. + */ +/*PRINTFLIKE2*/ +extern void mac_drop_pkt(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + +/*PRINTFLIKE2*/ +extern void mac_drop_chain(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *, uint8_t *, ip6_frag_t **); extern mblk_t *mac_copymsgchain_cksum(mblk_t *); -extern mblk_t *mac_fix_cksum(mblk_t *); extern void mac_packet_print(mac_handle_t, mblk_t *); extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); extern void mac_tx_notify(mac_impl_t *); -extern boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern void mac_callback_remove_wait(mac_cb_info_t *); -extern void mac_callback_free(mac_cb_t *); -extern mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **); +extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern void mac_callback_remove_wait(mac_cb_info_t *); +extern void mac_callback_barrier(mac_cb_info_t *); +extern void mac_callback_free(mac_cb_t *); +extern void mac_callback_walker_enter(mac_cb_info_t *); +extern void mac_callback_walker_exit(mac_cb_info_t *, mac_cb_t **, boolean_t); /* in mac_bcast.c */ extern void mac_bcast_init(void); @@ -759,6 +771,8 @@ extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t, */ extern int mac_group_addmac(mac_group_t *, const uint8_t *); extern int mac_group_remmac(mac_group_t *, const uint8_t *); +extern int mac_group_addvlan(mac_group_t *, uint16_t); +extern int mac_group_remvlan(mac_group_t *, uint16_t); extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *, mac_group_t *); extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); @@ -779,6 +793,7 @@ extern void mac_rx_switch_grp_to_sw(mac_group_t *); * MAC address functions are used internally by MAC layer. */ extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *); +extern mac_address_t *mac_find_macaddr_vlan(mac_impl_t *, uint8_t *, uint16_t); extern boolean_t mac_check_macaddr_shared(mac_address_t *); extern int mac_update_macaddr(mac_address_t *, uint8_t *); extern void mac_freshen_macaddr(mac_address_t *, uint8_t *); @@ -829,7 +844,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *); extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); -extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t); extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *); extern void i_mac_share_alloc(mac_client_impl_t *); @@ -849,7 +864,6 @@ extern void mac_tx_client_block(mac_client_impl_t *); extern void mac_tx_client_unblock(mac_client_impl_t *); extern void mac_tx_invoke_callbacks(mac_client_impl_t *, mac_tx_cookie_t); extern int i_mac_promisc_set(mac_impl_t *, boolean_t); -extern void i_mac_promisc_walker_cleanup(mac_impl_t *); extern mactype_t *mactype_getplugin(const char *); extern void mac_addr_factory_init(mac_impl_t *); extern void mac_addr_factory_fini(mac_impl_t *); @@ -863,8 +877,9 @@ extern int mac_start_group(mac_group_t *); extern void mac_stop_group(mac_group_t *); extern int mac_start_ring(mac_ring_t *); extern void mac_stop_ring(mac_ring_t *); -extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *, boolean_t); -extern int mac_remove_macaddr(mac_address_t *); +extern int mac_add_macaddr_vlan(mac_impl_t *, mac_group_t *, uint8_t *, + uint16_t, boolean_t); +extern int mac_remove_macaddr_vlan(mac_address_t *, uint16_t); extern void mac_set_group_state(mac_group_t *, mac_group_state_t); extern void mac_group_add_client(mac_group_t *, mac_client_impl_t *); diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 4c91c03967..2dea3a4758 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. */ #ifndef _SYS_MAC_PROVIDER_H @@ -108,6 +108,7 @@ typedef enum { MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */ MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */ MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */ + MAC_CAPAB_OVERLAY = 0x00800000, /* boolean only, no data */ MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transciever_t */ MAC_CAPAB_LED = 0x02000000 /* data is mac_capab_led_t */ } mac_capab_t; @@ -242,16 +243,59 @@ typedef struct mac_callbacks_s { /* * Virtualization Capabilities */ + /* - * The ordering of entries below is important. MAC_HW_CLASSIFIER - * is the cutoff below which are entries which don't depend on - * H/W. MAC_HW_CLASSIFIER and entries after that are cases where - * H/W has been updated through add/modify/delete APIs. + * The type of ring classification. This is used by MAC to determine + * what, if any, processing it has to do upon receiving traffic on a + * particular Rx ring. + * + * MAC_NO_CLASSIFIER + * + * No classification has been set. No traffic should cross an Rx + * ring in this state. + * + * MAC_SW_CLASSIFIER + * + * The driver delivers traffic for multiple clients to this ring. + * All traffic must be software classified by MAC to guarantee + * delivery to the correct client. This classification type may + * be chosen for several reasons. + * + * o The driver provides only one group and there are multiple + * clients using the MAC. + * + * o The driver provides some hardware filtering but not enough + * to fully classify the traffic. E.g., a VLAN VNIC requires L2 + * unicast address filtering as well as VLAN filtering, but + * some drivers may only support the former. + * + * o The ring belongs to the default group. The default group + * acts as a spillover for all clients that can't reserve an + * exclusive group. It also handles multicast traffic for all + * clients. For these reasons, the default group's rings are + * always software classified. + * + * MAC_HW_CLASSIFIER + * + * The driver delivers traffic for a single MAC client across + * this ring. With this guarantee, MAC can simply pass the + * traffic up the stack or even allow polling of the ring. + * + * MAC_PASSTHRU_CLASSIFIER + * + * The ring is in "passthru" mode. In this mode we bypass all of + * the typical MAC processing and pass the traffic directly to + * the mr_pt_fn callback, see mac_rx_common(). This is used in + * cases where there is another module acting as MAC provider on + * behalf of the driver. E.g., link aggregations use this mode to + * take full control of the port's rings; allowing it to enforce + * LACP protocols and aggregate rings across discrete drivers. */ typedef enum { MAC_NO_CLASSIFIER = 0, MAC_SW_CLASSIFIER, - MAC_HW_CLASSIFIER + MAC_HW_CLASSIFIER, + MAC_PASSTHRU_CLASSIFIER } mac_classify_type_t; typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *, @@ -281,6 +325,28 @@ typedef enum { } mac_ring_type_t; /* + * The value VLAN_ID_NONE (VID 0) means a client does not have + * membership to any VLAN. However, this statement is true for both + * untagged packets and priority tagged packets leading to confusion + * over what semantic is intended. To the provider, VID 0 is a valid + * VID when priority tagging is in play. To MAC and everything above + * VLAN_ID_NONE almost universally implies untagged traffic. Thus, we + * convert VLAN_ID_NONE to a sentinel value (MAC_VLAN_UNTAGGED) at the + * border between MAC and MAC provider. This informs the provider that + * the client is interested in untagged traffic and the provider + * should set any relevant bits to receive such traffic. + * + * Currently, the API between MAC and the provider passes the VID as a + * unit16_t. In the future this could actually be the entire TCI mask + * (PCP, DEI, and VID). This current scheme is safe in that potential + * future world as well; as 0xFFFF is not a valid TCI (the 0xFFF VID + * is reserved and never transmitted across networks). + */ +#define MAC_VLAN_UNTAGGED UINT16_MAX +#define MAC_VLAN_UNTAGGED_VID(vid) \ + (((vid) == VLAN_ID_NONE) ? MAC_VLAN_UNTAGGED : (vid)) + +/* * Grouping type of a ring group * * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped. @@ -342,6 +408,7 @@ typedef struct mac_ring_info_s { mac_ring_poll_t poll; } mrfunion; mac_ring_stat_t mri_stat; + /* * mri_flags will have some bits set to indicate some special * property/feature of a ring like serialization needed for a @@ -358,6 +425,8 @@ typedef struct mac_ring_info_s { * #defines for mri_flags. The flags are temporary flags that are provided * only to workaround issues in specific drivers, and they will be * removed in the future. + * + * These are consumed only by sun4v and neptune (nxge). */ #define MAC_RING_TX_SERIALIZE 0x1 #define MAC_RING_RX_ENQUEUE 0x2 @@ -366,6 +435,8 @@ typedef int (*mac_group_start_t)(mac_group_driver_t); typedef void (*mac_group_stop_t)(mac_group_driver_t); typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *); typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *); +typedef int (*mac_add_vlan_filter_t)(mac_group_driver_t, uint16_t); +typedef int (*mac_rem_vlan_filter_t)(mac_group_driver_t, uint16_t); struct mac_group_info_s { mac_group_driver_t mgi_driver; /* Driver reference */ @@ -374,9 +445,11 @@ struct mac_group_info_s { uint_t mgi_count; /* Count of rings */ mac_intr_t mgi_intr; /* Optional per-group intr */ - /* Only used for rx groups */ + /* Only used for Rx groups */ mac_add_mac_addr_t mgi_addmac; /* Add a MAC address */ mac_rem_mac_addr_t mgi_remmac; /* Remove a MAC address */ + mac_add_vlan_filter_t mgi_addvlan; /* Add a VLAN filter */ + mac_rem_vlan_filter_t mgi_remvlan; /* Remove a VLAN filter */ }; /* @@ -558,11 +631,12 @@ extern void mac_prop_info_set_range_uint32( extern void mac_prop_info_set_perm(mac_prop_info_handle_t, uint8_t); -extern void mac_hcksum_get(mblk_t *, uint32_t *, +extern void mac_hcksum_get(const mblk_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); extern void mac_hcksum_set(mblk_t *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); +extern void mac_hcksum_clone(const mblk_t *, mblk_t *); extern void mac_lso_get(mblk_t *, uint32_t *, uint32_t *); diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 0d49a2ff4d..65819c1209 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -340,6 +340,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__) /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index 88c98dc5a4..7196f7b3ac 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 6407534a3b..658f9f3f6b 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@ * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_NETCONFIG_H diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index b21504109c..92bd5b897d 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_NETI_H @@ -46,6 +48,9 @@ struct msgb; /* avoiding sys/stream.h here */ #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VND_INET "NHF_VND_INET" +#define NHF_VND_INET6 "NHF_VND_INET6" +#define NHF_VIONA "NHF_VIONA" /* * Event identification @@ -61,7 +66,7 @@ struct msgb; /* avoiding sys/stream.h here */ /* * Network NIC hardware checksum capability */ -#define NET_HCK_NONE 0x00 +#define NET_HCK_NONE 0x00 #define NET_HCK_L3_FULL 0x01 #define NET_HCK_L3_PART 0x02 #define NET_HCK_L4_FULL 0x10 diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 7ee33318cd..b327e69fad 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -88,7 +88,8 @@ typedef id_t netstackid_t; #define NS_IPSECESP 16 #define NS_IPNET 17 #define NS_ILB 18 -#define NS_MAX (NS_ILB+1) +#define NS_VND 19 +#define NS_MAX (NS_VND+1) /* * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define _SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVERLAY_IOC_CREATE OVERLAYIOC(1) +#define OVERLAY_IOC_DELETE OVERLAYIOC(2) +#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3) +#define OVERLAY_IOC_GETPROP OVERLAYIOC(4) +#define OVERLAY_IOC_SETPROP OVERLAYIOC(5) +#define OVERLAY_IOC_NPROPS OVERLAYIOC(6) +#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7) +#define OVERLAY_IOC_STATUS OVERLAYIOC(8) + +typedef struct overlay_ioc_create { + datalink_id_t oic_linkid; + uint32_t oic_filler; + uint64_t oic_vnetid; + char oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { + datalink_id_t oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { + datalink_id_t oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { + datalink_id_t oipn_linkid; + int32_t oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { + datalink_id_t oipi_linkid; + int32_t oipi_id; + char oipi_name[OVERLAY_PROP_NAMELEN]; + uint_t oipi_type; + uint_t oipi_prot; + uint8_t oipi_default[OVERLAY_PROP_SIZEMAX]; + uint32_t oipi_defsize; + uint32_t oipi_posssize; + uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { + datalink_id_t oip_linkid; + int32_t oip_id; + char oip_name[OVERLAY_PROP_NAMELEN]; + uint8_t oip_value[OVERLAY_PROP_SIZEMAX]; + uint32_t oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { + OVERLAY_I_OK = 0x00, + OVERLAY_I_DEGRADED = 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { + datalink_id_t ois_linkid; + uint_t ois_status; + char ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..d638096006 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define _SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { + OVERLAY_TARGET_NONE = 0x0, + OVERLAY_TARGET_POINT, + OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { + OVERLAY_PLUGIN_D_INVALID = 0x0, + OVERLAY_PLUGIN_D_ETHERNET = 0x1, + OVERLAY_PLUGIN_D_IP = 0x2, + OVERLAY_PLUGIN_D_PORT = 0x4, + OVERLAY_PLUGIN_D_MASK = 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { + OVERLAY_PROP_T_INT = 0x1, /* signed int */ + OVERLAY_PROP_T_UINT, /* unsigned int */ + OVERLAY_PROP_T_IP, /* sinaddr6 */ + OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { + OVERLAY_PROP_PERM_REQ = 0x1, + OVERLAY_PROP_PERM_READ = 0x2, + OVERLAY_PROP_PERM_WRITE = 0x4, + OVERLAY_PROP_PERM_RW = 0x6, + OVERLAY_PROP_PERM_RRW = 0x7, + OVERLAY_PROP_PERM_MASK = 0x7 +} overlay_prop_prot_t; + +#define OVERLAY_PROP_NAMELEN 64 +#define OVERLAY_PROP_SIZEMAX 256 +#define OVERLAY_STATUS_BUFLEN 256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..7fb8b8da1d --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define _SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION_ONE 0x1 + +typedef struct overlay_plugin { + kmutex_t ovp_mutex; + list_node_t ovp_link; /* overlay_plugin_lock */ + uint_t ovp_active; /* ovp_mutex */ + const char *ovp_name; /* RO */ + const overlay_plugin_ops_t *ovp_ops; /* RO */ + const char *const *ovp_props; /* RO */ + uint_t ovp_nprops; /* RO */ + uint_t ovp_id_size; /* RO */ + overlay_plugin_flags_t ovp_flags; /* RO */ + overlay_plugin_dest_t ovp_dest; /* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { + list_node_t omux_lnode; + ksocket_t omux_ksock; /* RO */ + overlay_plugin_t *omux_plugin; /* RO: associated encap */ + int omux_domain; /* RO: socket domain */ + int omux_family; /* RO: socket family */ + int omux_protocol; /* RO: socket protocol */ + struct sockaddr *omux_addr; /* RO: socket address */ + socklen_t omux_alen; /* RO: sockaddr len */ + kmutex_t omux_lock; /* Protects everything below */ + uint_t omux_count; /* Active instances */ + avl_tree_t omux_devices; /* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { + OVERLAY_T_TEARDOWN = 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { + kmutex_t ott_lock; + kcondvar_t ott_cond; + overlay_target_mode_t ott_mode; /* RO */ + overlay_plugin_dest_t ott_dest; /* RO */ + uint64_t ott_id; /* RO */ + overlay_target_flag_t ott_flags; /* ott_lock */ + uint_t ott_ocount; /* ott_lock */ + union { /* ott_lock */ + overlay_target_point_t ott_point; + struct overlay_target_dyn { + refhash_t *ott_dhash; + avl_tree_t ott_tree; + } ott_dyn; + } ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { + OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */ + OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */ + OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */ + OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */ + OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */ + OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */ + OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */ + OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */ + OVERLAY_F_DEGRADED = 0x40, /* device is degraded */ + OVERLAY_F_MASK = 0x7f /* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { + kmutex_t odd_lock; + kcondvar_t odd_iowait; + list_node_t odd_link; /* overlay_dev_lock */ + mac_handle_t odd_mh; /* RO */ + overlay_plugin_t *odd_plugin; /* RO */ + datalink_id_t odd_linkid; /* RO */ + void *odd_pvoid; /* RO -- only used by plugin */ + uint_t odd_ref; /* protected by odd_lock */ + uint_t odd_mtu; /* protected by odd_lock */ + overlay_dev_flag_t odd_flags; /* protected by odd_lock */ + uint_t odd_rxcount; /* protected by odd_lock */ + uint_t odd_txcount; /* protected by odd_lock */ + overlay_mux_t *odd_mux; /* protected by odd_lock */ + uint64_t odd_vid; /* RO if active else odd_lock */ + avl_node_t odd_muxnode; /* managed by mux */ + overlay_target_t *odd_target; /* See big theory statement */ + char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { + OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ + OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ + OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ + OVERLAY_ENTRY_F_VALID_MASK = 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { + kmutex_t ote_lock; + refhash_link_t ote_reflink; /* hashtable link */ + avl_node_t ote_avllink; /* iteration link */ + list_node_t ote_qlink; + overlay_target_entry_flags_t ote_flags; /* RW: state flags */ + uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ + overlay_target_t *ote_ott; /* RO */ + overlay_dev_t *ote_odd; /* RO */ + overlay_target_point_t ote_dest; /* RW: destination */ + mblk_t *ote_chead; /* RW: blocked mb chain head */ + mblk_t *ote_ctail; /* RW: blocked mb chain tail */ + size_t ote_mbsize; /* RW: outstanding mblk size */ + hrtime_t ote_vtime; /* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define OVERLAY_CTL "overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, + struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define OVERLAY_TARGET_OK 0 +#define OVERLAY_TARGET_DROP 1 +#define OVERLAY_TARGET_ASYNC 2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, + socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..07efaa05df --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define _SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + * overlay_plugin_init_t + * + * This interface is used to create a new instance of a plugin. An instance + * of a plugin will be created for each overlay device that is created. For + * example, if a device is created with VXLAN ID 23 and ID 42, then there + * will be two different calls to this function. + * + * This function gives the plugin a chance to create a private data + * structure that will be returned on subsequent calls to the system. + * + * overlay_plugin_fini_t + * + * This is the opposite of overlay_plugin_init_t. It will be called when it + * is safe to remove any private data that is associated with this instance + * of the plugin. + * + * overlay_plugin_propinfo_t + * + * This is called with the name of a property that is registered when the + * plugin is created. This function will be called with the name of the + * property that information is being requested about. The plugin is + * responsible for filling out information such as setting the name, the + * type of property it is, the protection of the property (can a user + * update it?), whether the property is required, an optional default value + * for the property, and an optional set of values or ranges that are + * allowed. + * + * overlay_plugin_getprop_t + * + * Return the value of the named property from the current instance of the + * plugin. + * + * overlay_plugin_setprop_t + * + * Set the value of the named property to the specified value for the + * current instance of the plugin. Note, that it is the plugin's + * responsibility to ensure that the value of the property is valid and to + * update state as appropriate. + * + * overlay_plugin_socket_t + * + * Every overlay device has a corresponding socket that it uses to send and + * receive traffic. This routine is used to get the parameters that should + * be used to define such a socket. The actual socket may be multiplexed + * with other uses of it. + * + * overlay_plugin_sockopt_t + * + * Allow a plugin to set any necessary socket options that it needs on the + * kernel socket that is being used by a mux. This will only be called once + * for a given mux, if additional devices are added to a mux, it will not + * be called additional times. + * + * overlay_plugin_encap_t + * + * In this routine you're given a message block and information about the + * packet, such as the identifier and are asked to fill out a message block + * that represents the encapsulation header and optionally manipulate the + * input message if required. + * + * overlay_plugin_decap_t + * + * In this routine, you're given the encapsulated message block. The + * requirement is to decapsulate it and determine what is the correct + * overlay identifier for this network and to fill in the header size so + * the broader system knows how much of this data should be considered + * consumed. + * + * ovpo_callbacks + * + * This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + * ovep_version + * + * Should be set to the value of the macro OVEP_VERSION. + * + * ovep_name + * + * Should be set to a character string that has the name of the module. + * Generally this should match the name of the kernel module; however, this + * is the name that users will use to refer to this module when creating + * devices. + * + * overlay_plugin_ops_t + * + * Should be set to the functions as described above. + * + * ovep_props + * + * This is an array of character strings that holds the names of the + * properties of the encapsulation plugin. + * + * + * ovep_id_size + * + * This is the size in bytes of the valid range for the identifier. The + * valid identifier range is considered a ovep_id_size byte unsigned + * integer, [ 0, 1 << (ovep_id_size * 8) ). + * + * ovep_flags + * + * A series of flags that indicate optional features that are supported. + * Valid flags include: + * + * OVEP_F_VLAN_TAG + * + * The encapsulation format allows for the encapsulated + * packet to maintain a VLAN tag. + * + * ovep_dest + * + * Describes the kind of destination that the overlay plugin supports for + * sending traffic. For example, vxlan uses UDP, therefore it requires both + * an IP address and a port; however, nvgre uses the gre header and + * therefore only requires an IP address. The following flags may be + * combined: + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * Indicates that to send a packet to its destination, we + * require a link-layer ethernet address. + * + * OVERLAY_PLUGIN_D_IP + * + * Indicates that to send a packet to its destination, we + * require an IP address. Note, all IP addresses are + * transmitted as IPv6 addresses and for an IPv4 + * destination, using an IPv4-mapped IPv6 address is the + * expected way to transmit that. + * + * OVERLAY_PLUGIN_D_PORT + * + * Indicates that to send a packet to its destination, a + * port is required, this usually indicates that the + * protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Improtantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION 0x1 + +typedef enum overlay_plugin_flags { + OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { + uint64_t ovdi_id; + size_t ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, + ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, + ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, + struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, + uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, + uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { + uint_t ovpo_callbacks; + overlay_plugin_init_t ovpo_init; + overlay_plugin_fini_t ovpo_fini; + overlay_plugin_encap_t ovpo_encap; + overlay_plugin_decap_t ovpo_decap; + overlay_plugin_socket_t ovpo_socket; + overlay_plugin_sockopt_t ovpo_sockopt; + overlay_plugin_getprop_t ovpo_getprop; + overlay_plugin_setprop_t ovpo_setprop; + overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { + uint_t ovep_version; + const char *ovep_name; + const overlay_plugin_ops_t *ovep_ops; + const char **ovep_props; + uint_t ovep_id_size; + uint_t ovep_flags; + uint_t ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, + uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..ae92ef3532 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,293 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define _OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { + uint8_t otp_mac[ETHERADDRL]; + struct in6_addr otp_ip; + uint16_t otp_port; +} overlay_target_point_t; + +#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { + OVERLAY_TARG_INFO_F_ACTIVE = 0x01, + OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { + datalink_id_t oti_linkid; + uint32_t oti_needs; + uint64_t oti_flags; + uint64_t oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { + datalink_id_t ota_linkid; + uint32_t ota_mode; + uint64_t ota_id; + uint32_t ota_provides; + overlay_target_point_t ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { + datalink_id_t otd_linkid; + uint32_t otd_pad; + char otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { + datalink_id_t otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + * OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + * The overlay_targ_resp_t has the appropriate information from + * which a reply can be generated. The information is filled into + * an overlay_targ_point_t as appropriate based on the + * overlay_plugin_dest_t type. + * + * + * OVERLAY_TARG_DROP - overlay_targ_resp_t + * + * The overlay_targ_resp_t should identify a request for which to + * drop a packet. + * + * + * OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * The overlay_targ_pkt_t injects a fully formed packet into the + * virtual network. It may either be identified by its data link id + * or by the request id. If both are specified, the + * datalink id will be used. Note, that an injection is not + * considered a reply and if this corresponds to a requeset, then + * that individual packet must still be dropped. + * + * + * OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * This ioctl can be used to copy data from a given request into a + * user buffer. This can be used in combination with + * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * + * + * OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * of receiving it on the local mac handle, it queues it for + * retransmission again. This is useful if you have a packet that + * was originally destined for some broadcast or multicast address + * that you now want to send to a unicast address. + */ +#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10) +#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11) +#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12) +#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13) +#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) +#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { + uint64_t otl_dlid; + uint64_t otl_reqid; + uint64_t otl_varpdid; + uint64_t otl_vnetid; + uint64_t otl_hdrsize; + uint64_t otl_pktsize; + uint8_t otl_srcaddr[ETHERADDRL]; + uint8_t otl_dstaddr[ETHERADDRL]; + uint32_t otl_dsttype; + uint32_t otl_sap; + int32_t otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { + uint64_t otr_reqid; + overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + void *otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +typedef struct overlay_targ_pkt32 { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + caddr32_t otp_buf; +} overlay_targ_pkt32_t; + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { + uint32_t otl_nents; + uint32_t otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * The overlay_targ_cache_t should be have its link identifier and + * the desired mac address filled in. On return, it will fill in + * the otc_dest member, if the entry exists in the table. + * + * + * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * The cache table entry of the mac address referred to by otc_mac + * and otd_linkid will be filled in with the details provided by in + * the otc_dest member. + * + * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * Removes the cache entry identified by otc_mac from the table. + * Note that this does not stop any in-flight lookups or deal with + * any data that is awaiting a lookup. + * + * + * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * entire table identified by otc_linkid. All other parameters are + * ignored. + * + * + * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * Iterates over the contents of a target cache identified by + * otci_linkid. Iteration is guaranteed to be exactly once for + * items which are in the hashtable at the beginning and end of + * iteration. For items which are added or removed after iteration + * has begun, only at most once semantics are guaranteed. Consumers + * should ensure that otci_marker is zeroed before starting + * iteration and should preserve its contents across calls. + * + * Before calling in, otci_count should be set to the number of + * entries that space has been allocated for in otci_ents. The + * value will be updated to indicate the total number written out. + */ + +#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30) +#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31) +#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) +#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) +#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define OVERLAY_TARGET_ITER_MAX 500 + +#define OVERLAY_TARGET_CACHE_DROP 0x01 + +typedef struct overlay_targ_cache_entry { + uint8_t otce_mac[ETHERADDRL]; + uint16_t otce_flags; + overlay_target_point_t otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { + datalink_id_t otc_linkid; + overlay_targ_cache_entry_t otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { + datalink_id_t otci_linkid; + uint32_t otci_pad; + uint64_t otci_marker; + uint16_t otci_count; + uint8_t otci_pad2[3]; + overlay_targ_cache_entry_t otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index 282d84b912..66bd91f76f 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -116,7 +116,7 @@ extern "C" { #define DEFAULT_MAXPID 999999 #define DEFAULT_JUMPPID 100000 #else -#define DEFAULT_MAXPID 30000 +#define DEFAULT_MAXPID 99999 #define DEFAULT_JUMPPID 0 #endif diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index 1269aeca10..587a51f0aa 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_PATTR_H @@ -106,6 +107,25 @@ typedef struct pattr_hcksum_s { #define HW_LSO_FLAGS HW_LSO /* All LSO flags, currently only one */ /* + * The packet originates from a MAC on the same machine as the + * receiving MAC. There are two ways this can happen. + * + * 1. MAC loopback: When a packet is destined for a MAC client on the + * same MAC as the sender. This datapath is taken in + * max_tx_send(). + * + * 2. Bridge Fwd: When a packet is destined for a MAC client on the + * same bridge as the sender. This datapath is taken in + * bridge_forward(). + * + * Presented with this flag, a receiver can then decide whether or not + * it needs to emulate some or all of the HW offloads that the NIC + * would have performed otherwise -- or whether it should accept the + * packet as-is. + */ +#define HW_LOCAL_MAC 0x100 + +/* * Structure used for zerocopy attribute. */ typedef struct pattr_zcopy_s { diff --git a/usr/src/uts/common/sys/pci.h b/usr/src/uts/common/sys/pci.h index 66ce71bcc2..d62d19c3a5 100644 --- a/usr/src/uts/common/sys/pci.h +++ b/usr/src/uts/common/sys/pci.h @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_PCI_H @@ -168,6 +168,7 @@ extern "C" { /* * PCI status register bits */ +#define PCI_STAT_READY 0x1 /* Immediate Readiness */ #define PCI_STAT_INTR 0x8 /* Interrupt state */ #define PCI_STAT_CAP 0x10 /* Implements Capabilities */ #define PCI_STAT_66MHZ 0x20 /* 66 MHz capable */ @@ -928,6 +929,8 @@ typedef struct pcix_attr { #define PCI_MSI_MME_SHIFT 0x4 /* Shift for MME bits */ #define PCI_MSI_64BIT_MASK 0x0080 /* 64bit support mask in MSI ctrl reg */ #define PCI_MSI_PVM_MASK 0x0100 /* PVM support mask in MSI ctrl reg */ +#define PCI_MSI_EMD_MASK 0x0200 /* EMD Capable Mask */ +#define PCI_MSI_EMD_ENABLE 0x0400 /* EMD Enable bit */ /* * PCI Extended Message Signalled Interrupts (MSI-X) capability entry offsets diff --git a/usr/src/uts/common/sys/pci_cap.h b/usr/src/uts/common/sys/pci_cap.h index 730e10d77b..9804913241 100644 --- a/usr/src/uts/common/sys/pci_cap.h +++ b/usr/src/uts/common/sys/pci_cap.h @@ -82,12 +82,12 @@ typedef enum { #define PCI_CAP_GET32(h, i, b, o) ((uint32_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_32, i, b, o)) -#define PCI_CAP_PUT8(h, i, b, o, d) ((uint8_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d)) -#define PCI_CAP_PUT16(h, i, b, o, d) ((uint16_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d)) -#define PCI_CAP_PUT32(h, i, b, o, d) ((uint32_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d)) +#define PCI_CAP_PUT8(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d) +#define PCI_CAP_PUT16(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d) +#define PCI_CAP_PUT32(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d) #define PCI_XCAP_GET8(h, i, b, o) ((uint8_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o)) @@ -96,12 +96,12 @@ typedef enum { #define PCI_XCAP_GET32(h, i, b, o) ((uint32_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o)) -#define PCI_XCAP_PUT8(h, i, b, o, d) ((uint8_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define PCI_XCAP_PUT16(h, i, b, o, d) ((uint16_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define PCI_XCAP_PUT32(h, i, b, o, d) ((uint32_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d)) +#define PCI_XCAP_PUT8(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d) +#define PCI_XCAP_PUT16(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d) +#define PCI_XCAP_PUT32(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d) extern int pci_cap_probe(ddi_acc_handle_t h, uint16_t index, diff --git a/usr/src/uts/common/sys/pcie.h b/usr/src/uts/common/sys/pcie.h index 05b70a56fa..a26729c523 100644 --- a/usr/src/uts/common/sys/pcie.h +++ b/usr/src/uts/common/sys/pcie.h @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2019, Joyent, Inc. + */ #ifndef _SYS_PCIE_H #define _SYS_PCIE_H @@ -49,6 +52,7 @@ extern "C" { #define PCIE_SLOTCTL 0x18 /* Slot Control */ #define PCIE_SLOTSTS 0x1A /* Slot Status */ #define PCIE_ROOTCTL 0x1C /* Root Control */ +#define PCIE_ROOTCAP 0x1E /* Root Capabilities */ #define PCIE_ROOTSTS 0x20 /* Root Status */ #define PCIE_DEVCAP2 0x24 /* Device Capability 2 */ #define PCIE_DEVCTL2 0x28 /* Device Control 2 */ @@ -125,6 +129,9 @@ extern "C" { #define PCIE_DEVCAP_EP_L1_LAT_MAX 0x1C0 /* > 64 us */ #define PCIE_DEVCAP_EP_L1_LAT_MASK 0x700 /* EP L1 Accetable Latency */ +/* + * As of PCIe 2.x these three bits are now undefined. + */ #define PCIE_DEVCAP_ATTN_BUTTON 0x1000 /* Attention Button Present */ #define PCIE_DEVCAP_ATTN_INDICATOR 0x2000 /* Attn Indicator Present */ #define PCIE_DEVCAP_PWR_INDICATOR 0x4000 /* Power Indicator Present */ @@ -140,6 +147,8 @@ extern "C" { #define PCIE_DEVCAP_PLMT_SCL_1_BY_1000 0xC000000 /* 0.001x Scale */ #define PCIE_DEVCAP_PLMT_SCL_MASK 0xC000000 /* Power Limit Scale */ +#define PCIE_DEVCAP_FLR 0x10000000 /* Function Level Reset */ + /* * Device Control Register (2 bytes) */ @@ -174,6 +183,9 @@ extern "C" { #define PCIE_DEVCTL_MAX_READ_REQ_MASK 0x7000 /* Max_Read_Request_Size */ #define PCIE_DEVCTL_MAX_READ_REQ_SHIFT 0xC +#define PCIE_DEVCTL_BRIDGE_RETRY 0x8000 /* Bridge can return CRS */ +#define PCIE_DEVCTL_INITIATE_FLR 0x8000 /* Start Function Level Reset */ + /* * Device Status Register (2 bytes) */ @@ -183,11 +195,20 @@ extern "C" { #define PCIE_DEVSTS_UR_DETECTED 0x8 /* Unsupported Req Detected */ #define PCIE_DEVSTS_AUX_POWER 0x10 /* AUX Power Detected */ #define PCIE_DEVSTS_TRANS_PENDING 0x20 /* Transactions Pending */ +#define PCIE_DEVSTS_EPR_DETECTED 0x40 /* Emergency Power Reduction */ /* * Link Capability Register (4 bytes) */ -#define PCIE_LINKCAP_MAX_SPEED_2_5 0x1 /* 2.5 Gb/s Speed */ +#define PCIE_LINKCAP_MAX_SPEED_2_5 0x1 /* 2.5 GT/s Speed */ +/* + * In version 2 of PCI express, this indicated that both 5.0 GT/s and 2.5 GT/s + * speeds were supported. The use of this as the maximum link speed was added + * with PCIex v3. + */ +#define PCIE_LINKCAP_MAX_SPEED_5 0x2 /* 5.0 GT/s Speed */ +#define PCIE_LINKCAP_MAX_SPEED_8 0x3 /* 8.0 GT/s Speed */ +#define PCIE_LINKCAP_MAX_SPEED_16 0x4 /* 16.0 GT/s Speed */ #define PCIE_LINKCAP_MAX_SPEED_MASK 0xF /* Maximum Link Speed */ #define PCIE_LINKCAP_MAX_WIDTH_X1 0x010 #define PCIE_LINKCAP_MAX_WIDTH_X2 0x020 @@ -199,6 +220,7 @@ extern "C" { #define PCIE_LINKCAP_MAX_WIDTH_MASK 0x3f0 /* Maximum Link Width */ #define PCIE_LINKCAP_ASPM_SUP_L0S 0x400 /* L0s Entry Supported */ +#define PCIE_LINKCAP_ASPM_SUP_L1 0x800 /* L1 Entry Supported */ #define PCIE_LINKCAP_ASPM_SUP_L0S_L1 0xC00 /* L0s abd L1 Supported */ #define PCIE_LINKCAP_ASPM_SUP_MASK 0xC00 /* ASPM Support */ @@ -222,9 +244,12 @@ extern "C" { #define PCIE_LINKCAP_L1_EXIT_LAT_MAX 0x38000 /* > 64 us */ #define PCIE_LINKCAP_L1_EXIT_LAT_MASK 0x38000 /* L1 Exit Latency */ -/* PCIe v1.1 spec based */ +#define PCIE_LINKCAP_CLOCK_POWER_MGMT 0x40000 /* Clock Power Management */ +#define PCIE_LINKCAP_SDER_CAP 0x80000 /* Surprise Down Err report */ #define PCIE_LINKCAP_DLL_ACTIVE_REP_CAPABLE 0x100000 /* DLL Active */ /* Capable bit */ +#define PCIE_LINKCAP_LINK_BW_NOTIFY_CAP 0x200000 /* Link Bandwidth Notify Cap */ +#define PCIE_LINKCAP_ASPM_OPTIONAL 0x400000 /* ASPM Opt. Comp. */ #define PCIE_LINKCAP_PORT_NUMBER 0xFF000000 /* Port Number */ #define PCIE_LINKCAP_PORT_NUMBER_SHIFT 24 /* Port Number Shift */ @@ -247,11 +272,23 @@ extern "C" { #define PCIE_LINKCTL_RETRAIN_LINK 0x20 /* Retrain Link */ #define PCIE_LINKCTL_COMMON_CLK_CFG 0x40 /* Common Clock Configuration */ #define PCIE_LINKCTL_EXT_SYNCH 0x80 /* Extended Synch */ +#define PCIE_LINKCTL_CLOCK_POWER_MGMT 0x100 /* Enable Clock Power Mgmt. */ +#define PCIE_LINKCTL_HW_WIDTH_DISABLE 0x200 /* hw auto width disable */ +#define PCIE_LINKCTL_LINK_BW_INTR_EN 0x400 /* Link bw mgmt intr */ +#define PCIE_LINKCTL_LINK_AUTO_BW_INTR_EN 0x800 /* Auto bw intr */ + +#define PCI_LINKCTRL_DRS_SIG_CTRL_NO_REP 0x00 +#define PCI_LINKCTRL_DRS_SIG_CTRL_IE 0x4000 +#define PCI_LINKCTRL_DRS_SIG_CTRL_DRS_FRS 0x8000 +#define PCIE_LINKCTL_DRS_SIG_CTRL_MASK 0xC000 /* DRS Signaling Control */ /* * Link Status Register (2 bytes) */ -#define PCIE_LINKSTS_SPEED_2_5 0x1 /* Link Speed */ +#define PCIE_LINKSTS_SPEED_2_5 0x1 /* 2.5 GT/s Link Speed */ +#define PCIE_LINKSTS_SPEED_5 0x2 /* 5.0 GT/s Link Speed */ +#define PCIE_LINKSTS_SPEED_8 0x3 /* 8.0 GT/s Link Speed */ +#define PCIE_LINKSTS_SPEED_16 0x4 /* 16.0 GT/s Link Speed */ #define PCIE_LINKSTS_SPEED_MASK 0xF /* Link Speed */ #define PCIE_LINKSTS_NEG_WIDTH_X1 0x010 @@ -263,12 +300,13 @@ extern "C" { #define PCIE_LINKSTS_NEG_WIDTH_X32 0x200 #define PCIE_LINKSTS_NEG_WIDTH_MASK 0x3F0 /* Negotiated Link Width */ +/* This bit is undefined as of PCIe 2.x */ #define PCIE_LINKSTS_TRAINING_ERROR 0x400 /* Training Error */ #define PCIE_LINKSTS_LINK_TRAINING 0x800 /* Link Training */ #define PCIE_LINKSTS_SLOT_CLK_CFG 0x1000 /* Slot Clock Configuration */ - -/* PCIe v1.1 spec based */ #define PCIE_LINKSTS_DLL_LINK_ACTIVE 0x2000 /* DLL Link Active */ +#define PCIE_LINKSTS_LINK_BW_MGMT 0x4000 /* Link bw mgmt status */ +#define PCIE_LINKSTS_AUTO_BW 0x8000 /* Link auto BW status */ /* * Slot Capability Register (4 bytes) @@ -311,6 +349,7 @@ extern "C" { #define PCIE_SLOTCTL_PWR_CONTROL 0x0400 /* Power controller Control */ #define PCIE_SLOTCTL_EMI_LOCK_CONTROL 0x0800 /* EMI Lock control */ #define PCIE_SLOTCTL_DLL_STATE_EN 0x1000 /* DLL State Changed En */ +#define PCIE_SLOTCTL_AUTO_SLOT_PL_DIS 0x2000 /* Auto Slot Power Limit Dis */ #define PCIE_SLOTCTL_ATTN_INDICATOR_MASK 0x00C0 /* Attn Indicator mask */ #define PCIE_SLOTCTL_PWR_INDICATOR_MASK 0x0300 /* Power Indicator mask */ #define PCIE_SLOTCTL_INTR_MASK 0x103f /* Supported intr mask */ @@ -354,6 +393,12 @@ extern "C" { #define PCIE_ROOTCTL_SYS_ERR_ON_NFE_EN 0x2 /* Sys Err on NF Err Enable */ #define PCIE_ROOTCTL_SYS_ERR_ON_FE_EN 0x4 /* Sys Err on Fatal Err En */ #define PCIE_ROOTCTL_PME_INTERRUPT_EN 0x8 /* PME Interrupt Enable */ +#define PCIE_ROOTCTL_CRS_SW_VIS_EN 0x10 /* CRS SW Visibility EN */ + +/* + * Root Capabilities register (2 bytes) + */ +#define PCIE_ROOTCAP_CRS_SW_VIS 0x01 /* CRS SW Visible */ /* * Root Status Register (4 bytes) @@ -378,14 +423,25 @@ extern "C" { #define PCIE_DEVCAP2_LTR_MECH 0x800 #define PCIE_DEVCAP2_TPH_COMP_SHIFT 12 #define PCIE_DEVCAP2_TPH_COMP_MASK 0x3 +#define PCIE_DEVCAP2_LNSYS_CLS_SHIFT 14 +#define PCIE_DEVCAP2_LNSYS_CLS_MASK 0x3 +#define PCIE_DEVCAP2_10B_TAG_COMP_SUP 0x10000 +#define PCIE_DEVCAP2_10B_TAG_REQ_SUP 0x20000 +#define PCIE_DEVCAP2_OBFF_SHIFT 18 +#define PCIE_DEVCAP2_OBFF_MASK 0x3 #define PCIE_DEVCAP2_EXT_FMT_FIELD 0x100000 #define PCIE_DEVCAP2_END_END_TLP_PREFIX 0x200000 #define PCIE_DEVCAP2_MAX_END_END_SHIFT 22 #define PCIE_DEVCAP2_MAX_END_END_MASK 0x3 +#define PCIE_DEVCAP2_EPR_SUP_SHIFT 24 +#define PCIE_DEVCAP2_EPR_SUP_MASK 0x3 +#define PCIE_DEVCAP2_EPR_INIT_REQ 0x4000000 +#define PCIE_DEVCAP2_FRS_SUP 0x80000000 /* * Device Control 2 Register (2 bytes) */ +#define PCIE_DEVCTL2_COM_TO_RANGE_MASK 0xf #define PCIE_DEVCTL2_COM_TO_RANGE_0 0x0 #define PCIE_DEVCTL2_COM_TO_RANGE_1 0x1 #define PCIE_DEVCTL2_COM_TO_RANGE_2 0x2 @@ -402,11 +458,65 @@ extern "C" { #define PCIE_DEVCTL2_IDO_REQ_EN 0x100 #define PCIE_DEVCTL2_IDO_COMPL_EN 0x200 #define PCIE_DEVCTL2_LTR_MECH_EN 0x400 +#define PCIE_DEVCTL2_EPR_REQ 0x800 +#define PCIE_DEVCTL2_10BTAG_REQ_EN 0x1000 +#define PCIE_DEVCTL2_OBFF_MASK 0x6000 +#define PCIE_DEVCTL2_OBFF_DISABLE 0x0000 +#define PCIE_DEVCTL2_OBFF_EN_VARA 0x2000 +#define PCIE_DEVCTL2_OBFF_EN_VARB 0x4000 +#define PCIE_DEVCTL2_OBFF_EN_WAKE 0x6000 #define PCIE_DEVCTL2_END_END_TLP_PREFIX 0x8000 - - +/* + * Link Capability 2 Register (4 bytes) + */ +#define PCIE_LINKCAP2_SPEED_2_5 0x02 +#define PCIE_LINKCAP2_SPEED_5 0x04 +#define PCIE_LINKCAP2_SPEED_8 0x08 +#define PCIE_LINKCAP2_SPEED_16 0x10 +#define PCIE_LINKCAP2_SPEED_MASK 0xfe +#define PCIE_LINKCAP2_CROSSLINK 0x100 +#define PCIE_LINKCAP2_LSKP_OSGSS_MASK 0xfe00 +#define PCIE_LINKCAP2_LKSP_OSGSS_2_5 0x0200 +#define PCIE_LINKCAP2_LKSP_OSGSS_5 0x0400 +#define PCIE_LINKCAP2_LKSP_OSGSS_8 0x0800 +#define PCIE_LINKCAP2_LKSP_OSGSS_16 0x1000 +#define PCIE_LINKCAP2_LKSP_OSRSS_MASK 0x7f0000 +#define PCIE_LINKCAP2_LKSP_OSRSS_2_5 0x010000 +#define PCIE_LINKCAP2_LKSP_OSRSS_5 0x020000 +#define PCIE_LINKCAP2_LKSP_OSRSS_8 0x040000 +#define PCIE_LINKCAP2_LKSP_OSRSS_16 0x080000 +#define PCIE_LINKCAP2_RTPD_SUP 0x800000 +#define PCIE_LINKCAP2_TRTPD_SUP 0x01000000 +#define PCIE_LINKCAP2_DRS 0x80000000 + +/* + * Link Control 2 Register (2 bytes) + */ +#define PCIE_LINKCTL2_TARGET_SPEED_MASK 0x000f +#define PICE_LINKCTL2_ENTER_COMPLIANCE 0x0010 +#define PCIE_LINKCTL2_HW_AUTO_SPEED_DIS 0x0020 +#define PCIE_LINKCTL2_SELECT_DEEMPH 0x0040 +#define PCIE_LINKCTL2_TX_MARGIN_MASK 0x0380 +#define PCIE_LINKCTL2_ENTER_MOD_COMP 0x0400 +#define PCIE_LINKCTL2_COMP_SOS 0x0800 +#define PCIE_LINKCTL2_COMP_DEEMPM_MASK 0xf000 + +/* + * Link Status 2 Register (2 bytes) + */ +#define PCIE_LINKSTS2_CUR_DEEMPH 0x0001 +#define PCIE_LINKSTS2_EQ8GT_COMP 0x0002 +#define PCIE_LINKSTS2_EQ8GT_P1_SUC 0x0004 +#define PCIE_LINKSTS2_EQ8GT_P2_SUC 0x0008 +#define PCIE_LINKSTS2_EQ8GT_P3_SUC 0x0010 +#define PCIE_LINKSTS2_LINK_EQ_REQ 0x0020 +#define PCIE_LINKSTS2_RETIMER_PRES_DET 0x0040 +#define PCIE_LINKSTS2_2RETIMER_PRES_DET 0x0080 +#define PCIE_LINKSTS2_XLINK_RES 0x0300 +#define PCIE_LINKSTS2_DS_COMP_PRES_MASK 0x7000 +#define PCIE_LINKSTS2_DRS_MSG_RX 0x8000 /* * PCI-Express Enhanced Capabilities Link Entry Bit Offsets @@ -441,6 +551,28 @@ extern "C" { #define PCIE_EXT_CAP_ID_ACS 0xD /* Access Control Services */ #define PCIE_EXT_CAP_ID_ARI 0xE /* Alternative Routing ID */ #define PCIE_EXT_CAP_ID_ATS 0xF /* Address Translation Svcs */ +#define PCIE_EXT_CAP_ID_SRIOV 0x10 /* Single Root I/O Virt. */ +#define PCIE_EXT_CAP_ID_MRIOV 0x11 /* Multi Root I/O Virt. */ +#define PCIE_EXT_CAP_ID_MULTICAST 0x12 /* Multicast Services */ +#define PCIE_EXT_CAP_ID_EA 0x14 /* Enhanced Allocation */ +#define PCIE_EXT_CAP_ID_RESIZE_BAR 0x15 /* Resizable BAR */ +#define PCIE_EXT_CAP_ID_DPA 0x16 /* Dynamic Power Allocation */ +#define PCIE_EXT_CAP_ID_TPH_REQ 0x17 /* TPH Requester */ +#define PCIE_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Report */ +#define PCIE_EXT_CAP_ID_PCIE2 0x19 /* PCI Express Capability 2 */ +#define PCIE_EXT_CAP_ID_PASID 0x1B /* PASID */ +#define PCIE_EXT_CAP_ID_LNR 0x1C /* LNR */ +#define PCIE_EXT_CAP_ID_DPC 0x1D /* DPC */ +#define PCIE_EXT_CAP_ID_L1PM 0x1E /* L1 PM Substrates */ +#define PCIE_EXT_CAP_ID_PTM 0x1F /* Precision Time Management */ +#define PCIE_EXT_CAP_ID_FRS 0x21 /* Function Ready Stat. Queue */ +#define PCIE_EXT_CAP_ID_RTR 0x22 /* Readiness Time Reporting */ +#define PCIE_EXT_CAP_ID_DVS 0x23 /* Designated Vendor-Specific */ +#define PCIE_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ +#define PCIE_EXT_CAP_ID_PL16GTE 0x26 /* Physical Layer 16.0 GT/s */ +#define PCIE_EXT_CAP_ID_LANE_MARGIN 0x27 /* Lane Margining */ +#define PCIE_EXT_CAP_ID_HIEARCHY_ID 0x28 /* Hierarchy ID */ +#define PCIE_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Mgmt */ /* * PCI-Express Advanced Error Reporting Extended Capability Offsets @@ -545,10 +677,10 @@ extern "C" { * AER Secondary Uncorrectable Error Register */ #define PCIE_AER_SUCE_TA_ON_SC 0x1 /* Target Abort on Split Comp */ -#define PCIE_AER_SUCE_MA_ON_SC 0x2 /* Master Abort on Split Comp */ +#define PCIE_AER_SUCE_MA_ON_SC 0x2 /* Master Abort on Split Comp */ #define PCIE_AER_SUCE_RCVD_TA 0x4 /* Received Target Abort */ -#define PCIE_AER_SUCE_RCVD_MA 0x8 /* Received Master Abort */ -#define PCIE_AER_SUCE_USC_ERR 0x20 /* Unexpected Split Comp Err */ +#define PCIE_AER_SUCE_RCVD_MA 0x8 /* Received Master Abort */ +#define PCIE_AER_SUCE_USC_ERR 0x20 /* Unexpected Split Comp Err */ #define PCIE_AER_SUCE_USC_MSG_DATA_ERR 0x40 /* USC Message Data Error */ #define PCIE_AER_SUCE_UC_DATA_ERR 0x80 /* Uncorrectable Data Error */ #define PCIE_AER_SUCE_UC_ATTR_ERR 0x100 /* UC Attribute Err */ diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index 1f08fad51d..faebc9d020 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019, Joyent, Inc. */ /* @@ -285,6 +286,29 @@ typedef struct pf_root_fault { typedef struct pf_data pf_data_t; +typedef enum pcie_link_width { + PCIE_LINK_WIDTH_UNKNOWN, + PCIE_LINK_WIDTH_X1, + PCIE_LINK_WIDTH_X2, + PCIE_LINK_WIDTH_X4, + PCIE_LINK_WIDTH_X8, + PCIE_LINK_WIDTH_X12, + PCIE_LINK_WIDTH_X16, + PCIE_LINK_WIDTH_X32 +} pcie_link_width_t; + +/* + * Note, this member should always be treated as a bit field, as a device may + * support multiple speeds. + */ +typedef enum pcie_link_speed { + PCIE_LINK_SPEED_UNKNOWN = 0x00, + PCIE_LINK_SPEED_2_5 = 0x01, + PCIE_LINK_SPEED_5 = 0x02, + PCIE_LINK_SPEED_8 = 0x04, + PCIE_LINK_SPEED_16 = 0x08 +} pcie_link_speed_t; + /* * For hot plugged device, these data are init'ed during during probe * For non-hotplugged device, these data are init'ed in pci_autoconfig (on x86), @@ -339,6 +363,15 @@ typedef struct pcie_bus { /* workaround for PCI/PCI-X devs behind PCIe2PCI Bridge */ pcie_req_id_t bus_pcie2pci_secbus; + + /* + * Link speed specific fields. + */ + pcie_link_width_t bus_max_width; + pcie_link_width_t bus_cur_width; + pcie_link_speed_t bus_sup_speed; + pcie_link_speed_t bus_max_speed; + pcie_link_speed_t bus_cur_speed; } pcie_bus_t; /* @@ -365,6 +398,7 @@ struct pf_data { boolean_t pe_lock; boolean_t pe_valid; uint32_t pe_severity_flags; /* Severity of error */ + uint32_t pe_severity_mask; uint32_t pe_orig_severity_flags; /* Original severity */ pf_affected_dev_t *pe_affected_dev; pcie_bus_t *pe_bus_p; @@ -393,6 +427,7 @@ typedef struct pf_impl { /* bus_fm_flags field */ #define PF_FM_READY (1 << 0) /* bus_fm_lock initialized */ #define PF_FM_IS_NH (1 << 1) /* known as non-hardened */ +#define PF_FM_IS_PASSTHRU (1 << 2) /* device is controlled by VM */ /* * PCIe fabric handle lookup address flags. Used to define what type of @@ -421,11 +456,10 @@ typedef struct pf_impl { #define PF_ERR_MATCHED_PARENT (1 << 5) /* Error Handled By Parent */ #define PF_ERR_PANIC (1 << 6) /* Error should panic system */ #define PF_ERR_PANIC_DEADLOCK (1 << 7) /* deadlock detected */ -#define PF_ERR_PANIC_BAD_RESPONSE (1 << 8) /* Device no response */ +#define PF_ERR_BAD_RESPONSE (1 << 8) /* Device bad/no response */ #define PF_ERR_MATCH_DOM (1 << 9) /* Error Handled By IO domain */ -#define PF_ERR_FATAL_FLAGS \ - (PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK | PF_ERR_PANIC_BAD_RESPONSE) +#define PF_ERR_FATAL_FLAGS (PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK) #define PF_HDL_FOUND 1 #define PF_HDL_NOTFOUND 2 @@ -529,6 +563,7 @@ extern void pcie_enable_errors(dev_info_t *dip); extern void pcie_disable_errors(dev_info_t *dip); extern int pcie_enable_ce(dev_info_t *dip); extern boolean_t pcie_bridge_is_link_disabled(dev_info_t *); +extern boolean_t pcie_is_pci_device(dev_info_t *dip); extern pcie_bus_t *pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags); @@ -587,6 +622,7 @@ extern void pf_eh_enter(pcie_bus_t *bus_p); extern void pf_eh_exit(pcie_bus_t *bus_p); extern int pf_scan_fabric(dev_info_t *rpdip, ddi_fm_error_t *derr, pf_data_t *root_pfd_p); +extern void pf_set_passthru(dev_info_t *, boolean_t); extern void pf_init(dev_info_t *, ddi_iblock_cookie_t, ddi_attach_cmd_t); extern void pf_fini(dev_info_t *, ddi_detach_cmd_t); extern int pf_hdl_lookup(dev_info_t *, uint64_t, uint32_t, uint64_t, diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index de15be4d60..816d6995cf 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *); int secpolicy_kmdb(const cred_t *); int secpolicy_lock_memory(const cred_t *); int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *); int secpolicy_modctl(const cred_t *, int); int secpolicy_net(const cred_t *, int, boolean_t); int secpolicy_net_bindmlp(const cred_t *); @@ -176,6 +177,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index 67b47f9a1e..3e0eb3b21f 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_POLL_IMPL_H @@ -140,6 +140,7 @@ struct pollstate { pollstate_t *ps_contend_nextp; /* next in contender list */ pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ int ps_flags; /* state flags */ + short ps_implicit_ev; /* implicit poll event interest */ }; /* pollstate flags */ @@ -225,6 +226,7 @@ struct polldat { int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ port_kevent_t *pd_portev; /* associated port event struct */ + uf_entry_gen_t pd_gen; /* fd generation at cache time */ uint64_t pd_epolldata; /* epoll data, if any */ }; @@ -256,6 +258,7 @@ struct pollcache { /* pc_flag */ #define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_EPOLL 0x04 /* pollcache is epoll-enabled */ #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 712bd7cb24..7d2209132d 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -315,6 +315,7 @@ typedef struct proc { size_t p_swrss; /* resident set size before last swap */ struct aio *p_aio; /* pointer to async I/O struct */ struct itimer **p_itimer; /* interval timers */ + uint_t p_itimer_sz; /* max allocated interval timers */ timeout_id_t p_alarmid; /* alarm's timeout id */ caddr_t p_usrstack; /* top of the process stack */ uint_t p_stkprot; /* stack memory protection */ @@ -358,6 +359,7 @@ typedef struct proc { struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ struct brand *p_brand; /* process's brand */ + void *p_brand_data; /* per-process brand state */ psecflags_t p_secflags; /* per-process security flags */ @@ -374,7 +376,6 @@ typedef struct proc { */ struct user p_user; /* (see sys/user.h) */ } proc_t; - #define PROC_T /* headers relying on proc_t are OK */ #ifdef _KERNEL @@ -640,6 +641,7 @@ extern int signal_is_blocked(kthread_t *, int); extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid); extern void pid_setmin(void); extern pid_t pid_allocate(proc_t *, pid_t, int); extern int pid_rele(struct pid *); @@ -655,6 +657,7 @@ extern int sprtrylock_proc(proc_t *); extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); +extern void sprunprlock(proc_t *); extern void pid_init(void); extern proc_t *pid_entry(int); extern int pid_slot(proc_t *); @@ -729,6 +732,10 @@ extern kthread_t *thread_unpin(void); extern void thread_init(void); extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern void thread_splitstack(void (*)(void *), void *, size_t); +extern void thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern void thread_splitstack_cleanup(void); + extern void tsd_create(uint_t *, void (*)(void *)); extern void tsd_destroy(uint_t *); extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); @@ -770,7 +777,7 @@ extern void pokelwps(proc_t *); extern void continuelwps(proc_t *); extern int exitlwps(int); extern void lwp_ctmpl_copy(klwp_t *, klwp_t *); -extern void lwp_ctmpl_clear(klwp_t *); +extern void lwp_ctmpl_clear(klwp_t *, boolean_t); extern klwp_t *forklwp(klwp_t *, proc_t *, id_t); extern void lwp_load(klwp_t *, gregset_t, uintptr_t); extern void lwp_setrval(klwp_t *, int, int); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index dfb54eaef5..99da92ab79 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -236,6 +236,7 @@ typedef struct pstatus { #define PR_FAULTED 6 #define PR_SUSPENDED 7 #define PR_CHECKPOINT 8 +#define PR_BRAND 9 /* * lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo diff --git a/usr/src/uts/common/sys/prsystm.h b/usr/src/uts/common/sys/prsystm.h index 7adc920da2..75259dc421 100644 --- a/usr/src/uts/common/sys/prsystm.h +++ b/usr/src/uts/common/sys/prsystm.h @@ -28,7 +28,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_PRSYSTM_H @@ -86,7 +86,7 @@ extern void prgetcred(proc_t *, struct prcred *); extern void prgetpriv(proc_t *, struct prpriv *); extern size_t prgetprivsize(void); extern void prgetsecflags(proc_t *, struct prsecflags *); -extern int prnsegs(struct as *, int); +extern uint_t prnsegs(struct as *, int); extern void prexit(proc_t *); extern void prfree(proc_t *); extern void prlwpexit(kthread_t *); diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 55987fe6d7..8b97fd7e3b 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ typedef struct pt_own { @@ -157,6 +163,19 @@ typedef struct pt_own { #define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ #define OWNERPT (('P'<<8)|5) /* set owner/group for slave device */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h index 2069e6d3f1..b7427a454d 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -10,11 +10,11 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#ifndef _SYS_SCSI_ADAPTERS_MPTHASH_H -#define _SYS_SCSI_ADAPTERS_MPTHASH_H +#ifndef _SYS_REFHASH_H +#define _SYS_REFHASH_H #include <sys/types.h> #include <sys/list.h> @@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *); extern void *refhash_next(refhash_t *, void *); extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); -#endif /* _SYS_SCSI_ADAPTERS_MPTHASH_H */ +#endif /* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 13166f378d..d65ca00f69 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h index d4233aecb5..2ed7320a09 100644 --- a/usr/src/uts/common/sys/rt.h +++ b/usr/src/uts/common/sys/rt.h @@ -22,6 +22,7 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -75,6 +76,16 @@ typedef struct rtkparms { int rt_tqsig; /* real-time time quantum signal */ uint_t rt_cflags; /* real-time control flags */ } rtkparms_t; + +#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ + +/* + * control flags (kparms->rt_cflags). + */ +#define RT_DOPRI 0x01 /* change priority */ +#define RT_DOTQ 0x02 /* change RT time quantum */ +#define RT_DOSIG 0x04 /* change RT time quantum signal */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h new file mode 100644 index 0000000000..afb7a94c58 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2012-2015 LSI Corp. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2000-2015 LSI Corporation. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * + * Name: mpi2_pci.h + * Title: MPI PCIe Attached Devices structures and definitions. + * Creation Date: October 9, 2012 + * + * mpi2_pci.h Version: 02.00.02 + * + * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25 + * prefix are for use only on MPI v2.5 products, and must not be used + * with MPI v2.0 products. Unless otherwise noted, names beginning with + * MPI2 or Mpi2 are for use with both MPI v2.0 and MPI v2.5 products. + * + * Version History + * --------------- + * + * Date Version Description + * -------- -------- ------------------------------------------------------ + * 03-16-15 02.00.00 Initial version. + * 02-17-16 02.00.01 Removed AHCI support. + * Removed SOP support. + * 07-01-16 02.00.02 Added MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP to + * NVME Encapsulated Request. + * -------------------------------------------------------------------------- + */ + +#ifndef MPI2_PCI_H +#define MPI2_PCI_H + + +/* + * Values for the PCIe DeviceInfo field used in PCIe Device Status Change Event + * data and PCIe Configuration pages. + */ +#define MPI26_PCIE_DEVINFO_DIRECT_ATTACH (0x00000010) + +#define MPI26_PCIE_DEVINFO_MASK_DEVICE_TYPE (0x0000000F) +#define MPI26_PCIE_DEVINFO_NO_DEVICE (0x00000000) +#define MPI26_PCIE_DEVINFO_PCI_SWITCH (0x00000001) +#define MPI26_PCIE_DEVINFO_NVME (0x00000003) + + +/**************************************************************************** +* NVMe Encapsulated message +****************************************************************************/ + +/* NVME Encapsulated Request Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_REQUEST +{ + U16 DevHandle; /* 0x00 */ + U8 ChainOffset; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U32 Reserved3; /* 0x0C */ + U64 ErrorResponseBaseAddress; /* 0x10 */ + U16 ErrorResponseAllocationLength; /* 0x18 */ + U16 Flags; /* 0x1A */ + U32 DataLength; /* 0x1C */ + U8 NVMe_Command[4]; /* 0x20 */ /* variable length */ + +} MPI26_NVME_ENCAPSULATED_REQUEST, MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_REQUEST, + Mpi26NVMeEncapsulatedRequest_t, MPI2_POINTER pMpi26NVMeEncapsulatedRequest_t; + +/* defines for the Flags field */ +#define MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP (0x0020) +/* Submission Queue Type*/ +#define MPI26_NVME_FLAGS_SUBMISSIONQ_MASK (0x0010) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_IO (0x0000) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0010) +/* Error Response Address Space */ +#define MPI26_NVME_FLAGS_MASK_ERROR_RSP_ADDR (0x000C) +#define MPI26_NVME_FLAGS_SYSTEM_RSP_ADDR (0x0000) +#define MPI26_NVME_FLAGS_IOCPLB_RSP_ADDR (0x0008) +#define MPI26_NVME_FLAGS_IOCPLBNTA_RSP_ADDR (0x000C) +/* Data Direction*/ +#define MPI26_NVME_FLAGS_DATADIRECTION_MASK (0x0003) +#define MPI26_NVME_FLAGS_NODATATRANSFER (0x0000) +#define MPI26_NVME_FLAGS_WRITE (0x0001) +#define MPI26_NVME_FLAGS_READ (0x0002) +#define MPI26_NVME_FLAGS_BIDIRECTIONAL (0x0003) + + +/* NVMe Encapuslated Reply Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_ERROR_REPLY +{ + U16 DevHandle; /* 0x00 */ + U8 MsgLength; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U16 Reserved3; /* 0x0C */ + U16 IOCStatus; /* 0x0E */ + U32 IOCLogInfo; /* 0x10 */ + U16 ErrorResponseCount; /* 0x14 */ + U16 Reserved4; /* 0x16 */ +} MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + Mpi26NVMeEncapsulatedErrorReply_t, + MPI2_POINTER pMpi26NVMeEncapsulatedErrorReply_t; + + +#endif + + diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h index 0050c8c00f..be8bf675b8 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h @@ -58,10 +58,10 @@ #include <sys/byteorder.h> #include <sys/queue.h> +#include <sys/refhash.h> #include <sys/isa_defs.h> #include <sys/sunmdi.h> #include <sys/mdi_impldefs.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h> diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h new file mode 100644 index 0000000000..5aba743834 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h @@ -0,0 +1,750 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef _SMRT_H +#define _SMRT_H + +#include <sys/types.h> +#include <sys/pci.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/conf.h> +#include <sys/map.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/scsi/scsi.h> +#include <sys/scsi/impl/spc3_types.h> +#include <sys/devops.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sdt.h> +#include <sys/policy.h> +#include <sys/ctype.h> + +#if !defined(_LITTLE_ENDIAN) || !defined(_BIT_FIELDS_LTOH) +/* + * This driver contains a number of multi-byte bit fields and other structs + * that are only correct on a system with the same ordering as x86. + */ +#error "smrt: driver works only on little endian systems" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some structures are statically sized based on the expected number of logical + * drives and controllers in the system. These definitions are used throughout + * other driver-specific header files, and must appear prior to their + * inclusion. + */ +#define SMRT_MAX_LOGDRV 64 /* Maximum number of logical drives */ +#define SMRT_MAX_PHYSDEV 128 /* Maximum number of physical devices */ + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> +#include <sys/scsi/adapters/smrt/smrt_scsi.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern ddi_device_acc_attr_t smrt_dev_attributes; + +typedef enum smrt_init_level { + SMRT_INITLEVEL_BASIC = (0x1 << 0), + SMRT_INITLEVEL_I2O_MAPPED = (0x1 << 1), + SMRT_INITLEVEL_CFGTBL_MAPPED = (0x1 << 2), + SMRT_INITLEVEL_PERIODIC = (0x1 << 3), + SMRT_INITLEVEL_INT_ALLOC = (0x1 << 4), + SMRT_INITLEVEL_INT_ADDED = (0x1 << 5), + SMRT_INITLEVEL_INT_ENABLED = (0x1 << 6), + SMRT_INITLEVEL_SCSA = (0x1 << 7), + SMRT_INITLEVEL_MUTEX = (0x1 << 8), + SMRT_INITLEVEL_TASKQ = (0x1 << 9), + SMRT_INITLEVEL_ASYNC_EVENT = (0x1 << 10), +} smrt_init_level_t; + +/* + * Commands issued to the controller carry a (generally 32-bit, though with + * two reserved signalling bits) identifying tag number. In order to avoid + * having the controller confuse us by double-reporting the completion of a + * particular tag, we try to reuse them as infrequently as possible. In + * practice, this means looping through a range of values. The minimum and + * maximum value are defined below. A single command tag value is set aside + * for polled commands sent prior to full initialisation of the driver. + */ +#define SMRT_PRE_TAG_NUMBER 0x00000bad +#define SMRT_MIN_TAG_NUMBER 0x00001000 +#define SMRT_MAX_TAG_NUMBER 0x0fffffff + +/* + * Character strings that represent the names of the iports used for both + * physical and virtual volumes. + */ +#define SMRT_IPORT_PHYS "p0" +#define SMRT_IPORT_VIRT "v0" + +/* + * Definitions to support waiting for the controller to converge on a + * particular state: ready or not ready. These are used with + * smrt_ctlr_wait_for_state(). + */ +#define SMRT_WAIT_DELAY_SECONDS 120 +typedef enum smrt_wait_state { + SMRT_WAIT_STATE_READY = 1, + SMRT_WAIT_STATE_UNREADY +} smrt_wait_state_t; + +typedef enum smrt_ctlr_mode { + SMRT_CTLR_MODE_UNKNOWN = 0, + SMRT_CTLR_MODE_SIMPLE +} smrt_ctlr_mode_t; + +/* + * In addition to Logical Volumes, we also expose the controller at a + * pseudo target address on the SCSI bus we are essentially pretending to be. + */ +#define SMRT_CONTROLLER_TARGET 128 + +/* + * When waiting for volume discovery to complete, we wait for a maximum + * duration (in seconds) before giving up. + */ +#define SMRT_DISCOVER_TIMEOUT 30 + +/* + * The maintenance routine which checks for controller lockup and aborts + * commands that have passed their timeout runs periodically. The time is + * expressed in seconds. + */ +#define SMRT_PERIODIC_RATE 5 + +/* + * At times, we need to check if the controller is still responding. To do + * that, we send a Nop message to the controller and make sure it completes + * successfully. So that we don't wait forever, we set a timeout (in seconds). + */ +#define SMRT_PING_CHECK_TIMEOUT 60 + +/* + * When detaching the device, we may need to have an asynchronous event + * cancellation be issued. While this should be relatively smooth, we don't + * want to wait forever for it. As such we set a timeout in seconds. + */ +#define SMRT_ASYNC_CANCEL_TIMEOUT 60 + +/* + * HP PCI vendor ID and Generation 9 device ID. Used to identify generations of + * supported controllers. + */ +#define SMRT_VENDOR_HP 0x103c +#define SMRT_DEVICE_GEN9 0x3238 + +typedef enum smrt_controller_status { + /* + * An attempt is being made to detach the controller instance. + */ + SMRT_CTLR_STATUS_DETACHING = (0x1 << 0), + + /* + * The controller is believed to be functioning correctly. The driver + * is to allow command submission, process interrupts, and perform + * periodic background maintenance. + */ + SMRT_CTLR_STATUS_RUNNING = (0x1 << 1), + + /* + * The controller is currently being reset. + */ + SMRT_CTLR_STATUS_RESETTING = (0x1 << 2), + + /* + * Our async event notification command is currently in need of help + * from the broader driver. This will be set by smrt_event_complete() + * to indicate that the command is not being processed due to a + * controller reset or because another fatal error occurred. The + * periodic will have to pick up and recover this for us. It is only + * safe for the driver to manipulate the event command outside of + * smrt_event_complete() if this flag is set. + */ + SMRT_CTLR_ASYNC_INTERVENTION = (0x1 << 3), + + /* + * See the theory statement on discovery and resets in smrt_ciss.c for + * an explanation of these values. + */ + SMRT_CTLR_DISCOVERY_REQUESTED = (0x1 << 4), + SMRT_CTLR_DISCOVERY_RUNNING = (0x1 << 5), + SMRT_CTLR_DISCOVERY_PERIODIC = (0x1 << 6), + SMRT_CTLR_DISCOVERY_REQUIRED = (0x1 << 7), +} smrt_controller_status_t; + +#define SMRT_CTLR_DISCOVERY_MASK (SMRT_CTLR_DISCOVERY_REQUESTED | \ + SMRT_CTLR_DISCOVERY_RUNNING | SMRT_CTLR_DISCOVERY_PERIODIC) + +typedef struct smrt_stats { + uint64_t smrts_tran_aborts; + uint64_t smrts_tran_resets; + uint64_t smrts_tran_starts; + uint64_t smrts_ctlr_resets; + unsigned smrts_max_inflight; + uint64_t smrts_unclaimed_interrupts; + uint64_t smrts_claimed_interrupts; + uint64_t smrts_ignored_scsi_cmds; + uint64_t smrts_events_received; + uint64_t smrts_events_errors; + uint64_t smrts_events_intervened; + uint64_t smrts_discovery_tq_errors; +} smrt_stats_t; + +typedef struct smrt_versions { + uint8_t smrtv_hardware_version; + + /* + * These strings must be large enough to hold the 4 byte version string + * retrieved from an IDENTIFY CONTROLLER response, as well as the + * terminating NUL byte: + */ + char smrtv_firmware_rev[5]; + char smrtv_recovery_rev[5]; + char smrtv_bootblock_rev[5]; +} smrt_versions_t; + +typedef struct smrt smrt_t; +typedef struct smrt_command smrt_command_t; +typedef struct smrt_command_internal smrt_command_internal_t; +typedef struct smrt_command_scsa smrt_command_scsa_t; +typedef struct smrt_pkt smrt_pkt_t; + +/* + * Per-Controller Structure + */ +struct smrt { + dev_info_t *smrt_dip; + int smrt_instance; + smrt_controller_status_t smrt_status; + smrt_stats_t smrt_stats; + + /* + * Controller configuration discovered during initialisation. + */ + uint32_t smrt_host_support; + uint32_t smrt_bus_support; + uint32_t smrt_maxcmds; + uint32_t smrt_sg_cnt; + smrt_versions_t smrt_versions; + uint16_t smrt_pci_vendor; + uint16_t smrt_pci_device; + + /* + * iport specific data + */ + dev_info_t *smrt_virt_iport; + dev_info_t *smrt_phys_iport; + scsi_hba_tgtmap_t *smrt_virt_tgtmap; + scsi_hba_tgtmap_t *smrt_phys_tgtmap; + + /* + * The transport mode of the controller. + */ + smrt_ctlr_mode_t smrt_ctlr_mode; + + /* + * The current initialisation level of the driver. Bits in this field + * are set during initialisation and unset during cleanup of the + * allocated resources. + */ + smrt_init_level_t smrt_init_level; + + /* + * Essentially everything is protected by "smrt_mutex". When the + * completion queue is updated, threads sleeping on "smrt_cv_finishq" + * are awoken. + */ + kmutex_t smrt_mutex; + kcondvar_t smrt_cv_finishq; + + /* + * List of enumerated logical volumes (smrt_volume_t). + */ + list_t smrt_volumes; + + /* + * List of enumerated physical devices (smrt_physical_t). + */ + list_t smrt_physicals; + + /* + * List of attached SCSA target drivers (smrt_target_t). + */ + list_t smrt_targets; + + /* + * Controller Heartbeat Tracking + */ + uint32_t smrt_last_heartbeat; + hrtime_t smrt_last_heartbeat_time; + + hrtime_t smrt_last_interrupt_claimed; + hrtime_t smrt_last_interrupt_unclaimed; + hrtime_t smrt_last_reset_start; + hrtime_t smrt_last_reset_finish; + + /* + * Command object tracking. These lists, and all commands within the + * lists, are protected by "smrt_mutex". + */ + uint32_t smrt_next_tag; + avl_tree_t smrt_inflight; + list_t smrt_commands; /* List of all commands. */ + list_t smrt_finishq; /* List of completed commands. */ + list_t smrt_abortq; /* List of commands to abort. */ + + /* + * Discovery coordination + */ + ddi_taskq_t *smrt_discover_taskq; + hrtime_t smrt_last_phys_discovery; + hrtime_t smrt_last_log_discovery; + uint64_t smrt_discover_gen; + + /* + * Controller interrupt handler registration. + */ + int smrt_interrupt_type; + int smrt_interrupt_cap; + uint_t smrt_interrupt_pri; + ddi_intr_handle_t smrt_interrupts[1]; + int smrt_ninterrupts; + + ddi_periodic_t smrt_periodic; + + scsi_hba_tran_t *smrt_hba_tran; + + ddi_dma_attr_t smrt_dma_attr; + + /* + * Access to the I2O Registers: + */ + unsigned smrt_i2o_bar; + caddr_t smrt_i2o_space; + ddi_acc_handle_t smrt_i2o_handle; + + /* + * Access to the Configuration Table: + */ + unsigned smrt_ct_bar; + uint32_t smrt_ct_baseaddr; + CfgTable_t *smrt_ct; + ddi_acc_handle_t smrt_ct_handle; + + /* + * Asynchronous Event State + */ + uint32_t smrt_event_count; + smrt_command_t *smrt_event_cmd; + smrt_command_t *smrt_event_cancel_cmd; + kcondvar_t smrt_event_queue; +}; + +/* + * Logical Volume Structure + */ +typedef enum smrt_volume_flags { + SMRT_VOL_FLAG_WWN = (0x1 << 0), +} smrt_volume_flags_t; + +typedef struct smrt_volume { + LUNAddr_t smlv_addr; + smrt_volume_flags_t smlv_flags; + + uint8_t smlv_wwn[16]; + uint64_t smlv_gen; + + smrt_t *smlv_ctlr; + list_node_t smlv_link; + + /* + * List of SCSA targets currently attached to this Logical Volume: + */ + list_t smlv_targets; +} smrt_volume_t; + +typedef struct smrt_physical { + LUNAddr_t smpt_addr; + uint64_t smpt_wwn; + uint8_t smpt_dtype; + uint16_t smpt_bmic; + uint64_t smpt_gen; + boolean_t smpt_supported; + boolean_t smpt_visible; + boolean_t smpt_unsup_warn; + list_node_t smpt_link; + list_t smpt_targets; + smrt_t *smpt_ctlr; + smrt_identify_physical_drive_t *smpt_info; +} smrt_physical_t; + +/* + * Per-Target Structure + */ +typedef struct smrt_target { + struct scsi_device *smtg_scsi_dev; + + boolean_t smtg_physical; + + /* + * This is only used when performing discovery during panic, as we need + * a mechanism to determine if the set of drives has shifted. + */ + boolean_t smtg_gone; + + /* + * Linkage back to the device that this target represents. This may be + * either a smrt_volume_t or a smrt_physical_t. We keep a pointer to the + * address, as that's the one thing we generally care about. + */ + union { + smrt_physical_t *smtg_phys; + smrt_volume_t *smtg_vol; + } smtg_lun; + list_node_t smtg_link_lun; + LUNAddr_t *smtg_addr; + + /* + * Linkage back to the controller: + */ + smrt_t *smtg_ctlr; + list_node_t smtg_link_ctlr; +} smrt_target_t; + +/* + * DMA Resource Tracking Structure + */ +typedef enum smrt_dma_level { + SMRT_DMALEVEL_HANDLE_ALLOC = (0x1 << 0), + SMRT_DMALEVEL_MEMORY_ALLOC = (0x1 << 1), + SMRT_DMALEVEL_HANDLE_BOUND = (0x1 << 2), +} smrt_dma_level_t; + +typedef struct smrt_dma { + smrt_dma_level_t smdma_level; + size_t smdma_real_size; + ddi_dma_handle_t smdma_dma_handle; + ddi_acc_handle_t smdma_acc_handle; + ddi_dma_cookie_t smdma_dma_cookies[1]; + uint_t smdma_dma_ncookies; +} smrt_dma_t; + + +typedef enum smrt_command_status { + /* + * When a command is submitted to the controller, it is marked USED + * to avoid accidental reuse of the command without reinitialising + * critical fields. The submitted command is also marked INFLIGHT + * to reflect its inclusion in the "smrt_inflight" AVL tree. When + * the command is completed by the controller, INFLIGHT is unset. + */ + SMRT_CMD_STATUS_USED = (0x1 << 0), + SMRT_CMD_STATUS_INFLIGHT = (0x1 << 1), + + /* + * This flag is set during abort queue processing to record that this + * command was aborted in response to an expired timeout, and not some + * other cancellation. If the controller is able to abort the command, + * we use this flag to let the SCSI framework know that the command + * timed out. + */ + SMRT_CMD_STATUS_TIMEOUT = (0x1 << 2), + + /* + * The controller set the error bit when completing this command. + * Details of the particular fault may be read from the error + * information written by the controller. + */ + SMRT_CMD_STATUS_ERROR = (0x1 << 3), + + /* + * This command has been abandoned by the original submitter. This + * could happen if the command did not complete in a timely fashion. + * When it reaches the finish queue it will be freed without further + * processing. + */ + SMRT_CMD_STATUS_ABANDONED = (0x1 << 4), + + /* + * This command has made it through the completion queue and had final + * processing performed. + */ + SMRT_CMD_STATUS_COMPLETE = (0x1 << 5), + + /* + * A polled message will be ignored by the regular processing of the + * completion queue. The blocking function doing the polling is + * responsible for watching the command on which it has set the POLLED + * flag. Regular completion queue processing (which might happen in + * the polling function, or it might happen in the interrupt handler) + * will set POLL_COMPLETE once it is out of the finish queue + * altogether. + */ + SMRT_CMD_STATUS_POLLED = (0x1 << 6), + SMRT_CMD_STATUS_POLL_COMPLETE = (0x1 << 7), + + /* + * An abort message has been sent to the controller in an attempt to + * cancel this command. + */ + SMRT_CMD_STATUS_ABORT_SENT = (0x1 << 8), + + /* + * This command has been passed to our tran_start(9E) handler. + */ + SMRT_CMD_STATUS_TRAN_START = (0x1 << 9), + + /* + * This command was for a SCSI command that we are explicitly avoiding + * sending to the controller. + */ + SMRT_CMD_STATUS_TRAN_IGNORED = (0x1 << 10), + + /* + * This command has been submitted once, and subsequently passed to + * smrt_command_reuse(). + */ + SMRT_CMD_STATUS_REUSED = (0x1 << 11), + + /* + * A controller reset has been issued, so a response for this command + * is not expected. If one arrives before the controller reset has + * taken effect, it likely cannot be trusted. + */ + SMRT_CMD_STATUS_RESET_SENT = (0x1 << 12), + + /* + * Certain commands related to discovery and pinging need to be run + * during the context after a reset has occurred, but before the + * controller is considered. Such commands can use this flag to bypass + * the normal smrt_submit() check. + */ + SMRT_CMD_IGNORE_RUNNING = (0x1 << 13), +} smrt_command_status_t; + +typedef enum smrt_command_type { + SMRT_CMDTYPE_INTERNAL = 1, + SMRT_CMDTYPE_EVENT, + SMRT_CMDTYPE_ABORTQ, + SMRT_CMDTYPE_SCSA, + SMRT_CMDTYPE_PREINIT, +} smrt_command_type_t; + +struct smrt_command { + uint32_t smcm_tag; + smrt_command_type_t smcm_type; + smrt_command_status_t smcm_status; + + smrt_t *smcm_ctlr; + smrt_target_t *smcm_target; + + list_node_t smcm_link; /* Linkage for allocated list. */ + list_node_t smcm_link_finish; /* Linkage for completion list. */ + list_node_t smcm_link_abort; /* Linkage for abort list. */ + avl_node_t smcm_node; /* Inflight AVL membership. */ + + hrtime_t smcm_time_submit; + hrtime_t smcm_time_complete; + + hrtime_t smcm_expiry; + + /* + * The time at which an abort message was sent to try and terminate + * this command, as well as the tag of the abort message itself: + */ + hrtime_t smcm_abort_time; + uint32_t smcm_abort_tag; + + /* + * Ancillary data objects. Only one of these will be allocated for any + * given command, but we nonetheless resist the temptation to use a + * union of pointers in order to make incorrect usage obvious. + */ + smrt_command_scsa_t *smcm_scsa; + smrt_command_internal_t *smcm_internal; + + /* + * Physical allocation tracking for the actual command to send to the + * controller. + */ + smrt_dma_t smcm_contig; + + CommandList_t *smcm_va_cmd; + uint32_t smcm_pa_cmd; + + ErrorInfo_t *smcm_va_err; + uint32_t smcm_pa_err; +}; + +/* + * Commands issued internally to the driver (as opposed to by the HBA + * framework) generally require a buffer in which to assemble the command body, + * and for receiving the response from the controller. The following object + * tracks this (optional) extra buffer. + */ +struct smrt_command_internal { + smrt_dma_t smcmi_contig; + + void *smcmi_va; + uint32_t smcmi_pa; + size_t smcmi_len; +}; + +/* + * Commands issued via the SCSI framework have a number of additional + * properties. + */ +struct smrt_command_scsa { + struct scsi_pkt *smcms_pkt; + smrt_command_t *smcms_command; +}; + + +/* + * CISS transport routines. + */ +void smrt_periodic(void *); +void smrt_lockup_check(smrt_t *); +int smrt_submit(smrt_t *, smrt_command_t *); +void smrt_submit_simple(smrt_t *, smrt_command_t *); +int smrt_retrieve(smrt_t *); +void smrt_retrieve_simple(smrt_t *); +int smrt_poll_for(smrt_t *, smrt_command_t *); +int smrt_preinit_command_simple(smrt_t *, smrt_command_t *); + +/* + * Interrupt service routines. + */ +int smrt_interrupts_setup(smrt_t *); +int smrt_interrupts_enable(smrt_t *); +void smrt_interrupts_teardown(smrt_t *); +uint32_t smrt_isr_hw_simple(caddr_t, caddr_t); + +/* + * Interrupt enable/disable routines. + */ +void smrt_intr_set(smrt_t *, boolean_t); + +/* + * Controller initialisation routines. + */ +int smrt_ctlr_init(smrt_t *); +void smrt_ctlr_teardown(smrt_t *); +int smrt_ctlr_reset(smrt_t *); +int smrt_ctlr_wait_for_state(smrt_t *, smrt_wait_state_t); +int smrt_ctlr_init_simple(smrt_t *); +void smrt_ctlr_teardown_simple(smrt_t *); +int smrt_cfgtbl_flush(smrt_t *); +int smrt_cfgtbl_transport_has_support(smrt_t *, int); +void smrt_cfgtbl_transport_set(smrt_t *, int); +int smrt_cfgtbl_transport_confirm(smrt_t *, int); +uint32_t smrt_ctlr_get_cmdsoutmax(smrt_t *); +uint32_t smrt_ctlr_get_maxsgelements(smrt_t *); + +/* + * Device enumeration and lookup routines. + */ +void smrt_discover_request(smrt_t *); + +int smrt_logvol_discover(smrt_t *, uint16_t, uint64_t); +void smrt_logvol_teardown(smrt_t *); +smrt_volume_t *smrt_logvol_lookup_by_id(smrt_t *, unsigned long); +void smrt_logvol_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, + void **); +boolean_t smrt_logvol_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, + void *, scsi_tgtmap_deact_rsn_t); + +int smrt_phys_discover(smrt_t *, uint16_t, uint64_t); +smrt_physical_t *smrt_phys_lookup_by_ua(smrt_t *, const char *); +void smrt_phys_teardown(smrt_t *); +void smrt_phys_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, + void **); +boolean_t smrt_phys_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, + void *, scsi_tgtmap_deact_rsn_t); + +/* + * SCSI framework routines. + */ +int smrt_ctrl_hba_setup(smrt_t *); +void smrt_ctrl_hba_teardown(smrt_t *); + +int smrt_logvol_hba_setup(smrt_t *, dev_info_t *); +void smrt_logvol_hba_teardown(smrt_t *, dev_info_t *); +int smrt_phys_hba_setup(smrt_t *, dev_info_t *); +void smrt_phys_hba_teardown(smrt_t *, dev_info_t *); + +void smrt_hba_complete(smrt_command_t *); + +void smrt_process_finishq(smrt_t *); +void smrt_process_abortq(smrt_t *); + +/* + * Command block management. + */ +smrt_command_t *smrt_command_alloc(smrt_t *, smrt_command_type_t, + int); +smrt_command_t *smrt_command_alloc_preinit(smrt_t *, size_t, int); +int smrt_command_attach_internal(smrt_t *, smrt_command_t *, size_t, + int); +void smrt_command_free(smrt_command_t *); +smrt_command_t *smrt_lookup_inflight(smrt_t *, uint32_t); +void smrt_command_reuse(smrt_command_t *); + +/* + * Device message construction routines. + */ +void smrt_write_lun_addr_phys(LUNAddr_t *, boolean_t, unsigned, unsigned); +void smrt_write_controller_lun_addr(LUNAddr_t *); +uint16_t smrt_lun_addr_to_bmic(PhysDevAddr_t *); +void smrt_write_message_abort_one(smrt_command_t *, uint32_t); +void smrt_write_message_abort_all(smrt_command_t *, LUNAddr_t *); +void smrt_write_message_nop(smrt_command_t *, int); +void smrt_write_message_event_notify(smrt_command_t *); + +/* + * Device management routines. + */ +int smrt_device_setup(smrt_t *); +void smrt_device_teardown(smrt_t *); +uint32_t smrt_get32(smrt_t *, offset_t); +void smrt_put32(smrt_t *, offset_t, uint32_t); + +/* + * SATA related routines. + */ +int smrt_sata_determine_wwn(smrt_t *, PhysDevAddr_t *, uint64_t *, uint16_t); + +/* + * Asynchronous Event Notification + */ +int smrt_event_init(smrt_t *); +void smrt_event_fini(smrt_t *); +void smrt_event_complete(smrt_command_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h new file mode 100644 index 0000000000..e1f1db68b3 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h @@ -0,0 +1,345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef _SMRT_CISS_H +#define _SMRT_CISS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Maximum number of Scatter/Gather List entries. These entries are statically + * allocated for all commands. + */ +#define CISS_MAXSGENTRIES 64 + +/* + * If the controller advertises a value of 0 for the maximum S/G list length it + * supports, the specification states that we should assume a value of 31. + */ +#define CISS_SGCNT_FALLBACK 31 + +/* + * The CDB field in the request block is fixed at 16 bytes in length. (See + * "3.2. Request Block" in the CISS specification.) + */ +#define CISS_CDBLEN 16 + +/* + * Command Status Values. These are listed in "Table 2 Command Status" in "3.3 + * Error Info" of the CISS specification. + */ +#define CISS_CMD_SUCCESS 0x00 +#define CISS_CMD_TARGET_STATUS 0x01 +#define CISS_CMD_DATA_UNDERRUN 0x02 +#define CISS_CMD_DATA_OVERRUN 0x03 +#define CISS_CMD_INVALID 0x04 +#define CISS_CMD_PROTOCOL_ERR 0x05 +#define CISS_CMD_HARDWARE_ERR 0x06 +#define CISS_CMD_CONNECTION_LOST 0x07 +#define CISS_CMD_ABORTED 0x08 +#define CISS_CMD_ABORT_FAILED 0x09 +#define CISS_CMD_UNSOLICITED_ABORT 0x0a +#define CISS_CMD_TIMEOUT 0x0b +#define CISS_CMD_UNABORTABLE 0x0c + +/* + * Request Transfer Directions, used in "RequestBlock.Type.Direction": + */ +#define CISS_XFER_NONE 0x00 +#define CISS_XFER_WRITE 0x01 +#define CISS_XFER_READ 0x02 +#define CISS_XFER_RSVD 0x03 + +/* + * Request Attributes, used in "RequestBlock.Type.Attribute": + */ +#define CISS_ATTR_UNTAGGED 0x00 +#define CISS_ATTR_SIMPLE 0x04 +#define CISS_ATTR_HEADOFQUEUE 0x05 +#define CISS_ATTR_ORDERED 0x06 + +/* + * Request Type, used in "RequestBlock.Type.Type": + */ +#define CISS_TYPE_CMD 0x00 +#define CISS_TYPE_MSG 0x01 + +/* + * I2O Space Register Offsets + * + * The name "I2O", and these register offsets, appear to be amongst the last + * vestiges of a long-defunct attempt at standardising mainframe-style I/O + * channels in the Intel server space: the Intelligent Input/Output (I2O) + * Architecture Specification. + * + * The draft of version 1.5 of this specification, in section "4.2.1.5.1 + * Extensions for PCI", suggests that the following are memory offsets into + * "the memory region specified by the first base address configuration + * register indicating memory space (offset 10h, 14h, and so forth)". These + * match up with the offsets of the first two BARs in a PCI configuration space + * type 0 header. + * + * The specification also calls out the Inbound Post List FIFO, write-only at + * offset 40h; the Outbound Post List FIFO, read-only at offset 44h; the + * Interrupt Status Register, at offset 30h; and the Interrupt Mask Register, + * at offset 34h. + * + * This ill-fated attempt to increase the proprietary complexity of (and + * presumably, thus, the gross margin on) computer systems is all but extinct. + * The transport layer of this storage controller is all that's left of their + * religion. + */ +#define CISS_I2O_INBOUND_DOORBELL 0x20 +#define CISS_I2O_INTERRUPT_STATUS 0x30 +#define CISS_I2O_INTERRUPT_MASK 0x34 +#define CISS_I2O_INBOUND_POST_Q 0x40 +#define CISS_I2O_OUTBOUND_POST_Q 0x44 +#define CISS_I2O_OUTBOUND_DOORBELL_STATUS 0x9c +#define CISS_I2O_OUTBOUND_DOORBELL_CLEAR 0xa0 +#define CISS_I2O_SCRATCHPAD 0xb0 +#define CISS_I2O_CFGTBL_CFG_OFFSET 0xb4 +#define CISS_I2O_CFGTBL_MEM_OFFSET 0xb8 + +/* + * Rather than make a lot of small mappings for each part of the address + * space we wish to access, we will make one large mapping. If more + * offsets are added to the I2O list above, this space should be extended + * appropriately. + */ +#define CISS_I2O_MAP_BASE 0x20 +#define CISS_I2O_MAP_LIMIT 0x100 + +/* + * The Scratchpad Register (I2O_SCRATCHPAD) is not mentioned in the CISS + * specification. It serves at least two known functions: + * - Signalling controller readiness + * - Exposing a debugging code when the controller firmware locks up + */ +#define CISS_SCRATCHPAD_INITIALISED 0xffff0000 + +/* + * Outbound Doorbell Register Values. + * + * These are read from the Outbound Doorbell Set/Status Register + * (CISS_I2O_OUTBOUND_DOORBELL_STATUS), but cleared by writing to the Clear + * Register (CISS_I2O_OUTBOUND_DOORBELL_CLEAR). + */ +#define CISS_ODR_BIT_INTERRUPT (1UL << 0) +#define CISS_ODR_BIT_LOCKUP (1UL << 1) + +/* + * Inbound Doorbell Register Values. + * + * These are written to and read from the Inbound Doorbell Register + * (CISS_I2O_INBOUND_DOORBELL). + */ +#define CISS_IDR_BIT_CFGTBL_CHANGE (1UL << 0) + +/* + * Interrupt Mask Register Values. + * + * These are written to and read from the Interrupt Mask Register + * (CISS_I2O_INTERRUPT_MASK). Note that a 1 bit in this register masks or + * disables the interrupt in question; to enable the interrupt the bit must be + * set to 0. + */ +#define CISS_IMR_BIT_SIMPLE_INTR_DISABLE (1UL << 3) + +/* + * Interrupt Status Register Values. + * + * These are read from the Interrupt Status Register + * (CISS_I2O_INTERRUPT_STATUS). + */ +#define CISS_ISR_BIT_SIMPLE_INTR (1UL << 3) + +/* + * Transport Methods. + * + * These bit positions are used in the Configuration Table to detect controller + * support for a particular method, via "TransportSupport"; to request that the + * controller enable a particular method, via "TransportRequest"; and to detect + * whether the controller has acknowledged the request and enabled the desired + * method, via "TransportActive". + * + * See: "9.1 Configuration Table" in the CISS Specification. + */ +#define CISS_CFGTBL_READY_FOR_COMMANDS (1UL << 0) +#define CISS_CFGTBL_XPORT_SIMPLE (1UL << 1) +#define CISS_CFGTBL_XPORT_PERFORMANT (1UL << 2) +#define CISS_CFGTBL_XPORT_MEMQ (1UL << 4) + +/* + * In the Simple Transport Method, when the appropriate interrupt status bit is + * set (CISS_ISR_BIT_SIMPLE_INTR), the Outbound Post Queue register is + * repeatedly read for notifications of the completion of commands previously + * submitted to the controller. These macros help break up the read value into + * its component fields: the tag number, and whether or not the command + * completed in error. + */ +#define CISS_OPQ_READ_TAG(x) ((x) >> 2) +#define CISS_OPQ_READ_ERROR(x) ((x) & (1UL << 1)) + +/* + * Physical devices that are reported may be marked as 'masked'. A masked device + * is one that the driver can see, but must not perform any I/O to. + */ +#define SMRT_CISS_MODE_MASKED 3 + +/* + * The following packed structures are used to ease the manipulation of + * requests and responses from the controller. + */ +#pragma pack(1) + +typedef struct smrt_tag { + uint32_t reserved:1; + uint32_t error:1; + uint32_t tag_value:30; + uint32_t unused; +} smrt_tag_t; + +typedef union SCSI3Addr { + struct { + uint8_t Dev; + uint8_t Bus:6; + uint8_t Mode:2; + } PeripDev; + struct { + uint8_t DevLSB; + uint8_t DevMSB:6; + uint8_t Mode:2; + } LogDev; + struct { + uint8_t Dev:5; + uint8_t Bus:3; + uint8_t Targ:6; + uint8_t Mode:2; + } LogUnit; +} SCSI3Addr_t; + +typedef struct PhysDevAddr { + uint32_t TargetId:24; + uint32_t Bus:6; + uint32_t Mode:2; + SCSI3Addr_t Target[2]; +} PhysDevAddr_t; + +typedef struct LogDevAddr { + uint32_t VolId:30; + uint32_t Mode:2; + uint8_t reserved[4]; +} LogDevAddr_t; + +typedef union LUNAddr { + uint8_t LunAddrBytes[8]; + SCSI3Addr_t SCSI3Lun[4]; + PhysDevAddr_t PhysDev; + LogDevAddr_t LogDev; +} LUNAddr_t; + +typedef struct CommandListHeader { + uint8_t ReplyQueue; + uint8_t SGList; + uint16_t SGTotal; + smrt_tag_t Tag; + LUNAddr_t LUN; +} CommandListHeader_t; + +typedef struct RequestBlock { + uint8_t CDBLen; + struct { + uint8_t Type:3; + uint8_t Attribute:3; + uint8_t Direction:2; + } Type; + uint16_t Timeout; + uint8_t CDB[CISS_CDBLEN]; +} RequestBlock_t; + +typedef struct ErrDescriptor { + uint64_t Addr; + uint32_t Len; +} ErrDescriptor_t; + +typedef struct SGDescriptor { + uint64_t Addr; + uint32_t Len; + uint32_t Ext; +} SGDescriptor_t; + +typedef struct CommandList { + CommandListHeader_t Header; + RequestBlock_t Request; + ErrDescriptor_t ErrDesc; + SGDescriptor_t SG[CISS_MAXSGENTRIES]; +} CommandList_t; + +typedef union MoreErrInfo { + struct { + uint8_t Reserved[3]; + uint8_t Type; + uint32_t ErrorInfo; + } Common_Info; + struct { + uint8_t Reserved[2]; + uint8_t offense_size; + uint8_t offense_num; + uint32_t offense_value; + } Invalid_Cmd; +} MoreErrInfo_t; + +typedef struct ErrorInfo { + uint8_t ScsiStatus; + uint8_t SenseLen; + uint16_t CommandStatus; + uint32_t ResidualCnt; + MoreErrInfo_t MoreErrInfo; + uint8_t SenseInfo[MAX_SENSE_LENGTH]; +} ErrorInfo_t; + +typedef struct CfgTable { + uint8_t Signature[4]; + uint32_t SpecValence; + uint32_t TransportSupport; + uint32_t TransportActive; + uint32_t TransportRequest; + uint32_t Upper32Addr; + uint32_t CoalIntDelay; + uint32_t CoalIntCount; + uint32_t CmdsOutMax; + uint32_t BusTypes; + uint32_t TransportMethodOffset; + uint8_t ServerName[16]; + uint32_t HeartBeat; + uint32_t HostDrvrSupport; + uint32_t MaxSGElements; + uint32_t MaxLunSupport; + uint32_t MaxPhyDevSupport; + uint32_t MaxPhyDrvPerLun; + uint32_t MaxPerfModeCmdsOutMax; + uint32_t MaxBlockFetchCount; +} CfgTable_t; + +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_CISS_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h new file mode 100644 index 0000000000..47ef99b2e0 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h @@ -0,0 +1,371 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017 Joyent, Inc. + */ + +#ifndef _SMRT_SCSI_H +#define _SMRT_SCSI_H + +#include <sys/types.h> + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* CISS LUN Addressing MODEs */ +#define PERIPHERIAL_DEV_ADDR 0x0 +#define LOGICAL_VOL_ADDR 0x1 +#define MASK_PERIPHERIAL_DEV_ADDR 0x3 +#define CISS_PHYS_MODE 0x0 + +/* + * Vendor-specific SCSI Commands + * + * These command opcodes are for use in the opcode byte of the CDB in a request + * of type CISS_TYPE_CMD. They are custom SCSI commands, using the + * vendor-specific part of the opcode space; i.e., 0xC0 through 0xFF. + */ +#define CISS_SCMD_READ 0xC0 +#define CISS_SCMD_WRITE 0xC1 +#define CISS_SCMD_REPORT_LOGICAL_LUNS 0xC2 +#define CISS_SCMD_REPORT_PHYSICAL_LUNS 0xC3 + +/* + * These command opcodes are _not_ in the usual vendor-specific space, but are + * nonetheless vendor-specific. They allow BMIC commands to be written to and + * read from the controller. If a command transfers no data, the specification + * suggests that BMIC_WRITE (0x27) is appropriate. + */ +#define CISS_SCMD_BMIC_READ 0x26 +#define CISS_SCMD_BMIC_WRITE 0x27 + +/* + * CISS Messages + * + * The CISS specification describes several directives that do not behave like + * SCSI commands. They are sent in requests of type CISS_TYPE_MSG. + * + * The Abort, Reset, and Nop, messages are defined in "8. Messages" in the CISS + * Specification. + */ +#define CISS_MSG_ABORT 0x0 +#define CISS_ABORT_TASK 0x0 +#define CISS_ABORT_TASKSET 0x1 + +#define CISS_MSG_RESET 0x1 +#define CISS_RESET_CTLR 0x0 +#define CISS_RESET_BUS 0x1 +#define CISS_RESET_TGT 0x3 +#define CISS_RESET_LUN 0x4 + +#define CISS_MSG_NOP 0x3 + +/* + * BMIC Commands + * + * These commands allow for the use of non-standard facilities specific to the + * Smart Array firmware. They are sent to the controller through a specially + * constructed CDB with the CISS_SCMD_BMIC_READ or CISS_SCMD_BMIC_WRITE opcode. + */ +#define CISS_BMIC_IDENTIFY_CONTROLLER 0x11 +#define CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE 0x15 +#define CISS_BMIC_NOTIFY_ON_EVENT 0xD0 +#define CISS_BMIC_NOTIFY_ON_EVENT_CANCEL 0xD1 + +/* + * Device and Phy type codes. These are used across many commands, including + * IDENTIFY PHYSICAL DEVICE and the REPORT PHYSICAL LUN extended reporting. + */ +#define SMRT_DTYPE_PSCSI 0x00 +#define SMRT_DTYPE_SATA 0x01 +#define SMRT_DTYPE_SAS 0x02 +#define SMRT_DTYPE_SATA_BW 0x03 +#define SMRT_DTYPE_SAS_BW 0x04 +#define SMRT_DTYPE_EXPANDER 0x05 +#define SMRT_DTYPE_SES 0x06 +#define SMRT_DTYPE_CONTROLLER 0x07 +#define SMRT_DTYPE_SGPIO 0x08 +#define SMRT_DTYPE_NVME 0x09 +#define SMRT_DTYPE_NOPHY 0xFF + +/* + * The following packed structures are used to ease the manipulation of SCSI + * and BMIC commands sent to, and status information returned from, the + * controller. + */ +#pragma pack(1) + +typedef struct smrt_report_logical_lun_ent { + LogDevAddr_t smrle_addr; +} smrt_report_logical_lun_ent_t; + +typedef struct smrt_report_logical_lun_extent { + LogDevAddr_t smrle_addr; + uint8_t smrle_wwn[16]; +} smrt_report_logical_lun_extent_t; + +typedef struct smrt_report_logical_lun { + uint32_t smrll_datasize; /* Big Endian */ + uint8_t smrll_extflag; + uint8_t smrll_reserved1[3]; + union { + smrt_report_logical_lun_ent_t ents[SMRT_MAX_LOGDRV]; + smrt_report_logical_lun_extent_t extents[SMRT_MAX_LOGDRV]; + } smrll_data; +} smrt_report_logical_lun_t; + +typedef struct smrt_report_logical_lun_req { + uint8_t smrllr_opcode; + uint8_t smrllr_extflag; + uint8_t smrllr_reserved1[4]; + uint32_t smrllr_datasize; /* Big Endian */ + uint8_t smrllr_reserved2; + uint8_t smrllr_control; +} smrt_report_logical_lun_req_t; + +typedef struct smrt_report_physical_lun_ent { + PhysDevAddr_t srple_addr; +} smrt_report_physical_lun_ent_t; + +/* + * This structure represents the 'physical node identifier' extended option for + * REPORT PHYSICAL LUNS. This is triggered when the extended flags is set to + * 0x1. Note that for SAS the other structure should always be used. + */ +typedef struct smrt_report_physical_pnid { + uint8_t srpp_node[8]; + uint8_t srpp_port[8]; +} smrt_report_physical_pnid_t; + +/* + * This structure represents the 'other physical device info' extended option + * for report physical luns. This is triggered when the extended flags is set + * to 0x2. + */ +typedef struct smrt_report_physical_opdi { + uint8_t srpo_wwid[8]; + uint8_t srpo_dtype; + uint8_t srpo_flags; + uint8_t srpo_multilun; + uint8_t srpo_paths; + uint32_t srpo_iohdl; +} smrt_report_physical_opdi_t; + +typedef struct smrt_report_physical_lun_extent { + PhysDevAddr_t srple_addr; + union { + smrt_report_physical_pnid_t srple_pnid; + smrt_report_physical_opdi_t srple_opdi; + } srple_extdata; +} smrt_report_physical_lun_extent_t; + +/* + * Values that can be ORed together into smrllr_extflag. smprl_extflag indicates + * if any extended processing was done or not. + */ +#define SMRT_REPORT_PHYSICAL_LUN_EXT_NONE 0x00 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_PNID 0x01 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI 0x02 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_MASK 0x0f +#define SMRT_REPORT_PHYSICAL_LUN_CTRL_ONLY (1 << 6) +#define SMRT_REPORT_PHYSICAL_LUN_ALL_PATHS (1 << 7) + +typedef struct smrt_report_physical_lun { + uint32_t smrpl_datasize; /* Big Endian */ + uint8_t smrpl_extflag; + uint8_t smrpl_reserved1[3]; + union { + smrt_report_physical_lun_ent_t ents[SMRT_MAX_PHYSDEV]; + smrt_report_physical_lun_extent_t extents[SMRT_MAX_PHYSDEV]; + } smrpl_data; +} smrt_report_physical_lun_t; + + +typedef struct smrt_report_physical_lun_req { + uint8_t smrplr_opcode; + uint8_t smrplr_extflag; + uint8_t smrplr_reserved[1]; + uint32_t smrplr_datasize; /* Big Endian */ + uint8_t smrplr_reserved2; + uint8_t smrplr_control; +} smrt_report_physical_lun_req_t; + +/* + * Request structure for the BMIC command IDENTIFY CONTROLLER. This structure + * is written into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved + * fields should be filled with zeroes. + */ +typedef struct smrt_identify_controller_req { + uint8_t smicr_opcode; + uint8_t smicr_lun; + uint8_t smicr_reserved1[4]; + uint8_t smicr_command; + uint8_t smicr_reserved2[2]; + uint8_t smicr_reserved3[1]; + uint8_t smicr_reserved4[6]; +} smrt_identify_controller_req_t; + +/* + * Response structure for IDENTIFY CONTROLLER. This structure is used to + * interpret the response the controller will write into the data buffer. + */ +typedef struct smrt_identify_controller { + uint8_t smic_logical_drive_count; + uint32_t smic_config_signature; + uint8_t smic_firmware_rev[4]; + uint8_t smic_recovery_rev[4]; + uint8_t smic_hardware_version; + uint8_t smic_bootblock_rev[4]; + + /* + * These are obsolete for SAS controllers: + */ + uint32_t smic_drive_present_map; + uint32_t smic_external_drive_map; + + uint32_t smic_board_id; +} smrt_identify_controller_t; + +/* + * Request structure for IDENTIFY PHYSICAL DEVICE. This structure is written + * into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved fields + * should be filled with zeroes. Note, the lower 8 bits of the BMIC ID are in + * index1, whereas the upper 8 bites are in index2; however, the controller may + * only support 8 bits worth of devices (and this driver does not support that + * many devices). + */ +typedef struct smrt_identify_physical_drive_req { + uint8_t sipdr_opcode; + uint8_t sipdr_lun; + uint8_t sipdr_bmic_index1; + uint8_t sipdr_reserved1[3]; + uint8_t sipdr_command; + uint8_t sipdr_reserved2[2]; + uint8_t sipdr_bmic_index2; + uint8_t sipdr_reserved4[6]; +} smrt_identify_physical_drive_req_t; + +/* + * Relevant values for the sipd_more_flags member. + */ +#define SMRT_MORE_FLAGS_LOGVOL (1 << 5) +#define SMRT_MORE_FLAGS_SPARE (1 << 6) + +/* + * Response structure for IDENTIFY PHYSICAL DEVICE. This structure is used to + * describe aspects of a physical drive. Note, not all fields are valid in all + * firmware revisions. + */ +typedef struct smrt_identify_physical_drive { + uint8_t sipd_scsi_bus; /* Invalid for SAS */ + uint8_t sipd_scsi_id; /* Invalid for SAS */ + uint16_t sipd_lblk_size; + uint32_t sipd_nblocks; + uint32_t sipd_rsrvd_blocsk; + uint8_t sipd_model[40]; + uint8_t sipd_serial[40]; + uint8_t sipd_firmware[8]; + uint8_t sipd_scsi_inquiry; + uint8_t sipd_compaq_stamp; + uint8_t sipd_last_failure; + uint8_t sipd_flags; + uint8_t sipd_more_flags; + uint8_t sipd_scsi_lun; /* Invalid for SAS */ + uint8_t sipd_yet_more_flags; + uint8_t sipd_even_more_flags; + uint32_t sipd_spi_speed_rules; + uint8_t sipd_phys_connector[2]; + uint8_t sipd_phys_box_on_bus; + uint8_t sipd_phys_bay_in_box; + uint32_t sipd_rpm; + uint8_t sipd_device_type; + uint8_t sipd_sata_version; + uint64_t sipd_big_nblocks; + uint64_t sipd_ris_slba; + uint32_t sipd_ris_size; + uint8_t sipd_wwid[20]; + uint8_t sipd_controller_phy_map[32]; + uint16_t sipd_phy_count; + uint8_t sipd_phy_connected_dev_type[256]; + uint8_t sipd_phy_to_drive_bay[256]; + uint16_t sipd_phy_to_attached_dev[256]; + uint8_t sipd_box_index; + uint8_t sipd_drive_support; + uint16_t sipd_extra_flags; + uint8_t sipd_neogiated_link_rate[256]; + uint8_t sipd_phy_to_phy_map[256]; + uint8_t sipd_pad[312]; +} smrt_identify_physical_drive_t; + +/* + * Note that this structure describes the CISS version of the command. There + * also exists a BMIC version, but it has a slightly different structure. This + * structure is also used for the cancellation request; however, in that case, + * the senr_flags field is reserved. + */ +typedef struct smrt_event_notify_req { + uint8_t senr_opcode; + uint8_t senr_subcode; + uint8_t senr_reserved1[2]; + uint32_t senr_flags; /* Big Endian */ + uint32_t senr_size; /* Big Endian */ + uint8_t senr_control; +} smrt_event_notify_req_t; + +/* + * When receiving event notifications, the buffer size must be 512 bytes large. + * We make sure that we always allocate a buffer of this size, even though we + * define a structure that is much shorter and only uses the fields that we end + * up caring about. This size requirement comes from the specification. + */ +#define SMRT_EVENT_NOTIFY_BUFLEN 512 + +#define SMRT_EVENT_CLASS_PROTOCOL 0 +#define SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR 1 + +#define SMRT_EVENT_CLASS_HOTPLUG 1 +#define SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE 0 + +#define SMRT_EVENT_CLASS_HWERROR 2 +#define SMRT_EVENT_CLASS_ENVIRONMENT 3 + +#define SMRT_EVENT_CLASS_PHYS 4 +#define SMRT_EVENT_PHYS_SUBCLASS_STATE 0 + +#define SMRT_EVENT_CLASS_LOGVOL 5 + +typedef struct smrt_event_notify { + uint32_t sen_timestamp; + uint16_t sen_class; + uint16_t sen_subclass; + uint16_t sen_detail; + uint8_t sen_data[64]; + char sen_message[80]; + uint32_t sen_tag; + uint16_t sen_date; + uint16_t sen_year; + uint32_t sen_time; + uint16_t sen_pre_power_time; + LUNAddr_t sen_addr; +} smrt_event_notify_t; + +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_SCSI_H */ diff --git a/usr/src/uts/common/sys/scsi/generic/inquiry.h b/usr/src/uts/common/sys/scsi/generic/inquiry.h index ddfd683169..fcbf00d5dc 100644 --- a/usr/src/uts/common/sys/scsi/generic/inquiry.h +++ b/usr/src/uts/common/sys/scsi/generic/inquiry.h @@ -362,7 +362,8 @@ struct scsi_inquiry { #define DTYPE_NOTPRESENT (DPQ_NEVER | DTYPE_UNKNOWN) /* - * Defined Response Data Formats: + * Defined Versions for inquiry data. These represent the base version that a + * device supports. */ #define RDF_LEVEL0 0x00 /* no conformance claim (SCSI-1) */ #define RDF_CCS 0x01 /* Obsolete (pseudo-spec) */ @@ -370,7 +371,8 @@ struct scsi_inquiry { #define RDF_SCSI_SPC 0x03 /* ANSI INCITS 301-1997 (SPC) */ #define RDF_SCSI_SPC2 0x04 /* ANSI INCITS 351-2001 (SPC-2) */ #define RDF_SCSI_SPC3 0x05 /* ANSI INCITS 408-2005 (SPC-3) */ -#define RDF_SCSI_SPC4 0x06 /* t10 (SPC-4) */ +#define RDF_SCSI_SPC4 0x06 /* ANSI INCITS 513-2015 (SPC-4) */ +#define RDF_SCSI_SPC5 0x07 /* t10 (SPC-5) */ /* * Defined Target Port Group Select values: @@ -436,6 +438,7 @@ struct vpd_desc { #define PM_CAPABLE_SPC2 RDF_SCSI_SPC2 #define PM_CAPABLE_SPC3 RDF_SCSI_SPC3 #define PM_CAPABLE_SPC4 RDF_SCSI_SPC4 +#define PM_CAPABLE_SPC5 RDF_SCSI_SPC5 #define PM_CAPABLE_LOG_MASK 0xffff0000 /* use upper 16 bit to */ /* indicate log specifics */ #define PM_CAPABLE_LOG_SUPPORTED 0x10000 /* Log page 0xE might be */ diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h index 57e1e01aec..c4af129a32 100644 --- a/usr/src/uts/common/sys/scsi/targets/sddef.h +++ b/usr/src/uts/common/sys/scsi/targets/sddef.h @@ -775,6 +775,12 @@ _NOTE(MUTEX_PROTECTS_DATA(sd_lun::un_fi_mutex, #define SD_FM_LOG(un) (((struct sd_fm_internal *)\ ((un)->un_fm_private))->fm_log_level) +/* + * Version Related Macros + */ +#define SD_SCSI_VERS_IS_GE_SPC_4(un) \ + (SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC4 || \ + SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC5) /* * Values for un_ctype @@ -1862,6 +1868,10 @@ struct sd_fm_internal { #define SD_PM_CAPABLE_IS_SPC_4(pm_cap) \ ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) +#define SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap) \ + (((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) || \ + ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC5)) + #define SD_PM_CAP_LOG_SUPPORTED(pm_cap) \ ((pm_cap & PM_CAPABLE_LOG_SUPPORTED) ? TRUE : FALSE) diff --git a/usr/src/uts/common/sys/sensors.h b/usr/src/uts/common/sys/sensors.h new file mode 100644 index 0000000000..b9ca9f1f3f --- /dev/null +++ b/usr/src/uts/common/sys/sensors.h @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +#ifndef _SYS_SENSORS_H +#define _SYS_SENSORS_H + +/* + * Consolidated sensor ioctls for various parts of the operating system. These + * interfaces should not be relied on at all. They are evolving and will change + * as we add more to the system for this. This may eventually become a larger + * framework, though it's more likely we'll consolidate that in userland. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * List of different possible kinds of sensors. + */ +#define SENSOR_KIND_UNKNOWN 0x00 +#define SENSOR_KIND_TEMPERATURE 0x01 + +/* + * Lists of units that senors may have. + */ +#define SENSOR_UNIT_UNKNOWN 0x00 +#define SENSOR_UNIT_CELSIUS 0x01 +#define SENSOR_UNIT_FAHRENHEIT 0x02 +#define SENSOR_UNIT_KELVIN 0x03 + +#define SENSOR_IOCTL (('s' << 24) | ('e' << 16) | ('n' << 8)) + +/* + * Ask the sensor what kind of sensor it is. + */ +#define SENSOR_IOCTL_TYPE (SENSOR_IOCTL | 0x01) + +typedef struct sensor_ioctl_kind { + uint64_t sik_kind; +} sensor_ioctl_kind_t; + +/* + * Ask the sensor for a temperature measurement. The sensor is responsible for + * returning the units it's in. A temperature measurement is broken down into a + * signed value and a notion of its granularity. The sit_gran member indicates + * the granularity: the number of increments per degree in the temperature + * measurement (the sit_temp member). sit_gran is signed and the sign indicates + * whether one needs to multiply or divide the granularity. For example, a + * value that set sit_gran to 10 would mean that the value in sit_temp was in + * 10ths of a degree and that to get the actual value in degrees, one would + * divide by 10. On the other hand, a negative value means that we effectively + * have to multiply to get there. For example, a value of -2 would indicate that + * each value in sit_temp indicated two degrees and to get the temperature in + * degrees you would multiply sit_temp by two. + */ +#define SENSOR_IOCTL_TEMPERATURE (SENSOR_IOCTL | 0x02) + +typedef struct sensor_ioctl_temperature { + uint32_t sit_unit; + int32_t sit_gran; + int64_t sit_temp; +} sensor_ioctl_temperature_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SENSORS_H */ diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h index 0219fc2cf7..8f530afda2 100644 --- a/usr/src/uts/common/sys/shm.h +++ b/usr/src/uts/common/sys/shm.h @@ -21,6 +21,7 @@ */ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2016 Joyent, Inc. * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -120,6 +121,10 @@ struct shmid_ds { #define SHM_LOCK 3 /* Lock segment in core */ #define SHM_UNLOCK 4 /* Unlock segment */ +#if defined(_KERNEL) +#define SHM_RMID 5 /* Private RMID for lx support */ +#endif + #if !defined(_KERNEL) int shmget(key_t, size_t, int); int shmids(int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h index 4d8cdcede5..1eae2ca0a4 100644 --- a/usr/src/uts/common/sys/shm_impl.h +++ b/usr/src/uts/common/sys/shm_impl.h @@ -21,13 +21,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SHM_IMPL_H #define _SYS_SHM_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ipc_impl.h> #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/shm.h> @@ -70,7 +69,11 @@ typedef struct kshmid { time_t shm_ctime; /* last change time */ struct sptinfo *shm_sptinfo; /* info about ISM segment */ struct seg *shm_sptseg; /* pointer to ISM segment */ - long shm_sptprot; /* was reserved (still a "long") */ + ulong_t shm_opts; + /* + * Composed of: sptprot (uchar_t) and + * RM_PENDING flag (1 bit). + */ } kshmid_t; /* @@ -78,6 +81,14 @@ typedef struct kshmid { */ #define SHMSA_ISM 1 /* uses shared page table */ +/* + * shm_opts definitions + * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are + * used for additional options. + */ +#define SHM_PROT_MASK 0xff +#define SHM_RM_PENDING 0x100 + typedef struct sptinfo { struct as *sptas; /* dummy as ptr. for spt segment */ } sptinfo_t; diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index aece147bec..b12dff6034 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ struct sigaction32 { * use of these symbols by applications is injurious * to binary compatibility */ -#define NSIG 74 /* valid signals range from 1 to NSIG-1 */ -#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */ +#define NSIG 75 /* valid signals range from 1 to NSIG-1 */ +#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */ #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */ #define MINSIGSTKSZ 2048 diff --git a/usr/src/uts/common/sys/smbios.h b/usr/src/uts/common/sys/smbios.h index d28141e668..43163a7507 100644 --- a/usr/src/uts/common/sys/smbios.h +++ b/usr/src/uts/common/sys/smbios.h @@ -526,6 +526,10 @@ typedef struct smbios_processor { #define SMB_PRU_LGA36471 0x36 /* LGA3647-1 */ #define SMB_PRU_SP3 0x37 /* socket SP3 */ #define SMB_PRU_SP3r2 0x38 /* socket SP3r2 */ +#define SMB_PRU_LGA2066 0x39 /* Socket LGA2066 */ +#define SMB_PRU_BGA1392 0x3A /* Socket BGA1392 */ +#define SMB_PRU_BGA1510 0x3B /* Socket BGA1510 */ +#define SMB_PRU_BGA1528 0x3C /* Socket BGA1528 */ #define SMB_PRC_RESERVED 0x0001 /* reserved */ #define SMB_PRC_UNKNOWN 0x0002 /* unknown */ @@ -707,6 +711,7 @@ typedef struct smbios_processor { #define SMB_PRF_ZARCH 0xCC /* z/Architecture */ #define SMB_PRF_CORE_I5 0xCD /* Intel Core i5 */ #define SMB_PRF_CORE_I3 0xCE /* Intel Core i3 */ +#define SMB_PRF_CORE_I9 0xCF /* Intel Core i9 */ #define SMB_PRF_C7M 0xD2 /* VIA C7-M */ #define SMB_PRF_C7D 0xD3 /* VIA C7-D */ #define SMB_PRF_C7 0xD4 /* VIA C7 */ @@ -872,6 +877,7 @@ typedef struct smbios_port { #define SMB_POC_BNC 0x20 /* BNC */ #define SMB_POC_1394 0x21 /* 1394 */ #define SMB_POC_SATA 0x22 /* SAS/SATA plug receptacle */ +#define SMB_POC_USB_C 0x23 /* USB Type-C receptacle */ #define SMB_POC_PC98 0xA0 /* PC-98 */ #define SMB_POC_PC98HR 0xA1 /* PC-98Hireso */ #define SMB_POC_PCH98 0xA2 /* PC-H98 */ @@ -913,6 +919,8 @@ typedef struct smbios_port { #define SMB_POT_NETWORK 0x1F /* Network port */ #define SMB_POT_SATA 0x20 /* SATA */ #define SMB_POT_SAS 0x21 /* SAS */ +#define SMB_POT_MFDP 0x22 /* MFDP (Multi-Function Display Port) */ +#define SMB_POT_THUNDERBOLT 0x23 /* Thunderbolt */ #define SMB_POT_8251 0xA0 /* 8251 compatible */ #define SMB_POT_8251F 0xA1 /* 8251 FIFO compatible */ #define SMB_POT_OTHER 0xFF /* other */ @@ -933,6 +941,8 @@ typedef struct smbios_slot { uint16_t smbl_sg; /* segment group number */ uint8_t smbl_bus; /* bus number */ uint8_t smbl_df; /* device/function number */ + uint8_t smbl_dbw; /* data bus width */ + uint8_t smbl_npeers; /* PCIe bifurcation peers */ } smbios_slot_t; #define SMB_SLT_OTHER 0x01 /* other */ @@ -1036,6 +1046,21 @@ typedef struct smbios_slot { #define SMB_SLCH2_PME 0x01 /* slot supports PME# signal */ #define SMB_SLCH2_HOTPLUG 0x02 /* slot supports hot-plug devices */ #define SMB_SLCH2_SMBUS 0x04 /* slot supports SMBus signal */ +#define SMB_SLCH2_BIFUR 0x08 /* slot supports PCIe bifurcation */ + +/* + * SMBIOS 7.10.9 Slot Peer Devices + * + * This structure represents an optional peer device that may be part of an + * SMBIOS 3.2 slot. + */ +typedef struct smbios_slot_peer { + uint16_t smblp_group; /* peer segment group number */ + uint8_t smblp_bus; /* peer bus number */ + uint8_t smblp_device; /* peer device number */ + uint8_t smblp_function; /* peer function number */ + uint8_t smblp_data_width; /* peer data bus width */ +} smbios_slot_peer_t; /* * SMBIOS On-Board Device Information. See DSP0134 Section 7.11 for more @@ -1189,6 +1214,17 @@ typedef struct smbios_memdevice { uint16_t smbmd_minvolt; /* minimum voltage */ uint16_t smbmd_maxvolt; /* maximum voltage */ uint16_t smbmd_confvolt; /* configured voltage */ + uint8_t smbmd_memtech; /* memory technology */ + uint32_t smbmd_opcap_flags; /* operating mode capability */ + const char *smbmd_firmware_rev; /* firmware rev */ + uint16_t smbmd_modmfg_id; /* JEDEC module mfg id */ + uint16_t smbmd_modprod_id; /* JEDEC module product id */ + uint16_t smbmd_cntrlmfg_id; /* JEDEC controller mfg id */ + uint16_t smbmd_cntrlprod_id; /* JEDEC controller prod id */ + uint64_t smbmd_nvsize; /* non-volatile size in bytes */ + uint64_t smbmd_volatile_size; /* volatile size in bytes */ + uint64_t smbmd_cache_size; /* cache size in bytes */ + uint64_t smbmd_logical_size; /* logical size in bytes */ } smbios_memdevice_t; #define SMB_MDFF_OTHER 0x01 /* other */ @@ -1234,6 +1270,7 @@ typedef struct smbios_memdevice { #define SMB_MDT_LPDDR2 0x1C /* LPDDR2 */ #define SMB_MDT_LPDDR3 0x1D /* LPDDR3 */ #define SMB_MDT_LPDDR4 0x1E /* LPDDR4 */ +#define SMB_MDT_LOGNV 0x1F /* Logical non-volatile device */ #define SMB_MDF_OTHER 0x0002 /* other */ #define SMB_MDF_UNKNOWN 0x0004 /* unknown */ @@ -1256,6 +1293,20 @@ typedef struct smbios_memdevice { #define SMB_MDR_QUAD 0x04 /* quad */ #define SMB_MDR_OCTAL 0x08 /* octal */ +#define SMB_MTECH_OTHER 0x01 /* other */ +#define SMB_MTECH_UNKNOWN 0x02 /* unknown */ +#define SMB_MTECH_DRAM 0x03 /* DRAM */ +#define SMB_MTECH_NVDIMM_N 0x04 /* NVDIMM-N */ +#define SMB_MTECH_NVDIMM_F 0x05 /* NVDIMM-F */ +#define SMB_MTECH_NVDIMM_P 0x06 /* NVDIMM-P */ +#define SMB_MTECH_INTCPM 0x07 /* Intel persistent memory */ + +#define SMB_MOMC_OTHER 0x01 /* other */ +#define SMB_MOMC_UNKNOWN 0x02 /* unknown */ +#define SMB_MOMC_VOLATILE 0x04 /* Volatile memory */ +#define SMB_MOMC_BYTE_PM 0x08 /* Byte-accessible persistent memory */ +#define SMB_MOMC_BLOCK_PM 0x10 /* Block-accessible persistent memory */ + /* * SMBIOS Memory Array Mapped Address. See DSP0134 Section 7.20 for more * information. We convert start/end addresses into addr/size for convenience. @@ -1626,7 +1677,8 @@ typedef struct smbios_memdevice_ext { #define SMB_VERSION_28 0x0208 /* SMBIOS encoding for DMTF spec 2.8 */ #define SMB_VERSION_30 0x0300 /* SMBIOS encoding for DMTF spec 3.0 */ #define SMB_VERSION_31 0x0301 /* SMBIOS encoding for DMTF spec 3.1 */ -#define SMB_VERSION SMB_VERSION_31 /* SMBIOS latest version definitions */ +#define SMB_VERSION_32 0x0302 /* SMBIOS encoding for DMTF spec 3.2 */ +#define SMB_VERSION SMB_VERSION_32 /* SMBIOS latest version definitions */ #define SMB_O_NOCKSUM 0x1 /* do not verify header checksums */ #define SMB_O_NOVERS 0x2 /* do not verify header versions */ @@ -1686,6 +1738,10 @@ extern int smbios_info_cache(smbios_hdl_t *, id_t, smbios_cache_t *); extern int smbios_info_port(smbios_hdl_t *, id_t, smbios_port_t *); extern int smbios_info_extport(smbios_hdl_t *, id_t, smbios_port_ext_t *); extern int smbios_info_slot(smbios_hdl_t *, id_t, smbios_slot_t *); +extern int smbios_info_slot_peers(smbios_hdl_t *, id_t, uint_t *, + smbios_slot_peer_t **); +extern void smbios_info_slot_peers_free(smbios_hdl_t *, uint_t, + smbios_slot_peer_t *); extern int smbios_info_obdevs(smbios_hdl_t *, id_t, int, smbios_obdev_t *); extern int smbios_info_obdevs_ext(smbios_hdl_t *, id_t, smbios_obdev_ext_t *); extern int smbios_info_strtab(smbios_hdl_t *, id_t, int, const char *[]); @@ -1785,6 +1841,9 @@ extern const char *smbios_memdevice_type_desc(uint_t); extern const char *smbios_memdevice_flag_name(uint_t); extern const char *smbios_memdevice_flag_desc(uint_t); extern const char *smbios_memdevice_rank_desc(uint_t); +extern const char *smbios_memdevice_memtech_desc(uint_t); +extern const char *smbios_memdevice_op_capab_name(uint_t); +extern const char *smbios_memdevice_op_capab_desc(uint_t); extern const char *smbios_onboard_type_desc(uint_t); diff --git a/usr/src/uts/common/sys/smbios_impl.h b/usr/src/uts/common/sys/smbios_impl.h index 66edfb027a..df61892a82 100644 --- a/usr/src/uts/common/sys/smbios_impl.h +++ b/usr/src/uts/common/sys/smbios_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -201,8 +201,8 @@ typedef struct smb_cache { #define SMB_CACHE_SIZE(s) (((s) & 0x8000) ? \ ((uint32_t)((s) & 0x7FFF) * 64 * 1024) : ((uint32_t)(s) * 1024)) -#define SMB_CACHE_EXT_SIZE(s) (((s) & 0x80000000U) ? \ - ((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) : \ +#define SMB_CACHE_EXT_SIZE(s) (((s) & 0x80000000U) ? \ + ((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) : \ ((uint64_t)(s) * 1024ULL)) #define SMB_CACHE_CFG_MODE(c) (((c) >> 8) & 3) @@ -226,6 +226,13 @@ typedef struct smb_port { /* * SMBIOS implementation structure for SMB_TYPE_SLOT. */ +typedef struct smb_slot_peer { + uint16_t smbspb_group_no; /* segment group number */ + uint8_t smbspb_bus; /* bus number */ + uint8_t smbspb_df; /* device/function number */ + uint8_t smbspb_width; /* electrical width */ +} smb_slot_peer_t; + typedef struct smb_slot { smb_header_t smbsl_hdr; /* structure header */ uint8_t smbsl_name; /* reference designation */ @@ -239,6 +246,10 @@ typedef struct smb_slot { uint16_t smbsl_sg; /* segment group number */ uint8_t smbsl_bus; /* bus number */ uint8_t smbsl_df; /* device/function number */ + /* Added in SMBIOS 3.2+ */ + uint8_t smbsl_dbw; /* Data bus width */ + uint8_t smbsl_npeers; /* Peer bdf groups */ + smb_slot_peer_t smbsl_peers[]; /* bifurcation peers */ } smb_slot_t; /* @@ -343,6 +354,18 @@ typedef struct smb_memdevice { uint16_t smbmdev_minvolt; /* minimum voltage */ uint16_t smbmdev_maxvolt; /* maximum voltage */ uint16_t smbmdev_confvolt; /* configured voltage */ + /* Added in SMBIOS 3.2 */ + uint8_t smbmdev_memtech; /* memory technology */ + uint16_t smbmdev_opmode; /* memory operating mode capability */ + uint8_t smbmdev_fwver; /* firmware version */ + uint16_t smbmdev_modulemfgid; /* module manufacturer ID */ + uint16_t smbmdev_moduleprodid; /* module product ID */ + uint16_t smbmdev_memsysmfgid; /* memory controller manufacturer id */ + uint16_t smbmdev_memsysprodid; /* memory controller product id */ + uint64_t smbmdev_nvsize; /* non-volatile memory size */ + uint64_t smbmdev_volsize; /* volatile memory size */ + uint64_t smbmdev_cachesize; /* cache size */ + uint64_t smbmdev_logicalsize; /* logical size */ } smb_memdevice_t; #define SMB_MDS_KBYTES 0x8000 /* size in specified in kilobytes */ @@ -627,7 +650,7 @@ typedef struct smb_struct { const smb_header_t *smbst_hdr; /* address of raw structure data */ const uchar_t *smbst_str; /* address of string data (if any) */ const uchar_t *smbst_end; /* address of 0x0000 ending tag */ - struct smb_struct *smbst_next; /* next structure in hash chain */ + struct smb_struct *smbst_next; /* next structure in hash chain */ uint16_t *smbst_strtab; /* string index -> offset table */ uint_t smbst_strtablen; /* length of smbst_strtab */ } smb_struct_t; @@ -788,6 +811,20 @@ typedef struct smb_base_cache { uint8_t smbba_flags; /* cache flags (SMB_CAF_*) */ } smb_base_cache_t; +typedef struct smb_base_slot { + const char *smbbl_name; /* reference designation */ + uint8_t smbbl_type; /* slot type */ + uint8_t smbbl_width; /* slot data bus width */ + uint8_t smbbl_usage; /* current usage */ + uint8_t smbbl_length; /* slot length */ + uint16_t smbbl_id; /* slot ID */ + uint8_t smbbl_ch1; /* slot characteristics 1 */ + uint8_t smbbl_ch2; /* slot characteristics 2 */ + uint16_t smbbl_sg; /* segment group number */ + uint8_t smbbl_bus; /* bus number */ + uint8_t smbbl_df; /* device/function number */ +} smb_base_slot_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 93b0af97e8..d6e13d4823 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -22,6 +22,7 @@ * Copyright 2014 Garrett D'Amore <garrett@damore.org> * * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -39,6 +40,9 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #ifndef _SYS_SOCKET_H #define _SYS_SOCKET_H @@ -204,6 +208,7 @@ struct so_snd_bufinfo { #define SO_SRCADDR 0x2001 /* Internal: AF_UNIX source address */ #define SO_FILEP 0x2002 /* Internal: AF_UNIX file pointer */ #define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */ +#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */ #endif /* _KERNEL */ /* @@ -303,8 +308,9 @@ struct linger { #define AF_INET_OFFLOAD 30 /* Sun private; do not use */ #define AF_TRILL 31 /* TRILL interface */ #define AF_PACKET 32 /* PF_PACKET Linux socket interface */ +#define AF_LX_NETLINK 33 /* Linux-compatible netlink */ -#define AF_MAX 32 +#define AF_MAX 33 /* * Protocol families, same as address families for now. @@ -344,6 +350,7 @@ struct linger { #define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */ #define PF_TRILL AF_TRILL #define PF_PACKET AF_PACKET +#define PF_LX_NETLINK AF_LX_NETLINK #define PF_MAX AF_MAX @@ -429,6 +436,7 @@ struct msghdr32 { /* with left over data */ #define MSG_XPG4_2 0x8000 /* Private: XPG4.2 flag */ +/* Obsolete but kept for compilation compatability. Use IOV_MAX. */ #define MSG_MAXIOVLEN 16 #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index ac07bad909..6794b5687b 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -102,6 +103,7 @@ struct sockaddr_ux { typedef struct sonodeops sonodeops_t; typedef struct sonode sonode_t; +typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *); struct sodirect_s; @@ -244,6 +246,10 @@ struct sonode { struct sof_instance *so_filter_top; /* top of stack */ struct sof_instance *so_filter_bottom; /* bottom of stack */ clock_t so_filter_defertime; /* time when deferred */ + + /* Kernel direct receive callbacks */ + so_krecv_f so_krecv_cb; /* recv callback */ + void *so_krecv_arg; /* recv cb arg */ }; #define SO_HAVE_DATA(so) \ @@ -297,15 +303,16 @@ struct sonode { #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ -#define SS_CLOSING 0x00010000 /* in process of closing */ +#define SS_CLOSING 0x00010000 /* in process of closing */ #define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ #define SS_FILOP_OK 0x00040000 /* socket can attach filters */ #define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ + #define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ #define SS_FIL_STOP 0x00200000 /* no more filter actions */ - #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ +#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */ #define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ #define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ @@ -321,7 +328,8 @@ struct sonode { /* * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment. */ #define SO_BLOCK_FALLBACK(so, fn) \ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ @@ -337,6 +345,24 @@ struct sonode { } \ } +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define SO_BLOCK_FALLBACK_SAFE(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state |= SS_FILOP_UNSF; \ + mutex_exit(&(so)->so_lock); \ + } + #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } @@ -368,6 +394,7 @@ struct sonode { /* The modes below are only for non-streams sockets */ #define SM_ACCEPTSUPP 0x400 /* can handle accept() */ #define SM_SENDFILESUPP 0x800 /* Private: proto supp sendfile */ +#define SM_DEFERERR 0x1000 /* Private: defer so_error delivery */ /* * Socket versions. Used by the socket library when calling _so_socket(). @@ -946,6 +973,15 @@ extern struct sonode *socreate(struct sockparams *, int, int, int, int, extern int so_copyin(const void *, void *, size_t, int); extern int so_copyout(const void *, void *, size_t, int); +/* + * Functions to manipulate the use of direct receive callbacks. This should not + * be used outside of sockfs and ksocket. These are generally considered a use + * once interface for a socket and will cause all outstanding data on the socket + * to be flushed. + */ +extern int so_krecv_set(sonode_t *, so_krecv_f, void *); +extern void so_krecv_unblock(sonode_t *); + #endif /* diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops { #define SOF_VERSION 1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define SOF_ATT_SAFE 0x1 + extern int sof_register(int, const char *, const sof_ops_t *, int); extern int sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index f1bd429815..89b355970e 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_SQUEUE_H @@ -29,6 +30,17 @@ extern "C" { #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ + #include <sys/types.h> #include <sys/processor.h> #include <sys/stream.h> @@ -76,16 +88,17 @@ typedef enum { struct ip_recv_attr_s; extern void squeue_init(void); -extern squeue_t *squeue_create(clock_t, pri_t); +extern squeue_t *squeue_create(pri_t, boolean_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *); struct conn_s; extern int squeue_synch_enter(struct conn_s *, mblk_t *); -extern void squeue_synch_exit(struct conn_s *); +extern void squeue_synch_exit(struct conn_s *, int); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 22550886eb..2bb717fb52 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_SQUEUE_IMPL_H @@ -84,7 +85,6 @@ typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t); -extern void squeue_worker_wakeup(squeue_t *); extern int ip_squeue_flag; struct squeue_s { @@ -99,14 +99,11 @@ struct squeue_s { ill_rx_ring_t *sq_rx_ring; /* The Rx ring tied to this sq */ ill_t *sq_ill; /* The ill this squeue is tied to */ - clock_t sq_curr_time; /* Current tick (lbolt) */ + hrtime_t sq_awoken; /* time of worker wake req */ kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */ kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */ kcondvar_t sq_synch_cv; /* cond var. synch thread waits on */ kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */ - clock_t sq_wait; /* lbolts to wait after a fill() */ - timeout_id_t sq_tid; /* timer id of pending timeout() */ - clock_t sq_awaken; /* time async thread was awakened */ processorid_t sq_bind; /* processor to bind to */ kthread_t *sq_worker; /* kernel thread id */ @@ -117,6 +114,7 @@ struct squeue_s { squeue_set_t *sq_set; /* managed by squeue creator */ pri_t sq_priority; /* squeue thread priority */ + boolean_t sq_isip; /* use IP-centric features */ /* Keep the debug-only fields at the end of the structure */ #ifdef DEBUG @@ -140,7 +138,6 @@ struct squeue_s { #define SQS_USER 0x00000010 /* A non interrupt user */ #define SQS_BOUND 0x00000020 /* Worker thread is bound */ #define SQS_REENTER 0x00000040 /* Re entered thread */ -#define SQS_TMO_PROG 0x00000080 /* Timeout is being set */ #define SQS_POLL_CAPAB 0x00000100 /* Squeue can control interrupts */ #define SQS_ILL_BOUND 0x00000200 /* Squeue bound to an ill */ @@ -165,6 +162,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 #define SQS_PAUSE 0x04000000 /* The squeue has been paused */ +#define SQS_EXIT 0x08000000 /* squeue is being torn down */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 4be8d794fc..7488d3dee8 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -644,16 +645,13 @@ struct stroptions { /* * Structure for rw (read/write) procedure calls. A pointer * to a struiod_t is passed as a parameter to the rwnext() call. - * - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? */ #define DEF_IOV_MAX 16 struct struiod { mblk_t *d_mp; /* pointer to mblk (chain) */ uio_t d_uio; /* uio info */ - iovec_t d_iov[DEF_IOV_MAX]; /* iov referenced by uio */ + iovec_t *d_iov; /* iov referenced by uio */ }; /* diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index ce86badfc1..f3bc1ed407 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -25,6 +25,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_STRSUBR_H @@ -1239,10 +1240,17 @@ extern void strsignal_nolock(stdata_t *, int, uchar_t); struct multidata_s; struct pdesc_s; + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */ extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int); extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); + extern void lso_info_set(mblk_t *, uint32_t, uint32_t); extern void lso_info_cleanup(mblk_t *); extern unsigned int bcksum(uchar_t *, int, unsigned int); diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index 1d94c8fd2c..3026dc961a 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -24,6 +24,7 @@ * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_SUNDDI_H @@ -202,13 +203,13 @@ extern "C" { #define DDI_NT_KEYBOARD "ddi_keyboard" /* keyboard device */ -#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */ +#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */ #define DDI_NT_PRINTER "ddi_printer" /* printer device */ #define DDI_NT_UGEN "ddi_generic:usb" /* USB generic drv */ -#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */ +#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */ #define DDI_NT_NEXUS "ddi_ctl:devctl" /* nexus drivers */ @@ -260,6 +261,11 @@ extern "C" { #define DDI_NT_INTRCTL "ddi_tool_intr" /* tool intr access */ /* + * Various device types used for sensors. + */ +#define DDI_NT_SENSOR_TEMP_CPU "ddi_sensor:temperature:cpu" + +/* * DDI event definitions */ #define EC_DEVFS "EC_devfs" /* Event class devfs */ @@ -839,7 +845,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, * allocated by property provider via kmem_alloc. Requester * is responsible for freeing returned property via kmem_free. * - * Arguments: + * Arguments: * * dev: Input: dev_t of property. * dip: Input: dev_info_t pointer of child. @@ -850,7 +856,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, * valuep: Output: Addr of callers buffer pointer. * lengthp:Output: *lengthp will contain prop length on exit. * - * Possible Returns: + * Possible Returns: * * DDI_PROP_SUCCESS: Prop found and returned. * DDI_PROP_NOT_FOUND: Prop not found @@ -1585,8 +1591,14 @@ int ddi_ffs(long mask); int +ddi_ffsll(long long mask); + +int ddi_fls(long mask); +int +ddi_flsll(long long mask); + /* * The ddi_soft_state* routines comprise generic storage management utilities * for driver soft state structures. Two types of soft_state indexes are diff --git a/usr/src/uts/common/sys/sysconfig.h b/usr/src/uts/common/sys/sysconfig.h index 3a68d76ebe..d5b65ef78c 100644 --- a/usr/src/uts/common/sys/sysconfig.h +++ b/usr/src/uts/common/sys/sysconfig.h @@ -25,6 +25,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SYSCONFIG_H @@ -101,6 +102,8 @@ extern int mach_sysconfig(int); #define _CONFIG_EPHID_MAX 47 /* maximum ephemeral uid */ +#define _CONFIG_NPROC_NCPU 48 /* NCPU (sometimes > NPROC_MAX) */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/sysevent.h b/usr/src/uts/common/sys/sysevent.h index 304745ed08..c2be00ad27 100644 --- a/usr/src/uts/common/sys/sysevent.h +++ b/usr/src/uts/common/sys/sysevent.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SYSEVENT_H @@ -67,10 +68,12 @@ extern "C" { #define SE_KERN_PID 0 #define SUNW_VENDOR "SUNW" +#define ILLUMOS_VENDOR "ILLUMOS" #define SE_USR_PUB "usr:" #define SE_KERN_PUB "kern:" #define SUNW_KERN_PUB SUNW_VENDOR ":" SE_KERN_PUB #define SUNW_USR_PUB SUNW_VENDOR ":" SE_USR_PUB +#define ILLUMOS_KERN_PUB ILLUMOS_VENDOR ":" SE_KERN_PUB /* * Event header and attribute value limits diff --git a/usr/src/uts/common/sys/sysevent/datalink.h b/usr/src/uts/common/sys/sysevent/datalink.h new file mode 100644 index 0000000000..592ef5bdde --- /dev/null +++ b/usr/src/uts/common/sys/sysevent/datalink.h @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_SYSEVENT_DATALINK_H +#define _SYS_SYSEVENT_DATALINK_H + +/* + * Datalink System Event payloads + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DATALINK_LINK_STATE + * + * Event Class - EC_DATALINK + * Event Sub-Class - EC_DATALINK_LINK_STATE + * + * Attribute Name - DATALINK_EV_LINK_NAME + * Attribute Type - SE_DATA_TYPE_STRING + * Attribute Value - [Name of the datalink] + * + * Attribute Name - DATALINK_EV_LINK_ID + * Attribute Type - SE_DATA_TYPE_INT32 + * Attribute Value - [datalink_id_t for the device] + * + * Attribute Name - DATALINK_EV_ZONE_ID + * Attribute Type - SE_DATA_TYPE_INT32 + * Attribute Value - [zoneid_t of the zone the datalink is in] + */ + +#define DATALINK_EV_LINK_NAME "link" +#define DATALINK_EV_LINK_ID "linkid" +#define DATALINK_EV_ZONE_ID "zone" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DATALINK_H */ diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index cf6e040ee9..8995ba4aa0 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -212,9 +212,11 @@ extern "C" { #define ESC_ZFS_HISTORY_EVENT "ESC_ZFS_history_event" /* - * datalink subclass definitions. + * datalink subclass definitions. Supporting attributes for datalink state found + * in sys/sysevent/datalink.h. */ #define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */ +#define ESC_DATALINK_LINK_STATE "ESC_datalink_link_state" /* link state */ /* * VRRP subclass definitions. Supporting attributes (name/value paris) are diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dtrace.h> #ifdef __cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent; extern systrace_sysent_t *systrace_sysent32; extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #ifdef _SYSCALL32_IMPL extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #endif #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 0c07623ce6..b955e5f3f2 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -363,6 +363,24 @@ extern pid_t tcgetsid(int); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index f9f1d6462b..6cc474f864 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -71,7 +71,10 @@ typedef struct ctxop { void (*exit_op)(void *); /* invoked during {thread,lwp}_exit() */ void (*free_op)(void *, int); /* function which frees the context */ void *arg; /* argument to above functions, ctx pointer */ - struct ctxop *next; /* next context ops */ + struct ctxop *next; /* next context ops */ + struct ctxop *prev; /* previous context ops */ + hrtime_t save_ts; /* timestamp of last save */ + hrtime_t restore_ts; /* timestamp of last restore */ } ctxop_t; /* @@ -351,6 +354,8 @@ typedef struct _kthread { kmutex_t t_wait_mutex; /* used in CV wait functions */ char *t_name; /* thread name */ + + uint64_t t_unsafe; /* unsafe to run with HT VCPU thread */ } kthread_t; /* @@ -372,7 +377,7 @@ typedef struct _kthread { #define T_WOULDBLOCK 0x0020 /* for lockfs */ #define T_DONTBLOCK 0x0040 /* for lockfs */ #define T_DONTPEND 0x0080 /* for lockfs */ -#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */ +#define T_SPLITSTK 0x0100 /* kernel stack is currently split */ #define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ @@ -401,6 +406,7 @@ typedef struct _kthread { #define TP_CHANGEBIND 0x1000 /* thread has a new cpu/cpupart binding */ #define TP_ZTHREAD 0x2000 /* this is a kernel thread for a zone */ #define TP_WATCHSTOP 0x4000 /* thread is stopping via holdwatch() */ +#define TP_KTHREAD 0x8000 /* in-kernel worker thread for a process */ /* * Thread scheduler flag (t_schedflag) definitions. @@ -413,6 +419,7 @@ typedef struct _kthread { #define TS_SIGNALLED 0x0010 /* thread was awakened by cv_signal() */ #define TS_PROJWAITQ 0x0020 /* thread is on its project's waitq */ #define TS_ZONEWAITQ 0x0040 /* thread is on its zone's waitq */ +#define TS_VCPU 0x0080 /* thread will enter guest context */ #define TS_CSTART 0x0100 /* setrun() by continuelwps() */ #define TS_UNPAUSE 0x0200 /* setrun() by unpauselwps() */ #define TS_XSTART 0x0400 /* setrun() by SIGCONT */ @@ -420,8 +427,9 @@ typedef struct _kthread { #define TS_RESUME 0x1000 /* setrun() by CPR resume process */ #define TS_CREATE 0x2000 /* setrun() by syslwp_create() */ #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ +#define TS_BSTART 0x8000 /* setrun() by brand */ #define TS_ALLSTART \ - (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) + (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART) #define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* @@ -449,6 +457,10 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \ + !((t)->t_schedflag & TS_BSTART)) + /* True if thread is asleep and wakeable */ #define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ ((t)->t_flag & T_WAKEABLE))) @@ -599,6 +611,7 @@ int thread_setname(kthread_t *, const char *); int thread_vsetname(kthread_t *, const char *, ...); extern int default_binding_mode; +extern int default_stksize; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 81b4753049..a69bf4dd63 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -15,10 +15,11 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_TIME_H @@ -247,8 +248,8 @@ struct itimerval32 { #define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) #define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) -#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) -#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) #define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) #define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) @@ -264,6 +265,14 @@ typedef longlong_t hrtime_t; #if defined(_KERNEL) || defined(_FAKE_KERNEL) +/* + * Unsigned counterpart to hrtime_t + */ +typedef u_longlong_t uhrtime_t; + +#define HRTIME_MAX LLONG_MAX +#define UHRTIME_MAX ULLONG_MAX + #include <sys/time_impl.h> #include <sys/mutex.h> diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h index ec349c962f..748e0c0627 100644 --- a/usr/src/uts/common/sys/timer.h +++ b/usr/src/uts/common/sys/timer.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_TIMER_H @@ -34,6 +34,9 @@ #include <sys/types.h> #include <sys/proc.h> #include <sys/thread.h> +#include <sys/param.h> +#include <sys/siginfo.h> +#include <sys/port.h> #ifdef __cplusplus extern "C" { @@ -42,7 +45,13 @@ extern "C" { #ifdef _KERNEL #define _TIMER_MAX 32 -extern int timer_max; /* patchable via /etc/system */ +/* + * Max timers per process. This is patchable via /etc/system and can be + * updated via kmdb. Sticking to positive powers of 2 is recommended. + */ +extern int timer_max; + +#define _TIMER_ALLOC_INIT 8 /* initial size for p_itimer array */ /* * Bit values for the it_lock field. @@ -56,6 +65,7 @@ extern int timer_max; /* patchable via /etc/system */ */ #define IT_SIGNAL 0x01 #define IT_PORT 0x02 /* use event port notification */ +#define IT_CALLBACK 0x04 /* custom callback function */ struct clock_backend; @@ -83,14 +93,27 @@ struct itimer { struct clock_backend *it_backend; void (*it_fire)(itimer_t *); kmutex_t it_mutex; - void *it_portev; /* port_kevent_t pointer */ - void *it_portsrc; /* port_source_t pointer */ - int it_portfd; /* port file descriptor */ + union { + struct { + void *_it_portev; /* port_kevent_t pointer */ + void *_it_portsrc; /* port_source_t pointer */ + int _it_portfd; /* port file descriptor */ + } _it_ev_port; + struct { + void (*_it_cb_func)(itimer_t *); + uintptr_t _it_cb_data[2]; + } _it_ev_cb; + } _it_ev_data; }; #define it_sigq __data.__proc.__it_sigq #define it_lwp __data.__proc.__it_lwp #define it_frontend __data.__it_frontend +#define it_portev _it_ev_data._it_ev_port._it_portev +#define it_portsrc _it_ev_data._it_ev_port._it_portsrc +#define it_portfd _it_ev_data._it_ev_port._it_portfd +#define it_cb_func _it_ev_data._it_ev_cb._it_cb_func +#define it_cb_data _it_ev_data._it_ev_cb._it_cb_data typedef struct clock_backend { struct sigevent clk_default; @@ -107,7 +130,11 @@ typedef struct clock_backend { extern void clock_add_backend(clockid_t clock, clock_backend_t *backend); extern clock_backend_t *clock_get_backend(clockid_t clock); +extern void timer_release(struct proc *, itimer_t *); +extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it); extern void timer_lwpbind(); +extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *, + itimer_t **, timer_t *); extern void timer_func(sigqueue_t *); extern void timer_exit(void); diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index 904b52cac4..75d000b831 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -159,7 +160,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index bca1ed1fa3..9584be559f 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -145,7 +145,8 @@ typedef struct uioa_s { */ typedef enum xuio_type { UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY + UIOTYPE_ZEROCOPY, + UIOTYPE_PEEKSIZE } xuio_type_t; typedef struct xuio { @@ -175,6 +176,15 @@ typedef struct xuio { int xu_zc_rw; /* read or write buffer */ void *xu_zc_priv; /* fs specific */ } xu_zc; + + /* + * Peek Size Support -- facilitate peeking at the size of a + * waiting message on a socket. + */ + struct { + ssize_t xu_ps_size; /* size of waiting msg */ + boolean_t xu_ps_set; /* was size calculated? */ + } xu_ps; } xu_ext; } xuio_t; diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h index c96f914a70..f1b209faad 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h @@ -20,7 +20,7 @@ */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_USB_HIDMINOR_H @@ -44,21 +44,28 @@ extern "C" { * transparent. * * So we change minor node numbering scheme to be: - * external node minor num == instance << 1 - * internal node minor num == instance << 1 | 0x1 + * external node minor num == instance << 9 + * internal node minor num == instance << 9 | 0x100 * (There are only internal nodes for keyboard/mouse now.) + * + * The 8 bits of the LSB are used for ugen minor numbering (hence the use + * of the first bit of the next byte for the "internal" flag) */ -#define HID_MINOR_BITS_MASK 0x1 +#define HID_MINOR_BITS_MASK 0x1ff +#define HID_MINOR_UGEN_BITS_MASK 0xff #define HID_MINOR_INSTANCE_MASK ~HID_MINOR_BITS_MASK -#define HID_MINOR_INSTANCE_SHIFT 1 +#define HID_MINOR_INSTANCE_SHIFT 9 -#define HID_MINOR_INTERNAL 0x1 +#define HID_MINOR_INTERNAL 0x100 #define HID_MINOR_MAKE_INTERNAL(minor) \ ((minor) | HID_MINOR_INTERNAL) #define HID_IS_INTERNAL_OPEN(minor) \ (((minor) & HID_MINOR_INTERNAL)) +#define HID_IS_UGEN_OPEN(minor) \ + (((minor) & HID_MINOR_UGEN_BITS_MASK)) + #define HID_MINOR_TO_INSTANCE(minor) \ (((minor) & HID_MINOR_INSTANCE_MASK) >> \ HID_MINOR_INSTANCE_SHIFT) diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h index e9a25ea894..ee68f0088a 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_USB_HIDVAR_H @@ -33,6 +33,7 @@ extern "C" { #endif #include <sys/usb/usba/usbai_private.h> +#include <sys/usb/usba/usba_ugen.h> /* * HID : This header file contains the internal structures @@ -222,6 +223,8 @@ typedef struct hid_state { queue_t *hid_inuse_rq; int hid_internal_flag; /* see below */ int hid_external_flag; /* see below */ + + usb_ugen_hdl_t hid_ugen_hdl; /* ugen support */ } hid_state_t; /* warlock directives, stable data */ diff --git a/usr/src/uts/common/sys/usb/usba/bos.h b/usr/src/uts/common/sys/usb/usba/bos.h new file mode 100644 index 0000000000..417dd1e60c --- /dev/null +++ b/usr/src/uts/common/sys/usb/usba/bos.h @@ -0,0 +1,242 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_USB_BOS_H +#define _SYS_USB_BOS_H + +/* + * This header contains definitions that relate to the USB Binary Object Store. + * While this functionality was originally introduced with WUSB, it was used in + * USB 3.x as a way to provide additional device related information. This is + * currently separate from the primary usbai headers as this functionality is + * not currently used by client device drivers themselves, but only by the hub + * driver for private functionality. + * + * This data is all derived from the USB 3.1 specification, Chapter 9.6.2 Binary + * Device Object Store (BOS). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Capability list, see USB 3.1 r1.0, Table 9-14. + */ +#define USB_BOS_TYPE_INVALID 0x00 /* Internal, synthetic value */ +#define USB_BOS_TYPE_WUSB 0x01 +#define USB_BOS_TYPE_USB2_EXT 0x02 +#define USB_BOS_TYPE_SUPERSPEED 0x03 +#define USB_BOS_TYPE_CONTAINER 0x04 +#define USB_BOS_TYPE_PLATFORM 0x05 +#define USB_BOS_TYPE_PD_CAP 0x06 +#define USB_BOS_TYPE_BATTERY_INFO 0x07 +#define USB_BOS_TYPE_PD_CONSUMER_CAP 0x08 +#define USB_BOS_TYPE_PD_PRODUCER_CAP 0x09 +#define USB_BOS_TYPE_SUPERSPEED_PLUS 0x0a +#define USB_BOS_TYPE_PRECISION_TIME 0x0b +#define USB_BOS_TYPE_WUSB_EXT 0x0c + +/* + * General Binary Object Store (BOS) descriptor. This is returned at the start + * of the BOS tree. See USB 3.1/Table 9-12. + */ +typedef struct usb_bos_descr { + uint8_t bLength; /* Descriptor size */ + uint8_t bDescriptorType; /* Set to USB_DESCR_TYPE_BOS */ + uint16_t wTotalLength; /* Total length */ + uint8_t bNumDeviceCaps; /* Number of caps that follow */ +} usb_bos_descr_t; + +/* + * This is the size of the usb_bos_descr_t in terms of packed bytes. + */ +#define USB_BOS_PACKED_SIZE 5 + +/* + * This represents a Device Capability Descriptor. bNumDeviceCaps of these + * follow the usb_bos_descr_t. This structure is the generic header of each + * device capability. Capability specific ones follow this. See USB 3.1/Table + * 9-14. + */ +typedef struct usb_dev_cap_descr { + uint8_t bLength; /* Descriptor size */ + uint8_t bDescriptorType; /* USB_TYPE_DEV_CAPABILITY */ + uint8_t bDevCapabilityType; /* USB_BOS_TYPE_* value */ +} usb_dev_cap_descr_t; + +#define USB_DEV_CAP_PACKED_SIZE 3 + +/* + * SuperSpeed devices include this descriptor to describe additional + * capabilities that they have when operating in USB 2.0 High-Speed mode. See + * USB 3.1/9.6.2.1 USB 2.0 Extension. + */ +typedef struct usb_bos_usb2ext { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bDevCapabilityType; + uint32_t bmAttributes; /* Bitfield defined below */ +} usb_bos_usb2ext_t; + +#define USB_BOS_USB2EXT_PACKED_SIZE 7 + +#define USB_BOS_USB2EXT_LPM 0x02 + +/* + * SuperSpeed devices include this descriptor to describe various hardware + * attributes related to basic USB 3.0 SuperSpeed functionality. See USB + * 3.1/9.6.2.2 SuperSpeed USB Device Capability. + */ +typedef struct usb_bos_ssusb { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bDevCapabilityType; + uint8_t bmAttributes; /* Capability bitfield */ + uint16_t wSpeedsSupported; /* speed bitmap defined below */ + uint8_t bFunctionalitySupport; /* Minimum supported speed */ + uint8_t bU1DevExitLat; /* Exit latency in us */ + uint16_t bU2DevExitLat; /* Exit latency in us */ +} usb_bos_ssusb_t; + +#define USB_BOS_SSUSB_PACKED_SIZE 10 + +#define USB_BOS_SSUB_CAP_LTM 0x02 + +#define USB_BOS_SSUSB_SPEED_LOW (1 << 0) +#define USB_BOS_SSUSB_SPEED_FULL (1 << 1) +#define USB_BOS_SSUSB_SPEED_HIGH (1 << 2) +#define USB_BOS_SSUSB_SPEED_SUPER (1 << 3) + +/* + * This structure is used to indicate a UUID for a given device that could + * register on multiple ports. For example, a hub that appears on both a USB 2.x + * and USB 3.x port like a hub. This UUID allows one to know that the device is + * the same. See USB 3.1/9.6.2.3 Container ID. + */ +typedef struct usb_bos_container { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bDevCapabilityType; + uint8_t bReserved; + uint8_t ContainerId[16]; +} usb_bos_container_t; + +#define USB_BOS_CONTAINER_PACKED_SIZE 20 + +/* + * This structure is used to indicate a platform-specific capability. For more + * information, see USB 3.1/9.6.2.4 Platform Descriptor. + */ +typedef struct usb_bos_platform { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bDevCapabilityType; + uint8_t bReserved; + uint8_t PlatformCapabilityUUID[16]; + uint8_t CapabilityData[]; +} usb_bos_platform_t; + +#define USB_BOS_PLATFORM_MIN_PACKED_SIZE 20 + +/* + * This structure is used to indicate capabilities and attributes of a + * SuperSpeedPlus link. This describes the USB 3.1+ speed needs and minimum + * attributes of the device. See USB 3.1/9.6.2.5 SuperSpeedPlus USB Device + * Capability. + */ +typedef struct usb_bos_ssplus { + uint8_t bLength; + uint8_t bDescriptortype; + uint8_t bDevCapabilityType; + uint8_t bReserved; + uint32_t bmAttributes; + uint16_t wFunctionalitySupport; + uint16_t wReserved; + uint32_t bmSublinkSpeedAttr[]; +} usb_bos_ssplus_t; + +#define USB_BOS_SSPLUS_MIN_PACKED_SIZE 16 + +/* + * These macros take apart the bmAttributes fields. + */ +#define USB_BOS_SSPLUS_NSSAC(x) (((x) & 0xf) + 1) +#define USB_BOS_SSPLUS_NSSIC(x) ((((x) & 0xf0) >> 4) + 1) + +/* + * These macros take apart the wFunctionalitySupport member. + */ +#define USB_BOS_SSPLUS_MIN_SSAI(x) ((x) & 0x0f) +#define USB_BOS_SSPLUS_MIN_RX_LANE(x) (((x) >> 8) & 0xf) +#define USB_BOS_SSPLUS_MIN_TX_LANE(x) (((x) >> 12) & 0xf) + +/* + * These macros are used to take apart the bmSublinkSpeedAttr members. There is + * always at least one of them that exist in each attribute; however, there + * could be more based on the value in NSSAC. + */ +#define USB_BOS_SSPLUS_ATTR_SSID(x) ((x) & 0xf) +#define USB_BOS_SSPLUS_ATTR_LSE(x) (((x) >> 4) & 0x3) +#define USB_BOS_SSPLUS_ATTR_LSE_BITPS 0 +#define USB_BOS_SSPLUS_ATTR_LSE_KBITPS 1 +#define USB_BOS_SSPLUS_ATTR_LSE_GBITPS 2 + +/* + * These two macros take apart the sublink type. bit 6 indicates whether or not + * the links are symmetric or asymmetric. It is asymmetric if the value is set + * to one (USB_BOS_SSPLUS_ATTR_ST_ASYM), symmetric otherwise. If it is + * asymmetric, then bit 7 indicates whether or not it's a tx or rx link. + */ +#define USB_BOS_SSPLUS_ATTR_ST_ASYM (1 << 6) +#define USB_BOS_SSPLUS_ATTR_ST_TX (1 << 7) + +#define USB_BOS_SSPLUS_ATTR_LP(x) (((x) >> 14) & 0x3) +#define USB_BOS_SSPLUS_ATTR_LP_SS 0x0 +#define USB_BOS_SSPLUS_ATTR_LP_SSPLUS 0x1 + +#define USB_BOS_SSPLUS_ATTR_LSM(x) ((x) >> 16) + +typedef struct usb_bos_precision_time { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bDevCapabilityType; +} usb_bos_precision_time_t; + +#define USB_BOS_PRECISION_TIME_PACKED_SIZE 3 + +/* + * This structure serves as an internal, parsed representation of a USB bos + * descriptor. + */ +typedef struct usb_bos { + uint8_t ubos_length; + uint8_t ubos_type; + union { + usb_bos_usb2ext_t ubos_usb2; + usb_bos_ssusb_t ubos_ssusb; + usb_bos_container_t ubos_container; + usb_bos_platform_t ubos_platform; + usb_bos_ssplus_t ubos_ssplus; + usb_bos_precision_time_t ubos_time; + uint8_t ubos_raw[256]; + } ubos_caps; +} usb_bos_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_USB_BOS_H */ diff --git a/usr/src/uts/common/sys/usb/usba/usba10.h b/usr/src/uts/common/sys/usb/usba/usba10.h index 947dd65379..42e78cd35e 100644 --- a/usr/src/uts/common/sys/usb/usba/usba10.h +++ b/usr/src/uts/common/sys/usb/usba/usba10.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_USB_USBA10_H @@ -139,7 +140,7 @@ usba10_usb_free_descr_tree( size_t usba10_usb_parse_data( char *format, - uchar_t *data, + const uchar_t *data, size_t datalen, void *structure, size_t structlen); diff --git a/usr/src/uts/common/sys/usb/usba/usba_impl.h b/usr/src/uts/common/sys/usb/usba/usba_impl.h index 784bb32d44..ddb6f7346d 100644 --- a/usr/src/uts/common/sys/usb/usba/usba_impl.h +++ b/usr/src/uts/common/sys/usb/usba/usba_impl.h @@ -23,6 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_USB_USBA_USBA_IMPL_H @@ -34,6 +35,7 @@ #include <sys/usb/usba/hubdi.h> #include <sys/usb/usba/usba_private.h> #include <sys/usb/usba/usba_types.h> +#include <sys/usb/usba/bos.h> #include <sys/taskq.h> #include <sys/disp.h> @@ -301,6 +303,13 @@ void usba_rem_root_hub(dev_info_t *dip); void usba_get_dev_string_descrs(dev_info_t *, usba_device_t *); /* + * Retrieve the binary object store for the device. + */ +void usba_get_binary_object_store(dev_info_t *, usba_device_t *); +void usba_add_binary_object_store_props(dev_info_t *, usba_device_t *); +void usba_free_binary_object_store(usba_device_t *); + +/* * Check if we are not in interrupt context and have * USB_FLAGS_SLEEP flags set. */ diff --git a/usr/src/uts/common/sys/usb/usba/usba_private.h b/usr/src/uts/common/sys/usb/usba/usba_private.h index 4e56e4aa47..406ee3824c 100644 --- a/usr/src/uts/common/sys/usb/usba/usba_private.h +++ b/usr/src/uts/common/sys/usb/usba/usba_private.h @@ -23,6 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_USB_USBA_USBA_PRIVATE_H @@ -88,21 +89,21 @@ extern "C" { * extended in a later rev of the spec. */ size_t usb_parse_dev_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(DEVICE) */ size_t buflen, usb_dev_descr_t *ret_descr, size_t ret_buf_len); size_t usb_parse_cfg_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, usb_cfg_descr_t *ret_descr, size_t ret_buf_len); size_t usb_parse_ia_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, size_t first_if, usb_ia_descr_t *ret_descr, @@ -110,7 +111,7 @@ size_t usb_parse_ia_descr( size_t usb_parse_if_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, uint_t if_index, uint_t alt_if_setting, @@ -123,7 +124,7 @@ size_t usb_parse_if_descr( * the first endpoint */ size_t usb_parse_ep_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, uint_t if_index, uint_t alt_if_setting, @@ -160,7 +161,7 @@ size_t usb_parse_ep_descr( #define USB_DESCR_TYPE_ANY -1 /* Wild card */ size_t usb_parse_CV_cfg_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, char *fmt, uint_t descr_type, @@ -170,7 +171,7 @@ size_t usb_parse_CV_cfg_descr( size_t usb_parse_CV_if_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, char *fmt, uint_t if_index, @@ -182,7 +183,7 @@ size_t usb_parse_CV_if_descr( size_t usb_parse_CV_ep_descr( - uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ + const uchar_t *buf, /* from GET_DESCRIPTOR(CONFIGURATION) */ size_t buflen, char *fmt, uint_t if_index, @@ -199,7 +200,7 @@ size_t usb_parse_CV_ep_descr( */ size_t usb_parse_CV_descr( char *format, - uchar_t *data, + const uchar_t *data, size_t datalen, void *structure, size_t structlen); @@ -270,8 +271,7 @@ typedef enum usba_event { USBA_EVENT_TAG_HOT_REMOVAL = 0, USBA_EVENT_TAG_HOT_INSERTION = 1, USBA_EVENT_TAG_PRE_SUSPEND = 2, - USBA_EVENT_TAG_POST_RESUME = 3, - USBA_EVENT_TAG_CPR = -1 + USBA_EVENT_TAG_POST_RESUME = 3 } usba_event_t; #define USBA_PRE_SUSPEND_EVENT "SUNW,USBA:USBA_PRE_SUSPEND" @@ -409,11 +409,11 @@ typedef struct usba_if_pwr_descr { uint16_t TransitionTimeFromD3; /* D3 -> D0 transition time */ } usba_if_pwr_descr_t; -size_t usba_parse_cfg_pwr_descr(uchar_t *, size_t, usba_cfg_pwr_descr_t *, - size_t); +size_t usba_parse_cfg_pwr_descr(const uchar_t *, size_t, usba_cfg_pwr_descr_t *, + size_t); -size_t usba_parse_if_pwr_descr(uchar_t *, size_t buflen, uint_t, - uint_t, usba_if_pwr_descr_t *, size_t); +size_t usba_parse_if_pwr_descr(const uchar_t *, size_t buflen, uint_t, + uint_t, usba_if_pwr_descr_t *, size_t); /* * Returns (at ret_descr) a null-terminated string. Null termination is @@ -423,7 +423,7 @@ size_t usba_parse_if_pwr_descr(uchar_t *, size_t buflen, uint_t, * XXX is this needed when there is usb_get_string_descriptor * If so, then more comments about how it differs? */ -size_t usba_ascii_string_descr(uchar_t *, size_t, char *, size_t); +size_t usba_ascii_string_descr(const uchar_t *, size_t, char *, size_t); /* diff --git a/usr/src/uts/common/sys/usb/usba/usba_types.h b/usr/src/uts/common/sys/usb/usba/usba_types.h index c99a958c1a..e09bacb860 100644 --- a/usr/src/uts/common/sys/usb/usba/usba_types.h +++ b/usr/src/uts/common/sys/usb/usba/usba_types.h @@ -22,6 +22,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_USB_USBA_USBA_TYPES_H @@ -31,6 +32,7 @@ #include <sys/taskq.h> #include <sys/usb/usba/usba_private.h> #include <sys/usb/usba/usbai_private.h> +#include <sys/usb/usba/bos.h> #ifdef __cplusplus extern "C" { @@ -241,7 +243,7 @@ typedef struct usb_client_dev_data_list { } usb_client_dev_data_list_t; /* - * This structure uniquely identifies a USB device + * This structure uniquely identifies a USB device * with all interfaces, or just one interface of a USB device. * usba_device is associated with a devinfo node * @@ -363,6 +365,14 @@ typedef struct usba_device { * Private data for HCD drivers */ void *usb_hcd_private; + + /* + * Binary Object Store data + */ + mblk_t *usb_bos_mp; + uint_t usb_bos_nalloc; + uint_t usb_bos_nents; + usb_bos_t *usb_bos; } usba_device_t; #define USBA_CLIENT_FLAG_SIZE 1 diff --git a/usr/src/uts/common/sys/usb/usbai.h b/usr/src/uts/common/sys/usb/usbai.h index b37d8f230f..6c90a50b81 100644 --- a/usr/src/uts/common/sys/usb/usbai.h +++ b/usr/src/uts/common/sys/usb/usbai.h @@ -23,7 +23,7 @@ * Use is subject to license terms. * * Copyright 2014 Garrett D'Amore <garrett@damore.org> - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_USB_USBAI_H @@ -789,7 +789,7 @@ void usb_client_detach( size_t usb_parse_data( char *format, - uchar_t *data, + const uchar_t *data, size_t datalen, void *structure, size_t structlen); diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index 0b997c518c..15b4d0b247 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -82,6 +82,21 @@ extern "C" { #endif /* + * File Descriptor assignment generation. + * + * Certain file descriptor consumers (namely epoll) need to be able to detect + * when the resource underlying an fd change due to (re)assignment. Checks + * comparing old and new file_t pointers work OK, but could easily be fooled by + * an entry freed-to and reused-from the cache. To better detect such + * assingments, a generation number is kept in the uf_entry. Whenever a + * non-NULL file_t is assigned to the entry, the generation is incremented, + * indicating the change. There is a minute possibility that a rollover of the + * value could cause assigments to evade detection by consumers, but it is + * considered acceptably small. + */ +typedef uint_t uf_entry_gen_t; + +/* * Entry in the per-process list of open files. * Note: only certain fields are copied in flist_grow() and flist_fork(). * This is indicated in brackets in the structure member comments. @@ -96,11 +111,13 @@ typedef struct uf_entry { short uf_busy; /* file is allocated [grow, fork] */ kcondvar_t uf_wanted_cv; /* waiting for setf() [never copied] */ kcondvar_t uf_closing_cv; /* waiting for close() [never copied] */ - struct portfd *uf_portfd; /* associated with port [grow] */ + struct portfd *uf_portfd; /* associated with port [grow] */ + uf_entry_gen_t uf_gen; /* assigned fd generation [grow,fork] */ /* Avoid false sharing - pad to coherency granularity (64 bytes) */ char uf_pad[64 - sizeof (kmutex_t) - 2 * sizeof (void*) - 2 * sizeof (int) - 2 * sizeof (short) - - 2 * sizeof (kcondvar_t) - sizeof (struct portfd *)]; + 2 * sizeof (kcondvar_t) - sizeof (struct portfd *) - + sizeof (uf_entry_gen_t)]; } uf_entry_t; /* @@ -185,9 +202,9 @@ typedef struct { /* kernel syscall set type */ * This value should not be changed in a patch. */ #if defined(__sparc) -#define __KERN_NAUXV_IMPL 20 +#define __KERN_NAUXV_IMPL 24 #elif defined(__i386) || defined(__amd64) -#define __KERN_NAUXV_IMPL 25 +#define __KERN_NAUXV_IMPL 28 #endif struct execsw; diff --git a/usr/src/uts/common/sys/vm.h b/usr/src/uts/common/sys/vm.h index a8ca2ad377..0f7dfa9fd0 100644 --- a/usr/src/uts/common/sys/vm.h +++ b/usr/src/uts/common/sys/vm.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -57,6 +58,8 @@ int queue_io_request(struct vnode *, u_offset_t); extern kmutex_t memavail_lock; extern kcondvar_t memavail_cv; +#define WAKE_PAGEOUT_SCANNER() cv_broadcast(&proc_pageout->p_cv) + #endif /* defined(_KERNEL) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..afbf438eff 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,12 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ + +#define VMUSAGE_ZONE_FLAGS (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | \ + VMUSAGE_A_ZONE) typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index c274bae805..2292310bda 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright (c) 2017, Joyent, Inc. All rights reserved. + */ +/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +61,9 @@ extern pgcnt_t desscan; /* desired pages scanned per second */ extern pgcnt_t slowscan; extern pgcnt_t fastscan; extern pgcnt_t pushes; /* number of pages pushed to swap device */ +extern uint64_t low_mem_scan; /* num times page scan due to low memory */ +extern uint64_t zone_cap_scan; /* num times page scan due to zone cap */ +extern uint64_t n_throttle; /* num times page create throttled */ /* writable copies of tunables */ extern pgcnt_t maxpgio; /* max paging i/o per sec before start swaps */ @@ -159,6 +165,8 @@ extern void *boot_virt_alloc(void *addr, size_t size); extern size_t exec_get_spslew(void); +extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define _SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define VND_IOC_ATTACH (VND_IOC | 0x1) + +#define VND_NAMELEN 32 + +typedef struct vnd_ioc_attach { + char via_name[VND_NAMELEN]; + zoneid_t via_zoneid; + uint32_t via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define VND_IOC_LINK (VND_IOC | 0x2) + +typedef struct vnd_ioc_link { + char vil_name[VND_NAMELEN]; + uint32_t vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define VND_IOC_UNLINK (VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { + uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { + uint64_t vib_size; + uint32_t vib_filler; + uint32_t vib_errno; +} vnd_ioc_buf_t; + +#define VND_IOC_GETRXBUF (VND_IOC | 0x04) +#define VND_IOC_SETRXBUF (VND_IOC | 0x05) +#define VND_IOC_GETMAXBUF (VND_IOC | 0x06) +#define VND_IOC_GETTXBUF (VND_IOC | 0x07) +#define VND_IOC_SETTXBUF (VND_IOC | 0x08) +#define VND_IOC_GETMINTU (VND_IOC | 0x09) +#define VND_IOC_GETMAXTU (VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { + uint32_t vii_version; + zoneid_t vii_zone; + char vii_name[VND_NAMELEN]; + char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { + uint_t vl_nents; + uint_t vl_actents; + vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { + uint_t vl_nents; + uint_t vl_actents; + caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif /* _KERNEL */ + +#define VND_IOC_LIST (VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30) +#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define _SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { + VND_E_SUCCESS = 0, /* no error */ + VND_E_NOMEM, /* no memory */ + VND_E_NODATALINK, /* no such datalink */ + VND_E_NOTETHER, /* not DL_ETHER */ + VND_E_DLPIINVAL, /* Unknown DLPI failures */ + VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */ + VND_E_BINDFAIL, /* DL_BIND_REQ failed */ + VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */ + VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */ + VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */ + VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */ + VND_E_DLDBADVERS, /* bad dld version */ + VND_E_KSTATCREATE, /* failed to create kstats */ + VND_E_NODEV, /* no such vnd link */ + VND_E_NONETSTACK, /* netstack doesn't exist */ + VND_E_ASSOCIATED, /* device already associated */ + VND_E_ATTACHED, /* device already attached */ + VND_E_LINKED, /* device already linked */ + VND_E_BADNAME, /* invalid name */ + VND_E_PERM, /* can't touch this */ + VND_E_NOZONE, /* no such zone */ + VND_E_STRINIT, /* failed to initialize vnd stream module */ + VND_E_NOTATTACHED, /* device not attached */ + VND_E_NOTLINKED, /* device not linked */ + VND_E_LINKEXISTS, /* another device has the same link name */ + VND_E_MINORNODE, /* failed to create minor node */ + VND_E_BUFTOOBIG, /* requested buffer size is too large */ + VND_E_BUFTOOSMALL, /* requested buffer size is too small */ + VND_E_DLEXCL, /* unable to get dlpi excl access */ + VND_E_DIRECTNOTSUP, + /* DLD direct capability not suported over data link */ + VND_E_BADPROPSIZE, /* invalid property size */ + VND_E_BADPROP, /* invalid property */ + VND_E_PROPRDONLY, /* property is read only */ + VND_E_SYS, /* unexpected system error */ + VND_E_CAPABPASS, + /* capabilities invalid, pass-through module detected */ + VND_E_UNKNOWN /* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 7e50091347..4c8d49c621 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_VNIC_IMPL_H @@ -64,7 +64,9 @@ typedef struct vnic_s { mac_notify_handle_t vn_mnh; uint32_t vn_hcksum_txflags; + mac_capab_lso_t vn_cap_lso; uint32_t vn_mtu; + link_state_t vn_ls; } vnic_t; #define vn_mch vn_mc_handles[0] diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 51b4f7af18..b527558895 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. */ @@ -805,12 +805,14 @@ typedef enum vnevent { VE_RMDIR = 4, /* Remove of directory vnode's name */ VE_CREATE = 5, /* Create with vnode's name which exists */ VE_LINK = 6, /* Link with vnode's name as source */ - VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ + VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ VE_TRUNCATE = 9, /* Truncate */ VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */ VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ - VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */ + VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */ + VE_RENAME_SRC_DIR = 13, /* Rename with vnode as source dir */ + VE_RESIZE = 14 /* Resize/truncate to non-zero offset */ } vnevent_t; /* @@ -1370,7 +1372,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_create(vnode_t *, caller_context_t *); void vnevent_link(vnode_t *, caller_context_t *); -void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, + caller_context_t *ct); void vnevent_mountedover(vnode_t *, caller_context_t *); void vnevent_truncate(vnode_t *, caller_context_t *); int vnevent_support(vnode_t *, caller_context_t *); @@ -1380,6 +1383,7 @@ void vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *, caller_context_t *); +void vnevent_resize(vnode_t *, caller_context_t *); /* Vnode specific data */ void vsd_create(uint_t *, void (*)(void *)); @@ -1482,6 +1486,7 @@ extern struct vnode kvps[]; typedef enum { KV_KVP, /* vnode for all segkmem pages */ KV_ZVP, /* vnode for all ZFS pages */ + KV_VVP, /* vnode for all VMM pages */ #if defined(__sparc) KV_MPVP, /* vnode for all page_t meta-pages */ KV_PROMVP, /* vnode for all PROM pages */ diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h new file mode 100644 index 0000000000..d87786b507 --- /dev/null +++ b/usr/src/uts/common/sys/vxlan.h @@ -0,0 +1,47 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_VXLAN_H +#define _SYS_VXLAN_H + +/* + * Common VXLAN information + */ + +#include <sys/inttypes.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Sizes in bytes */ +#define VXLAN_HDR_LEN 8 +#define VXLAN_ID_LEN 3 + +#define VXLAN_F_VDI 0x08000000 +#define VXLAN_ID_SHIFT 8 + +#pragma pack(1) +typedef struct vxlan_hdr { + uint32_t vxlan_flags; + uint32_t vxlan_id; +} vxlan_hdr_t; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VXLAN_H */ diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..e08d75ecba --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,78 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define _SYS_ZFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define ZFD_MASTER_NAME "master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define ZFD_SLAVE_NAME "slave" + +#define ZFD_NAME_LEN 16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define ZFD_MAKETTY (ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define ZFD_EOF (ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define ZFD_HAS_SLAVE (ZFD_IOC | 2) + +/* + * This ioctl links two streams into a multiplexer configuration for in-zone + * logging. + */ +#define ZFD_MUX (ZFD_IOC | 3) + +/* + * This ioctl controls the flow control setting for the log multiplexer stream + * (1 = true, 0 = false). The default is false which implies teeing into the + * log stream is "best-effort" but data will be discarded if the stream + * becomes full. If set and the log stream begins to fill up, the primary + * stream will stop flowing. + */ +#define ZFD_MUX_FLOWCON (ZFD_IOC | 4) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 56fa4b8d87..a4ec347ce4 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -20,9 +20,9 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -43,6 +43,7 @@ #include <sys/secflags.h> #include <netinet/in.h> #include <sys/cpu_uarray.h> +#include <sys/nvpair.h> #ifdef __cplusplus extern "C" { @@ -52,15 +53,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -99,14 +112,18 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 -#define ZONE_ATTR_SCHED_CLASS 13 -#define ZONE_ATTR_FLAGS 14 -#define ZONE_ATTR_HOSTID 15 -#define ZONE_ATTR_FS_ALLOWED 16 -#define ZONE_ATTR_NETWORK 17 -#define ZONE_ATTR_INITNORESTART 20 +#define ZONE_ATTR_SCHED_CLASS 12 +#define ZONE_ATTR_FLAGS 13 +#define ZONE_ATTR_HOSTID 14 +#define ZONE_ATTR_FS_ALLOWED 15 +#define ZONE_ATTR_NETWORK 16 +#define ZONE_ATTR_DID 17 +#define ZONE_ATTR_INITNORESTART 18 +#define ZONE_ATTR_APP_SVC_CT 19 +#define ZONE_ATTR_SCHED_FIXEDHI 20 #define ZONE_ATTR_SECFLAGS 21 +#define ZONE_ATTR_INITRESTART0 22 +#define ZONE_ATTR_INITREBOOT 23 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -122,13 +139,18 @@ extern "C" { #define ZONE_EVENT_READY "ready" #define ZONE_EVENT_RUNNING "running" #define ZONE_EVENT_SHUTTING_DOWN "shutting_down" +#define ZONE_EVENT_FREE "free" #define ZONE_CB_NAME "zonename" #define ZONE_CB_NEWSTATE "newstate" #define ZONE_CB_OLDSTATE "oldstate" +#define ZONE_CB_RESTARTS "restarts" #define ZONE_CB_TIMESTAMP "when" #define ZONE_CB_ZONEID "zoneid" +#define ZONE_EVENT_INIT_CLASS "init" +#define ZONE_EVENT_INIT_RESTART_SC "restart" + /* * Exit values that may be returned by scripts or programs invoked by various * zone commands. @@ -187,6 +209,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -203,6 +226,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -227,7 +251,8 @@ typedef enum { ZONE_IS_EMPTY, ZONE_IS_DOWN, ZONE_IS_DYING, - ZONE_IS_DEAD + ZONE_IS_DEAD, + ZONE_IS_FREE /* transient state for zone sysevent */ } zone_status_t; #define ZONE_MIN_STATE ZONE_IS_UNINITIALIZED #define ZONE_MAX_STATE ZONE_IS_DEAD @@ -247,9 +272,12 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + int status; /* init status on shutdown */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ + /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ + int pad; } zone_cmd_arg_t; /* @@ -384,7 +412,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -395,12 +423,57 @@ typedef struct zone_kstat { struct cpucap; typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_rcnt; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_wcnt; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; kstat_named_t zm_pgpgin; kstat_named_t zm_anonpgin; kstat_named_t zm_execpgin; kstat_named_t zm_fspgin; kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; } zone_mcap_kstat_t; typedef struct { @@ -415,8 +488,10 @@ typedef struct { kstat_named_t zm_ffnoproc; kstat_named_t zm_ffnomem; kstat_named_t zm_ffmisc; + kstat_named_t zm_mfseglim; kstat_named_t zm_nested_intp; kstat_named_t zm_init_pid; + kstat_named_t zm_init_restarts; kstat_named_t zm_boot_time; } zone_misc_kstat_t; @@ -459,6 +534,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -511,10 +587,11 @@ typedef struct zone { kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ pid_t zone_proc_initpid; /* pid of "init" for this zone */ - char *zone_initname; /* fs path to 'init' */ + uint_t zone_proc_init_restarts; /* times init restarted */ + char *zone_initname; /* fs path to 'init' */ + int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -552,9 +629,13 @@ typedef struct zone { tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */ boolean_t zone_restart_init; /* Restart init if it dies? */ + boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */ + boolean_t zone_restart_init_0; /* Restart only if it exits 0 */ + boolean_t zone_setup_app_contract; /* setup contract? */ struct brand *zone_brand; /* zone's brand */ void *zone_brand_data; /* store brand specific data */ id_t zone_defaultcid; /* dflt scheduling class id */ + boolean_t zone_fixed_hipri; /* fixed sched. hi prio */ kstat_t *zone_swapresv_kstat; kstat_t *zone_lockedmem_kstat; /* @@ -563,8 +644,24 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + + /* + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + zone_zfs_kstat_t *zone_zfs_stats; + /* - * Solaris Auditing per-zone audit context + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -581,7 +678,11 @@ typedef struct zone { /* zone_rctls->rcs_lock */ kstat_t *zone_nprocs_kstat; - kmutex_t zone_mcap_lock; /* protects mcap statistics */ + /* + * kstats and counters for physical memory capping. + */ + kstat_t *zone_physmem_kstat; + kmutex_t zone_mcap_lock; /* protects mcap statistics */ kstat_t *zone_mcap_ksp; zone_mcap_kstat_t *zone_mcap_stats; uint64_t zone_pgpgin; /* pages paged in */ @@ -606,6 +707,8 @@ typedef struct zone { uint32_t zone_ffnomem; /* as_dup/memory error */ uint32_t zone_ffmisc; /* misc. other error */ + uint32_t zone_mfseglim; /* map failure (# segs limit) */ + uint32_t zone_nested_intp; /* nested interp. kstat */ struct loadavg_s zone_loadavg; /* loadavg for this zone */ @@ -633,6 +736,53 @@ typedef struct zone { } zone_t; /* + * Data and counters used for ZFS fair-share disk IO. + */ +typedef struct zone_zfs_io { + uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */ + uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */ + sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zpers_wr_ops; /* writes, and */ + sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */ + kstat_io_t zpers_zfs_rwstats; + uint64_t zpers_io_util; /* IO utilization metric */ + uint64_t zpers_zfs_rd_waittime; + uint8_t zpers_io_delay; /* IO delay on logical r/w */ + uint8_t zpers_zfs_weight; /* used to prevent starvation */ + uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */ +} zone_zfs_io_t; + +/* + * "Persistent" zone data which can be accessed idependently of the zone_t. + */ +typedef struct zone_persist { + kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */ + zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */ + uint8_t zpers_over; /* currently over cap */ + uint32_t zpers_pg_cnt; /* current RSS in pages */ + uint32_t zpers_pg_limit; /* current RRS limit in pages */ + uint32_t zpers_nover; /* # of times over phys. cap */ +#ifndef DEBUG + uint64_t zpers_pg_out; /* # pages flushed */ +#else + /* + * To conserve memory, some detailed kstats are only kept for DEBUG + * builds. + */ + uint64_t zpers_zfs_rd_waittime; + + uint64_t zpers_pg_anon; /* # clean anon pages flushed */ + uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */ + uint64_t zpers_pg_fs; /* # clean fs pages flushed */ + uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */ +#endif +} zone_persist_t; + +typedef enum zone_pageout_op { + ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY +} zone_pageout_op_t; + +/* * Special value of zone_psetid to indicate that pools are disabled. */ #define ZONE_PS_INVAL PS_MYID @@ -662,6 +812,7 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -842,6 +993,7 @@ extern int zone_ncpus_online_get(zone_t *); * Returns true if the named pool/dataset is visible in the current zone. */ extern int zone_dataset_visible(const char *, int *); +extern int zone_dataset_visible_inzone(zone_t *, const char *, int *); /* * zone version of kadmin() @@ -854,10 +1006,25 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); +extern void zone_pageout_stat(int, zone_pageout_op_t); +extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern zone_persist_t zone_pdata[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; +/* For publishing sysevents related to a particular zone */ +extern void zone_sysevent_publish(zone_t *, const char *, const char *, + nvlist_t *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c index 9b4bd38baa..245ef9f14f 100644 --- a/usr/src/uts/common/syscall/brandsys.c +++ b/usr/src/uts/common/syscall/brandsys.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2016 Joyent, Inc. + */ #include <sys/brand.h> #include <sys/systm.h> @@ -35,7 +37,7 @@ */ int64_t brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg4) { struct proc *p = curthread->t_procp; int64_t rval = 0; @@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (set_errno(ENOSYS)); if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, - arg4, arg5, arg6)) != 0) + arg4)) != 0) return (set_errno(err)); return (rval); diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c index 84c924f570..deb5532b50 100644 --- a/usr/src/uts/common/syscall/chdir.c +++ b/usr/src/uts/common/syscall/chdir.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -62,7 +63,7 @@ /* * Change current working directory ("."). */ -static int chdirec(vnode_t *, int ischroot, int do_traverse); +static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse); int chdir(char *fname) @@ -78,7 +79,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 0, 1); + error = chdirec(vp, B_FALSE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -102,7 +103,7 @@ fchdir(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 0, 0); + error = chdirec(vp, B_FALSE, B_FALSE); if (error) return (set_errno(error)); return (0); @@ -125,7 +126,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 1, 1); + error = chdirec(vp, B_TRUE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -152,18 +153,18 @@ fchroot(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 1, 0); + error = chdirec(vp, B_TRUE, B_FALSE); if (error) return (set_errno(error)); return (0); } static int -chdirec(vnode_t *vp, int ischroot, int do_traverse) +chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot, + boolean_t do_traverse) { int error; vnode_t *oldvp; - proc_t *pp = curproc; vnode_t **vpp; refstr_t *cwd; int newcwd = 1; @@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse) if (ischroot) { struct vattr tattr; struct vattr rattr; - vnode_t *zonevp = curproc->p_zone->zone_rootvp; + vnode_t *zonevp = pp->p_zone->zone_rootvp; tattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL)) @@ -243,3 +244,15 @@ bad: VN_RELE(vp); return (error); } + +int +chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(pp, vp, ischroot, do_traverse)); +} + +static int +chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(curproc, vp, ischroot, do_traverse)); +} diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c index 7b787a4acb..b029d92f1b 100644 --- a/usr/src/uts/common/syscall/fcntl.c +++ b/usr/src/uts/common/syscall/fcntl.c @@ -54,7 +54,8 @@ #include <sys/cmn_err.h> -static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +/* This is global so that it can be used by brand emulation. */ +int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); static void fd_too_big(proc_t *); diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index 1ee4b6a395..721f884a7e 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c index edb04c824b..874e31869c 100644 --- a/usr/src/uts/common/syscall/open.c +++ b/usr/src/uts/common/syscall/open.c @@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode) if (filemode & (FSEARCH|FEXEC)) { /* - * Must be one or the other and neither FREAD nor FWRITE + * Must be one or the other. * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN - * XXX: Should these just be silently ignored? + * XXX: Should these just be silently ignored like we + * silently ignore FREAD|FWRITE? */ - if ((filemode & (FREAD|FWRITE)) || - (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || + if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index ae34556f14..2214bacaf8 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -317,20 +317,57 @@ polllock(pollhead_t *php, kmutex_t *lp) return (0); } -static int -poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +int +poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds) +{ + pollfd_t *pollfdp; + nfds_t old_nfds; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + return (EFAULT); + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + return (EFAULT); + } + return (0); +} + +int +poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp, + int *fdcnt) { kthread_t *t = curthread; - klwp_t *lwp = ttolwp(t); - proc_t *p = ttoproc(t); - int fdcnt = 0; - int i; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; - pollstate_t *ps; pollcache_t *pcp; int error = 0; - nfds_t old_nfds; int cacheindex = 0; /* which cache set is used */ /* @@ -340,33 +377,34 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) deadline = -1; } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { deadline = 0; + } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) { + /* Use an indefinite timeout if tv_sec would cause overflow */ + deadline = -1; } else { + /* + * The above check, when combined with the protections offered + * by itimerspecfix (ensuring that neither field is negative + * and that tv_nsec represents less than a whole second), will + * prevent overflow during the conversion from timespec_t to + * uhrtime_t. + */ + uhrtime_t utime = tsp->tv_sec * NANOSEC; + utime += tsp->tv_nsec; + /* They must wait at least a tick. */ - deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; - deadline = MAX(deadline, nsec_per_tick); - deadline += gethrtime(); - } + utime = MAX(utime, nsec_per_tick); - /* - * Reset our signal mask, if requested. - */ - if (ksetp != NULL) { - mutex_enter(&p->p_lock); - schedctl_finish_sigblock(t); - lwp->lwp_sigoldmask = t->t_hold; - t->t_hold = *ksetp; - t->t_flag |= T_TOMASK; /* - * Call cv_reltimedwait_sig() just to check for signals. - * We will return immediately with either 0 or -1. + * Since utime has an upper bound of HRTIME_MAX, adding the + * gethrtime() result cannot incur an overflow as the unsigned + * type has an adequate bound. */ - if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, - TR_CLOCK_TICK)) { - mutex_exit(&p->p_lock); - error = EINTR; - goto pollout; + utime += (uhrtime_t)gethrtime(); + if (utime > HRTIME_MAX) { + deadline = -1; + } else { + deadline = (hrtime_t)utime; } - mutex_exit(&p->p_lock); } /* @@ -374,6 +412,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * If yes then bypass all the other stuff and make it sleep. */ if (nfds == 0) { + *fdcnt = 0; /* * Sleep until we have passed the requested future * time or until interrupted by a signal. @@ -385,66 +424,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); - error = (error == 0) ? EINTR : 0; + return ((error == 0) ? EINTR : 0); } - goto pollout; - } - - if (nfds > p->p_fno_ctl) { - mutex_enter(&p->p_lock); - (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], - p->p_rctls, p, RCA_SAFE); - mutex_exit(&p->p_lock); - error = EINVAL; - goto pollout; - } - - /* - * Need to allocate memory for pollstate before anything because - * the mutex and cv are created in this space - */ - ps = pollstate_create(); - - if (ps->ps_pcache == NULL) - ps->ps_pcache = pcache_alloc(); - pcp = ps->ps_pcache; - - /* - * NOTE: for performance, buffers are saved across poll() calls. - * The theory is that if a process polls heavily, it tends to poll - * on the same set of descriptors. Therefore, we only reallocate - * buffers when nfds changes. There is no hysteresis control, - * because there is no data to suggest that this is necessary; - * the penalty of reallocating is not *that* great in any event. - */ - old_nfds = ps->ps_nfds; - if (nfds != old_nfds) { - - kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); - pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); - ps->ps_pollfd = pollfdp; - ps->ps_nfds = nfds; + return (0); } + VERIFY(ps != NULL); pollfdp = ps->ps_pollfd; - if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { - error = EFAULT; - goto pollout; - } - - if (fds == NULL) { - /* - * If the process has page 0 mapped, then the copyin() above - * will succeed even if fds is NULL. However, our cached - * poll lists are keyed by the address of the passed-in fds - * structure, and we use the value NULL to indicate an unused - * poll cache list entry. As such, we elect not to support - * NULL as a valid (user) memory address and fail the poll() - * call. - */ - error = EINVAL; - goto pollout; - } + VERIFY(pollfdp != NULL); /* * If this thread polls for the first time, allocate ALL poll @@ -460,10 +447,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) /* * poll and cache this poll fd list in ps_pcacheset[0]. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } else { pollcacheset_t *pcset = ps->ps_pcacheset; @@ -488,11 +475,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * the callee will guarantee the consistency * of cached poll list and cache content. */ - error = pcacheset_resolve(ps, nfds, &fdcnt, + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -509,11 +496,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * found an unused entry. Use it to cache * this poll list. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); - if (fdcnt || error) { + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -527,10 +514,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; - error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } } @@ -548,8 +535,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; - error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; @@ -595,13 +582,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); } + return (error); +} + +/* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ +int +pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t ts; + timespec_t *tsp; + k_sigset_t kset; + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + int error = 0, fdcnt = 0; + + /* + * Copy in timeout + */ + if (timeoutp == NULL) { + tsp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + /* + * Copy in and reset signal mask, if requested. + */ + if (setp != NULL) { + sigset_t set; + + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = kset; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + * If nfds == 0, we will skip all of the copying and check steps and + * proceed directly into poll_common to process the supplied timeout. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + if ((error = poll_copyin(ps, fds, nfds)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + pollout: /* - * If we changed the signal mask but we received - * no signal then restore the signal mask. - * Otherwise psig() will deal with the signal mask. + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. */ - if (ksetp != NULL) { + if (setp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; @@ -612,12 +702,10 @@ pollout: if (error) return (set_errno(error)); - /* * Copy out the events and return the fdcnt to the user. */ - if (nfds != 0 && - copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) + if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG @@ -625,7 +713,7 @@ pollout: * Another sanity check: */ if (fdcnt) { - int reventcnt = 0; + int i, reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { @@ -638,6 +726,8 @@ pollout: } ASSERT(fdcnt == reventcnt); } else { + int i; + for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } @@ -648,52 +738,6 @@ pollout: } /* - * This is the system call trap that poll(), - * select() and pselect() are built upon. - * It is a private interface between libc and the kernel. - */ -int -pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) -{ - timespec_t ts; - timespec_t *tsp; - sigset_t set; - k_sigset_t kset; - k_sigset_t *ksetp; - model_t datamodel = get_udatamodel(); - - if (timeoutp == NULL) - tsp = NULL; - else { - if (datamodel == DATAMODEL_NATIVE) { - if (copyin(timeoutp, &ts, sizeof (ts))) - return (set_errno(EFAULT)); - } else { - timespec32_t ts32; - - if (copyin(timeoutp, &ts32, sizeof (ts32))) - return (set_errno(EFAULT)); - TIMESPEC32_TO_TIMESPEC(&ts, &ts32) - } - - if (itimerspecfix(&ts)) - return (set_errno(EINVAL)); - tsp = &ts; - } - - if (setp == NULL) - ksetp = NULL; - else { - if (copyin(setp, &set, sizeof (set))) - return (set_errno(EFAULT)); - sigutok(&set, &kset); - ksetp = &kset; - } - - return (poll_common(fds, nfds, tsp, ksetp)); -} - -/* * Clean up any state left around by poll(2). Called when a thread exits. */ void @@ -1277,8 +1321,8 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, * be OK too. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, - &memphp, NULL); + error = VOP_POLL(fp->f_vnode, pollfdp->events | ps->ps_implicit_ev, 0, + &pollfdp->revents, &memphp, NULL); if (error) { return (error); } @@ -1992,7 +2036,8 @@ retry: * flag. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, + error = VOP_POLL(fp->f_vnode, + pollfdp[entry].events | ps->ps_implicit_ev, 0, &pollfdp[entry].revents, &php, NULL); /* * releasef after completely done with this cached @@ -2291,6 +2336,7 @@ pollstate_create() } else { ASSERT(ps->ps_depth == 0); ASSERT(ps->ps_flags == 0); + ASSERT(ps->ps_implicit_ev == 0); ASSERT(ps->ps_pc_stack[0] == 0); } return (ps); @@ -3025,7 +3071,7 @@ plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, php = NULL; ASSERT(curthread->t_pollcache == NULL); error = VOP_POLL(fp->f_vnode, - pollfdp[i].events, 0, + pollfdp[i].events | psp->ps_implicit_ev, 0, &pollfdp[i].revents, &php, NULL); if (error) { return (error); diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e0e63f4c0..09f3266ab4 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c index a28894b2c9..23f03e841d 100644 --- a/usr/src/uts/common/syscall/rw.c +++ b/usr/src/uts/common/syscall/rw.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -50,6 +50,7 @@ #include <sys/debug.h> #include <sys/rctl.h> #include <sys/nbmlock.h> +#include <sys/limits.h> #define COPYOUT_MAX_CACHE (1<<17) /* 128K */ @@ -607,19 +608,12 @@ out: return (bcount); } -/* - * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr.... - * XXX -- However, SVVS expects readv() and writev() to fail if - * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source), - * XXX -- so I guess that's the "interface". - */ -#define DEF_IOV_MAX 16 - ssize_t readv(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -768,6 +794,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -777,7 +805,8 @@ ssize_t writev(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen = aiov32[i].iov_len; count32 += iovlen; - if (iovlen < 0 || count32 < 0) + if (iovlen < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -917,6 +977,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -936,25 +999,35 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int error = 0; int i; + /* + * In a 64-bit kernel, this interface supports native 64-bit + * applications as well as 32-bit applications using both standard and + * large-file access. For 32-bit large-file aware applications, the + * offset is passed as two parameters which are joined into the actual + * offset used. The 64-bit libc always passes 0 for the extended_offset. + * Note that off_t is a signed value, but the preadv/pwritev API treats + * the offset as a position in the file for the operation, so passing + * a negative value will likely fail the maximum offset checks below + * because we convert it to an unsigned value which will be larger than + * the maximum valid offset. + */ #if defined(_SYSCALL32_IMPL) || defined(_ILP32) u_offset_t fileoff = ((u_offset_t)extended_offset << 32) | (u_offset_t)offset; #else /* _SYSCALL32_IMPL || _ILP32 */ u_offset_t fileoff = (u_offset_t)(ulong_t)offset; #endif /* _SYSCALL32_IMPR || _ILP32 */ -#ifdef _SYSCALL32_IMPL - const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 && - extended_offset == 0? - MAXOFF32_T : MAXOFFSET_T; -#else /* _SYSCALL32_IMPL */ - const u_offset_t maxoff = MAXOFF32_T; -#endif /* _SYSCALL32_IMPL */ int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -962,61 +1035,104 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; } vp = fp->f_vnode; rwflag = 0; - if (vp->v_type == VREG) { + /* + * Behaviour is same as read(2). Please see comments in read(2). + */ + if (vp->v_type == VREG) { if (bcount == 0) goto out; - /* - * return EINVAL for offsets that cannot be - * represented in an off_t. - */ - if (fileoff > maxoff) { - error = EINVAL; + /* Handle offset past maximum offset allowed for file. */ + if (fileoff >= OFFSET_MAX(fp)) { + struct vattr va; + va.va_mask = AT_SIZE; + + error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL); + if (error == 0) { + if (fileoff >= va.va_size) { + count = 0; + } else { + error = EOVERFLOW; + } + } goto out; } - if (fileoff + bcount > maxoff) - bcount = (ssize_t)((u_offset_t)maxoff - fileoff); + ASSERT(bcount == count); + + /* Note: modified count used in nbl_conflict() call below. */ + if ((fileoff + count) > OFFSET_MAX(fp)) + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } else if (vp->v_type == VFIFO) { error = ESPIPE; goto out; @@ -1033,8 +1149,7 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = nbl_svmand(vp, fp->f_cred, &svmand); if (error != 0) goto out; - if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, - NULL)) { + if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) { error = EACCES; goto out; } @@ -1042,33 +1157,6 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, (void) VOP_RWLOCK(vp, rwflag, NULL); - /* - * Behaviour is same as read(2). Please see comments in - * read(2). - */ - - if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) { - struct vattr va; - va.va_mask = AT_SIZE; - if ((error = - VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) { - VOP_RWUNLOCK(vp, rwflag, NULL); - goto out; - } - if (fileoff >= va.va_size) { - VOP_RWUNLOCK(vp, rwflag, NULL); - count = 0; - goto out; - } else { - VOP_RWUNLOCK(vp, rwflag, NULL); - error = EOVERFLOW; - goto out; - } - } - if ((vp->v_type == VREG) && - (fileoff + count > OFFSET_MAX(fp))) { - count = (ssize_t)(OFFSET_MAX(fp) - fileoff); - } auio.uio_loffset = fileoff; auio.uio_iov = aiov; auio.uio_iovcnt = iovcnt; @@ -1099,6 +1187,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -1109,7 +1199,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -1118,25 +1209,26 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int error = 0; int i; + /* + * See the comment in preadv for how the offset is handled. + */ #if defined(_SYSCALL32_IMPL) || defined(_ILP32) u_offset_t fileoff = ((u_offset_t)extended_offset << 32) | (u_offset_t)offset; #else /* _SYSCALL32_IMPL || _ILP32 */ u_offset_t fileoff = (u_offset_t)(ulong_t)offset; #endif /* _SYSCALL32_IMPR || _ILP32 */ -#ifdef _SYSCALL32_IMPL - const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 && - extended_offset == 0? - MAXOFF32_T : MAXOFFSET_T; -#else /* _SYSCALL32_IMPL */ - const u_offset_t maxoff = MAXOFF32_T; -#endif /* _SYSCALL32_IMPL */ int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -1144,58 +1236,92 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; } vp = fp->f_vnode; rwflag = 1; - if (vp->v_type == VREG) { + /* + * The kernel's write(2) code checks the rctl & OFFSET_MAX and returns + * EFBIG when fileoff exceeds either limit. We do the same. + */ + if (vp->v_type == VREG) { if (bcount == 0) goto out; /* - * return EINVAL for offsets that cannot be - * represented in an off_t. + * Don't allow pwritev to cause file size to exceed the proper + * offset limit. */ - if (fileoff > maxoff) { - error = EINVAL; + if (fileoff >= OFFSET_MAX(fp)) { + error = EFBIG; goto out; } + /* * Take appropriate action if we are trying * to write above the resource limit. @@ -1218,17 +1344,13 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = EFBIG; goto out; } - /* - * Don't allow pwritev to cause file sizes to exceed - * maxoff. - */ - if (fileoff == maxoff) { - error = EFBIG; - goto out; - } - if (fileoff + bcount > maxoff) - bcount = (ssize_t)((u_offset_t)maxoff - fileoff); + ASSERT(bcount == count); + + /* Note: modified count used in nbl_conflict() call below. */ + if ((fileoff + count) > OFFSET_MAX(fp)) + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + } else if (vp->v_type == VFIFO) { error = ESPIPE; goto out; @@ -1245,8 +1367,7 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, error = nbl_svmand(vp, fp->f_cred, &svmand); if (error != 0) goto out; - if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, - NULL)) { + if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) { error = EACCES; goto out; } @@ -1254,34 +1375,6 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, (void) VOP_RWLOCK(vp, rwflag, NULL); - - /* - * Behaviour is same as write(2). Please see comments for - * write(2). - */ - - if (vp->v_type == VREG) { - if (fileoff >= curproc->p_fsz_ctl) { - VOP_RWUNLOCK(vp, rwflag, NULL); - mutex_enter(&curproc->p_lock); - /* see above rctl_action comment */ - (void) rctl_action( - rctlproc_legacy[RLIMIT_FSIZE], - curproc->p_rctls, - curproc, RCA_UNSAFE_SIGINFO); - mutex_exit(&curproc->p_lock); - error = EFBIG; - goto out; - } - if (fileoff >= OFFSET_MAX(fp)) { - VOP_RWUNLOCK(vp, rwflag, NULL); - error = EFBIG; - goto out; - } - if (fileoff + count > OFFSET_MAX(fp)) - count = (ssize_t)(OFFSET_MAX(fp) - fileoff); - } - auio.uio_loffset = fileoff; auio.uio_iov = aiov; auio.uio_iovcnt = iovcnt; @@ -1308,6 +1401,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index 0cfafbf13f..16c6fdd27e 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *); * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ -int +static int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { @@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, return (0); } -ssize32_t +static ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { @@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, } #endif -int +static int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { @@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, } -int +static int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { @@ -1174,6 +1174,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, } else { maxblk = (int)vp->v_stream->sd_maxblk; } + + /* + * We need to make sure that the socket that we're sending on + * supports sendfile behavior. sockfs doesn't know that the APIs + * we want to use are coming from sendfile, so we can't rely on + * it to check for us. + */ + if ((so->so_mode & SM_SENDFILESUPP) == 0) { + error = EOPNOTSUPP; + goto err; + } break; case VREG: break; diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c index 4085104cc7..93f26121bc 100644 --- a/usr/src/uts/common/syscall/stat.c +++ b/usr/src/uts/common/syscall/stat.c @@ -61,7 +61,7 @@ * to VOP_GETATTR */ -static int +int cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred) { vnode_t *startvp; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 03f2fabe13..e09f4e85a2 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -111,6 +112,9 @@ sysconfig(int which) case _CONFIG_NPROC_MAX: return (max_ncpus); + case _CONFIG_NPROC_NCPU: + return (NCPU); /* Private sysconfig for direct NCPU access */ + case _CONFIG_STACK_PROT: return (curproc->p_stkprot & ~PROT_USER); @@ -167,44 +171,29 @@ sysconfig(int which) /* * If the non-global zone has a phys. memory cap, use that. * We always report the system-wide value for the global zone, - * even though rcapd can be used on the global zone too. + * even though memory capping can be used on the global zone + * too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), - physinstalled)); + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); + return (MIN(cap, physinstalled)); + } return (physinstalled); case _CONFIG_AVPHYS_PAGES: /* - * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always - * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * If the non-global zone has a phys. memory cap, use its + * free value. We always report the system-wide value for the + * global zone, even though memory capping can be used on the + * global zone too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { - pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - - cap = btop(curproc->p_zone->zone_phys_mcap); - if (cap > physinstalled) - return (freemem); - - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); - /* - * Because rcapd implements a soft cap, it is possible - * for rss to be temporarily over the cap. - */ - if (cap > rss) - free = cap - rss; - else - free = 0; + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); return (MIN(free, freemem)); } diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 858305504d..dfe7f22d44 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -108,7 +108,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -245,12 +245,13 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. */ if (ttoproc(curthread) != &p0) { + mutex_enter(&curproc->p_lock); VN_RELE(PTOU(curproc)->u_cdir); if (PTOU(curproc)->u_rdir) VN_RELE(PTOU(curproc)->u_rdir); @@ -260,6 +261,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) PTOU(curproc)->u_cdir = rootdir; PTOU(curproc)->u_rdir = NULL; PTOU(curproc)->u_cwd = NULL; + mutex_exit(&curproc->p_lock); } /* diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c index a2deedb163..b25f89b6d5 100644 --- a/usr/src/uts/common/syscall/umount.c +++ b/usr/src/uts/common/syscall/umount.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -125,6 +126,7 @@ umount2(char *pathp, int flag) struct pathname pn; struct vfs *vfsp; int error; + boolean_t altroot; /* * Some flags are disallowed through the system call interface. @@ -154,9 +156,12 @@ umount2(char *pathp, int flag) * isn't in an environment with an alternate root (to the zone's root) * directory, i.e. chroot(2). */ - if (secpolicy_fs_unmount(CRED(), NULL) != 0 || - (PTOU(curproc)->u_rdir != NULL && - PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) || + mutex_enter(&curproc->p_lock); + altroot = (PTOU(curproc)->u_rdir != NULL && + PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp); + mutex_exit(&curproc->p_lock); + + if (secpolicy_fs_unmount(CRED(), NULL) != 0 || altroot || (vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) { vnode_t *fsrootvp; diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index a2509e7bb6..3735139068 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8747b96acc..ae9b0be758 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -229,6 +230,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -527,9 +529,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c index 7e48602189..7305c9c85a 100644 --- a/usr/src/uts/common/vm/page_lock.c +++ b/usr/src/uts/common/vm/page_lock.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ @@ -140,9 +141,8 @@ static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; & (VPH_TABLE_SIZE - 1)) /* - * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. - * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is - * VPH_TABLE_SIZE + 1. + * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes, + * one for kvps[KV_ZVP], and one for other kvps[] users. */ kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; @@ -888,10 +888,10 @@ static int page_vnode_mutex_stress = 0; kmutex_t * page_vnode_mutex(vnode_t *vp) { - if (vp == &kvp) + if (vp == &kvp || vp == &kvps[KV_VVP]) return (&vph_mutex[VPH_TABLE_SIZE + 0]); - if (vp == &zvp) + if (vp == &kvps[KV_ZVP]) return (&vph_mutex[VPH_TABLE_SIZE + 1]); #ifdef DEBUG if (page_vnode_mutex_stress != 0) diff --git a/usr/src/uts/common/vm/page_retire.c b/usr/src/uts/common/vm/page_retire.c index 76be970a45..f4e8d0737f 100644 --- a/usr/src/uts/common/vm/page_retire.c +++ b/usr/src/uts/common/vm/page_retire.c @@ -22,6 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -851,9 +852,8 @@ page_retire_incr_pend_count(void *datap) { PR_INCR_KSTAT(pr_pending); - if ((datap == &kvp) || (datap == &zvp)) { + if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP]) PR_INCR_KSTAT(pr_pending_kas); - } } void @@ -861,9 +861,8 @@ page_retire_decr_pend_count(void *datap) { PR_DECR_KSTAT(pr_pending); - if ((datap == &kvp) || (datap == &zvp)) { + if (datap == &kvp || datap == &kvps[KV_ZVP] || datap == &kvps[KV_VVP]) PR_DECR_KSTAT(pr_pending_kas); - } } /* diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c index 439c859d96..0b116d6eba 100644 --- a/usr/src/uts/common/vm/seg_kmem.c +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -122,6 +122,11 @@ vmem_t *static_alloc_arena; /* arena for allocating static memory */ vmem_t *zio_arena = NULL; /* arena for allocating zio memory */ vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */ +#if defined(__amd64) +vmem_t *kvmm_arena; /* arena for vmm VA */ +struct seg kvmmseg; /* Segment for vmm memory */ +#endif + /* * seg_kmem driver can map part of the kernel heap with large pages. * Currently this functionality is implemented for sparc platforms only. @@ -440,7 +445,7 @@ segkmem_badop() /*ARGSUSED*/ static faultcode_t segkmem_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t size, - enum fault_type type, enum seg_rw rw) + enum fault_type type, enum seg_rw rw) { pgcnt_t npages; spgcnt_t pg; @@ -655,13 +660,19 @@ segkmem_dump(struct seg *seg) segkmem_dump_range, seg->s_as); vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT, segkmem_dump_range, seg->s_as); + /* + * We don't want to dump pages attached to kzioseg since they + * contain file data from ZFS. If this page's segment is + * kzioseg return instead of writing it to the dump device. + * + * Same applies to VM memory allocations. + */ } else if (seg == &kzioseg) { - /* - * We don't want to dump pages attached to kzioseg since they - * contain file data from ZFS. If this page's segment is - * kzioseg return instead of writing it to the dump device. - */ return; +#if defined(__amd64) + } else if (seg == &kvmmseg) { + return; +#endif } else { segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size); } @@ -677,7 +688,7 @@ segkmem_dump(struct seg *seg) /*ARGSUSED*/ static int segkmem_pagelock(struct seg *seg, caddr_t addr, size_t len, - page_t ***ppp, enum lock_type type, enum seg_rw rw) + page_t ***ppp, enum lock_type type, enum seg_rw rw) { page_t **pplist, *pp; pgcnt_t npages; @@ -802,21 +813,18 @@ struct seg_ops segkmem_ops = { }; int -segkmem_zio_create(struct seg *seg) -{ - ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock)); - seg->s_ops = &segkmem_ops; - seg->s_data = &zvp; - kas.a_size += seg->s_size; - return (0); -} - -int segkmem_create(struct seg *seg) { ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock)); seg->s_ops = &segkmem_ops; - seg->s_data = &kvp; + if (seg == &kzioseg) + seg->s_data = &kvps[KV_ZVP]; +#if defined(__amd64) + else if (seg == &kvmmseg) + seg->s_data = &kvps[KV_VVP]; +#endif + else + seg->s_data = &kvps[KV_KVP]; kas.a_size += seg->s_size; return (0); } @@ -858,7 +866,7 @@ segkmem_page_create(void *addr, size_t size, int vmflag, void *arg) */ void * segkmem_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag, uint_t attr, - page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) + page_t *(*page_create_func)(void *, size_t, int, void *), void *pcarg) { page_t *ppl; caddr_t addr = inaddr; @@ -968,10 +976,10 @@ segkmem_alloc(vmem_t *vmp, size_t size, int vmflag) return (segkmem_alloc_vn(vmp, size, vmflag, &kvp)); } -void * +static void * segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag) { - return (segkmem_alloc_vn(vmp, size, vmflag, &zvp)); + return (segkmem_alloc_vn(vmp, size, vmflag, &kvps[KV_ZVP])); } /* @@ -980,8 +988,8 @@ segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag) * we currently don't have a special kernel segment for non-paged * kernel memory that is exported by drivers to user space. */ -static void -segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp, +void +segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp, void (*func)(page_t *)) { page_t *pp; @@ -1038,21 +1046,15 @@ segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp, } void -segkmem_xfree(vmem_t *vmp, void *inaddr, size_t size, void (*func)(page_t *)) -{ - segkmem_free_vn(vmp, inaddr, size, &kvp, func); -} - -void segkmem_free(vmem_t *vmp, void *inaddr, size_t size) { - segkmem_free_vn(vmp, inaddr, size, &kvp, NULL); + segkmem_xfree(vmp, inaddr, size, &kvp, NULL); } -void +static void segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size) { - segkmem_free_vn(vmp, inaddr, size, &zvp, NULL); + segkmem_xfree(vmp, inaddr, size, &kvps[KV_ZVP], NULL); } void @@ -1534,8 +1536,21 @@ segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size) ASSERT(zio_alloc_arena != NULL); } -#ifdef __sparc +#if defined(__amd64) + +void +segkmem_kvmm_init(void *base, size_t size) +{ + ASSERT(base != NULL); + ASSERT(size != 0); + + kvmm_arena = vmem_create("kvmm_arena", base, size, 1024 * 1024, + NULL, NULL, NULL, 0, VM_SLEEP); + + ASSERT(kvmm_arena != NULL); +} +#elif defined(__sparc) static void * segkmem_alloc_ppa(vmem_t *vmp, size_t size, int vmflag) diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h index 1db85826b1..9a20101670 100644 --- a/usr/src/uts/common/vm/seg_kmem.h +++ b/usr/src/uts/common/vm/seg_kmem.h @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 RackTop Systems. */ @@ -65,12 +65,18 @@ extern vmem_t *static_arena; /* arena for caches to import static memory */ extern vmem_t *static_alloc_arena; /* arena for allocating static memory */ extern vmem_t *zio_arena; /* arena for zio caches */ extern vmem_t *zio_alloc_arena; /* arena for zio caches */ + +#if defined(__amd64) +extern struct seg kvmmseg; /* Segment for vmm mappings */ +extern vmem_t *kvmm_arena; /* arena for vmm VA */ +extern void segkmem_kvmm_init(void *, size_t); +#endif + extern struct vnode kvps[]; /* - * segkmem page vnodes + * segkmem page vnodes (please don't add more defines here...) */ #define kvp (kvps[KV_KVP]) -#define zvp (kvps[KV_ZVP]) #if defined(__sparc) #define mpvp (kvps[KV_MPVP]) #define promvp (kvps[KV_PROMVP]) @@ -83,16 +89,14 @@ extern void *segkmem_xalloc(vmem_t *, void *, size_t, int, uint_t, extern void *segkmem_alloc(vmem_t *, size_t, int); extern void *segkmem_alloc_permanent(vmem_t *, size_t, int); extern void segkmem_free(vmem_t *, void *, size_t); -extern void segkmem_xfree(vmem_t *, void *, size_t, void (*)(page_t *)); +extern void segkmem_xfree(vmem_t *, void *, size_t, + struct vnode *, void (*)(page_t *)); extern void *boot_alloc(void *, size_t, uint_t); extern void boot_mapin(caddr_t addr, size_t size); extern void kernelheap_init(void *, void *, char *, void *, void *); extern void segkmem_gc(void); -extern void *segkmem_zio_alloc(vmem_t *, size_t, int); -extern int segkmem_zio_create(struct seg *); -extern void segkmem_zio_free(vmem_t *, void *, size_t); extern void segkmem_zio_init(void *, size_t); /* diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index 8046d10212..da6393f792 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7313,7 +7313,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7338,11 +7339,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7394,7 +7395,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index 853b092e6d..ec6d2b8920 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -58,6 +58,7 @@ #include <sys/debug.h> #include <sys/tnf_probe.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/as.h> @@ -72,6 +73,8 @@ clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ +ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */ + static struct kmem_cache *as_cache; static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); @@ -853,8 +856,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, int as_lock_held; klwp_t *lwp = ttolwp(curthread); - - retry: /* * Indicate that the lwp is not to be stopped while waiting for a @@ -1724,6 +1725,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } + + /* + * Keep the number of segments in a userspace AS constrained to + * a reasonable limit. Linux enforces a value slightly less + * than 64k in order to avoid ELF limits if/when a process + * dumps core. While SunOS avoids that specific problem with + * other tricks, the limit is still valuable to keep kernel + * memory consumption in check. + */ + if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) { + AS_LOCK_EXIT(as); + atomic_inc_32(&p->p_zone->zone_mfseglim); + return (ENOMEM); + } } if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 78d1cb1a58..abccf82057 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -22,6 +22,7 @@ * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net> * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -440,10 +441,26 @@ init_pages_pp_maximum() } } +/* + * In the past, we limited the maximum pages that could be gotten to essentially + * 1/2 of the total pages on the system. However, this is too conservative for + * some cases. For example, if we want to host a large virtual machine which + * needs to use a significant portion of the system's memory. In practice, + * allowing more than 1/2 of the total pages is fine, but becomes problematic + * as we approach or exceed 75% of the pages on the system. Thus, we limit the + * maximum to 23/32 of the total pages, which is ~72%. + */ void set_max_page_get(pgcnt_t target_total_pages) { - max_page_get = target_total_pages / 2; + max_page_get = (target_total_pages >> 5) * 23; + ASSERT3U(max_page_get, >, 0); +} + +pgcnt_t +get_max_page_get() +{ + return (max_page_get); } static pgcnt_t pending_delete; @@ -1460,6 +1477,8 @@ page_create_throttle(pgcnt_t npages, int flags) uint_t i; pgcnt_t tf; /* effective value of throttlefree */ + atomic_inc_64(&n_throttle); + /* * Normal priority allocations. */ @@ -1492,7 +1511,7 @@ page_create_throttle(pgcnt_t npages, int flags) tf = throttlefree - ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); for (;;) { fm = 0; @@ -1579,7 +1598,7 @@ checkagain: } ASSERT(proc_pageout != NULL); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, "page_create_sleep_start: freemem %ld needfree %ld", @@ -2226,7 +2245,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } pp = rootpp; @@ -2355,7 +2374,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } /* diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index 1b8d12eb8d..a206320a30 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index e542e8e479..01c2666e91 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2018, Joyent, Inc. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1595,8 +1648,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1617,13 +1669,42 @@ vmu_free_extra() extern kcondvar_t *pr_pid_cv; +static void +vmu_get_zone_rss(zoneid_t zid) +{ + vmu_zone_t *zone; + zone_t *zp; + int ret; + uint_t pgcnt; + + if ((zp = zone_find_by_id(zid)) == NULL) + return; + + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(zid); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + pgcnt = zone_pdata[zid].zpers_pg_cnt; + zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt); + zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap; + + zone_rele(zp); +} + /* * Determine which entity types are relevant and allocate the hashes to - * track them. Then walk the process table and count rss and swap - * for each process'es address space. Address space object such as - * vnodes, amps and anons are tracked per entity, so that they are - * not double counted in the results. - * + * track them. First get the zone rss using the data we already have. Then, + * if necessary, walk the process table and count rss and swap for each + * process'es address space. Address space object such as vnodes, amps and + * anons are tracked per entity, so that they are not double counted in the + * results. */ static void vmu_calculate() @@ -1631,6 +1712,7 @@ vmu_calculate() int i = 0; int ret; proc_t *p; + uint_t zone_flags = 0; vmu_clear_calc(); @@ -1638,9 +1720,34 @@ vmu_calculate() vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, ALL_ZONES); + zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS; + if (zone_flags != 0) { + /* + * Use the accurate zone RSS data we already keep track of. + */ + int i; + + for (i = 0; i <= MAX_ZONEID; i++) { + if (zone_pdata[i].zpers_pg_cnt > 0) { + vmu_get_zone_rss(i); + } + } + } + + /* If only neeeded zone data, we're done. */ + if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) { + return; + } + + DTRACE_PROBE(vmu__calculate__all); + vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS; + /* * Walk process table and calculate rss of each proc. * + * Since we already obtained all zone rss above, the following loop + * executes with the VMUSAGE_ZONE_FLAGS cleared. + * * Pidlock and p_lock cannot be held while doing the rss calculation. * This is because: * 1. The calculation allocates using KM_SLEEP. @@ -1695,6 +1802,12 @@ again: mutex_exit(&pidlock); vmu_free_extra(); + + /* + * Restore any caller-supplied zone flags we blocked during + * the process-table walk. + */ + vmu_data.vmu_calc_flags |= zone_flags; } /* @@ -1745,7 +1858,7 @@ vmu_cache_rele(vmu_cache_t *cache) */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1764,7 +1877,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1827,26 +1940,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1902,10 +2022,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1925,6 +2047,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1935,6 +2061,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1954,7 +2095,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2011,7 +2152,8 @@ start: mutex_exit(&vmu_data.vmu_lock); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c index 761597653b..c21476df89 100644 --- a/usr/src/uts/common/xen/io/xnb.c +++ b/usr/src/uts/common/xen/io/xnb.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #ifdef DEBUG @@ -251,8 +252,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp) * because it doesn't cover all of the interesting cases :-( */ mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM); - - return (mac_fix_cksum(mp)); + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + return (mp); } mblk_t * |