diff options
Diffstat (limited to 'usr/src/uts/common/sys')
121 files changed, 5338 insertions, 395 deletions
| diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 8d26a71342..5a6d7a204c 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -23,6 +23,7 @@  # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.  # Copyright (c) 2018, Joyent, Inc.  # Copyright 2013 Garrett D'Amore <garrett@damore.org> +# Copyright 2015, Joyent, Inc. All rights reserved.  # Copyright 2013 Saso Kiselkov. All rights reserved.  # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com>  # Copyright 2017 Nexenta Systems, Inc. @@ -258,6 +259,7 @@ CHKHDRS=			\  	flock.h			\  	flock_impl.h		\  	fork.h			\ +	frameio.h		\  	fss.h			\  	fsspriocntl.h		\  	fsid.h			\ @@ -283,6 +285,7 @@ CHKHDRS=			\  	idmap.h			\  	ieeefp.h		\  	id_space.h		\ +	inotify.h		\  	instance.h		\  	int_const.h		\  	int_fmtio.h		\ @@ -351,6 +354,7 @@ CHKHDRS=			\  	lgrp.h			\  	lgrp_user.h		\  	libc_kernel.h		\ +	limits.h		\  	link.h			\  	list.h			\  	list_impl.h		\ @@ -435,6 +439,9 @@ CHKHDRS=			\  	ontrap.h		\  	open.h			\  	openpromio.h		\ +	overlay.h		\ +	overlay_common.h	\ +	overlay_target.h	\  	panic.h			\  	param.h			\  	pathconf.h		\ @@ -659,6 +666,8 @@ CHKHDRS=			\  	vmem.h			\  	vmem_impl.h		\  	vmsystm.h		\ +	vnd.h			\ +	vnd_errno.h		\  	vnic.h			\  	vnic_impl.h		\  	vnode.h			\ @@ -670,11 +679,13 @@ CHKHDRS=			\  	vuid_queue.h		\  	vuid_state.h		\  	vuid_store.h		\ +	vxlan.h			\  	wait.h			\  	waitq.h			\  	watchpoint.h		\  	winlockio.h		\  	zcons.h			\ +	zfd.h			\  	zone.h			\  	xti_inet.h		\  	xti_osi.h		\ @@ -840,13 +851,14 @@ FSHDRS=				\  	autofs.h		\  	decomp.h		\  	dv_node.h		\ -	sdev_impl.h		\  	fifonode.h		\  	hsfs_isospec.h		\  	hsfs_node.h		\  	hsfs_rrip.h		\  	hsfs_spec.h		\  	hsfs_susp.h		\ +	hyprlofs.h		\ +	hyprlofs_info.h		\  	lofs_info.h		\  	lofs_node.h		\  	mntdata.h		\ @@ -856,6 +868,8 @@ FSHDRS=				\  	pc_label.h		\  	pc_node.h		\  	pxfs_ki.h		\ +	sdev_impl.h		\ +	sdev_plugin.h		\  	snode.h			\  	swapnode.h		\  	tmp.h			\ @@ -980,6 +994,7 @@ SATAGENHDRS=		\  SYSEVENTHDRS=		\  	ap_driver.h     \ +	datalink.h	\  	dev.h		\  	domain.h        \  	dr.h            \ diff --git a/usr/src/uts/common/sys/acct.h b/usr/src/uts/common/sys/acct.h index f00884681b..e01ad61025 100644 --- a/usr/src/uts/common/sys/acct.h +++ b/usr/src/uts/common/sys/acct.h @@ -22,6 +22,7 @@  /*   * Copyright 2014 Garrett D'Amore <garrett@damore.org>   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -88,7 +89,7 @@ extern int acct(const char *);  #if defined(_KERNEL) -void	acct(char); +void	acct(int);  int	sysacct(char *);  struct vnode; diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 547c9cc241..80733aa31e 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,8 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved. + * Copyright 2018 Joyent, Inc.   */  #ifndef	_SYS_AGGR_IMPL_H @@ -54,25 +56,47 @@ extern "C" {   */  #define	MAC_PSEUDO_RING_INUSE	0x01 +#define	MAX_GROUPS_PER_PORT	128 + +/* + * VLAN filters placed on the Rx pseudo group. + */ +typedef struct aggr_vlan { +	list_node_t	av_link; +	uint16_t	av_vid;		/* VLAN ID */ +	uint_t		av_refs;	/* num aggr clients using this VID */ +} aggr_vlan_t; +  typedef struct aggr_unicst_addr_s {  	uint8_t				aua_addr[ETHERADDRL];  	struct aggr_unicst_addr_s	*aua_next;  } aggr_unicst_addr_t;  typedef struct aggr_pseudo_rx_ring_s { -	mac_ring_handle_t	arr_rh;	/* filled in by aggr_fill_ring() */ -	struct aggr_port_s	*arr_port; -	mac_ring_handle_t	arr_hw_rh; -	uint_t			arr_flags; -	uint64_t		arr_gen; +	mac_ring_handle_t		arr_rh;	/* set by aggr_fill_ring() */ +	struct aggr_port_s		*arr_port; +	struct aggr_pseudo_rx_group_s	*arr_grp; +	mac_ring_handle_t		arr_hw_rh; +	uint_t				arr_flags; +	uint64_t			arr_gen;  } aggr_pseudo_rx_ring_t; +/* + * An aggr pseudo group abstracts the underlying ports' HW groups. For + * example, if each port has 8 groups (mac_group_t), then the aggr + * will create 8 pseudo groups. Each pseudo group represents a + * collection of HW groups: one group from each port. If you have + * three ports then the pseudo group stands in for three HW groups. + */  typedef struct aggr_pseudo_rx_group_s { +	uint_t			arg_index;  	struct aggr_grp_s	*arg_grp; /* filled in by aggr_fill_group() */  	mac_group_handle_t	arg_gh;   /* filled in by aggr_fill_group() */  	aggr_unicst_addr_t	*arg_macaddr;  	aggr_pseudo_rx_ring_t	arg_rings[MAX_RINGS_PER_GROUP];  	uint_t			arg_ring_cnt; +	uint_t			arg_untagged; /* num clients untagged */ +	list_t			arg_vlans;    /* VLANs on this group */  } aggr_pseudo_rx_group_t;  typedef struct aggr_pseudo_tx_ring_s { @@ -106,12 +130,13 @@ typedef struct aggr_port_s {  			lp_collector_enabled : 1,  			lp_promisc_on : 1,  			lp_no_link_update : 1, -			lp_rx_grp_added : 1,  			lp_tx_grp_added : 1,  			lp_closing : 1, -			lp_pad_bits : 24; +			lp_pad_bits : 25;  	mac_handle_t	lp_mh; -	mac_client_handle_t lp_mch; + +	mac_client_handle_t	lp_mch; +  	const mac_info_t *lp_mip;  	mac_notify_handle_t lp_mnh;  	uint_t		lp_tx_idx;		/* idx in group's tx array */ @@ -123,13 +148,19 @@ typedef struct aggr_port_s {  	aggr_lacp_port_t lp_lacp;		/* LACP state */  	lacp_stats_t	lp_lacp_stats;  	uint32_t	lp_margin; -	mac_promisc_handle_t lp_mphp; +  	mac_unicast_handle_t lp_mah;  	/* List of non-primary addresses that requires promiscous mode set */  	aggr_unicst_addr_t	*lp_prom_addr; -	/* handle of the underlying HW RX group */ -	mac_group_handle_t	lp_hwgh; + +	/* +	 * References to the underlying HW Rx groups of this port. +	 * Used by aggr to program HW classification for the pseudo +	 * groups. +	 */ +	mac_group_handle_t	lp_hwghs[MAX_GROUPS_PER_PORT]; +  	int			lp_tx_ring_cnt;  	/* handles of the underlying HW TX rings */  	mac_ring_handle_t	*lp_tx_rings; @@ -176,7 +207,7 @@ typedef struct aggr_grp_s {  			lg_lso : 1,  			lg_pad_bits : 8;  	aggr_port_t	*lg_ports;		/* list of configured ports */ -	aggr_port_t	*lg_mac_addr_port; +	aggr_port_t	*lg_mac_addr_port;	/* using address of this port */  	mac_handle_t	lg_mh;  	zoneid_t	lg_zoneid;  	uint_t		lg_nattached_ports; @@ -186,11 +217,18 @@ typedef struct aggr_grp_s {  	uint_t		lg_tx_ports_size;	/* size of lg_tx_ports */  	uint32_t	lg_tx_policy;		/* outbound policy */  	uint8_t		lg_mac_tx_policy; -	uint64_t	lg_ifspeed;  	link_state_t	lg_link_state; + + +	/* +	 * The lg_stat_lock must be held when accessing these fields. +	 */ +	kmutex_t	lg_stat_lock; +	uint64_t	lg_ifspeed;  	link_duplex_t	lg_link_duplex;  	uint64_t	lg_stat[MAC_NSTAT];  	uint64_t	lg_ether_stat[ETHER_NSTAT]; +  	aggr_lacp_mode_t lg_lacp_mode;		/* off, active, or passive */  	Agg_t		aggr;			/* 802.3ad data */  	uint32_t	lg_hcksum_txflags; @@ -213,7 +251,9 @@ typedef struct aggr_grp_s {  	kthread_t	*lg_lacp_rx_thread;  	boolean_t	lg_lacp_done; -	aggr_pseudo_rx_group_t	lg_rx_group; +	uint_t			lg_rx_group_count; +	aggr_pseudo_rx_group_t	lg_rx_groups[MAX_GROUPS_PER_PORT]; +  	aggr_pseudo_tx_group_t	lg_tx_group;  	kmutex_t	lg_tx_flowctl_lock; @@ -335,8 +375,11 @@ extern void aggr_grp_port_hold(aggr_port_t *);  extern void aggr_grp_port_rele(aggr_port_t *);  extern void aggr_grp_port_wait(aggr_grp_t *); -extern int aggr_port_addmac(aggr_port_t *, const uint8_t *); -extern void aggr_port_remmac(aggr_port_t *, const uint8_t *); +extern int aggr_port_addmac(aggr_port_t *, uint_t, const uint8_t *); +extern void aggr_port_remmac(aggr_port_t *, uint_t, const uint8_t *); + +extern int aggr_port_addvlan(aggr_port_t *, uint_t, uint16_t); +extern int aggr_port_remvlan(aggr_port_t *, uint_t, uint16_t);  extern mblk_t *aggr_ring_tx(void *, mblk_t *);  extern mblk_t *aggr_find_tx_ring(void *, mblk_t *, diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 1fb5011970..b3b2898987 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -78,6 +78,9 @@ typedef struct {  #define	AT_FLAGS	8	/* processor flags */  #define	AT_ENTRY	9	/* a.out entry point */ +/* First introduced on Linux */ +#define	AT_RANDOM	25	/* address of 16 random bytes */ +  /*   * These relate to the original PPC ABI document; Linux reused   * the values for other things (see below), so disambiguation of @@ -90,19 +93,18 @@ typedef struct {   * These are the values from LSB 1.3, the first five are also described   * in the draft amd64 ABI.   * - * At the time of writing, Solaris doesn't place any of these values into - * the aux vector, except AT_CLKTCK which is placed on the aux vector for - * lx branded processes; also, we do similar things via AT_SUN_ values. + * At the time of writing, illumos doesn't place any of these values into the + * aux vector, except where noted. We do similar things via AT_SUN_ values.   *   * AT_NOTELF		10	program is not ELF? - * AT_UID		11	real user id - * AT_EUID		12	effective user id - * AT_GID		13	real group id - * AT_EGID		14	effective group id + * AT_UID		11	real user id (provided in LX) + * AT_EUID		12	effective user id (provided in LX) + * AT_GID		13	real group id (provided in LX) + * AT_EGID		14	effective group id (provided in LX)   *   * AT_PLATFORM		15   * AT_HWCAP		16 - * AT_CLKTCK		17	c.f. _SC_CLK_TCK + * AT_CLKTCK		17	c.f. _SC_CLK_TCK (provided in LX)   * AT_FPUCW		18   *   * AT_DCACHEBSIZE	19	(moved from 10) @@ -110,6 +112,16 @@ typedef struct {   * AT_UCACHEBSIZE	21	(moved from 12)   *   * AT_IGNOREPPC		22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE		23	secure mode boolean (provided in LX) + * AT_BASE_PLATFORM	24	string identifying real platform, may + *				differ from AT_PLATFORM. + * AT_HWCAP2		26	extension of AT_HWCAP + * AT_EXECFN		31	filename of program + * AT_SYSINFO		32 + * AT_SYSINFO_EHDR	33	The vDSO location   */  /* @@ -186,6 +198,8 @@ extern uint_t getisax(uint32_t *, uint_t);  #define	AT_SUN_BRAND_AUX1	2020  #define	AT_SUN_BRAND_AUX2	2021  #define	AT_SUN_BRAND_AUX3	2022 +#define	AT_SUN_BRAND_AUX4	2025 +#define	AT_SUN_BRAND_NROOT	2024  /*   * Aux vector for comm page diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index badc3faff8..df22f492bf 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -21,6 +21,7 @@  /*   * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc.   */  #ifndef _SYS_BRAND_H @@ -102,29 +103,106 @@ struct brand_mach_ops;  struct intpdata;  struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock) + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + *                  during lwp creation before b_initlwp could be called. + * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock) + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + * b_clearbrand - Perform any actions necessary when clearing the brand. + * b_rpc_statd - Upcall to rpc.statd running within the zone + * b_acct_out - Output properly formatted accounting record + */  struct brand_ops { -	void	(*b_init_brand_data)(zone_t *); +	void	(*b_init_brand_data)(zone_t *, kmutex_t *);  	void	(*b_free_brand_data)(zone_t *);  	int	(*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, -		uintptr_t, uintptr_t, uintptr_t); +		uintptr_t);  	void	(*b_setbrand)(struct proc *);  	int	(*b_getattr)(zone_t *, int, void *, size_t *);  	int	(*b_setattr)(zone_t *, int, void *, size_t);  	void	(*b_copy_procdata)(struct proc *, struct proc *); -	void	(*b_proc_exit)(struct proc *, klwp_t *); +	void	(*b_proc_exit)(struct proc *);  	void	(*b_exec)();  	void	(*b_lwp_setrval)(klwp_t *, int, int); -	int	(*b_initlwp)(klwp_t *); +	void	*(*b_lwpdata_alloc)(struct proc *); +	void	(*b_lwpdata_free)(void *); +	void	(*b_initlwp)(klwp_t *, void *); +	void	(*b_initlwp_post)(klwp_t *);  	void	(*b_forklwp)(klwp_t *, klwp_t *);  	void	(*b_freelwp)(klwp_t *);  	void	(*b_lwpexit)(klwp_t *); -	int	(*b_elfexec)(struct vnode *vp, struct execa *uap, -	    struct uarg *args, struct intpdata *idata, int level, -	    long *execsz, int setid, caddr_t exec_file, -	    struct cred *cred, int brand_action); +	int	(*b_elfexec)(struct vnode *, struct execa *, struct uarg *, +	    struct intpdata *, int, size_t *, int, caddr_t, struct cred *, +	    int *);  	void	(*b_sigset_native_to_brand)(sigset_t *);  	void	(*b_sigset_brand_to_native)(sigset_t *); +	void	(*b_sigfd_translate)(k_siginfo_t *);  	int	b_nsig; +	void	(*b_exit_with_sig)(proc_t *, sigqueue_t *); +	boolean_t (*b_wait_filter)(proc_t *, proc_t *); +	boolean_t (*b_native_exec)(uint8_t, const char **); +	uint32_t (*b_map32limit)(proc_t *); +	void	(*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); +	int	(*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, +	    boolean_t *, int *); +	int	(*b_sigcld_repost)(proc_t *, sigqueue_t *); +	int	(*b_issig_stop)(proc_t *, klwp_t *); +	boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int); +	void	(*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) +	void	(*b_savecontext32)(ucontext32_t *); +#endif +	void	(*b_restorecontext)(ucontext_t *); +	caddr_t	(*b_sendsig_stack)(int); +	void	(*b_sendsig)(int); +	int	(*b_setid_clear)(vattr_t *vap, cred_t *cr); +	int	(*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, +	    enum seg_rw); +	boolean_t b_intp_parse_arg; +	void	(*b_clearbrand)(proc_t *, boolean_t); +	void	(*b_rpc_statd)(int, void *, void *); +	void	(*b_acct_out)(struct vnode *, int);  };  /* @@ -135,6 +213,7 @@ typedef struct brand {  	char    		*b_name;  	struct brand_ops	*b_ops;  	struct brand_mach_ops	*b_machops; +	size_t			b_data_size;  } brand_t;  extern brand_t native_brand; @@ -165,7 +244,7 @@ extern brand_t	*brand_register_zone(struct brand_attr *);  extern brand_t	*brand_find_name(char *);  extern void	brand_unregister_zone(brand_t *);  extern int	brand_zone_count(brand_t *); -extern void	brand_setbrand(proc_t *); +extern int	brand_setbrand(proc_t *, boolean_t);  extern void	brand_clearbrand(proc_t *, boolean_t);  /* @@ -178,17 +257,16 @@ extern int	brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t,  extern void	brand_solaris_copy_procdata(proc_t *, proc_t *,  		    struct brand *);  extern int	brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, -		    intpdata_t *, int, long *, int, caddr_t, cred_t *, int, -		    struct brand *, char *, char *, char *, char *, char *); +		    intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int *, +		    struct brand *, char *, char *, char *);  extern void	brand_solaris_exec(struct brand *);  extern int	brand_solaris_fini(char **, struct modlinkage *,  		    struct brand *);  extern void	brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *);  extern void	brand_solaris_freelwp(klwp_t *, struct brand *); -extern int	brand_solaris_initlwp(klwp_t *, struct brand *); +extern void	brand_solaris_initlwp(klwp_t *, struct brand *);  extern void	brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void	brand_solaris_proc_exit(struct proc *, klwp_t *, -		    struct brand *); +extern void	brand_solaris_proc_exit(struct proc *, struct brand *);  extern void	brand_solaris_setbrand(proc_t *, struct brand *);  #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index e20e0e0c35..b6b5c20e44 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2012 Joyent, Inc.  All rights reserved.   *   * Copyright 2017 RackTop Systems.   */ @@ -188,6 +189,7 @@ struct biostats {  #define	B_STARTED	0x2000000	/* io:::start probe called for buf */  #define	B_ABRWRITE	0x4000000	/* Application based recovery active */  #define	B_PAGE_NOWAIT	0x8000000	/* Skip the page if it is locked */ +#define	B_INVALCURONLY	0x10000000	/* invalidate only for curproc */  /*   * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -200,6 +202,12 @@ struct biostats {   * between the sole use of these two flags.  In both cases, IO will be done   * if the page is not yet committed to storage.   * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL.  B_INVALCURONLY has no + * meaning on its own.  When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + *   * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)   * should be used.   * diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h index 21cf94dcf9..2c70d7c9f1 100644 --- a/usr/src/uts/common/sys/contract/process.h +++ b/usr/src/uts/common/sys/contract/process.h @@ -21,13 +21,12 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_CONTRACT_PROCESS_H  #define	_SYS_CONTRACT_PROCESS_H -#pragma ident	"%Z%%M%	%I%	%E% SMI" -  #include <sys/contract.h>  #include <sys/time.h> @@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t;  #define	CT_PR_NOORPHAN	0x2	/* kill when contract is abandoned */  #define	CT_PR_PGRPONLY	0x4	/* only kill process group on fatal errors */  #define	CT_PR_REGENT	0x8	/* automatically detach inherited contracts */ -#define	CT_PR_ALLPARAM	0xf +#define	CT_PR_KEEP_EXEC	0x10	/* preserve template accross exec */ +#define	CT_PR_ALLPARAM	0x1f  /*   * ctr_ev_* flags diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@  /*   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.   */  #ifndef	_SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *);   */  extern int cpucaps_project_set(kproject_t *, rctl_qty_t);  extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);  /*   * Get current CPU usage for a project/zone.   */  extern rctl_qty_t cpucaps_project_get(kproject_t *);  extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);  /*   * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.   */  #ifndef	_SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap {  	waitq_t		cap_waitq;	/* waitq for capped threads	*/  	kstat_t		*cap_kstat;	/* cpucaps specific kstat	*/  	int64_t		cap_gen;	/* zone cap specific 		*/ +	hrtime_t	cap_chk_value;	/* effective CPU usage cap	*/  	hrtime_t	cap_value;	/* scaled CPU usage cap		*/  	hrtime_t	cap_usage;	/* current CPU usage		*/ +	hrtime_t	cap_base;	/* base CPU for burst		*/ +	u_longlong_t	cap_burst_limit; /* max secs (in tics) for a burst */ +	u_longlong_t	cap_bursting;	/* # of ticks currently bursting */  	disp_lock_t	cap_usagelock;	/* protects cap_usage above	*/  	/*  	 * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap {  	hrtime_t	cap_maxusage;	/* maximum cap usage		*/  	u_longlong_t	cap_below;	/* # of ticks spend below the cap */  	u_longlong_t	cap_above;	/* # of ticks spend above the cap */ +	u_longlong_t	cap_above_base;	/* # of ticks spent above the base */  } cpucap_t;  /* diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 8565ca053e..2cfe5116d9 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -23,6 +23,7 @@   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright (c) 2012 by Delphix. All rights reserved.   * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2018 Joyent, Inc.   * Copyright 2017 RackTop Systems.   */ @@ -389,7 +390,6 @@ extern cpu_core_t cpu_core[];  #define	CPU_DISP_HALTED		0x02	/* CPU halted waiting for interrupt */  /* Note: inside ifdef: _KERNEL || _KMEMUSER || _BOOT */ -#if defined(_MACHDEP)  /*   * Macros for manipulating sets of CPUs as a bitmap.  Note that this @@ -405,34 +405,60 @@ extern cpu_core_t cpu_core[];  #define	CPUSET_WORDS	BT_BITOUL(NCPU)  #define	CPUSET_NOTINSET	((uint_t)-1) -#if	CPUSET_WORDS > 1 - -typedef struct cpuset { +#if defined(_MACHDEP) +struct cpuset {  	ulong_t	cpub[CPUSET_WORDS]; -} cpuset_t; +}; +#else +struct cpuset; +#endif + +typedef struct cpuset cpuset_t; + +extern cpuset_t	*cpuset_alloc(int); +extern void	cpuset_free(cpuset_t *);  /* - * Private functions for manipulating cpusets that do not fit in a - * single word.  These should not be used directly; instead the - * CPUSET_* macros should be used so the code will be portable - * across different definitions of NCPU. + * Functions for manipulating cpusets.  These were previously considered + * private when some cpuset_t handling was performed in the CPUSET_* macros. + * They are now acceptable to use in non-_MACHDEP code.   */  extern	void	cpuset_all(cpuset_t *); -extern	void	cpuset_all_but(cpuset_t *, uint_t); +extern	void	cpuset_all_but(cpuset_t *, const uint_t);  extern	int	cpuset_isnull(cpuset_t *); -extern	int	cpuset_cmp(cpuset_t *, cpuset_t *); -extern	void	cpuset_only(cpuset_t *, uint_t); +extern	int	cpuset_isequal(cpuset_t *, cpuset_t *); +extern	void	cpuset_only(cpuset_t *, const uint_t); +extern	long	cpu_in_set(cpuset_t *, const uint_t); +extern	void	cpuset_add(cpuset_t *, const uint_t); +extern	void	cpuset_del(cpuset_t *, const uint_t);  extern	uint_t	cpuset_find(cpuset_t *);  extern	void	cpuset_bounds(cpuset_t *, uint_t *, uint_t *); +extern	void	cpuset_atomic_del(cpuset_t *, const uint_t); +extern	void	cpuset_atomic_add(cpuset_t *, const uint_t); +extern	long	cpuset_atomic_xadd(cpuset_t *, const uint_t); +extern	long	cpuset_atomic_xdel(cpuset_t *, const uint_t); +extern	void	cpuset_or(cpuset_t *, cpuset_t *); +extern	void	cpuset_xor(cpuset_t *, cpuset_t *); +extern	void	cpuset_and(cpuset_t *, cpuset_t *); +extern	void	cpuset_zero(cpuset_t *); + + +#if defined(_MACHDEP) + +/* + * Prior to the cpuset_t restructuring, the CPUSET_* macros contained + * significant logic, rather than directly invoking the backend functions. + * They are maintained here so that existing _MACHDEP code can use them. + */  #define	CPUSET_ALL(set)			cpuset_all(&(set))  #define	CPUSET_ALL_BUT(set, cpu)	cpuset_all_but(&(set), cpu)  #define	CPUSET_ONLY(set, cpu)		cpuset_only(&(set), cpu) -#define	CPU_IN_SET(set, cpu)		BT_TEST((set).cpub, cpu) -#define	CPUSET_ADD(set, cpu)		BT_SET((set).cpub, cpu) -#define	CPUSET_DEL(set, cpu)		BT_CLEAR((set).cpub, cpu) +#define	CPU_IN_SET(set, cpu)		cpu_in_set(&(set), cpu) +#define	CPUSET_ADD(set, cpu)		cpuset_add(&(set), cpu) +#define	CPUSET_DEL(set, cpu)		cpuset_del(&(set), cpu)  #define	CPUSET_ISNULL(set)		cpuset_isnull(&(set)) -#define	CPUSET_ISEQUAL(set1, set2)	cpuset_cmp(&(set1), &(set2)) +#define	CPUSET_ISEQUAL(set1, set2)	cpuset_isequal(&(set1), &(set2))  /*   * Find one CPU in the cpuset. @@ -460,86 +486,24 @@ extern	void	cpuset_bounds(cpuset_t *, uint_t *, uint_t *);   * deleting a cpu that's not in the cpuset)   */ -#define	CPUSET_ATOMIC_DEL(set, cpu)	BT_ATOMIC_CLEAR((set).cpub, (cpu)) -#define	CPUSET_ATOMIC_ADD(set, cpu)	BT_ATOMIC_SET((set).cpub, (cpu)) - -#define	CPUSET_ATOMIC_XADD(set, cpu, result) \ -	BT_ATOMIC_SET_EXCL((set).cpub, cpu, result) - -#define	CPUSET_ATOMIC_XDEL(set, cpu, result) \ -	BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result) - - -#define	CPUSET_OR(set1, set2)		{		\ -	int _i;						\ -	for (_i = 0; _i < CPUSET_WORDS; _i++)		\ -		(set1).cpub[_i] |= (set2).cpub[_i];	\ -} - -#define	CPUSET_XOR(set1, set2)		{		\ -	int _i;						\ -	for (_i = 0; _i < CPUSET_WORDS; _i++)		\ -		(set1).cpub[_i] ^= (set2).cpub[_i];	\ -} - -#define	CPUSET_AND(set1, set2)		{		\ -	int _i;						\ -	for (_i = 0; _i < CPUSET_WORDS; _i++)		\ -		(set1).cpub[_i] &= (set2).cpub[_i];	\ -} - -#define	CPUSET_ZERO(set)		{		\ -	int _i;						\ -	for (_i = 0; _i < CPUSET_WORDS; _i++)		\ -		(set).cpub[_i] = 0;			\ -} - -#elif	CPUSET_WORDS == 1 - -typedef	ulong_t	cpuset_t;	/* a set of CPUs */ - -#define	CPUSET(cpu)			(1UL << (cpu)) - -#define	CPUSET_ALL(set)			((void)((set) = ~0UL)) -#define	CPUSET_ALL_BUT(set, cpu)	((void)((set) = ~CPUSET(cpu))) -#define	CPUSET_ONLY(set, cpu)		((void)((set) = CPUSET(cpu))) -#define	CPU_IN_SET(set, cpu)		((set) & CPUSET(cpu)) -#define	CPUSET_ADD(set, cpu)		((void)((set) |= CPUSET(cpu))) -#define	CPUSET_DEL(set, cpu)		((void)((set) &= ~CPUSET(cpu))) -#define	CPUSET_ISNULL(set)		((set) == 0) -#define	CPUSET_ISEQUAL(set1, set2)	((set1) == (set2)) -#define	CPUSET_OR(set1, set2)		((void)((set1) |= (set2))) -#define	CPUSET_XOR(set1, set2)		((void)((set1) ^= (set2))) -#define	CPUSET_AND(set1, set2)		((void)((set1) &= (set2))) -#define	CPUSET_ZERO(set)		((void)((set) = 0)) - -#define	CPUSET_FIND(set, cpu)		{		\ -	cpu = (uint_t)(lowbit(set) - 1);				\ -} - -#define	CPUSET_BOUNDS(set, smallest, largest)	{	\ -	smallest = (uint_t)(lowbit(set) - 1);		\ -	largest = (uint_t)(highbit(set) - 1);		\ -} +#define	CPUSET_ATOMIC_DEL(set, cpu)	cpuset_atomic_del(&(set), cpu) +#define	CPUSET_ATOMIC_ADD(set, cpu)	cpuset_atomic_add(&(set), cpu) -#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_ulong(&(set), ~CPUSET(cpu)) -#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_ulong(&(set), CPUSET(cpu)) +#define	CPUSET_ATOMIC_XADD(set, cpu, result)	\ +	(result) = cpuset_atomic_xadd(&(set), cpu) -#define	CPUSET_ATOMIC_XADD(set, cpu, result) \ -	{ result = atomic_set_long_excl(&(set), (cpu)); } +#define	CPUSET_ATOMIC_XDEL(set, cpu, result)	\ +	(result) = cpuset_atomic_xdel(&(set), cpu) -#define	CPUSET_ATOMIC_XDEL(set, cpu, result) \ -	{ result = atomic_clear_long_excl(&(set), (cpu)); } +#define	CPUSET_OR(set1, set2)	cpuset_or(&(set1), &(set2)) -#else	/* CPUSET_WORDS <= 0 */ +#define	CPUSET_XOR(set1, set2)	cpuset_xor(&(set1), &(set2)) -#error NCPU is undefined or invalid +#define	CPUSET_AND(set1, set2)	cpuset_and(&(set1), &(set2)) -#endif	/* CPUSET_WORDS	*/ - -extern cpuset_t cpu_seqid_inuse; +#define	CPUSET_ZERO(set)	cpuset_zero(&(set)) -#endif	/* _MACHDEP */ +#endif /* _MACHDEP */  #endif /* _KERNEL || _KMEMUSER || _BOOT */  #define	CPU_CPR_OFFLINE		0x0 @@ -550,10 +514,14 @@ extern cpuset_t cpu_seqid_inuse;  #if defined(_KERNEL) || defined(_KMEMUSER) +extern cpuset_t cpu_seqid_inuse; +  extern struct cpu	*cpu[];		/* indexed by CPU number */  extern struct cpu	**cpu_seq;	/* indexed by sequential CPU id */  extern cpu_t		*cpu_list;	/* list of CPUs */  extern cpu_t		*cpu_active;	/* list of active CPUs */ +extern cpuset_t		cpu_active_set;	/* cached set of active CPUs */ +extern cpuset_t		cpu_available;	/* cached set of available CPUs */  extern int		ncpus;		/* number of CPUs present */  extern int		ncpus_online;	/* number of CPUs not quiesced */  extern int		max_ncpus;	/* max present before ncpus is known */ @@ -572,13 +540,19 @@ extern struct cpu *curcpup(void);  #endif  /* - * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id - * as the target and to grab cpu_lock instead of requiring the caller - * to grab it. + * CPU_CURRENT indicates to thread_affinity_set() to use whatever curthread's + * current CPU is; holding cpu_lock is not required.   */  #define	CPU_CURRENT	-3  /* + * CPU_BEST can be used by thread_affinity_set() callers to set affinity to a + * good CPU (in particular, an ht_acquire()-friendly choice); holding cpu_lock + * is not required. + */ +#define	CPU_BEST	-4 + +/*   * Per-CPU statistics   *   * cpu_stats_t contains numerous system and VM-related statistics, in the form @@ -613,7 +587,7 @@ extern struct cpu *curcpup(void);   */  #define	CPU_NEW_GENERATION(cp)	((cp)->cpu_generation++) -#endif /* _KERNEL || _KMEMUSER */ +#endif /* defined(_KERNEL) || defined(_KMEMUSER) */  /*   * CPU support routines (not for genassym.c) diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index fb79dfecde..1f938132e0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *);  extern gid_t crgetrgid(const cred_t *);  extern gid_t crgetsgid(const cred_t *);  extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *);  extern projid_t crgetprojid(const cred_t *);  extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/ctf_api.h b/usr/src/uts/common/sys/ctf_api.h index 073cc4f0d6..190720246f 100644 --- a/usr/src/uts/common/sys/ctf_api.h +++ b/usr/src/uts/common/sys/ctf_api.h @@ -24,7 +24,7 @@   * Use is subject to license terms.   */  /* - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc.   */  /* @@ -116,7 +116,7 @@ enum {  	ECTF_ZLIB,		/* zlib library failure */  	ECTF_CONVBKERR,		/* CTF conversion backend error */  	ECTF_CONVNOCSRC,	/* No C source to convert from */ -	ECTF_NOCONVBKEND	/* No applicable conversion backend */ +	ECTF_CONVNODEBUG,	/* No debug info to convert into CTF */  };  /* diff --git a/usr/src/uts/common/sys/cyclic.h b/usr/src/uts/common/sys/cyclic.h index 5f28543f9f..270a09449f 100644 --- a/usr/src/uts/common/sys/cyclic.h +++ b/usr/src/uts/common/sys/cyclic.h @@ -23,6 +23,7 @@   * Use is subject to license terms.   *   * Copyright 2017 RackTop Systems. + * Copyright 2018 Joyent, Inc.   */  #ifndef _SYS_CYCLIC_H @@ -81,6 +82,7 @@ extern cyclic_id_t cyclic_add_omni(cyc_omni_handler_t *);  extern void cyclic_remove(cyclic_id_t);  extern void cyclic_bind(cyclic_id_t, cpu_t *, cpupart_t *);  extern int cyclic_reprogram(cyclic_id_t, hrtime_t); +extern void cyclic_move_here(cyclic_id_t);  extern hrtime_t cyclic_getres();  extern int cyclic_offline(cpu_t *cpu); diff --git a/usr/src/uts/common/sys/disp.h b/usr/src/uts/common/sys/disp.h index b324f4d323..cb3711edcd 100644 --- a/usr/src/uts/common/sys/disp.h +++ b/usr/src/uts/common/sys/disp.h @@ -23,6 +23,8 @@   * Use is subject to license terms.   *   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved. + * + * Copyright 2018 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -63,11 +65,11 @@ typedef struct _disp {  	/*  	 * Priorities:  	 *	disp_maxrunpri is the maximum run priority of runnable threads -	 * 	on this queue.  It is -1 if nothing is runnable. +	 *	on this queue.  It is -1 if nothing is runnable.  	 *  	 *	disp_max_unbound_pri is the maximum run priority of threads on  	 *	this dispatch queue but runnable by any CPU.  This may be left -	 * 	artificially high, then corrected when some CPU tries to take +	 *	artificially high, then corrected when some CPU tries to take  	 *	an unbound thread.  It is -1 if nothing is runnable.  	 */  	pri_t		disp_maxrunpri;	/* maximum run priority */ @@ -151,8 +153,7 @@ extern void		dq_srundec(kthread_t *);  extern void		cpu_rechoose(kthread_t *);  extern void		cpu_surrender(kthread_t *);  extern void		kpreempt(int); -extern struct cpu	*disp_lowpri_cpu(struct cpu *, struct lgrp_ld *, pri_t, -			    struct cpu *); +extern struct cpu	*disp_lowpri_cpu(struct cpu *, kthread_t *, pri_t);  extern int		disp_bound_threads(struct cpu *, int);  extern int		disp_bound_anythreads(struct cpu *, int);  extern int		disp_bound_partition(struct cpu *, int); @@ -167,6 +168,8 @@ extern void		resume_from_zombie(kthread_t *)  extern void		disp_swapped_enq(kthread_t *);  extern int		disp_anywork(void); +extern struct cpu	*disp_choose_best_cpu(void); +  #define	KPREEMPT_SYNC		(-1)  #define	kpreempt_disable()				\  	{						\ @@ -183,6 +186,8 @@ extern int		disp_anywork(void);  #endif	/* _KERNEL */ +#define	CPU_IDLE_PRI (-1) +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct	dadk {  	kstat_t		*dad_errstats;	/* error stats			*/  	kmutex_t	dad_cmd_mutex;  	int		dad_cmd_count; +	uint32_t	dad_err_cnt;	/* number of recent errors	*/ +	hrtime_t	dad_last_log;	/* time of last error log	*/  };  #define	DAD_SECSIZ	dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index 6449f39a35..5be223ce93 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -192,6 +192,7 @@ typedef struct dld_ioc_rename {  	datalink_id_t	dir_linkid1;  	datalink_id_t	dir_linkid2;  	char		dir_link[MAXLINKNAMELEN]; +	boolean_t	dir_zoneinit;  } dld_ioc_rename_t;  /* @@ -204,6 +205,7 @@ typedef struct dld_ioc_rename {  typedef struct dld_ioc_zid {  	zoneid_t	diz_zid;  	datalink_id_t	diz_linkid; +	boolean_t	diz_transient;  } dld_ioc_zid_t;  /* @@ -356,6 +358,7 @@ typedef struct dld_ioc_led {  #define	DLD_CAPAB_POLL		0x00000002  #define	DLD_CAPAB_PERIM		0x00000003  #define	DLD_CAPAB_LSO		0x00000004 +#define	DLD_CAPAB_IPCHECK	0x00000005  #define	DLD_ENABLE		0x00000001  #define	DLD_DISABLE		0x00000002 @@ -382,6 +385,7 @@ typedef struct dld_ioc_led {   */  typedef	int	(*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define	DI_DIRECT_RAW	0x1  /*   * Direct Tx/Rx capability.   */ @@ -406,8 +410,16 @@ typedef struct dld_capab_direct_s {  	/* flow control "can I put on a ring" callback */  	uintptr_t	di_tx_fctl_df; /* canput-like callback */  	void		*di_tx_fctl_dh; + +	/* flags that control our behavior */ +	uint_t		di_flags;  } dld_capab_direct_t; +typedef struct dld_capab_ipcheck_s { +	uintptr_t	ipc_allowed_df; +	void		*ipc_allowed_dh; +} dld_capab_ipcheck_t; +  /*   * Polling/softring capability.   */ diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 035eea893a..336fa9cb67 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum {  typedef enum {  	DLD_UNINITIALIZED,  	DLD_PASSIVE, -	DLD_ACTIVE +	DLD_ACTIVE, +	DLD_EXCLUSIVE  } dld_passivestate_t;  /* @@ -256,6 +257,8 @@ extern void		dld_str_rx_unitdata(void *, mac_resource_handle_t,  extern void		dld_str_notify_ind(dld_str_t *);  extern mac_tx_cookie_t	str_mdata_fastpath_put(dld_str_t *, mblk_t *,      uintptr_t, uint16_t); +extern mac_tx_cookie_t	str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, +    uintptr_t, uint16_t);  extern int		dld_flow_ctl_callb(dld_str_t *, uint64_t,      int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" {  #define	IPTUN_IOC	0x454A  #define	BRIDGE_IOC	0xB81D  #define	IBPART_IOC	0x6171 +#define	OVERLAY_IOC	0x2005  /* GLDv3 modules use these macros to generate unique ioctl commands */  #define	DLDIOC(cmdid)		DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" {  #define	IPTUNIOC(cmdid)		DLD_IOC_CMD(IPTUN_IOC, (cmdid))  #define	BRIDGEIOC(cmdid)	DLD_IOC_CMD(BRIDGE_IOC, (cmdid))  #define	IBPARTIOC(cmdid)	DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define	OVERLAYIOC(cmdid)	DLD_IOC_CMD(OVERLAY_IOC, (cmdid))  #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..d76daffeb7 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -107,6 +108,7 @@ typedef struct dl_ipnetinfo {  #define	DL_PASSIVE_REQ		0x114	/* Allow access to aggregated link */  #define	DL_INTR_MODE_REQ	0x115	/* Request Rx processing in INTR mode */  #define	DL_NOTIFY_CONF		0x116	/* Notification from upstream */ +#define	DL_EXCLUSIVE_REQ	0x117	/* Make bind active */  /*   * Primitives used for Connectionless Service @@ -388,6 +390,8 @@ typedef struct dl_ipnetinfo {  #define	DL_PROMISC_PHYS		0x01	/* promiscuous mode at phys level */  #define	DL_PROMISC_SAP		0x02	/* promiscuous mode at sap level */  #define	DL_PROMISC_MULTI	0x03	/* promiscuous mode for multicast */ +#define	DL_PROMISC_RX_ONLY	0x04	/* above only enabled for rx */ +#define	DL_PROMISC_FIXUPS	0x05	/* above will be fixed up */  /*   * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -673,11 +677,11 @@ typedef struct {  #define	HCKSUM_ENABLE		0x01	/* Set to enable hardware checksum */  					/* capability */  #define	HCKSUM_INET_PARTIAL	0x02	/* Partial 1's complement checksum */ -					/* ability */ +					/* ability for TCP/UDP packets. */  #define	HCKSUM_INET_FULL_V4	0x04	/* Full 1's complement checksum */ -					/* ability for IPv4 packets. */ +					/* ability for IPv4 TCP/UDP packets. */  #define	HCKSUM_INET_FULL_V6	0x08	/* Full 1's complement checksum */ -					/* ability for IPv6 packets. */ +					/* ability for IPv6 TCP/UDP packets. */  #define	HCKSUM_IPHDRCKSUM	0x10	/* IPv4 Header checksum offload */  					/* capability */  #ifdef _KERNEL @@ -1107,6 +1111,13 @@ typedef struct {  } dl_intr_mode_req_t;  /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { +	t_uscalar_t	dl_primitive; +} dl_exclusive_req_t; + +/*   *	CONNECTION-ORIENTED SERVICE PRIMITIVES   */ @@ -1528,6 +1539,7 @@ union DL_primitives {  	dl_control_ack_t	control_ack;  	dl_passive_req_t	passive_req;  	dl_intr_mode_req_t	intr_mode_req; +	dl_exclusive_req_t	exclusive_req;  };  #define	DL_INFO_REQ_SIZE	sizeof (dl_info_req_t) @@ -1596,6 +1608,7 @@ union DL_primitives {  #define	DL_CONTROL_ACK_SIZE	sizeof (dl_control_ack_t)  #define	DL_PASSIVE_REQ_SIZE	sizeof (dl_passive_req_t)  #define	DL_INTR_MODE_REQ_SIZE	sizeof (dl_intr_mode_req_t) +#define	DL_EXCLUSIVE_REQ_SIZE	sizeof (dl_exclusive_req_t)  #ifdef	_KERNEL  /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 6bd2bbe35a..81f9e2abac 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_DLS_H @@ -85,6 +86,8 @@ typedef struct dls_link_s	dls_link_t;  #define	DLS_PROMISC_SAP		0x00000001  #define	DLS_PROMISC_MULTI	0x00000002  #define	DLS_PROMISC_PHYS	0x00000004 +#define	DLS_PROMISC_RX_ONLY	0x00000008 +#define	DLS_PROMISC_FIXUPS	0x00000010  extern int	dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);  extern void	dls_close(dld_str_t *); @@ -106,11 +109,13 @@ extern void	str_notify(void *, mac_notify_type_t);  extern int		dls_devnet_open(const char *,  			    dls_dl_handle_t *, dev_t *); +extern int		dls_devnet_open_in_zone(const char *, +			    dls_dl_handle_t *, dev_t *, zoneid_t);  extern void		dls_devnet_close(dls_dl_handle_t);  extern boolean_t	dls_devnet_rebuild();  extern int		dls_devnet_rename(datalink_id_t, datalink_id_t, -			    const char *); +			    const char *, boolean_t);  extern int		dls_devnet_create(mac_handle_t, datalink_id_t,  			    zoneid_t);  extern int		dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -122,12 +127,13 @@ extern int		dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *);  extern void		dls_devnet_rele(dls_dl_handle_t);  extern void		dls_devnet_prop_task_wait(dls_dl_handle_t); +extern const char	*dls_devnet_link(dls_dl_handle_t);  extern const char	*dls_devnet_mac(dls_dl_handle_t);  extern uint16_t		dls_devnet_vid(dls_dl_handle_t);  extern datalink_id_t	dls_devnet_linkid(dls_dl_handle_t);  extern int		dls_devnet_dev2linkid(dev_t, datalink_id_t *);  extern int		dls_devnet_phydev(datalink_id_t, dev_t *); -extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t);  extern zoneid_t		dls_devnet_getzid(dls_dl_handle_t);  extern zoneid_t		dls_devnet_getownerzid(dls_dl_handle_t);  extern boolean_t	dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -141,6 +147,8 @@ extern int		dls_mgmt_update(const char *, uint32_t, boolean_t,  extern int		dls_mgmt_get_linkinfo(datalink_id_t, char *,  			    datalink_class_t *, uint32_t *, uint32_t *);  extern int		dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int		dls_mgmt_get_linkid_in_zone(const char *, +    datalink_id_t *, zoneid_t);  extern datalink_id_t	dls_mgmt_get_next(datalink_id_t, datalink_class_t,  			    datalink_media_t, uint32_t);  extern int		dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 60f51c47b5..329f8dd08e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_DLS_IMPL_H @@ -46,11 +47,12 @@ typedef struct dls_multicst_addr_s {  } dls_multicst_addr_t;  struct dls_link_s {				/* Protected by */ -	char			dl_name[MAXNAMELEN];	/* SL */ +	char			dl_name[MAXNAMELEN];	/* RO */  	uint_t			dl_ddi_instance;	/* SL */  	mac_handle_t		dl_mh;			/* SL */  	mac_client_handle_t	dl_mch;			/* SL */  	mac_unicast_handle_t	dl_mah;			/* SL */ +	mac_notify_handle_t	dl_mnh;			/* SL */  	const mac_info_t	*dl_mip;		/* SL */  	uint_t			dl_ref;			/* SL */  	mod_hash_t		*dl_str_hash;		/* SL, modhash lock */ @@ -61,6 +63,7 @@ struct dls_link_s {				/* Protected by */  	uint_t			dl_zone_ref;  	link_tagmode_t		dl_tagmode;		/* atomic */  	uint_t			dl_nonip_cnt;		/* SL */ +	uint_t			dl_exclusive;		/* SL */  };  typedef struct dls_head_s { @@ -96,13 +99,16 @@ extern void		dls_create_str_kstats(dld_str_t *);  extern int		dls_stat_update(kstat_t *, dls_link_t *, int);  extern int		dls_stat_create(const char *, int, const char *,  			    zoneid_t, int (*)(struct kstat *, int), void *, -			    kstat_t **); +			    kstat_t **, zoneid_t); +extern void	dls_stat_delete(kstat_t *);  extern int		dls_devnet_open_by_dev(dev_t, dls_link_t **,  			    dls_dl_handle_t *);  extern int		dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *,  			    dls_link_t **);  extern void		dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *); +extern int		dls_devnet_hold_tmp_by_link(dls_link_t *, +			    dls_dl_handle_t *);  extern void		dls_init(void);  extern int		dls_fini(void); @@ -126,6 +132,7 @@ extern void		dls_mgmt_init(void);  extern void		dls_mgmt_fini(void);  extern int		dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int		dls_exclusive_set(dld_str_t *, boolean_t);  #ifdef	__cplusplus  } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b4032c24d6..6fec277991 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  #ifndef	_DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum {  	DATALINK_CLASS_SIMNET		= 0x20,  	DATALINK_CLASS_BRIDGE		= 0x40,  	DATALINK_CLASS_IPTUN		= 0x80, -	DATALINK_CLASS_PART		= 0x100 +	DATALINK_CLASS_PART		= 0x100, +	DATALINK_CLASS_OVERLAY		= 0x200  } datalink_class_t;  #define	DATALINK_CLASS_ALL	(DATALINK_CLASS_PHYS |	\  	DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \  	DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ -	DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) +	DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ +	DATALINK_CLASS_OVERLAY)  /*   * A combination of flags and media. @@ -111,10 +114,14 @@ typedef uint64_t	datalink_media_t;  #define	DLMGMT_CMD_BASE			128  /* - * Indicate the link mapping is active or persistent + * Indicate if the link mapping is active, persistent, or transient. A + * transient link is an active link with a twist -- it is an active + * link which is destroyed along with the zone rather than reassigned + * to the GZ.   */  #define	DLMGMT_ACTIVE		0x01  #define	DLMGMT_PERSIST		0x02 +#define	DLMGMT_TRANSIENT	0x04  /* upcall argument */  typedef struct dlmgmt_door_arg { @@ -165,6 +172,7 @@ typedef struct dlmgmt_door_getname {  typedef struct dlmgmt_door_getlinkid {  	int			ld_cmd;  	char			ld_link[MAXLINKNAMELEN]; +	zoneid_t		ld_zoneid;  } dlmgmt_door_getlinkid_t;  typedef struct dlmgmt_door_getnext_s { @@ -225,6 +233,7 @@ typedef struct dlmgmt_getattr_retval_s {  	char			lr_attrval[MAXLINKATTRVALLEN];  } dlmgmt_getattr_retval_t; +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/efi_partition.h b/usr/src/uts/common/sys/efi_partition.h index 5fa101cbb7..065f65f802 100644 --- a/usr/src/uts/common/sys/efi_partition.h +++ b/usr/src/uts/common/sys/efi_partition.h @@ -22,12 +22,14 @@   * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.   * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.   * Copyright 2014 Toomas Soome <tsoome@me.com> + * Copyright (c) 2019, Joyent, Inc.   */  #ifndef	_SYS_EFI_PARTITION_H  #define	_SYS_EFI_PARTITION_H  #include <sys/uuid.h> +#include <sys/stddef.h>  #ifdef	__cplusplus  extern "C" { @@ -46,6 +48,16 @@ extern "C" {  #define	EFI_SIGNATURE	0x5452415020494645ULL +/* + * Although the EFI spec is clear that sizeof (efi_gpt_t) is a valid value + * (512), at least one EFI system (AMI v4.6.4.1) incorrectly expects this to be + * exactly the size of the structure defined in the spec, that is, 92. + * + * As the reserved section is never used, the modified value works fine + * everywhere else. + */ +#define	EFI_HEADER_SIZE (offsetof(efi_gpt_t, efi_gpt_Reserved2)) +  /* EFI Guid Partition Table Header -- little endian on-disk format */  typedef struct efi_gpt {  	uint64_t	efi_gpt_Signature; @@ -222,7 +234,7 @@ typedef struct dk_efi {  	diskaddr_t	 dki_lba;	/* starting block */  	len_t		 dki_length;	/* length in bytes */  	union { -		efi_gpt_t 	*_dki_data; +		efi_gpt_t	*_dki_data;  		uint64_t	_dki_data_64;  	} dki_un;  #define	dki_data	dki_un._dki_data diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index 4bd884e9c2..1a2ca397ef 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -500,6 +500,11 @@ typedef struct {  #define	PT_GNU_STACK	0x6474e551	/* Indicates stack executability */  #define	PT_GNU_RELRO	0x6474e552	/* Read-only after relocation */ +/* + * Linux specific program headers not even used by Linux (!!) + */ +#define	PT_PAX_FLAGS	0x65041580	/* PaX flags (see below) */ +  #define	PT_LOSUNW	0x6ffffffa  #define	PT_SUNWBSS	0x6ffffffa	/* Sun Specific segment (unused) */  #define	PT_SUNWSTACK	0x6ffffffb	/* describes the stack segment */ @@ -515,6 +520,45 @@ typedef struct {  #define	PF_W		0x2  #define	PF_X		0x1 +/* + * PaX is a regrettable series of never-integrated Linux patches for a + * facility to provide additional protections on memory pages for purposes of + * increasing security, and for allowing binaries to demand (or refuse) those + * protections via the PT_PAX_FLAGS program header.  (Portents of its + * rudderless existence, "PaX" is a term of indefinite origin written by an + * unknown group of people.)  This facility is unfortunate in any number of + * ways, and was largely obviated by the broad adoption of non-executable + * stacks at any rate -- but it lives on in binaries that continue to mark + * themselves to explicitly refuse the (never-integrated, now-obviated) + * facility.  One might cringe that PaX overloads the meaning of the p_flags + * to specify protections, but that is the least of its transgressions: + * instead of using one p_type constant to explicitly enable a series of + * protections and another to explicitly disable others, it insists on + * conflating both actions into PT_PAX_FLAGS.  The resulting doubling of + * constant definitions (two constant definitions for every protection instead + * of merely one) assures that the values can't even fit in the eight + * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for + * such filthy functionality.  And were all of this not enough, there is one + * final nomenclature insult to be added to this semantic injury:  the + * constants for the p_flags don't even embed "_PAX_" in their name -- despite + * the fact that this is their only purpose!  We resist the temptation to + * right this final wrong here; we grit our teeth and provide exactly the + * Linux definitions -- or rather, what would have been the Linux definitions + * had this belching jalopy ever been permitted to crash itself into mainline. + */ +#define	PF_PAGEEXEC	0x00000010	/* PaX: enable PAGEEXEC */ +#define	PF_NOPAGEEXEC	0x00000020	/* PaX: disable PAGEEXEC */ +#define	PF_SEGMEXEC	0x00000040	/* PaX: enable SEGMEXEC */ +#define	PF_NOSEGMEXEC	0x00000080	/* PaX: disable SEGMEXEC */ +#define	PF_MPROTECT	0x00000100	/* PaX: enable MPROTECT */ +#define	PF_NOMPROTECT	0x00000200	/* PaX: disable MPROTECT */ +#define	PF_RANDEXEC	0x00000400	/* PaX: enable RANDEXEC */ +#define	PF_NORANDEXEC	0x00000800	/* PaX: disable RANDEXEC */ +#define	PF_EMUTRAMP	0x00001000	/* PaX: enable EMUTRAMP */ +#define	PF_NOEMUTRAMP	0x00002000	/* PaX: disable EMUTRAMP */ +#define	PF_RANDMMAP	0x00004000	/* PaX: enable RANDMMAP */ +#define	PF_NORANDMMAP	0x00008000	/* PaX: disable RANDMMAP */ +  #define	PF_MASKOS	0x0ff00000	/* OS specific values */  #define	PF_MASKPROC	0xf0000000	/* processor specific values */ diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h index 1b0d961b0b..b64a101348 100644 --- a/usr/src/uts/common/sys/eventfd.h +++ b/usr/src/uts/common/sys/eventfd.h @@ -10,7 +10,7 @@   */  /* - * Copyright (c) 2015 Joyent, Inc.  All rights reserved. + * Copyright (c) 2017, Joyent, Inc.   */  /* @@ -47,6 +47,13 @@ typedef uint64_t eventfd_t;  #define	EVENTFDIOC		(('e' << 24) | ('f' << 16) | ('d' << 8))  #define	EVENTFDIOC_SEMAPHORE	(EVENTFDIOC | 1)	/* toggle sem state */ +/* + * Kernel-internal method to write to eventfd while bypassing overflow limits, + * therefore avoiding potential to block as well.  This is used to fulfill AIO + * behavior in LX related to eventfd notification. + */ +#define	EVENTFDIOC_POST		(EVENTFDIOC | 2) +  #ifndef _KERNEL  extern int eventfd(unsigned int, int); @@ -58,6 +65,7 @@ extern int eventfd_write(int, eventfd_t);  #define	EVENTFDMNRN_EVENTFD	0  #define	EVENTFDMNRN_CLONE	1  #define	EVENTFD_VALMAX		(ULLONG_MAX - 1ULL) +#define	EVENTFD_VALOVERFLOW	ULLONG_MAX  #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index 8056f9a8e8..12115b7e27 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -26,6 +26,10 @@  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/  /*	  All Rights Reserved	*/ +/* + * Copyright 2019 Joyent, Inc. + */ +  #ifndef _SYS_EXEC_H  #define	_SYS_EXEC_H @@ -79,7 +83,7 @@ typedef struct uarg {  	ssize_t arglen;  	char	*fname;  	char	*pathname; -	ssize_t	auxsize; +	size_t	auxsize;  	caddr_t	stackend;  	size_t	stk_align;  	size_t	stk_size; @@ -102,10 +106,13 @@ typedef struct uarg {  	vnode_t	*ex_vp;  	char	*emulator;  	char	*brandname; +	const char *brand_nroot;  	char	*auxp_auxflags; /* addr of auxflags auxv on the user stack */  	char	*auxp_brand; /* address of first brand auxv on user stack */  	cred_t	*pfcred;  	boolean_t scrubenv; +	uintptr_t maxstack; +	boolean_t stk_prot_override;  	uintptr_t commpage;  } uarg_t; @@ -175,8 +182,8 @@ struct execsw {  	int	exec_maglen;  	int	(*exec_func)(struct vnode *vp, struct execa *uap,  		    struct uarg *args, struct intpdata *idata, int level, -		    long *execsz, int setid, caddr_t exec_file, -		    struct cred *cred, int brand_action); +		    size_t *execsz, int setid, caddr_t exec_file, +		    struct cred *cred, int *brand_action);  	int	(*exec_core)(struct vnode *vp, struct proc *p,  		    struct cred *cred, rlim64_t rlimit, int sig,  		    core_content_t content); @@ -213,8 +220,8 @@ extern int exece(const char *fname, const char **argp, const char **envp);  extern int exec_common(const char *fname, const char **argp,      const char **envp, int brand_action);  extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, -    struct intpdata *idata, int level, long *execsz, caddr_t exec_file, -    struct cred *cred, int brand_action); +    struct intpdata *idata, int level, size_t *execsz, caddr_t exec_file, +    struct cred *cred, int *brand_action);  extern struct execsw *allocate_execsw(char *name, char *magic,      size_t magic_size);  extern struct execsw *findexecsw(char *magic); @@ -239,26 +246,32 @@ extern void exec_set_sp(size_t);   * when compiling the 32-bit compatability elf code in the elfexec module.   */  extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, -    long *, int, caddr_t, cred_t *, int); +    size_t *, int, caddr_t, cred_t *, int *);  extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, -    intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); +    intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, +    uintptr_t *, uintptr_t *); +extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, uint_t *, caddr_t *, +    size_t *);  #endif /* !_ELF32_COMPAT */  #if defined(_LP64)  extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, -    long *, int, caddr_t, cred_t *, int); +    size_t *, int, caddr_t, cred_t *, int *);  extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, -    intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); +    intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, +    uintptr_t *, uintptr_t *); +extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, uint_t *, caddr_t *, +    size_t *);  #endif  /* _LP64 */  /*   * Utility functions for exec module core routines:   */ -extern int core_seg(proc_t *, vnode_t *, offset_t, caddr_t, -    size_t, rlim64_t, cred_t *); +extern int core_seg(proc_t *, vnode_t *, u_offset_t, caddr_t, size_t, +    rlim64_t, cred_t *); -extern int core_write(vnode_t *, enum uio_seg, offset_t, -    const void *, size_t, rlim64_t, cred_t *); +extern int core_write(vnode_t *, enum uio_seg, u_offset_t, const void *, +    size_t, rlim64_t, cred_t *);  /* a.out stuff */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index ec0741fe08..556a7ab2a1 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -27,13 +27,13 @@  /*	  All Rights Reserved	*/  /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ -/* Copyright 2015 Joyent, Inc. */ +/* Copyright 2017 Joyent, Inc. */  #ifndef _SYS_FILE_H  #define	_SYS_FILE_H  #include <sys/t_lock.h> -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL)  #include <sys/model.h>  #include <sys/user.h>  #endif @@ -122,11 +122,6 @@ typedef struct fpollinfo {  #if defined(_KERNEL) || defined(_FAKE_KERNEL)  /* - * This is a flag that is set on f_flag2, but is never user-visible - */ -#define	FEPOLLED	0x8000 - -/*   * Fake flags for driver ioctl calls to inform them of the originating   * process' model.  See <sys/model.h>   * @@ -200,6 +195,7 @@ struct vattr;  struct uf_info;  extern file_t *getf(int); +extern file_t *getf_gen(int, uf_entry_gen_t *);  extern void releasef(int);  extern void areleasef(int, struct uf_info *);  #ifndef	_BOOT @@ -226,6 +222,7 @@ extern void fcnt_add(struct uf_info *, int);  extern void close_exec(struct uf_info *);  extern void clear_stale_fd(void);  extern void clear_active_fd(int); +extern void set_active_fd(int);  extern void free_afd(afd_t *afd);  extern int fgetstartvp(int, char *, struct vnode **);  extern int fsetattrat(int, char *, int, struct vattr *); diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc.  All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define	_SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif	/* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { +	void	*fv_buf;	/* Buffer with data */ +	size_t	fv_buflen;	/* Size of the buffer */ +	size_t	fv_actlen;	/* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { +	uint_t	fio_version;	/* Should always be FRAMEIO_CURRENT_VERSION */ +	uint_t	fio_nvpf;	/* How many vectors make up one frame */ +	uint_t	fio_nvecs;	/* The total number of vectors */ +	framevec_t fio_vecs[];	/* C99 VLA */ +} frameio_t; + + +#define	FRAMEIO_VERSION_ONE	1 +#define	FRAMEIO_CURRENT_VERSION	FRAMEIO_VERSION_ONE + +#define	FRAMEIO_NVECS_MAX	32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { +	caddr32_t fv_buf; +	size32_t fv_buflen; +	size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { +	uint_t fio_version; +	uint_t fio_vecspframe; +	uint_t fio_nvecs; +	framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { +	MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, +    int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/fifonode.h b/usr/src/uts/common/sys/fs/fifonode.h index d8b158ce3c..1ea8563e1c 100644 --- a/usr/src/uts/common/sys/fs/fifonode.h +++ b/usr/src/uts/common/sys/fs/fifonode.h @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -83,6 +84,7 @@ struct fifonode {  	struct msgb	*fn_tail;	/* last message to read */  	fifolock_t	*fn_lock;	/* pointer to per fifo lock */  	uint_t		fn_count;	/* Number of bytes on fn_mp */ +	uint_t		fn_hiwat;	/* pipe (fifofast) high water */  	kcondvar_t	fn_wait_cv;	/* fifo conditional variable */  	ushort_t	fn_wcnt;	/* number of writers */  	ushort_t	fn_rcnt;	/* number of readers */ @@ -135,6 +137,8 @@ typedef struct fifodata {  #define	FIFOPOLLRBAND	0x20000  #define	FIFOSTAYFAST	0x40000	/* don't turn into stream mode */  #define	FIFOWAITMODE	0x80000	/* waiting for the possibility to change mode */ +/* Data on loan, block reads. Use in conjunction with FIFOSTAYFAST. */ +#define	FIFORDBLOCK	0x100000  #define	FIFOHIWAT	(16 * 1024)  #define	FIFOLOWAT	(0) @@ -147,16 +151,6 @@ typedef struct fifodata {  #if defined(_KERNEL) -/* - * Fifohiwat defined as a variable is to allow tuning of the high - * water mark if needed. It is not meant to be released. - */ -#if FIFODEBUG -extern int Fifohiwat; -#else /* FIFODEBUG */ -#define	Fifohiwat	FIFOHIWAT -#endif /* FIFODEBUG */ -  extern struct vnodeops *fifo_vnodeops;  extern const struct fs_operation_def fifo_vnodeops_template[];  extern struct kmem_cache *fnode_cache; @@ -181,6 +175,8 @@ extern void	fifo_fastoff(fifonode_t *);  extern struct streamtab *fifo_getinfo();  extern void	fifo_wakereader(fifonode_t *, fifolock_t *);  extern void	fifo_wakewriter(fifonode_t *, fifolock_t *); +extern boolean_t fifo_stayfast_enter(fifonode_t *); +extern void	fifo_stayfast_exit(fifonode_t *);  #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc.  All rights reserved. + */ + +#ifndef	_SYS_FS_HYPRLOFS_H +#define	_SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define	HYPRLOFS_IOC	('H' << 8) + +#define	HYPRLOFS_ADD_ENTRIES	(HYPRLOFS_IOC | 1) +#define	HYPRLOFS_RM_ENTRIES	(HYPRLOFS_IOC | 2) +#define	HYPRLOFS_RM_ALL		(HYPRLOFS_IOC | 3) +#define	HYPRLOFS_GET_ENTRIES	(HYPRLOFS_IOC | 4) + +typedef struct { +	char	*hle_path; +	uint_t	hle_plen; +	char	*hle_name; +	uint_t	hle_nlen; +} hyprlofs_entry_t; + +typedef struct { +	hyprlofs_entry_t	*hle_entries; +	uint_t			hle_len; +} hyprlofs_entries_t; + +typedef struct { +	char		hce_path[MAXPATHLEN]; +	char		hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { +	hyprlofs_curr_entry_t	*hce_entries; +	uint_t			hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { +	caddr32_t	hle_path; +	uint_t		hle_plen; +	caddr32_t	hle_name; +	uint_t		hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { +	caddr32_t	hle_entries; +	uint_t		hle_len; +} hyprlofs_entries32_t; + +typedef struct { +	caddr32_t	hce_entries; +	uint_t		hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef	_SYS_FS_HYPRLOFS_INFO_H +#define	_SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + *	hln_rwlock protects access of the directory list at hln_dir + *	as well as syncronizing read/writes to directory hlnodes. + *	hln_tlock protects updates to hln_mode and hln_nlink. + *	hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { +	struct hlnode	*hln_back;		/* linked list of hlnodes */ +	struct hlnode	*hln_forw;		/* linked list of hlnodes */ +	union { +		struct { +			struct hldirent	*un_dirlist; /* dirent list */ +			uint_t	un_dirents;	/* number of dirents */ +		} un_dirstruct; +		vnode_t	*un_realvp;		/* real vnode */ +	} un_hlnode; +	vnode_t 	*hln_vnode;		/* vnode for this hlnode */ +	int 		hln_gen;		/* pseudo gen num for hlfid */ +	int 		hln_looped;		/* flag indicating loopback */ +	vattr_t		hln_attr;		/* attributes */ +	krwlock_t	hln_rwlock;		/* rw - serialize mods and */ +						/* directory updates */ +	kmutex_t	hln_tlock;		/* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { +	vfs_t		*hlm_vfsp;	/* filesystem's vfs struct */ +	hlnode_t	*hlm_rootnode;	/* root hlnode */ +	char 		*hlm_mntpath;	/* name of hyprlofs mount point */ +	dev_t		hlm_dev;	/* unique dev # of mounted `device' */ +	uint_t		hlm_gen;	/* pseudo generation number for files */ +	kmutex_t	hlm_contents;	/* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes.  File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { +	hlnode_t	*hld_hlnode;		/* hlnode for this file */ +	struct hldirent	*hld_next;		/* next directory entry */ +	struct hldirent	*hld_prev;		/* prev directory entry */ +	uint_t		hld_offset;		/* "offset" of dir entry */ +	uint_t		hld_hash;		/* a hash of td_name */ +	struct hldirent	*hld_link;		/* linked via the hash table */ +	hlnode_t	*hld_parent;		/* parent, dir we are in */ +	char		*hld_name;		/* must be null terminated */ +						/* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { +	uint16_t hlfid_len; +	ino32_t	hlfid_ino; +	int32_t	hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define	VFSTOHLM(vfsp)		((hlfsmount_t *)(vfsp)->vfs_data) +#define	VTOHLM(vp)		((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define	VTOHLN(vp)		((hlnode_t *)(vp)->v_data) +#define	HLNTOV(tp)		((tp)->hln_vnode) +#define	REALVP(vp)		((vnode_t *)VTOHLN(vp)->hln_realvp) +#define	hlnode_hold(tp)		VN_HOLD(HLNTOV(tp)) +#define	hlnode_rele(tp)		VN_RELE(HLNTOV(tp)) + +#define	hln_dir		un_hlnode.un_dirstruct.un_dirlist +#define	hln_dirents	un_hlnode.un_dirstruct.un_dirents +#define	hln_realvp	un_hlnode.un_realvp + +/* + * Attributes + */ +#define	hln_mask	hln_attr.va_mask +#define	hln_type	hln_attr.va_type +#define	hln_mode	hln_attr.va_mode +#define	hln_uid		hln_attr.va_uid +#define	hln_gid		hln_attr.va_gid +#define	hln_fsid	hln_attr.va_fsid +#define	hln_nodeid	hln_attr.va_nodeid +#define	hln_nlink	hln_attr.va_nlink +#define	hln_size	hln_attr.va_size +#define	hln_atime	hln_attr.va_atime +#define	hln_mtime	hln_attr.va_mtime +#define	hln_ctime	hln_attr.va_ctime +#define	hln_rdev	hln_attr.va_rdev +#define	hln_blksize	hln_attr.va_blksize +#define	hln_nblocks	hln_attr.va_nblocks +#define	hln_seq		hln_attr.va_seq + +/* + * enums + */ +enum de_op	{ DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op	{ DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages.  Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define		HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t  hyprlofs_minfree;		/* Anonymous memory in pages */ + +extern	void	hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, +		    cred_t *); +extern	int	hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern	int	hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, +		    cred_t *); +extern	void	hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern	void	hyprlofs_dirtrunc(hlnode_t *); +extern	int	hyprlofs_taccess(void *, int, cred_t *); +extern	int	hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, +		    vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index 9f9ce5c8c1..d1c5f674f1 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -37,6 +37,7 @@ extern "C" {  #include <sys/vfs_opreg.h>  #include <sys/list.h>  #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h>  #include <sys/sunddi.h>  /* @@ -129,6 +130,21 @@ typedef struct sdev_local_data {  	struct sdev_dprof sdev_lprof;	/* profile for multi-inst */  } sdev_local_data_t; +/* sdev_flags */ +typedef enum sdev_flags { +	SDEV_BUILD =		0x0001,	/* directory cache out-of-date */ +	SDEV_GLOBAL =		0x0002,	/* global /dev nodes */ +	SDEV_PERSIST =		0x0004,	/* backing store persisted node */ +	SDEV_NO_NCACHE = 	0x0008,	/* do not include in neg. cache */ +	SDEV_DYNAMIC =		0x0010,	/* special-purpose vnode ops */ +					/* (ex: pts) */ +	SDEV_VTOR =		0x0020,	/* validate sdev_nodes during search */ +	SDEV_ATTR_INVALID =	0x0040,	/* invalid node attributes, */ +					/* need update */ +	SDEV_SUBDIR =		0x0080,	/* match all subdirs under here */ +	SDEV_ZONED =		0x0100	/* zoned subdir */ +} sdev_flags_t; +  /*   * /dev filesystem sdev_node defines   */ @@ -151,7 +167,7 @@ typedef struct sdev_node {  	ino64_t		sdev_ino;	/* inode */  	uint_t		sdev_nlink;	/* link count */  	int		sdev_state;	/* state of this node */ -	int		sdev_flags;	/* flags bit */ +	sdev_flags_t	sdev_flags;	/* flags bit */  	kmutex_t	sdev_lookup_lock; /* node creation synch lock */  	kcondvar_t	sdev_lookup_cv;	/* node creation sync cv */ @@ -162,7 +178,7 @@ typedef struct sdev_node {  		struct sdev_global_data	sdev_globaldata;  		struct sdev_local_data	sdev_localdata;  	} sdev_instance_data; - +	list_node_t	sdev_plist;	/* link on plugin list */  	void		*sdev_private;  } sdev_node_t; @@ -193,29 +209,11 @@ typedef enum {  	SDEV_READY  } sdev_node_state_t; -/* sdev_flags */ -#define	SDEV_BUILD		0x0001	/* directory cache out-of-date */ -#define	SDEV_GLOBAL		0x0002	/* global /dev nodes */ -#define	SDEV_PERSIST		0x0004	/* backing store persisted node */ -#define	SDEV_NO_NCACHE		0x0008	/* do not include in neg. cache */ -#define	SDEV_DYNAMIC		0x0010	/* special-purpose vnode ops */ -					/* (ex: pts) */ -#define	SDEV_VTOR		0x0020	/* validate sdev_nodes during search */ -#define	SDEV_ATTR_INVALID	0x0040	/* invalid node attributes, */ -					/* need update */ -#define	SDEV_SUBDIR		0x0080	/* match all subdirs under here */ -#define	SDEV_ZONED		0x0100  /* zoned subdir */ -  /* sdev_lookup_flags */  #define	SDEV_LOOKUP	0x0001	/* node creation in progress */  #define	SDEV_READDIR	0x0002	/* VDIR readdir in progress */  #define	SDEV_LGWAITING	0x0004	/* waiting for devfsadm completion */ -#define	SDEV_VTOR_INVALID	-1 -#define	SDEV_VTOR_SKIP		0 -#define	SDEV_VTOR_VALID		1 -#define	SDEV_VTOR_STALE		2 -  /* convenient macros */  #define	SDEV_IS_GLOBAL(dv)	\  	(dv->sdev_flags & SDEV_GLOBAL) @@ -368,8 +366,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *,  extern int devname_profile_update(char *, size_t);  extern struct sdev_data *sdev_find_mntinfo(char *);  void sdev_mntinfo_rele(struct sdev_data *); +typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *); +void sdev_mnt_walk(sdev_mnt_walk_f, void *);  extern struct vnodeops *devpts_getvnodeops(void);  extern struct vnodeops *devvt_getvnodeops(void); +extern void sdev_plugin_nodeready(struct sdev_node *); +extern int sdev_plugin_init(void); +extern int sdev_plugin_fini(void);  /*   * boot states - warning, the ordering here is significant @@ -515,6 +518,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *);  extern void sdev_modctl_dump_files(void);  /* + * plugin and legacy vtab stuff + */ +/* directory dependent vop table */ +typedef struct sdev_vop_table { +	char *vt_name;				/* subdirectory name */ +	const fs_operation_def_t *vt_service;	/* vnodeops table */ +	struct vnodeops **vt_global_vops;	/* global container for vop */ +	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */ +	int vt_flags; +} sdev_vop_table_t; + +extern struct sdev_vop_table vtab[]; +extern struct vnodeops *sdev_get_vop(struct sdev_node *); +extern void sdev_set_no_negcache(struct sdev_node *); +extern void *sdev_get_vtor(struct sdev_node *dv); + +/*   * globals   */  extern kmutex_t sdev_lock; @@ -527,6 +547,7 @@ extern struct vnodeops		*devipnet_vnodeops;  extern struct vnodeops		*devvt_vnodeops;  extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */  extern struct vnodeops		*devzvol_vnodeops; +extern int			sdev_vnodeops_tbl_size;  extern const fs_operation_def_t	sdev_vnodeops_tbl[];  extern const fs_operation_def_t	devpts_vnodeops_tbl[]; diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h new file mode 100644 index 0000000000..f4ed813c1e --- /dev/null +++ b/usr/src/uts/common/sys/fs/sdev_plugin.h @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#ifndef _SYS_SDEV_PLUGIN_H +#define	_SYS_SDEV_PLUGIN_H + +/* + * Kernel sdev plugin interface + */ + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/vnode.h> + +#endif	/* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef uintptr_t sdev_plugin_hdl_t; +typedef uintptr_t sdev_ctx_t; + +/* + * Valid return values for sdev_plugin_validate_t. + */ +typedef enum sdev_plugin_validate { +	SDEV_VTOR_INVALID = -1, +	SDEV_VTOR_SKIP = 0, +	SDEV_VTOR_VALID	= 1, +	SDEV_VTOR_STALE	= 2 +} sdev_plugin_validate_t; + +/* + * Valid flags + */ +typedef enum sdev_plugin_flags { +	SDEV_PLUGIN_NO_NCACHE = 0x1, +	SDEV_PLUGIN_SUBDIR = 0x2 +} sdev_plugin_flags_t; + +#define	SDEV_PLUGIN_FLAGS_MASK	0x3 + +/* + * Functions a module must implement + */ +typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t); +typedef int (*sp_filldir_f)(sdev_ctx_t); +typedef void (*sp_inactive_f)(sdev_ctx_t); + +#define	SDEV_PLUGIN_VERSION	1 + +typedef struct sdev_plugin_ops { +	int spo_version; +	sdev_plugin_flags_t spo_flags; +	sp_valid_f spo_validate; +	sp_filldir_f spo_filldir; +	sp_inactive_f spo_inactive; +} sdev_plugin_ops_t; + +extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *, +    int *); +extern int sdev_plugin_unregister(sdev_plugin_hdl_t); + +typedef enum sdev_ctx_flags { +	SDEV_CTX_GLOBAL = 0x2	/* node belongs to the GZ */ +} sdev_ctx_flags_t; + +/* + * Context helper functions + */ +extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t); +extern const char *sdev_ctx_name(sdev_ctx_t); +extern const char *sdev_ctx_path(sdev_ctx_t); +extern int sdev_ctx_minor(sdev_ctx_t, minor_t *); +extern enum vtype sdev_ctx_vtype(sdev_ctx_t); + +/* + * Callbacks to manipulate nodes + */ +extern int sdev_plugin_mkdir(sdev_ctx_t, char *); +extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t); + +#endif	/* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDEV_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index fb07de6588..f4cee09244 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -23,7 +23,7 @@   * All rights reserved.  Use is subject to license terms.   */  /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc.   */  #ifndef	_SYS_FS_TMP_H @@ -43,8 +43,10 @@ struct tmount {  	struct vfs	*tm_vfsp;	/* filesystem's vfs struct */  	struct tmpnode	*tm_rootnode;	/* root tmpnode */  	char 		*tm_mntpath;	/* name of tmpfs mount point */ -	ulong_t		tm_anonmax;	/* file system max anon reservation */ -	pgcnt_t		tm_anonmem;	/* pages of reserved anon memory */ +	size_t		tm_anonmax;	/* file system max anon reservation */ +	size_t		tm_anonmem;	/* bytes of reserved anon memory */ +					/* and allocated kmem for the fs */ +	size_t		tm_allocmem;	/* bytes alloced from tmp_kmem_ funcs */  	dev_t		tm_dev;		/* unique dev # of mounted `device' */  	uint_t		tm_gen;		/* pseudo generation number for files */  	kmutex_t	tm_contents;	/* lock for tmount structure */ @@ -58,6 +60,7 @@ struct tmount {  #define	VTOTM(vp)		((struct tmount *)(vp)->v_vfsp->vfs_data)  #define	VTOTN(vp)		((struct tmpnode *)(vp)->v_data)  #define	TNTOV(tp)		((tp)->tn_vnode) +#define	TNTOTM(tp)		(VTOTM(TNTOV(tp)))  #define	tmpnode_hold(tp)	VN_HOLD(TNTOV(tp))  #define	tmpnode_rele(tp)	VN_RELE(TNTOV(tp)) @@ -69,41 +72,39 @@ enum dr_op	{ DR_REMOVE, DR_RMDIR, DR_RENAME };	/* dirremove ops */  /*   * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system.  E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree.  The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system.  In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration.  As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB.  Care + * should be exercised in changing this:  tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure.   */ -#define	TMPMINFREE	2 * 1024 * 1024	/* 2 Megabytes */ +#define	TMPMINFREE	16 * 1024 * 1024	/* 16 Megabytes */  extern size_t	tmpfs_minfree;		/* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define	TMPMAXFRACKMEM	25	/* 1/25 of physical memory */ - -extern size_t 	tmp_kmemspace; -extern size_t	tmpfs_maxkmem;	/* Allocatable kernel memory in bytes */ -  extern	void	tmpnode_init(struct tmount *, struct tmpnode *,  	struct vattr *, struct cred *); +extern	void	tmpnode_cleanup(struct tmpnode *tp);  extern	int	tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t);  extern	void	tmpnode_growmap(struct tmpnode *, ulong_t);  extern	int	tdirlookup(struct tmpnode *, char *, struct tmpnode **,      struct cred *);  extern	int	tdirdelete(struct tmpnode *, struct tmpnode *, char *,  	enum dr_op, struct cred *); -extern	void	tdirinit(struct tmpnode *, struct tmpnode *); +extern	int	tdirinit(struct tmpnode *, struct tmpnode *);  extern	void	tdirtrunc(struct tmpnode *); -extern	void	*tmp_memalloc(size_t, int); -extern	void	tmp_memfree(void *, size_t);  extern	int	tmp_resv(struct tmount *, struct tmpnode *, size_t, int);  extern	int	tmp_taccess(void *, int, struct cred *);  extern	int	tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, @@ -114,6 +115,9 @@ extern	int	tdirenter(struct tmount *, struct tmpnode *, char *,  	enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *,  	struct tmpnode **, struct cred *, caller_context_t *); +extern void	*tmp_kmem_zalloc(struct tmount *, size_t, int); +extern void	tmp_kmem_free(struct tmount *, void *, size_t); +  #define	TMP_MUSTHAVE	0x01  #ifdef	__cplusplus diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 2d4e1aa7fb..4a48af52a1 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -21,13 +21,12 @@  /*   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  #ifndef _SYS_FX_H  #define	_SYS_FX_H -#pragma ident	"%Z%%M%	%I%	%E% SMI" -  #include <sys/types.h>  #include <sys/thread.h>  #include <sys/ddi.h> @@ -145,7 +144,14 @@ typedef struct	fxkparms {  	uint_t	fx_cflags;  } fxkparms_t; +/* + * control flags (kparms->fx_cflags). + */ +#define	FX_DOUPRILIM	0x01	/* change user priority limit */ +#define	FX_DOUPRI	0x02	/* change user priority */ +#define	FX_DOTQ		0x04	/* change FX time quantum */ +#define	FXMAXUPRI	60	/* maximum user priority setting */  /*   * Interface for partner private code. This is not a public interface. diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..91ab46fc44 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,59 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_GSQUEUE_H +#define	_SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define	GSQUEUE_FILL	0x0001 +#define	GSQUEUE_NODRAIN	0x0002 +#define	GSQUEUE_PROCESS	0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, +    int, uint8_t); + +#define	GSQUEUE_DEFAULT_PRIORITY	MAXCLSYSPRI + +#endif	/* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h index d8a15f0fe5..f3337bbacf 100644 --- a/usr/src/uts/common/sys/hook_impl.h +++ b/usr/src/uts/common/sys/hook_impl.h @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018, Joyent, Inc.   */  /* @@ -171,7 +172,7 @@ typedef struct hook_family_int {  	cvwaitlock_t			hfi_lock;  	SLIST_ENTRY(hook_family_int)	hfi_entry;  	hook_event_int_head_t		hfi_head; -	hook_family_t 			hfi_family; +	hook_family_t			hfi_family;  	kstat_t				*hfi_kstat;  	struct hook_stack		*hfi_stack;  	hook_notify_head_t		hfi_nhead; @@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t;  #define	Hn_ARP	"arp"  #define	Hn_IPV4	"inet"  #define	Hn_IPV6	"inet6" +#define	Hn_VIONA "viona_inet"  extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t);  extern int hook_register(hook_family_int_t *, char *, hook_t *); diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index d56fcceb5a..46d25f207f 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc.  All Rights reserved.   */  #ifndef	_ID_SPACE_H @@ -34,8 +35,6 @@ extern "C" {  #include <sys/mutex.h>  #include <sys/vmem.h> -#ifdef _KERNEL -  typedef vmem_t id_space_t;  id_space_t *id_space_create(const char *, id_t, id_t); @@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *);  id_t id_alloc_specific_nosleep(id_space_t *, id_t);  void id_free(id_space_t *, id_t); -#endif /* _KERNEL */ -  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc.  All rights reserved. + */ + +/* + * Header file to support for the inotify facility.  Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define	_SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define	IN_ACCESS		0x00000001 +#define	IN_MODIFY		0x00000002 +#define	IN_ATTRIB		0x00000004 +#define	IN_CLOSE_WRITE		0x00000008 +#define	IN_CLOSE_NOWRITE	0x00000010 +#define	IN_OPEN			0x00000020 +#define	IN_MOVED_FROM		0x00000040 +#define	IN_MOVED_TO		0x00000080 +#define	IN_CREATE		0x00000100 +#define	IN_DELETE		0x00000200 +#define	IN_DELETE_SELF		0x00000400 +#define	IN_MOVE_SELF		0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define	IN_UNMOUNT		0x00002000 +#define	IN_Q_OVERFLOW		0x00004000 +#define	IN_IGNORED		0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define	IN_ONLYDIR		0x01000000 +#define	IN_DONT_FOLLOW		0x02000000 +#define	IN_EXCL_UNLINK		0x04000000 +#define	IN_MASK_ADD		0x20000000 +#define	IN_ISDIR		0x40000000 +#define	IN_ONESHOT		0x80000000 + +/* + * Helpful constants. + */ +#define	IN_CLOSE		(IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define	IN_MOVE			(IN_MOVED_FROM | IN_MOVED_TO) +#define	IN_ALL_EVENTS		\ +	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ +	IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ +	IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define	IN_CHILD_EVENTS		\ +	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ +	IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define	IN_CLOEXEC		02000000		/* LX_O_CLOEXEC */ +#define	IN_NONBLOCK		04000			/* LX_O_NONBLOCK */ + +struct inotify_event { +	int32_t		wd;		/* watch descriptor */ +	uint32_t	mask;		/* mask of events */ +	uint32_t	cookie;		/* event association cookie, if any */ +	uint32_t	len;		/* size of name field */ +	char		name[];		/* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define	INOTIFYIOC		(('i' << 24) | ('n' << 16) | ('y' << 8)) +#define	INOTIFYIOC_ADD_WATCH	(INOTIFYIOC | 1)	/* add watch */ +#define	INOTIFYIOC_RM_WATCH	(INOTIFYIOC | 2)	/* remove watch */ +#define	INOTIFYIOC_ADD_CHILD	(INOTIFYIOC | 3)	/* add child watch */ +#define	INOTIFYIOC_ACTIVATE	(INOTIFYIOC | 4)	/* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define	INOTIFY_PTR(type, name)	uint32_t name##pad; type *name +#else +#define	INOTIFY_PTR(type, name)	type *name; uint32_t name##pad +#endif +#else +#define	INOTIFY_PTR(type, name)	type *name +#endif + +typedef struct inotify_addwatch { +	int inaw_fd;			/* open fd for object */ +	uint32_t inaw_mask;		/* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { +	INOTIFY_PTR(char, inac_name);	/* pointer to name */ +	int inac_fd;			/* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define	IN_UNMASKABLE \ +	(IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define	IN_MODIFIERS \ +	(IN_EXCL_UNLINK | IN_ONESHOT) + +#define	IN_FLAGS \ +	(IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define	IN_REMOVAL		(1ULL << 32) +#define	INOTIFYMNRN_INOTIFY	0 +#define	INOTIFYMNRN_CLONE	1 + +#endif /* _KERNEL */ + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h index 0569c3e967..d7dc365c09 100644 --- a/usr/src/uts/common/sys/ipc_impl.h +++ b/usr/src/uts/common/sys/ipc_impl.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc.   */  #ifndef	_IPC_IMPL_H @@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *);  kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *);  void ipc_cleanup(ipc_service_t *, kipc_perm_t *); +void ipc_rmsvc(ipc_service_t *, kipc_perm_t *);  int ipc_rmid(ipc_service_t *, int, cred_t *);  int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h index bad74f8b81..f21c3fb5af 100644 --- a/usr/src/uts/common/sys/ipd.h +++ b/usr/src/uts/common/sys/ipd.h @@ -20,7 +20,7 @@   */  /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved.   */  /* @@ -35,7 +35,7 @@ extern "C" {  #endif  #define	IPD_DEV_PATH	"/dev/ipd" -#define	IPD_MAX_DELAY	10000		/* 10 ms in us */ +#define	IPD_MAX_DELAY	1000000		/* 1 second in microseconds */  typedef struct ipd_ioc_perturb {  	zoneid_t	ipip_zoneid; diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index bf89ef0d33..0a76ee19a7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -95,7 +96,7 @@ extern "C" {  /* insert new signals here, and move _SIGRTM* appropriately */  #define	_SIGRTMIN 42	/* first (highest-priority) realtime signal */ -#define	_SIGRTMAX 73	/* last (lowest-priority) realtime signal */ +#define	_SIGRTMAX 74	/* last (lowest-priority) realtime signal */  extern long _sysconf(int);	/* System Private interface to sysconf() */  #define	SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN))	/* first realtime signal */  #define	SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX))	/* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..0ea1a396b9 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@   */  /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  #ifndef	_SYS_KLWP_H @@ -191,7 +191,14 @@ typedef struct _klwp {  	struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */  	struct contract	*lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ -	void	*lwp_brand;		/* per-lwp brand data */ +	/* +	 * Branding: +	 * lwp_brand			- per-lwp brand data +	 * lwp_brand_syscall		- brand syscall interposer +	 */ +	void	*lwp_brand; +	int	(*lwp_brand_syscall)(void); +  	struct psinfo *lwp_spymaster;	/* if an agent LWP, our spymaster */  } klwp_t; diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h index 2396ef4625..d52a54f6b7 100644 --- a/usr/src/uts/common/sys/kobj.h +++ b/usr/src/uts/common/sys/kobj.h @@ -24,6 +24,9 @@   *   * Copyright 2017 RackTop Systems.   */ +/* + * Copyright (c) 2017 Joyent, Inc. + */  #ifndef _SYS_KOBJ_H  #define	_SYS_KOBJ_H @@ -47,6 +50,12 @@ struct module_list {  	struct module *mp;  }; +typedef struct hotinline_desc { +	char	*hid_symname;		/* symbol name */ +	uintptr_t hid_instr_offset;	/* offset of call in text */ +	struct hotinline_desc *hid_next;	/* next hotinline */ +} hotinline_desc_t; +  typedef unsigned short	symid_t;		/* symbol table index */  typedef unsigned char	*reloc_dest_t; @@ -99,6 +108,8 @@ struct module {  	caddr_t textwin;  	caddr_t textwin_base; +	hotinline_desc_t *hi_calls; +  	sdt_probedesc_t *sdt_probes;  	size_t sdt_nprobes;  	char *sdt_tab; @@ -187,6 +198,7 @@ extern int kobj_read_file(struct _buf *, char *, unsigned, unsigned);  extern int kobj_get_filesize(struct _buf *, uint64_t *size);  extern uintptr_t kobj_getelfsym(char *, void *, int *);  extern void kobj_set_ctf(struct module *, caddr_t data, size_t size); +extern void do_hotinlines(struct module *);  extern int kobj_filbuf(struct _buf *);  extern void kobj_sync(void); diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index 5d8827f1ae..d720caa631 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -21,6 +21,7 @@  /*   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc.   */  #ifndef _SYS_KSOCKET_H_ @@ -122,6 +123,11 @@ extern int 	ksocket_close(ksocket_t, struct cred *);  extern void	ksocket_hold(ksocket_t);  extern void	ksocket_rele(ksocket_t); +typedef boolean_t (*ksocket_krecv_f)(ksocket_t, struct msgb *, size_t, int, +		    void *); +extern int	ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *); +extern void	ksocket_krecv_unblock(ksocket_t); +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h new file mode 100644 index 0000000000..88625d1829 --- /dev/null +++ b/usr/src/uts/common/sys/limits.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc.  All rights reserved. + */ + +#ifndef _SYS_LIMITS_H +#define	_SYS_LIMITS_H + +#ifdef	__cplusplus +extern "C" { +#endif + +#define	IOV_MAX	1024 + +#ifdef _KERNEL +#define	IOV_MAX_STACK	16	/* max. IOV on-stack allocation */ +#endif /* _KERNEL */ + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SYS_LIMITS_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 0907d6deff..afe554ba03 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@  /*   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc.   * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>   */ @@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s {  } mac_propval_uint32_range_t;  /* + * Defines ranges which are a series of C style strings. + */ +typedef struct mac_propval_str_range_s { +	uint32_t mpur_nextbyte; +	char mpur_data[1]; +} mac_propval_str_range_t; + +/*   * Data type of property values.   */  typedef enum { @@ -120,6 +128,7 @@ typedef struct mac_propval_range_s {  	mac_propval_type_t mpr_type;		/* type of value */  	union {  		mac_propval_uint32_range_t mpr_uint32[1]; +		mac_propval_str_range_t mpr_str;  	} u;  } mac_propval_range_t; @@ -614,6 +623,36 @@ typedef struct mactype_register_s {  } mactype_register_t;  /* + * Flags to describe the hardware emulation desired from a client when + * calling mac_hw_emul(). + * + * MAC_HWCKSUM_EMUL + * + *	If an mblk is marked with HCK_* flags, then calculate those + *	checksums and update the checksum flags. + * + * MAC_IPCKSUM_EMUL + * + *	Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header + *	checksum. We still update both the IPv4 and ULP checksum + *	flags. + * + * MAC_LSO_EMUL + * + *	If an mblk is marked with HW_LSO, then segment the LSO mblk + *	into a new chain of mblks which reference the original data + *	block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the + *	caller needs both then it must set both. + */ +typedef enum mac_emul { +	MAC_HWCKSUM_EMUL = (1 << 0), +	MAC_IPCKSUM_EMUL = (1 << 1), +	MAC_LSO_EMUL = (1 << 2) +} mac_emul_t; + +#define	MAC_HWCKSUM_EMULS	(MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL) + +/*   * Driver interface functions.   */  extern int			mac_open_by_linkid(datalink_id_t, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 0fc4939503..3290db92e6 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -22,7 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright 2013 Joyent, Inc.  All rights reserved. + * Copyright 2018 Joyent, Inc.  All rights reserved.   */  /* @@ -88,6 +88,7 @@ typedef enum {  } mac_client_promisc_type_t;  /* flags passed to mac_unicast_add() */ +  #define	MAC_UNICAST_NODUPCHECK			0x0001  #define	MAC_UNICAST_PRIMARY			0x0002  #define	MAC_UNICAST_HW				0x0004 @@ -115,6 +116,7 @@ typedef enum {  #define	MAC_PROMISC_FLAGS_NO_PHYS		0x0002  #define	MAC_PROMISC_FLAGS_VLAN_TAG_STRIP	0x0004  #define	MAC_PROMISC_FLAGS_NO_COPY		0x0008 +#define	MAC_PROMISC_FLAGS_DO_FIXUPS		0x0010  /* flags passed to mac_tx() */  #define	MAC_DROP_ON_NO_DESC	0x01 /* freemsg() if no tx descs */ @@ -198,6 +200,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);  extern void mac_client_set_rings(mac_client_handle_t, int, int); +extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t); +  #endif	/* _KERNEL */  #ifdef	__cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 9b3b4fe369..21e8620121 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -24,7 +24,7 @@   * Copyright (c) 2012, Joyent, Inc.  All rights reserved.   */  /* - * Copyright (c) 2013, Joyent, Inc.  All rights reserved. + * Copyright 2018 Joyent, Inc.   */  #ifndef	_SYS_MAC_CLIENT_IMPL_H @@ -57,7 +57,7 @@ typedef struct mac_unicast_impl_s {			/* Protected by */  	uint16_t			mui_vid;	/* SL */  } mac_unicast_impl_t; -#define	MAC_CLIENT_FLAGS_PRIMARY		0X0001 +#define	MAC_CLIENT_FLAGS_PRIMARY		0x0001  #define	MAC_CLIENT_FLAGS_VNIC_PRIMARY		0x0002  #define	MAC_CLIENT_FLAGS_MULTI_PRIMARY		0x0004  #define	MAC_CLIENT_FLAGS_PASSIVE_PRIMARY	0x0008 @@ -83,6 +83,7 @@ typedef struct mac_promisc_impl_s {			/* Protected by */  	boolean_t			mpi_no_phys;	/* WO */  	boolean_t			mpi_strip_vlan_tag;	/* WO */  	boolean_t			mpi_no_copy;	/* WO */ +	boolean_t			mpi_do_fixups;	/* WO */  } mac_promisc_impl_t;  typedef union mac_tx_percpu_s { @@ -131,12 +132,17 @@ struct mac_client_impl_s {			/* Protected by */  	uint32_t		mci_flags;		/* SL */  	krwlock_t		mci_rw_lock;  	mac_unicast_impl_t	*mci_unicast_list;	/* mci_rw_lock */ +  	/*  	 * The mac_client_impl_t may be shared by multiple clients, i.e  	 * multiple VLANs sharing the same MAC client. In this case the -	 * address/vid tubles differ and are each associated with their +	 * address/vid tuples differ and are each associated with their  	 * own flow entry, but the rest underlying components SRS, etc,  	 * are common. +	 * +	 * This is only needed to support sun4v vsw. There are several +	 * places in MAC we could simplify the code if we removed +	 * sun4v support.  	 */  	flow_entry_t		*mci_flent_list;	/* mci_rw_lock */  	uint_t			mci_nflents;		/* mci_rw_lock */ @@ -313,6 +319,74 @@ extern	int	mac_tx_percpu_cnt;  	(((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 &&		\  	(mcip)->mci_nvids == 1)						\ +/* + * MAC Client Implementation State (mci_state_flags) + * + * MCIS_IS_VNIC + * + *	The client is a VNIC. + * + * MCIS_EXCLUSIVE + * + *	The client has exclusive control over the MAC, such that it is + *	the sole client of the MAC. + * + * MCIS_TAG_DISABLE + * + *	MAC will not add VLAN tags to outgoing traffic. If this flag + *	is set it is up to the client to add the correct VLAN tag. + * + * MCIS_STRIP_DISABLE + * + *	MAC will not strip the VLAN tags on incoming traffic before + *	passing it to mci_rx_fn. This only applies to non-bypass + *	traffic. + * + * MCIS_IS_AGGR_PORT + * + *	The client represents a port on an aggr. + * + * MCIS_CLIENT_POLL_CAPABLE + * + *	The client is capable of polling the Rx TCP/UDP softrings. + * + * MCIS_DESC_LOGGED + * + *	This flag is set when the client's link info has been logged + *	by the mac_log_linkinfo() timer. This ensures that the + *	client's link info is only logged once. + * + * MCIS_SHARE_BOUND + * + *	This client has an HIO share bound to it. + * + * MCIS_DISABLE_TX_VID_CHECK + * + *	MAC will not check the VID of the client's Tx traffic. + * + * MCIS_USE_DATALINK_NAME + * + *	The client is using the same name as its underlying MAC. This + *	happens when dlmgmtd is unreachable during client creation. + * + * MCIS_UNICAST_HW + * + *	The client requires MAC address hardware classification. This + *	is only used by sun4v vsw. + * + * MCIS_IS_AGGR_CLIENT + * + *	The client sits atop an aggr. + * + * MCIS_RX_BYPASS_DISABLE + * + *	Do not allow the client to enable DLS bypass. + * + * MCIS_NO_UNICAST_ADDR + * + *	This client has no MAC unicast addresss associated with it. + * + */  /* MCI state flags */  #define	MCIS_IS_VNIC			0x0001  #define	MCIS_EXCLUSIVE			0x0002 @@ -325,7 +399,7 @@ extern	int	mac_tx_percpu_cnt;  #define	MCIS_DISABLE_TX_VID_CHECK	0x0100  #define	MCIS_USE_DATALINK_NAME		0x0200  #define	MCIS_UNICAST_HW			0x0400 -#define	MCIS_IS_AGGR			0x0800 +#define	MCIS_IS_AGGR_CLIENT		0x0800  #define	MCIS_RX_BYPASS_DISABLE		0x1000  #define	MCIS_NO_UNICAST_ADDR		0x2000 @@ -337,8 +411,7 @@ extern	int	mac_tx_percpu_cnt;  extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);  extern void mac_client_init(void);  extern void mac_client_fini(void); -extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, -    mac_client_impl_t *); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *);  extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 6b409513a6..97b3fd685a 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -22,7 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright 2013 Joyent, Inc.  All rights reserved. + * Copyright 2018 Joyent, Inc.   */  /* @@ -58,6 +58,9 @@ extern const mac_info_t *mac_info(mac_handle_t);  extern boolean_t mac_info_get(const char *, mac_info_t *);  extern boolean_t mac_promisc_get(mac_handle_t); +extern boolean_t mac_protect_check_addr(mac_client_handle_t, boolean_t, +    in6_addr_t *); +  extern int mac_start(mac_handle_t);  extern void mac_stop(mac_handle_t); @@ -121,9 +124,17 @@ extern void mac_tx_client_quiesce(mac_client_handle_t);  extern void mac_tx_client_condemn(mac_client_handle_t);  extern void mac_tx_client_restart(mac_client_handle_t);  extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); +extern uint_t mac_hwrings_idx_get(mac_handle_t, uint_t, mac_group_handle_t *, +    mac_ring_handle_t *, mac_ring_type_t);  extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *,      mac_ring_handle_t *, mac_ring_type_t);  extern uint_t mac_hwring_getinfo(mac_ring_handle_t); +extern void mac_hwring_set_passthru(mac_ring_handle_t, mac_rx_t, void *, +    mac_resource_handle_t); +extern void mac_hwring_clear_passthru(mac_ring_handle_t); +extern void mac_client_set_flow_cb(mac_client_handle_t, mac_rx_t, void *); +extern void mac_client_clear_flow_cb(mac_client_handle_t); +  extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t,      mac_ring_handle_t);  extern void mac_hwring_teardown(mac_ring_handle_t); @@ -131,6 +142,8 @@ extern int mac_hwring_disable_intr(mac_ring_handle_t);  extern int mac_hwring_enable_intr(mac_ring_handle_t);  extern int mac_hwring_start(mac_ring_handle_t);  extern void mac_hwring_stop(mac_ring_handle_t); +extern int mac_hwring_activate(mac_ring_handle_t); +extern void mac_hwring_quiesce(mac_ring_handle_t);  extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int);  extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *);  extern int mac_hwring_getstat(mac_ring_handle_t, uint_t, uint64_t *); @@ -144,6 +157,13 @@ extern void mac_hwring_set_default(mac_handle_t, mac_ring_handle_t);  extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *);  extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *); +extern int mac_hwgroup_addvlan(mac_group_handle_t, uint16_t); +extern int mac_hwgroup_remvlan(mac_group_handle_t, uint16_t); + +extern boolean_t mac_has_hw_vlan(mac_handle_t); + +extern uint_t mac_get_num_rx_groups(mac_handle_t); +extern int mac_set_promisc(mac_handle_t, boolean_t);  extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t,      mac_resource_props_t *); @@ -171,6 +191,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);  extern void *mac_get_devinfo(mac_handle_t);  extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t);  extern uint32_t mac_no_notification(mac_handle_t);  extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h index e290ba7dbe..d37752ec23 100644 --- a/usr/src/uts/common/sys/mac_flow.h +++ b/usr/src/uts/common/sys/mac_flow.h @@ -22,7 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright 2013 Joyent, Inc.  All rights reserved. + * Copyright 2017 Joyent, Inc.  All rights reserved.   */  #ifndef	_MAC_FLOW_H @@ -155,6 +155,14 @@ typedef enum {  #define	MPT_MAXIPADDR		MPT_MAXCNT  #define	MPT_MAXCID		MPT_MAXCNT  #define	MPT_MAXCIDLEN		256 +#define	MPT_FALSE		0x00000000 +#define	MPT_TRUE		0x00000001 + +/* Dynamic address detection types */ +#define	MPT_DYN_DHCPV4		0x00000001 +#define	MPT_DYN_DHCPV6		0x00000002 +#define	MPT_DYN_SLAAC		0x00000004 +#define	MPT_DYN_ALL		0x00000007  typedef struct mac_ipaddr_s {  	uint32_t	ip_version; @@ -175,11 +183,13 @@ typedef struct mac_dhcpcid_s {  } mac_dhcpcid_t;  typedef struct mac_protect_s { -	uint32_t	mp_types; -	uint32_t	mp_ipaddrcnt; -	mac_ipaddr_t	mp_ipaddrs[MPT_MAXIPADDR]; -	uint32_t	mp_cidcnt; -	mac_dhcpcid_t	mp_cids[MPT_MAXCID]; +	uint32_t	mp_types;	/* Enabled protection types */ +	uint32_t	mp_ipaddrcnt;	/* Count of allowed IPs */ +	mac_ipaddr_t	mp_ipaddrs[MPT_MAXIPADDR]; /* Allowed IPs */ +	uint32_t	mp_cidcnt;	/* Count of allowed DHCP CIDs */ +	mac_dhcpcid_t	mp_cids[MPT_MAXCID]; /* Allowed DHCP CIDs */ +	uint32_t	mp_allcids;	/* Whether to allow all CIDs through */ +	uint32_t	mp_dynamic;	/* Enabled dynamic address methods */  } mac_protect_t;  /* The default priority for links */ diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 774c4fad9a..593322b990 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,7 +20,7 @@   */  /*   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc.   */  #ifndef	_SYS_MAC_IMPL_H @@ -208,9 +208,18 @@ struct mac_ring_s {  	mac_ring_t		*mr_next;	/* next ring in the chain */  	mac_group_handle_t	mr_gh;		/* reference to group */ -	mac_classify_type_t	mr_classify_type;	/* HW vs SW */ +	mac_classify_type_t	mr_classify_type;  	struct mac_soft_ring_set_s *mr_srs;	/* associated SRS */ -	mac_ring_handle_t	mr_prh;		/* associated pseudo ring hdl */ +	mac_ring_handle_t	mr_prh;	/* associated pseudo ring hdl */ + +	/* +	 * Ring passthru callback and arguments. See the +	 * MAC_PASSTHRU_CLASSIFIER comment in mac_provider.h. +	 */ +	mac_rx_t		mr_pt_fn; +	void			*mr_pt_arg1; +	mac_resource_handle_t	mr_pt_arg2; +  	uint_t			mr_refcnt;	/* Ring references */  	/* ring generation no. to guard against drivers using stale rings */  	uint64_t		mr_gen_num; @@ -255,8 +264,8 @@ struct mac_ring_s {  }  /* - * Per mac client flow information associated with a RX group. - * The entire structure is SL protected. + * Used to attach MAC clients to an Rx group. The members are SL + * protected.   */  typedef struct mac_grp_client {  	struct mac_grp_client		*mgc_next; @@ -270,15 +279,20 @@ typedef struct mac_grp_client {  	((g)->mrg_clients->mgc_next == NULL)) ?		\  	(g)->mrg_clients->mgc_client : NULL) +#define	MAC_GROUP_HW_VLAN(g)				\ +	(((g) != NULL) &&				\ +	((g)->mrg_info.mgi_addvlan != NULL) &&		\ +	((g)->mrg_info.mgi_remvlan != NULL)) +  /*   * Common ring group data structure for ring control and management. - * The entire structure is SL protected + * The entire structure is SL protected.   */  struct mac_group_s {  	int			mrg_index;	/* index in the list */  	mac_ring_type_t		mrg_type;	/* ring type */  	mac_group_state_t	mrg_state;	/* state of the group */ -	mac_group_t		*mrg_next;	/* next ring in the chain */ +	mac_group_t		*mrg_next;	/* next group in the chain */  	mac_handle_t		mrg_mh;		/* reference to MAC */  	mac_ring_t		*mrg_rings;	/* grouped rings */  	uint_t			mrg_cur_count;	/* actual size of group */ @@ -360,17 +374,23 @@ typedef struct mac_mcast_addrs_s {  } mac_mcast_addrs_t;  typedef enum { -	MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1,	/* hardware steering */ +	MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1,	/* HW classification */  	MAC_ADDRESS_TYPE_UNICAST_PROMISC		/* promiscuous mode */  } mac_address_type_t; +typedef struct mac_vlan_s { +	struct mac_vlan_s	*mv_next; +	uint16_t		mv_vid; +} mac_vlan_t; +  typedef struct mac_address_s {  	mac_address_type_t	ma_type;		/* address type */ -	int			ma_nusers;		/* number of users */ -							/* of that address */ +	int			ma_nusers;		/* num users of addr */  	struct mac_address_s	*ma_next;		/* next address */  	uint8_t			ma_addr[MAXMACADDRLEN];	/* address value */  	size_t			ma_len;			/* address length */ +	mac_vlan_t		*ma_vlans;		/* VLANs on this addr */ +	boolean_t		ma_untagged;		/* accept untagged? */  	mac_group_t		*ma_group;		/* asscociated group */  	mac_impl_t		*ma_mip;		/* MAC handle */  } mac_address_t; @@ -487,7 +507,7 @@ struct mac_impl_s {  	mac_capab_led_t		mi_led;  	/* -	 * MAC address list. SL protected. +	 * MAC address and VLAN lists. SL protected.  	 */  	mac_address_t		*mi_addresses; @@ -654,6 +674,7 @@ struct mac_impl_s {  #define	MIS_LEGACY		0x0040  #define	MIS_NO_ACTIVE		0x0080  #define	MIS_POLL_DISABLE	0x0100 +#define	MIS_IS_OVERLAY		0x0200  #define	mi_getstat	mi_callbacks->mc_getstat  #define	mi_start	mi_callbacks->mc_start @@ -722,12 +743,23 @@ typedef struct mac_client_impl_s mac_client_impl_t;  extern void	mac_init(void);  extern int	mac_fini(void); +/* + * MAC packet/chain drop functions to aggregate all dropped-packet + * debugging to a single surface. + */ +/*PRINTFLIKE2*/ +extern void	mac_drop_pkt(mblk_t *, const char *, ...) +    __KPRINTFLIKE(2); + +/*PRINTFLIKE2*/ +extern void	mac_drop_chain(mblk_t *, const char *, ...) +    __KPRINTFLIKE(2); +  extern void	mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);  extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *,      uint8_t *, ip6_frag_t **);  extern mblk_t *mac_copymsgchain_cksum(mblk_t *); -extern mblk_t *mac_fix_cksum(mblk_t *);  extern void mac_packet_print(mac_handle_t, mblk_t *);  extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,      mac_header_info_t *); @@ -759,6 +791,8 @@ extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t,   */  extern int mac_group_addmac(mac_group_t *, const uint8_t *);  extern int mac_group_remmac(mac_group_t *, const uint8_t *); +extern int mac_group_addvlan(mac_group_t *, uint16_t); +extern int mac_group_remvlan(mac_group_t *, uint16_t);  extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *,      mac_group_t *);  extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); @@ -779,6 +813,7 @@ extern void mac_rx_switch_grp_to_sw(mac_group_t *);   * MAC address functions are used internally by MAC layer.   */  extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *); +extern mac_address_t *mac_find_macaddr_vlan(mac_impl_t *, uint8_t *, uint16_t);  extern boolean_t mac_check_macaddr_shared(mac_address_t *);  extern int mac_update_macaddr(mac_address_t *, uint8_t *);  extern void mac_freshen_macaddr(mac_address_t *, uint8_t *); @@ -829,7 +864,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *);  extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);  extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);  extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); -extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t);  extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);  extern void i_mac_share_alloc(mac_client_impl_t *); @@ -863,8 +898,9 @@ extern int mac_start_group(mac_group_t *);  extern void mac_stop_group(mac_group_t *);  extern int mac_start_ring(mac_ring_t *);  extern void mac_stop_ring(mac_ring_t *); -extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *, boolean_t); -extern int mac_remove_macaddr(mac_address_t *); +extern int mac_add_macaddr_vlan(mac_impl_t *, mac_group_t *, uint8_t *, +    uint16_t, boolean_t); +extern int mac_remove_macaddr_vlan(mac_address_t *, uint16_t);  extern void mac_set_group_state(mac_group_t *, mac_group_state_t);  extern void mac_group_add_client(mac_group_t *, mac_client_impl_t *); diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 4c91c03967..2dea3a4758 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -21,7 +21,7 @@  /*   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc.   */  #ifndef	_SYS_MAC_PROVIDER_H @@ -108,6 +108,7 @@ typedef enum {  	MAC_CAPAB_NO_ZCOPY	= 0x00100000, /* boolean only, no data */  	MAC_CAPAB_LEGACY	= 0x00200000, /* data is mac_capab_legacy_t */  	MAC_CAPAB_VRRP		= 0x00400000, /* data is mac_capab_vrrp_t */ +	MAC_CAPAB_OVERLAY	= 0x00800000, /* boolean only, no data */  	MAC_CAPAB_TRANSCEIVER	= 0x01000000, /* mac_capab_transciever_t */  	MAC_CAPAB_LED		= 0x02000000  /* data is mac_capab_led_t */  } mac_capab_t; @@ -242,16 +243,59 @@ typedef struct mac_callbacks_s {  /*   * Virtualization Capabilities   */ +  /* - * The ordering of entries below is important. MAC_HW_CLASSIFIER - * is the cutoff below which are entries which don't depend on - * H/W. MAC_HW_CLASSIFIER and entries after that are cases where - * H/W has been updated through add/modify/delete APIs. + * The type of ring classification. This is used by MAC to determine + * what, if any, processing it has to do upon receiving traffic on a + * particular Rx ring. + * + * MAC_NO_CLASSIFIER + * + *	No classification has been set. No traffic should cross an Rx + *	ring in this state. + * + * MAC_SW_CLASSIFIER + * + *	The driver delivers traffic for multiple clients to this ring. + *	All traffic must be software classified by MAC to guarantee + *	delivery to the correct client. This classification type may + *	be chosen for several reasons. + * + *	o The driver provides only one group and there are multiple + *	  clients using the MAC. + * + *	o The driver provides some hardware filtering but not enough + *	  to fully classify the traffic. E.g., a VLAN VNIC requires L2 + *	  unicast address filtering as well as VLAN filtering, but + *	  some drivers may only support the former. + * + *	o The ring belongs to the default group. The default group + *	  acts as a spillover for all clients that can't reserve an + *	  exclusive group. It also handles multicast traffic for all + *	  clients. For these reasons, the default group's rings are + *	  always software classified. + * + * MAC_HW_CLASSIFIER + * + *	The driver delivers traffic for a single MAC client across + *	this ring. With this guarantee, MAC can simply pass the + *	traffic up the stack or even allow polling of the ring. + * + * MAC_PASSTHRU_CLASSIFIER + * + *	The ring is in "passthru" mode. In this mode we bypass all of + *	the typical MAC processing and pass the traffic directly to + *	the mr_pt_fn callback, see mac_rx_common(). This is used in + *	cases where there is another module acting as MAC provider on + *	behalf of the driver. E.g., link aggregations use this mode to + *	take full control of the port's rings; allowing it to enforce + *	LACP protocols and aggregate rings across discrete drivers.   */  typedef enum {  	MAC_NO_CLASSIFIER = 0,  	MAC_SW_CLASSIFIER, -	MAC_HW_CLASSIFIER +	MAC_HW_CLASSIFIER, +	MAC_PASSTHRU_CLASSIFIER  } mac_classify_type_t;  typedef	void	(*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *, @@ -281,6 +325,28 @@ typedef enum {  } mac_ring_type_t;  /* + * The value VLAN_ID_NONE (VID 0) means a client does not have + * membership to any VLAN. However, this statement is true for both + * untagged packets and priority tagged packets leading to confusion + * over what semantic is intended. To the provider, VID 0 is a valid + * VID when priority tagging is in play. To MAC and everything above + * VLAN_ID_NONE almost universally implies untagged traffic. Thus, we + * convert VLAN_ID_NONE to a sentinel value (MAC_VLAN_UNTAGGED) at the + * border between MAC and MAC provider. This informs the provider that + * the client is interested in untagged traffic and the provider + * should set any relevant bits to receive such traffic. + * + * Currently, the API between MAC and the provider passes the VID as a + * unit16_t. In the future this could actually be the entire TCI mask + * (PCP, DEI, and VID). This current scheme is safe in that potential + * future world as well; as 0xFFFF is not a valid TCI (the 0xFFF VID + * is reserved and never transmitted across networks). + */ +#define	MAC_VLAN_UNTAGGED		UINT16_MAX +#define	MAC_VLAN_UNTAGGED_VID(vid)	\ +	(((vid) == VLAN_ID_NONE) ? MAC_VLAN_UNTAGGED : (vid)) + +/*   * Grouping type of a ring group   *   * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped. @@ -342,6 +408,7 @@ typedef struct mac_ring_info_s {  		mac_ring_poll_t	poll;  	} mrfunion;  	mac_ring_stat_t		mri_stat; +  	/*  	 * mri_flags will have some bits set to indicate some special  	 * property/feature of a ring like serialization needed for a @@ -358,6 +425,8 @@ typedef struct mac_ring_info_s {   * #defines for mri_flags. The flags are temporary flags that are provided   * only to workaround issues in specific drivers, and they will be   * removed in the future. + * + * These are consumed only by sun4v and neptune (nxge).   */  #define	MAC_RING_TX_SERIALIZE		0x1  #define	MAC_RING_RX_ENQUEUE		0x2 @@ -366,6 +435,8 @@ typedef	int	(*mac_group_start_t)(mac_group_driver_t);  typedef	void	(*mac_group_stop_t)(mac_group_driver_t);  typedef	int	(*mac_add_mac_addr_t)(void *, const uint8_t *);  typedef	int	(*mac_rem_mac_addr_t)(void *, const uint8_t *); +typedef int	(*mac_add_vlan_filter_t)(mac_group_driver_t, uint16_t); +typedef int	(*mac_rem_vlan_filter_t)(mac_group_driver_t, uint16_t);  struct mac_group_info_s {  	mac_group_driver_t	mgi_driver;	/* Driver reference */ @@ -374,9 +445,11 @@ struct mac_group_info_s {  	uint_t			mgi_count;	/* Count of rings */  	mac_intr_t		mgi_intr;	/* Optional per-group intr */ -	/* Only used for rx groups */ +	/* Only used for Rx groups */  	mac_add_mac_addr_t	mgi_addmac;	/* Add a MAC address */  	mac_rem_mac_addr_t	mgi_remmac;	/* Remove a MAC address */ +	mac_add_vlan_filter_t	mgi_addvlan;	/* Add a VLAN filter */ +	mac_rem_vlan_filter_t	mgi_remvlan;	/* Remove a VLAN filter */  };  /* @@ -558,11 +631,12 @@ extern void			mac_prop_info_set_range_uint32(  extern void			mac_prop_info_set_perm(mac_prop_info_handle_t,  				    uint8_t); -extern void			mac_hcksum_get(mblk_t *, uint32_t *, +extern void			mac_hcksum_get(const mblk_t *, uint32_t *,  				    uint32_t *, uint32_t *, uint32_t *,  				    uint32_t *);  extern void			mac_hcksum_set(mblk_t *, uint32_t, uint32_t,  				    uint32_t, uint32_t, uint32_t); +extern void			mac_hcksum_clone(const mblk_t *, mblk_t *);  extern void			mac_lso_get(mblk_t *, uint32_t *, uint32_t *); diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 0d49a2ff4d..65819c1209 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -340,6 +340,7 @@ struct memcntl_mha32 {  #define	MS_SYNC		0x4		/* wait for msync */  #define	MS_ASYNC	0x1		/* return immediately */  #define	MS_INVALIDATE	0x2		/* invalidate caches */ +#define	MS_INVALCURPROC	0x8		/* invalidate cache for curproc only */  #if	(_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)  /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index 88c98dc5a4..7196f7b3ac 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2012, Joyent, Inc.  All rights reserved.   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.   *   *	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T @@ -47,6 +48,7 @@ extern "C" {  #define	MNTTYPE_PCFS	"pcfs"		/* PC (MSDOS) file system */  #define	MNTTYPE_PC	MNTTYPE_PCFS	/* Deprecated name; use MNTTYPE_PCFS */  #define	MNTTYPE_LOFS	"lofs"		/* Loop back file system */ +#define	MNTTYPE_HYPRLOFS "hyprlofs"	/* Hyperlofs file system */  #define	MNTTYPE_LO	MNTTYPE_LOFS	/* Deprecated name; use MNTTYPE_LOFS */  #define	MNTTYPE_HSFS	"hsfs"		/* High Sierra (9660) file system */  #define	MNTTYPE_SWAP	"swap"		/* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 6407534a3b..658f9f3f6b 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@   *   * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_NETCONFIG_H diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index b21504109c..92bd5b897d 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -21,6 +21,8 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc.   */  #ifndef _SYS_NETI_H @@ -46,6 +48,9 @@ struct msgb;	/* avoiding sys/stream.h here */  #define	NHF_INET	"NHF_INET"  #define	NHF_INET6	"NHF_INET6"  #define	NHF_ARP		"NHF_ARP" +#define	NHF_VND_INET	"NHF_VND_INET" +#define	NHF_VND_INET6	"NHF_VND_INET6" +#define	NHF_VIONA	"NHF_VIONA"  /*   * Event identification @@ -61,7 +66,7 @@ struct msgb;	/* avoiding sys/stream.h here */  /*   * Network NIC hardware checksum capability   */ -#define	NET_HCK_NONE   	0x00 +#define	NET_HCK_NONE	0x00  #define	NET_HCK_L3_FULL	0x01  #define	NET_HCK_L3_PART	0x02  #define	NET_HCK_L4_FULL	0x10 diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 7ee33318cd..b327e69fad 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -88,7 +88,8 @@ typedef id_t	netstackid_t;  #define	NS_IPSECESP	16  #define	NS_IPNET	17  #define	NS_ILB		18 -#define	NS_MAX		(NS_ILB+1) +#define	NS_VND		19 +#define	NS_MAX		(NS_VND+1)  /*   * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define	_SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define	OVERLAY_IOC_CREATE	OVERLAYIOC(1) +#define	OVERLAY_IOC_DELETE	OVERLAYIOC(2) +#define	OVERLAY_IOC_PROPINFO	OVERLAYIOC(3) +#define	OVERLAY_IOC_GETPROP	OVERLAYIOC(4) +#define	OVERLAY_IOC_SETPROP	OVERLAYIOC(5) +#define	OVERLAY_IOC_NPROPS	OVERLAYIOC(6) +#define	OVERLAY_IOC_ACTIVATE	OVERLAYIOC(7) +#define	OVERLAY_IOC_STATUS	OVERLAYIOC(8) + +typedef struct overlay_ioc_create { +	datalink_id_t	oic_linkid; +	uint32_t	oic_filler; +	uint64_t	oic_vnetid; +	char		oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { +	datalink_id_t	oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { +	datalink_id_t	oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { +	datalink_id_t	oipn_linkid; +	int32_t		oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { +	datalink_id_t	oipi_linkid; +	int32_t		oipi_id; +	char		oipi_name[OVERLAY_PROP_NAMELEN]; +	uint_t		oipi_type; +	uint_t		oipi_prot; +	uint8_t		oipi_default[OVERLAY_PROP_SIZEMAX]; +	uint32_t	oipi_defsize; +	uint32_t	oipi_posssize; +	uint8_t		oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { +	datalink_id_t	oip_linkid; +	int32_t		oip_id; +	char		oip_name[OVERLAY_PROP_NAMELEN]; +	uint8_t		oip_value[OVERLAY_PROP_SIZEMAX]; +	uint32_t	oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { +	OVERLAY_I_OK		= 0x00, +	OVERLAY_I_DEGRADED	= 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { +	datalink_id_t	ois_linkid; +	uint_t		ois_status; +	char		ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..d638096006 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define	_SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { +	OVERLAY_TARGET_NONE = 0x0, +	OVERLAY_TARGET_POINT, +	OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { +	OVERLAY_PLUGIN_D_INVALID	= 0x0, +	OVERLAY_PLUGIN_D_ETHERNET	= 0x1, +	OVERLAY_PLUGIN_D_IP		= 0x2, +	OVERLAY_PLUGIN_D_PORT 		= 0x4, +	OVERLAY_PLUGIN_D_MASK		= 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { +	OVERLAY_PROP_T_INT = 0x1,	/* signed int */ +	OVERLAY_PROP_T_UINT,		/* unsigned int */ +	OVERLAY_PROP_T_IP,		/* sinaddr6 */ +	OVERLAY_PROP_T_STRING		/* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { +	OVERLAY_PROP_PERM_REQ	= 0x1, +	OVERLAY_PROP_PERM_READ	= 0x2, +	OVERLAY_PROP_PERM_WRITE	= 0x4, +	OVERLAY_PROP_PERM_RW 	= 0x6, +	OVERLAY_PROP_PERM_RRW	= 0x7, +	OVERLAY_PROP_PERM_MASK	= 0x7 +} overlay_prop_prot_t; + +#define	OVERLAY_PROP_NAMELEN	64 +#define	OVERLAY_PROP_SIZEMAX	256 +#define	OVERLAY_STATUS_BUFLEN	256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..7fb8b8da1d --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define	_SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define	OVEP_VERSION_ONE	0x1 + +typedef struct overlay_plugin { +	kmutex_t ovp_mutex; +	list_node_t ovp_link;			/* overlay_plugin_lock */ +	uint_t ovp_active;			/* ovp_mutex */ +	const char *ovp_name;			/* RO */ +	const overlay_plugin_ops_t *ovp_ops;	/* RO */ +	const char *const *ovp_props;		/* RO */ +	uint_t ovp_nprops;			/* RO */ +	uint_t ovp_id_size;			/* RO */ +	overlay_plugin_flags_t ovp_flags;	/* RO */ +	overlay_plugin_dest_t ovp_dest;		/* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { +	list_node_t		omux_lnode; +	ksocket_t		omux_ksock;	/* RO */ +	overlay_plugin_t	*omux_plugin;	/* RO: associated encap */ +	int			omux_domain;	/* RO: socket domain */ +	int			omux_family;	/* RO: socket family */ +	int			omux_protocol;	/* RO: socket protocol */ +	struct sockaddr 	*omux_addr;	/* RO: socket address */ +	socklen_t		omux_alen;	/* RO: sockaddr len */ +	kmutex_t		omux_lock;	/* Protects everything below */ +	uint_t			omux_count;	/* Active instances */ +	avl_tree_t		omux_devices;	/* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { +	OVERLAY_T_TEARDOWN	= 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { +	kmutex_t		ott_lock; +	kcondvar_t		ott_cond; +	overlay_target_mode_t	ott_mode;	/* RO */ +	overlay_plugin_dest_t	ott_dest;	/* RO */ +	uint64_t		ott_id;		/* RO */ +	overlay_target_flag_t	ott_flags;	/* ott_lock */ +	uint_t			ott_ocount;	/* ott_lock */ +	union {					/* ott_lock */ +		overlay_target_point_t	ott_point; +		struct overlay_target_dyn { +			refhash_t	*ott_dhash; +			avl_tree_t	ott_tree; +		} ott_dyn; +	} ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { +	OVERLAY_F_ACTIVATED	= 0x01, /* Activate ioctl completed */ +	OVERLAY_F_IN_MUX	= 0x02,	/* Currently in a mux */ +	OVERLAY_F_IN_TX		= 0x04,	/* Currently doing tx */ +	OVERLAY_F_IN_RX		= 0x08, /* Currently doing rx */ +	OVERLAY_F_IOMASK	= 0x0c,	/* A mask for rx and tx */ +	OVERLAY_F_MDDROP	= 0x10,	/* Drop traffic for metadata update */ +	OVERLAY_F_STOPMASK	= 0x1e,	/* None set when stopping */ +	OVERLAY_F_VARPD		= 0x20,	/* varpd plugin exists */ +	OVERLAY_F_DEGRADED	= 0x40,	/* device is degraded */ +	OVERLAY_F_MASK		= 0x7f	/* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { +	kmutex_t	odd_lock; +	kcondvar_t	odd_iowait; +	list_node_t	odd_link;		/* overlay_dev_lock */ +	mac_handle_t	odd_mh;			/* RO */ +	overlay_plugin_t *odd_plugin;		/* RO */ +	datalink_id_t	odd_linkid;		/* RO */ +	void		*odd_pvoid;		/* RO -- only used by plugin */ +	uint_t		odd_ref;		/* protected by odd_lock */ +	uint_t		odd_mtu;		/* protected by odd_lock */ +	overlay_dev_flag_t odd_flags;		/* protected by odd_lock */ +	uint_t		odd_rxcount;		/* protected by odd_lock */ +	uint_t		odd_txcount;		/* protected by odd_lock */ +	overlay_mux_t	*odd_mux;		/* protected by odd_lock */ +	uint64_t	odd_vid;		/* RO if active else odd_lock */ +	avl_node_t	odd_muxnode;		/* managed by mux */ +	overlay_target_t *odd_target;		/* See big theory statement */ +	char		odd_fmamsg[OVERLAY_STATUS_BUFLEN];	/* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { +	OVERLAY_ENTRY_F_PENDING		= 0x01,	/* lookup in progress */ +	OVERLAY_ENTRY_F_VALID		= 0x02,	/* entry is currently valid */ +	OVERLAY_ENTRY_F_DROP		= 0x04,	/* always drop target */ +	OVERLAY_ENTRY_F_VALID_MASK	= 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { +	kmutex_t		ote_lock; +	refhash_link_t		ote_reflink;	/* hashtable link */ +	avl_node_t		ote_avllink;	/* iteration link */ +	list_node_t		ote_qlink; +	overlay_target_entry_flags_t ote_flags;	/* RW: state flags */ +	uint8_t			ote_addr[ETHERADDRL];	/* RO: mac addr */ +	overlay_target_t	*ote_ott;	/* RO */ +	overlay_dev_t		*ote_odd;	/* RO */ +	overlay_target_point_t	ote_dest;	/* RW: destination */ +	mblk_t			*ote_chead;	/* RW: blocked mb chain head */ +	mblk_t			*ote_ctail;	/* RW: blocked mb chain tail */ +	size_t			ote_mbsize;	/* RW: outstanding mblk size */ +	hrtime_t		ote_vtime;	/* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define	OVERLAY_CTL	"overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, +    struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define	OVERLAY_TARGET_OK	0 +#define	OVERLAY_TARGET_DROP	1 +#define	OVERLAY_TARGET_ASYNC	2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, +    socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..07efaa05df --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define	_SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then  * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + *   overlay_plugin_init_t + * + * 	This interface is used to create a new instance of a plugin. An instance + * 	of a plugin will be created for each overlay device that is created. For + * 	example, if a device is created with VXLAN ID 23 and ID 42, then there + * 	will be two different calls to this function. + * + * 	This function gives the plugin a chance to create a private data + * 	structure that will be returned on subsequent calls to the system. + * + *   overlay_plugin_fini_t + * + *   	This is the opposite of overlay_plugin_init_t. It will be called when it + *   	is safe to remove any private data that is associated with this instance + *   	of the plugin. + * + *   overlay_plugin_propinfo_t + * + *   	This is called with the name of a property that is registered when the + *   	plugin is created. This function will be called with the name of the + *   	property that information is being requested about. The plugin is + *   	responsible for filling out information such as setting the name, the + *   	type of property it is, the protection of the property (can a user + *   	update it?), whether the property is required, an optional default value + *   	for the property, and an optional set of values or ranges that are + *   	allowed. + * + *   overlay_plugin_getprop_t + * + *	Return the value of the named property from the current instance of the + *	plugin. + * + *   overlay_plugin_setprop_t + * + *	Set the value of the named property to the specified value for the + *	current instance of the plugin. Note, that it is the plugin's + *	responsibility to ensure that the value of the property is valid and to + *	update state as appropriate. + * + *   overlay_plugin_socket_t + * + *   	Every overlay device has a corresponding socket that it uses to send and + *   	receive traffic. This routine is used to get the parameters that should + *   	be used to define such a socket. The actual socket may be multiplexed + *   	with other uses of it. + * + *   overlay_plugin_sockopt_t + * + *   	Allow a plugin to set any necessary socket options that it needs on the + *   	kernel socket that is being used by a mux. This will only be called once + *   	for a given mux, if additional devices are added to a mux, it will not + *   	be called additional times. + * + *   overlay_plugin_encap_t + * + *   	In this routine you're given a message block and information about the + *   	packet, such as the identifier and are asked to fill out a message block + *   	that represents the encapsulation header and optionally manipulate the + *   	input message if required. + * + *   overlay_plugin_decap_t + * + *   	In this routine, you're given the encapsulated message block. The + *   	requirement is to decapsulate it and determine what is the correct + *   	overlay identifier for this network and to fill in the header size so + *   	the broader system knows how much of this data should be considered + *   	consumed. + * + *   ovpo_callbacks + * + *   	This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + *   ovep_version + * + *   	Should be set to the value of the macro OVEP_VERSION. + * + *   ovep_name + * + *   	Should be set to a character string that has the name of the module. + *   	Generally this should match the name of the kernel module; however, this + *   	is the name that users will use to refer to this module when creating + *   	devices. + * + *   overlay_plugin_ops_t + * + *   	Should be set to the functions as described above. + * + *   ovep_props + * + *   	This is an array of character strings that holds the names of the + *   	properties of the encapsulation plugin. + * + * + *   ovep_id_size + * + *   	This is the size in bytes of the valid range for the identifier. The + *   	valid identifier range is considered a ovep_id_size byte unsigned + *   	integer, [ 0, 1 << (ovep_id_size * 8) ). + * + *   ovep_flags + * + *   	A series of flags that indicate optional features that are supported. + *   	Valid flags include: + * + *   		OVEP_F_VLAN_TAG + * + * 			The encapsulation format allows for the encapsulated + * 			packet to maintain a VLAN tag. + * + *   ovep_dest + * + *   	Describes the kind of destination that the overlay plugin supports for + *   	sending traffic. For example, vxlan uses UDP, therefore it requires both + *   	an IP address and a port; however, nvgre uses the gre header and + *   	therefore only requires an IP address. The following flags may be + *   	combined: + * + *   		OVERLAY_PLUGIN_D_ETHERNET + * + *   			Indicates that to send a packet to its destination, we + *   			require a link-layer ethernet address. + * + * 		OVERLAY_PLUGIN_D_IP + * + * 			Indicates that to send a packet to its destination, we + * 			require an IP address. Note, all IP addresses are + * 			transmitted as IPv6 addresses and for an IPv4 + * 			destination, using an IPv4-mapped IPv6 address is the + * 			expected way to transmit that. + * + * 		OVERLAY_PLUGIN_D_PORT + * + * 			Indicates that to send a packet to its destination, a + * 			port is required, this usually indicates that the + * 			protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Improtantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define	OVEP_VERSION	0x1 + +typedef enum overlay_plugin_flags { +	OVEP_F_VLAN_TAG	= 0x01	/* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { +	uint64_t	ovdi_id; +	size_t		ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, +    ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, +    ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, +    struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, +    uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, +    uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { +	uint_t			ovpo_callbacks; +	overlay_plugin_init_t	ovpo_init; +	overlay_plugin_fini_t	ovpo_fini; +	overlay_plugin_encap_t	ovpo_encap; +	overlay_plugin_decap_t	ovpo_decap; +	overlay_plugin_socket_t ovpo_socket; +	overlay_plugin_sockopt_t ovpo_sockopt; +	overlay_plugin_getprop_t ovpo_getprop; +	overlay_plugin_setprop_t ovpo_setprop; +	overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { +	uint_t			ovep_version; +	const char		*ovep_name; +	const overlay_plugin_ops_t	*ovep_ops; +	const char 		**ovep_props; +	uint_t			ovep_id_size; +	uint_t			ovep_flags; +	uint_t			ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, +    uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..ae92ef3532 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,293 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define	_OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { +	uint8_t		otp_mac[ETHERADDRL]; +	struct in6_addr	otp_ip; +	uint16_t	otp_port; +} overlay_target_point_t; + +#define	OVERLAY_TARG_IOCTL	(('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define	OVERLAY_TARG_INFO	(OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { +	OVERLAY_TARG_INFO_F_ACTIVE = 0x01, +	OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { +	datalink_id_t		oti_linkid; +	uint32_t		oti_needs; +	uint64_t		oti_flags; +	uint64_t		oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define	OVERLAY_TARG_ASSOCIATE	(OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { +	datalink_id_t		ota_linkid; +	uint32_t		ota_mode; +	uint64_t		ota_id; +	uint32_t		ota_provides; +	overlay_target_point_t	ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define	OVERLAY_TARG_DISASSOCIATE	(OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define	OVERLAY_TARG_DEGRADE	(OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { +	datalink_id_t	otd_linkid; +	uint32_t	otd_pad; +	char		otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define	OVERLAY_TARG_RESTORE	(OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { +	datalink_id_t	otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + *	OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + *		The overlay_targ_resp_t has the appropriate information from + *		which a reply can be generated. The information is filled into + *		an overlay_targ_point_t as appropriate based on the + *		overlay_plugin_dest_t type. + * + * + *	OVERLAY_TARG_DROP - overlay_targ_resp_t + * + *		The overlay_targ_resp_t should identify a request for which to + *		drop a packet. + * + * + * 	OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * 		The overlay_targ_pkt_t injects a fully formed packet into the + * 		virtual network. It may either be identified by its data link id + * 		or by the request id. If both are specified, the + * 		datalink id will be used. Note, that an injection is not + * 		considered a reply and if this corresponds to a requeset, then + * 		that individual packet must still be dropped. + * + * + * 	OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * 		This ioctl can be used to copy data from a given request into a + * 		user buffer. This can be used in combination with + * 		OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * + * + * 	OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * 		This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * 		of receiving it on the local mac handle, it queues it for + * 		retransmission again. This is useful if you have a packet that + * 		was originally destined for some broadcast or multicast address + * 		that you now want to send to a unicast address. + */ +#define	OVERLAY_TARG_LOOKUP	(OVERLAY_TARG_IOCTL | 0x10) +#define	OVERLAY_TARG_RESPOND	(OVERLAY_TARG_IOCTL | 0x11) +#define	OVERLAY_TARG_DROP	(OVERLAY_TARG_IOCTL | 0x12) +#define	OVERLAY_TARG_INJECT	(OVERLAY_TARG_IOCTL | 0x13) +#define	OVERLAY_TARG_PKT	(OVERLAY_TARG_IOCTL | 0x14) +#define	OVERLAY_TARG_RESEND	(OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { +	uint64_t	otl_dlid; +	uint64_t	otl_reqid; +	uint64_t	otl_varpdid; +	uint64_t	otl_vnetid; +	uint64_t	otl_hdrsize; +	uint64_t	otl_pktsize; +	uint8_t		otl_srcaddr[ETHERADDRL]; +	uint8_t		otl_dstaddr[ETHERADDRL]; +	uint32_t	otl_dsttype; +	uint32_t	otl_sap; +	int32_t		otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { +	uint64_t	otr_reqid; +	overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { +	uint64_t	otp_linkid; +	uint64_t	otp_reqid; +	uint64_t	otp_size; +	void		*otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +typedef struct overlay_targ_pkt32 { +	uint64_t	otp_linkid; +	uint64_t	otp_reqid; +	uint64_t	otp_size; +	caddr32_t	otp_buf; +} overlay_targ_pkt32_t; + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define	OVERLAY_TARG_LIST	(OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { +	uint32_t	otl_nents; +	uint32_t	otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * 	OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * 		The overlay_targ_cache_t should be have its link identifier and + * 		the desired mac address filled in. On return, it will fill in + * 		the otc_dest member, if the entry exists in the table. + * + * + * 	OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * 		The cache table entry of the mac address referred to by otc_mac + * 		and otd_linkid will be filled in with the details provided by in + * 		the otc_dest member. + * + * 	OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * 		Removes the cache entry identified by otc_mac from the table. + * 		Note that this does not stop any in-flight lookups or deal with + * 		any data that is awaiting a lookup. + * + * + * 	OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * 		Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * 		entire table identified by otc_linkid. All other parameters are + * 		ignored. + * + * + * 	OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * 		Iterates over the contents of a target cache identified by + * 		otci_linkid. Iteration is guaranteed to be exactly once for + * 		items which are in the hashtable at the beginning and end of + * 		iteration. For items which are added or removed after iteration + * 		has begun, only at most once semantics are guaranteed. Consumers + * 		should ensure that otci_marker is zeroed before starting + * 		iteration and should preserve its contents across calls. + * + * 		Before calling in, otci_count should be set to the number of + * 		entries that space has been allocated for in otci_ents. The + * 		value will be updated to indicate the total number written out. + */ + +#define	OVERLAY_TARG_CACHE_GET		(OVERLAY_TARG_IOCTL | 0x30) +#define	OVERLAY_TARG_CACHE_SET		(OVERLAY_TARG_IOCTL | 0x31) +#define	OVERLAY_TARG_CACHE_REMOVE	(OVERLAY_TARG_IOCTL | 0x32) +#define	OVERLAY_TARG_CACHE_FLUSH	(OVERLAY_TARG_IOCTL | 0x33) +#define	OVERLAY_TARG_CACHE_ITER		(OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define	OVERLAY_TARGET_ITER_MAX	500 + +#define	OVERLAY_TARGET_CACHE_DROP	0x01 + +typedef struct overlay_targ_cache_entry { +	uint8_t			otce_mac[ETHERADDRL]; +	uint16_t		otce_flags; +	overlay_target_point_t	otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { +	datalink_id_t			otc_linkid; +	overlay_targ_cache_entry_t	otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { +	datalink_id_t			otci_linkid; +	uint32_t			otci_pad; +	uint64_t			otci_marker; +	uint16_t			otci_count; +	uint8_t				otci_pad2[3]; +	overlay_targ_cache_entry_t	otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index 282d84b912..66bd91f76f 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -116,7 +116,7 @@ extern "C" {  #define	DEFAULT_MAXPID	999999  #define	DEFAULT_JUMPPID	100000  #else -#define	DEFAULT_MAXPID	30000 +#define	DEFAULT_MAXPID	99999  #define	DEFAULT_JUMPPID	0  #endif diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index 1269aeca10..587a51f0aa 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -21,6 +21,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018 Joyent, Inc.   */  #ifndef _SYS_PATTR_H @@ -106,6 +107,25 @@ typedef struct pattr_hcksum_s {  #define	HW_LSO_FLAGS		HW_LSO	/* All LSO flags, currently only one */  /* + * The packet originates from a MAC on the same machine as the + * receiving MAC. There are two ways this can happen. + * + * 1. MAC loopback: When a packet is destined for a MAC client on the + *                  same MAC as the sender. This datapath is taken in + *                  max_tx_send(). + * + * 2. Bridge Fwd: When a packet is destined for a MAC client on the + *                same bridge as the sender. This datapath is taken in + *                bridge_forward(). + * + * Presented with this flag, a receiver can then decide whether or not + * it needs to emulate some or all of the HW offloads that the NIC + * would have performed otherwise -- or whether it should accept the + * packet as-is. + */ +#define	HW_LOCAL_MAC		0x100 + +/*   * Structure used for zerocopy attribute.   */  typedef struct pattr_zcopy_s { diff --git a/usr/src/uts/common/sys/pci_cap.h b/usr/src/uts/common/sys/pci_cap.h index 730e10d77b..9804913241 100644 --- a/usr/src/uts/common/sys/pci_cap.h +++ b/usr/src/uts/common/sys/pci_cap.h @@ -82,12 +82,12 @@ typedef enum {  #define	PCI_CAP_GET32(h, i, b, o) ((uint32_t) \  	pci_cap_get(h, PCI_CAP_CFGSZ_32, i, b, o)) -#define	PCI_CAP_PUT8(h, i, b, o, d) ((uint8_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d)) -#define	PCI_CAP_PUT16(h, i, b, o, d) ((uint16_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d)) -#define	PCI_CAP_PUT32(h, i, b, o, d) ((uint32_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d)) +#define	PCI_CAP_PUT8(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d) +#define	PCI_CAP_PUT16(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d) +#define	PCI_CAP_PUT32(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d)  #define	PCI_XCAP_GET8(h, i, b, o) ((uint8_t) \  	pci_cap_get(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o)) @@ -96,12 +96,12 @@ typedef enum {  #define	PCI_XCAP_GET32(h, i, b, o) ((uint32_t) \  	pci_cap_get(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o)) -#define	PCI_XCAP_PUT8(h, i, b, o, d) ((uint8_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define	PCI_XCAP_PUT16(h, i, b, o, d) ((uint16_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define	PCI_XCAP_PUT32(h, i, b, o, d) ((uint32_t) \ -	pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d)) +#define	PCI_XCAP_PUT8(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d) +#define	PCI_XCAP_PUT16(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d) +#define	PCI_XCAP_PUT32(h, i, b, o, d) \ +	pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d)  extern int pci_cap_probe(ddi_acc_handle_t h, uint16_t index, diff --git a/usr/src/uts/common/sys/pcie.h b/usr/src/uts/common/sys/pcie.h index 05b70a56fa..0e5f929164 100644 --- a/usr/src/uts/common/sys/pcie.h +++ b/usr/src/uts/common/sys/pcie.h @@ -22,6 +22,9 @@   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ +/* + * Copyright (c) 2017, Joyent, Inc. + */  #ifndef	_SYS_PCIE_H  #define	_SYS_PCIE_H @@ -140,6 +143,8 @@ extern "C" {  #define	PCIE_DEVCAP_PLMT_SCL_1_BY_1000	0xC000000	/* 0.001x Scale */  #define	PCIE_DEVCAP_PLMT_SCL_MASK	0xC000000	/* Power Limit Scale */ +#define	PCIE_DEVCAP_FLR			0x10000000 /* Function Level Reset */ +  /*   * Device Control Register (2 bytes)   */ @@ -174,6 +179,8 @@ extern "C" {  #define	PCIE_DEVCTL_MAX_READ_REQ_MASK	0x7000	/* Max_Read_Request_Size */  #define	PCIE_DEVCTL_MAX_READ_REQ_SHIFT	0xC +#define	PCIE_DEVCTL_INITIATE_FLR	0x8000 +  /*   * Device Status Register (2 bytes)   */ @@ -187,7 +194,14 @@ extern "C" {  /*   * Link Capability Register (4 bytes)   */ -#define	PCIE_LINKCAP_MAX_SPEED_2_5	0x1	/* 2.5 Gb/s Speed */ +#define	PCIE_LINKCAP_MAX_SPEED_2_5	0x1	/* 2.5 GT/s Speed */ +/* + * In version 2 of PCI express, this indicated that both 5.0 GT/s and 2.5 GT/s + * speeds were supported. The use of this as the maximum link speed was added + * with PCIex v3. + */ +#define	PCIE_LINKCAP_MAX_SPEED_5	0x2	/* 5.0 GT/s Speed */ +#define	PCIE_LINKCAP_MAX_SPEED_8	0x3	/* 8.0 GT/s Speed */  #define	PCIE_LINKCAP_MAX_SPEED_MASK	0xF	/* Maximum Link Speed */  #define	PCIE_LINKCAP_MAX_WIDTH_X1	0x010  #define	PCIE_LINKCAP_MAX_WIDTH_X2	0x020 @@ -251,7 +265,9 @@ extern "C" {  /*   * Link Status Register (2 bytes)   */ -#define	PCIE_LINKSTS_SPEED_2_5		0x1	/* Link Speed */ +#define	PCIE_LINKSTS_SPEED_2_5		0x1	/* 2.5 GT/s Link Speed */ +#define	PCIE_LINKSTS_SPEED_5		0x2	/* 5.0 GT/s Link Speed */ +#define	PCIE_LINKSTS_SPEED_8		0x3	/* 8.0 GT/s Link Speed */  #define	PCIE_LINKSTS_SPEED_MASK		0xF	/* Link Speed */  #define	PCIE_LINKSTS_NEG_WIDTH_X1	0x010 @@ -386,6 +402,7 @@ extern "C" {  /*   * Device Control 2 Register (2 bytes)   */ +#define	PCIE_DEVCTL2_COM_TO_RANGE_MASK	0xf  #define	PCIE_DEVCTL2_COM_TO_RANGE_0	0x0  #define	PCIE_DEVCTL2_COM_TO_RANGE_1	0x1  #define	PCIE_DEVCTL2_COM_TO_RANGE_2	0x2 @@ -405,8 +422,14 @@ extern "C" {  #define	PCIE_DEVCTL2_END_END_TLP_PREFIX	0x8000 - - +/* + * Link Capability 2 Register (4 bytes) + */ +#define	PCIE_LINKCAP2_SPEED_2_5		0x02 +#define	PCIE_LINKCAP2_SPEED_5		0x04 +#define	PCIE_LINKCAP2_SPEED_8		0x08 +#define	PCIE_LINKCAP2_SPEED_MASK	0xfe +#define	PCIE_LINKCAP2_CROSSLINK		0x100  /*   * PCI-Express Enhanced Capabilities Link Entry Bit Offsets diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index be3a0ff56e..0e37a515eb 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc.   */  #ifndef	_SYS_PCIE_IMPL_H @@ -281,6 +282,28 @@ typedef struct pf_root_fault {  typedef struct pf_data pf_data_t; +typedef enum pcie_link_width { +	PCIE_LINK_WIDTH_UNKNOWN, +	PCIE_LINK_WIDTH_X1, +	PCIE_LINK_WIDTH_X2, +	PCIE_LINK_WIDTH_X4, +	PCIE_LINK_WIDTH_X8, +	PCIE_LINK_WIDTH_X12, +	PCIE_LINK_WIDTH_X16, +	PCIE_LINK_WIDTH_X32 +} pcie_link_width_t; + +/* + * Note, this member should always be treated as a bit field, as a device may + * support multiple speeds. + */ +typedef enum pcie_link_speed { +	PCIE_LINK_SPEED_UNKNOWN = 0x00, +	PCIE_LINK_SPEED_2_5	= 0x01, +	PCIE_LINK_SPEED_5	= 0x02, +	PCIE_LINK_SPEED_8	= 0x04 +} pcie_link_speed_t; +  /*   * For hot plugged device, these data are init'ed during during probe   * For non-hotplugged device, these data are init'ed in pci_autoconfig (on x86), @@ -335,6 +358,15 @@ typedef struct pcie_bus {  	/* workaround for PCI/PCI-X devs behind PCIe2PCI Bridge */  	pcie_req_id_t   bus_pcie2pci_secbus; + +	/* +	 * Link speed specific fields. +	 */ +	pcie_link_width_t	bus_max_width; +	pcie_link_width_t	bus_cur_width; +	pcie_link_speed_t	bus_sup_speed; +	pcie_link_speed_t	bus_max_speed; +	pcie_link_speed_t	bus_cur_speed;  } pcie_bus_t;  /* @@ -361,6 +393,7 @@ struct pf_data {  	boolean_t		pe_lock;  	boolean_t		pe_valid;  	uint32_t		pe_severity_flags;	/* Severity of error */ +	uint32_t		pe_severity_mask;  	uint32_t		pe_orig_severity_flags; /* Original severity */  	pf_affected_dev_t	*pe_affected_dev;  	pcie_bus_t		*pe_bus_p; @@ -389,6 +422,7 @@ typedef struct pf_impl {  /* bus_fm_flags field */  #define	PF_FM_READY		(1 << 0)	/* bus_fm_lock initialized */  #define	PF_FM_IS_NH		(1 << 1)	/* known as non-hardened */ +#define	PF_FM_IS_PASSTHRU	(1 << 2)	/* device is controlled by VM */  /*   * PCIe fabric handle lookup address flags.  Used to define what type of @@ -417,11 +451,10 @@ typedef struct pf_impl {  #define	PF_ERR_MATCHED_PARENT	(1 << 5) /* Error Handled By Parent */  #define	PF_ERR_PANIC		(1 << 6) /* Error should panic system */  #define	PF_ERR_PANIC_DEADLOCK	(1 << 7) /* deadlock detected */ -#define	PF_ERR_PANIC_BAD_RESPONSE (1 << 8) /* Device no response */ +#define	PF_ERR_BAD_RESPONSE	(1 << 8) /* Device bad/no response */  #define	PF_ERR_MATCH_DOM	(1 << 9) /* Error Handled By IO domain */ -#define	PF_ERR_FATAL_FLAGS		\ -	(PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK | PF_ERR_PANIC_BAD_RESPONSE) +#define	PF_ERR_FATAL_FLAGS		(PF_ERR_PANIC | PF_ERR_PANIC_DEADLOCK)  #define	PF_HDL_FOUND		1  #define	PF_HDL_NOTFOUND		2 @@ -525,6 +558,7 @@ extern void pcie_enable_errors(dev_info_t *dip);  extern void pcie_disable_errors(dev_info_t *dip);  extern int pcie_enable_ce(dev_info_t *dip);  extern boolean_t pcie_bridge_is_link_disabled(dev_info_t *); +extern boolean_t pcie_is_pci_device(dev_info_t *dip);  extern pcie_bus_t *pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf,      uint8_t flags); @@ -583,6 +617,7 @@ extern void pf_eh_enter(pcie_bus_t *bus_p);  extern void pf_eh_exit(pcie_bus_t *bus_p);  extern int pf_scan_fabric(dev_info_t *rpdip, ddi_fm_error_t *derr,      pf_data_t *root_pfd_p); +extern void pf_set_passthru(dev_info_t *, boolean_t);  extern void pf_init(dev_info_t *, ddi_iblock_cookie_t, ddi_attach_cmd_t);  extern void pf_fini(dev_info_t *, ddi_detach_cmd_t);  extern int pf_hdl_lookup(dev_info_t *, uint64_t, uint32_t, uint64_t, diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index de15be4d60..816d6995cf 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *);  int secpolicy_kmdb(const cred_t *);  int secpolicy_lock_memory(const cred_t *);  int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *);  int secpolicy_modctl(const cred_t *, int);  int secpolicy_net(const cred_t *, int, boolean_t);  int secpolicy_net_bindmlp(const cred_t *); @@ -176,6 +177,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,      const vattr_t *, cred_t *);  int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);  int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *);  int secpolicy_basic_exec(const cred_t *, vnode_t *);  int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index 67b47f9a1e..3e0eb3b21f 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -25,7 +25,7 @@   */  /* - * Copyright 2015, Joyent, Inc. + * Copyright 2017 Joyent, Inc.   */  #ifndef _SYS_POLL_IMPL_H @@ -140,6 +140,7 @@ struct pollstate {  	pollstate_t	*ps_contend_nextp;	/* next in contender list */  	pollstate_t	**ps_contend_pnextp;	/* pointer-to-previous-next */  	int		ps_flags;	/* state flags */ +	short		ps_implicit_ev;	/* implicit poll event interest */  };  /* pollstate flags */ @@ -225,6 +226,7 @@ struct polldat {  	int		pd_nsets;	/* num of xref sets, used by poll(2) */  	xref_t		*pd_ref;	/* ptr to xref info, 1 for each set */  	port_kevent_t	*pd_portev;	/* associated port event struct */ +	uf_entry_gen_t	pd_gen;		/* fd generation at cache time */  	uint64_t	pd_epolldata;	/* epoll data, if any */  }; @@ -256,6 +258,7 @@ struct pollcache {  /* pc_flag */  #define	PC_POLLWAKE	0x02	/* pollwakeup() occurred */ +#define	PC_EPOLL	0x04	/* pollcache is epoll-enabled */  #if defined(_KERNEL)  /* diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 712bd7cb24..7d2209132d 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -315,6 +315,7 @@ typedef struct	proc {  	size_t p_swrss;			/* resident set size before last swap */  	struct aio	*p_aio;		/* pointer to async I/O struct */  	struct itimer	**p_itimer;	/* interval timers */ +	uint_t		p_itimer_sz;	/* max allocated interval timers */  	timeout_id_t	p_alarmid;	/* alarm's timeout id */  	caddr_t		p_usrstack;	/* top of the process stack */  	uint_t		p_stkprot;	/* stack memory protection */ @@ -358,6 +359,7 @@ typedef struct	proc {  	struct zone	*p_zone;	/* zone in which process lives */  	struct vnode	*p_execdir;	/* directory that p_exec came from */  	struct brand	*p_brand;	/* process's brand  */ +  	void		*p_brand_data;	/* per-process brand state */  	psecflags_t	p_secflags;	/* per-process security flags */ @@ -374,7 +376,6 @@ typedef struct	proc {  	 */  	struct user p_user;		/* (see sys/user.h) */  } proc_t; -  #define	PROC_T				/* headers relying on proc_t are OK */  #ifdef _KERNEL @@ -640,6 +641,7 @@ extern int signal_is_blocked(kthread_t *, int);  extern int sigcheck(proc_t *, kthread_t *);  extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid);  extern void pid_setmin(void);  extern pid_t pid_allocate(proc_t *, pid_t, int);  extern int pid_rele(struct pid *); @@ -655,6 +657,7 @@ extern int sprtrylock_proc(proc_t *);  extern void sprwaitlock_proc(proc_t *);  extern void sprlock_proc(proc_t *);  extern void sprunlock(proc_t *); +extern void sprunprlock(proc_t *);  extern void pid_init(void);  extern proc_t *pid_entry(int);  extern int pid_slot(proc_t *); @@ -729,6 +732,10 @@ extern	kthread_t *thread_unpin(void);  extern	void	thread_init(void);  extern	void	thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern	void	thread_splitstack(void (*)(void *), void *, size_t); +extern	void	thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern	void	thread_splitstack_cleanup(void); +  extern	void	tsd_create(uint_t *, void (*)(void *));  extern	void	tsd_destroy(uint_t *);  extern	void	*tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); @@ -770,7 +777,7 @@ extern	void	pokelwps(proc_t *);  extern	void	continuelwps(proc_t *);  extern	int	exitlwps(int);  extern	void	lwp_ctmpl_copy(klwp_t *, klwp_t *); -extern	void	lwp_ctmpl_clear(klwp_t *); +extern	void	lwp_ctmpl_clear(klwp_t *, boolean_t);  extern	klwp_t	*forklwp(klwp_t *, proc_t *, id_t);  extern	void	lwp_load(klwp_t *, gregset_t, uintptr_t);  extern	void	lwp_setrval(klwp_t *, int, int); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index dfb54eaef5..99da92ab79 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -236,6 +236,7 @@ typedef struct pstatus {  #define	PR_FAULTED	6  #define	PR_SUSPENDED	7  #define	PR_CHECKPOINT	8 +#define	PR_BRAND	9  /*   * lwp ps(1) information file.  /proc/<pid>/lwp/<lwpid>/lwpsinfo diff --git a/usr/src/uts/common/sys/prsystm.h b/usr/src/uts/common/sys/prsystm.h index 7adc920da2..75259dc421 100644 --- a/usr/src/uts/common/sys/prsystm.h +++ b/usr/src/uts/common/sys/prsystm.h @@ -28,7 +28,7 @@  /*	  All Rights Reserved	*/  /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc.   */  #ifndef _SYS_PRSYSTM_H @@ -86,7 +86,7 @@ extern void prgetcred(proc_t *, struct prcred *);  extern void prgetpriv(proc_t *, struct prpriv *);  extern size_t prgetprivsize(void);  extern void prgetsecflags(proc_t *, struct prsecflags *); -extern int  prnsegs(struct as *, int); +extern uint_t prnsegs(struct as *, int);  extern void prexit(proc_t *);  extern void prfree(proc_t *);  extern void prlwpexit(kthread_t *); diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 55987fe6d7..8b97fd7e3b 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t);  #define	DDBGP(a, b)  #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { +	boolean_t		(*ppocb_func)(ptmptsopencb_arg_t); +	ptmptsopencb_arg_t	ppocb_arg; +} ptmptsopencb_t; +  #endif /* _KERNEL */  typedef struct pt_own { @@ -157,6 +163,19 @@ typedef struct pt_own {  #define	ZONEPT		(('P'<<8)|4)	/* set zone of master/slave pair */  #define	OWNERPT		(('P'<<8)|5)	/* set owner/group for slave device */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + *	      The return value of the callback function when it's invoked + *	      with the opaque argument passed to it will indicate if the + *	      pts slave device is currently open. + */ +#define	PTMPTSOPENCB	(('P'<<8)|6)	/* check if the slave is open */ + +#endif /* _KERNEL */ +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h index 2069e6d3f1..b7427a454d 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -10,11 +10,11 @@   */  /* - * Copyright 2014 Joyent, Inc.  All rights reserved. + * Copyright 2015 Joyent, Inc.   */ -#ifndef	_SYS_SCSI_ADAPTERS_MPTHASH_H -#define	_SYS_SCSI_ADAPTERS_MPTHASH_H +#ifndef	_SYS_REFHASH_H +#define	_SYS_REFHASH_H  #include <sys/types.h>  #include <sys/list.h> @@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *);  extern void *refhash_next(refhash_t *, void *);  extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); -#endif	/* _SYS_SCSI_ADAPTERS_MPTHASH_H */ +#endif	/* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 13166f378d..d65ca00f69 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@   *   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.  All rights reserved.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h index d4233aecb5..2ed7320a09 100644 --- a/usr/src/uts/common/sys/rt.h +++ b/usr/src/uts/common/sys/rt.h @@ -22,6 +22,7 @@  /*   * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -75,6 +76,16 @@ typedef struct	rtkparms {  	int	rt_tqsig;	/* real-time time quantum signal */  	uint_t	rt_cflags;	/* real-time control flags */  } rtkparms_t; + +#define	RTGPPRIO0	100	/* Global priority for RT priority 0 */ + +/* + * control flags (kparms->rt_cflags). + */ +#define	RT_DOPRI	0x01	/* change priority */ +#define	RT_DOTQ		0x02	/* change RT time quantum */ +#define	RT_DOSIG	0x04	/* change RT time quantum signal */ +  #endif	/* _KERNEL */  #ifdef	__cplusplus diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h new file mode 100644 index 0000000000..afb7a94c58 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2012-2015 LSI Corp. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + *    may be used to endorse or promote products derived from this software + *    without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + *  Copyright (c) 2000-2015 LSI Corporation. + *  Copyright (c) 2013-2016 Avago Technologies + *  All rights reserved. + * + * + *           Name:  mpi2_pci.h + *          Title:  MPI PCIe Attached Devices structures and definitions. + *  Creation Date:  October 9, 2012 + * + *  mpi2_pci.h Version:  02.00.02 + * + *  NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25 + *        prefix are for use only on MPI v2.5 products, and must not be used + *        with MPI v2.0 products. Unless otherwise noted, names beginning with + *        MPI2 or Mpi2 are for use with both MPI v2.0 and MPI v2.5 products. + * + *  Version History + *  --------------- + * + *  Date      Version   Description + *  --------  --------  ------------------------------------------------------ + *  03-16-15  02.00.00  Initial version. + *  02-17-16  02.00.01  Removed AHCI support. + *                      Removed SOP support. + *  07-01-16  02.00.02  Added MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP to + *                      NVME Encapsulated Request. + *  -------------------------------------------------------------------------- + */ + +#ifndef MPI2_PCI_H +#define MPI2_PCI_H + + +/* + * Values for the PCIe DeviceInfo field used in PCIe Device Status Change Event + * data and PCIe Configuration pages. + */ +#define MPI26_PCIE_DEVINFO_DIRECT_ATTACH        (0x00000010) + +#define MPI26_PCIE_DEVINFO_MASK_DEVICE_TYPE     (0x0000000F) +#define MPI26_PCIE_DEVINFO_NO_DEVICE            (0x00000000) +#define MPI26_PCIE_DEVINFO_PCI_SWITCH           (0x00000001) +#define MPI26_PCIE_DEVINFO_NVME                 (0x00000003) + + +/**************************************************************************** +*  NVMe Encapsulated message +****************************************************************************/ + +/* NVME Encapsulated Request Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_REQUEST +{ +    U16                     DevHandle;                      /* 0x00 */ +    U8                      ChainOffset;                    /* 0x02 */ +    U8                      Function;                       /* 0x03 */ +    U16                     EncapsulatedCommandLength;      /* 0x04 */ +    U8                      Reserved1;                      /* 0x06 */ +    U8                      MsgFlags;                       /* 0x07 */ +    U8                      VP_ID;                          /* 0x08 */ +    U8                      VF_ID;                          /* 0x09 */ +    U16                     Reserved2;                      /* 0x0A */ +    U32                     Reserved3;                      /* 0x0C */ +    U64                     ErrorResponseBaseAddress;       /* 0x10 */ +    U16                     ErrorResponseAllocationLength;  /* 0x18 */ +    U16                     Flags;                          /* 0x1A */ +    U32                     DataLength;                     /* 0x1C */ +    U8                      NVMe_Command[4];                /* 0x20 */ /* variable length */ + +} MPI26_NVME_ENCAPSULATED_REQUEST, MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_REQUEST, +  Mpi26NVMeEncapsulatedRequest_t, MPI2_POINTER pMpi26NVMeEncapsulatedRequest_t; + +/* defines for the Flags field */ +#define MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP       (0x0020) +/* Submission Queue Type*/ +#define MPI26_NVME_FLAGS_SUBMISSIONQ_MASK           (0x0010) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_IO             (0x0000) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_ADMIN          (0x0010) +/* Error Response Address Space */ +#define MPI26_NVME_FLAGS_MASK_ERROR_RSP_ADDR        (0x000C) +#define MPI26_NVME_FLAGS_SYSTEM_RSP_ADDR            (0x0000) +#define MPI26_NVME_FLAGS_IOCPLB_RSP_ADDR            (0x0008) +#define MPI26_NVME_FLAGS_IOCPLBNTA_RSP_ADDR         (0x000C) +/* Data Direction*/ +#define MPI26_NVME_FLAGS_DATADIRECTION_MASK         (0x0003) +#define MPI26_NVME_FLAGS_NODATATRANSFER             (0x0000) +#define MPI26_NVME_FLAGS_WRITE                      (0x0001) +#define MPI26_NVME_FLAGS_READ                       (0x0002) +#define MPI26_NVME_FLAGS_BIDIRECTIONAL              (0x0003) + + +/* NVMe Encapuslated Reply Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_ERROR_REPLY +{ +    U16                     DevHandle;                      /* 0x00 */ +    U8                      MsgLength;                      /* 0x02 */ +    U8                      Function;                       /* 0x03 */ +    U16                     EncapsulatedCommandLength;      /* 0x04 */ +    U8                      Reserved1;                      /* 0x06 */ +    U8                      MsgFlags;                       /* 0x07 */ +    U8                      VP_ID;                          /* 0x08 */ +    U8                      VF_ID;                          /* 0x09 */ +    U16                     Reserved2;                      /* 0x0A */ +    U16                     Reserved3;                      /* 0x0C */ +    U16                     IOCStatus;                      /* 0x0E */ +    U32                     IOCLogInfo;                     /* 0x10 */ +    U16                     ErrorResponseCount;             /* 0x14 */ +    U16                     Reserved4;                      /* 0x16 */ +} MPI26_NVME_ENCAPSULATED_ERROR_REPLY, +  MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_ERROR_REPLY, +  Mpi26NVMeEncapsulatedErrorReply_t, +  MPI2_POINTER pMpi26NVMeEncapsulatedErrorReply_t; + + +#endif + + diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h index 0050c8c00f..be8bf675b8 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h @@ -58,10 +58,10 @@  #include <sys/byteorder.h>  #include <sys/queue.h> +#include <sys/refhash.h>  #include <sys/isa_defs.h>  #include <sys/sunmdi.h>  #include <sys/mdi_impldefs.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>  #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>  #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h>  #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h> diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h new file mode 100644 index 0000000000..5aba743834 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h @@ -0,0 +1,750 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef	_SMRT_H +#define	_SMRT_H + +#include <sys/types.h> +#include <sys/pci.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/conf.h> +#include <sys/map.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/scsi/scsi.h> +#include <sys/scsi/impl/spc3_types.h> +#include <sys/devops.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sdt.h> +#include <sys/policy.h> +#include <sys/ctype.h> + +#if !defined(_LITTLE_ENDIAN) || !defined(_BIT_FIELDS_LTOH) +/* + * This driver contains a number of multi-byte bit fields and other structs + * that are only correct on a system with the same ordering as x86. + */ +#error "smrt: driver works only on little endian systems" +#endif + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * Some structures are statically sized based on the expected number of logical + * drives and controllers in the system.  These definitions are used throughout + * other driver-specific header files, and must appear prior to their + * inclusion. + */ +#define	SMRT_MAX_LOGDRV		64	/* Maximum number of logical drives */ +#define	SMRT_MAX_PHYSDEV	128	/* Maximum number of physical devices */ + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> +#include <sys/scsi/adapters/smrt/smrt_scsi.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +extern ddi_device_acc_attr_t smrt_dev_attributes; + +typedef enum smrt_init_level { +	SMRT_INITLEVEL_BASIC =			(0x1 << 0), +	SMRT_INITLEVEL_I2O_MAPPED =		(0x1 << 1), +	SMRT_INITLEVEL_CFGTBL_MAPPED =		(0x1 << 2), +	SMRT_INITLEVEL_PERIODIC =		(0x1 << 3), +	SMRT_INITLEVEL_INT_ALLOC =		(0x1 << 4), +	SMRT_INITLEVEL_INT_ADDED =		(0x1 << 5), +	SMRT_INITLEVEL_INT_ENABLED =		(0x1 << 6), +	SMRT_INITLEVEL_SCSA =			(0x1 << 7), +	SMRT_INITLEVEL_MUTEX =			(0x1 << 8), +	SMRT_INITLEVEL_TASKQ =			(0x1 << 9), +	SMRT_INITLEVEL_ASYNC_EVENT =		(0x1 << 10), +} smrt_init_level_t; + +/* + * Commands issued to the controller carry a (generally 32-bit, though with + * two reserved signalling bits) identifying tag number.  In order to avoid + * having the controller confuse us by double-reporting the completion of a + * particular tag, we try to reuse them as infrequently as possible.  In + * practice, this means looping through a range of values.  The minimum and + * maximum value are defined below.  A single command tag value is set aside + * for polled commands sent prior to full initialisation of the driver. + */ +#define	SMRT_PRE_TAG_NUMBER			0x00000bad +#define	SMRT_MIN_TAG_NUMBER			0x00001000 +#define	SMRT_MAX_TAG_NUMBER			0x0fffffff + +/* + * Character strings that represent the names of the iports used for both + * physical and virtual volumes. + */ +#define	SMRT_IPORT_PHYS				"p0" +#define	SMRT_IPORT_VIRT				"v0" + +/* + * Definitions to support waiting for the controller to converge on a + * particular state: ready or not ready.  These are used with + * smrt_ctlr_wait_for_state(). + */ +#define	SMRT_WAIT_DELAY_SECONDS			120 +typedef enum smrt_wait_state { +	SMRT_WAIT_STATE_READY = 1, +	SMRT_WAIT_STATE_UNREADY +} smrt_wait_state_t; + +typedef enum smrt_ctlr_mode { +	SMRT_CTLR_MODE_UNKNOWN = 0, +	SMRT_CTLR_MODE_SIMPLE +} smrt_ctlr_mode_t; + +/* + * In addition to Logical Volumes, we also expose the controller at a + * pseudo target address on the SCSI bus we are essentially pretending to be. + */ +#define	SMRT_CONTROLLER_TARGET			128 + +/* + * When waiting for volume discovery to complete, we wait for a maximum + * duration (in seconds) before giving up. + */ +#define	SMRT_DISCOVER_TIMEOUT			30 + +/* + * The maintenance routine which checks for controller lockup and aborts + * commands that have passed their timeout runs periodically.  The time is + * expressed in seconds. + */ +#define	SMRT_PERIODIC_RATE			5 + +/* + * At times, we need to check if the controller is still responding.  To do + * that, we send a Nop message to the controller and make sure it completes + * successfully.  So that we don't wait forever, we set a timeout (in seconds). + */ +#define	SMRT_PING_CHECK_TIMEOUT			60 + +/* + * When detaching the device, we may need to have an asynchronous event + * cancellation be issued.  While this should be relatively smooth, we don't + * want to wait forever for it.  As such we set a timeout in seconds. + */ +#define	SMRT_ASYNC_CANCEL_TIMEOUT		60 + +/* + * HP PCI vendor ID and Generation 9 device ID. Used to identify generations of + * supported controllers. + */ +#define	SMRT_VENDOR_HP				0x103c +#define	SMRT_DEVICE_GEN9			0x3238 + +typedef enum smrt_controller_status { +	/* +	 * An attempt is being made to detach the controller instance. +	 */ +	SMRT_CTLR_STATUS_DETACHING =		(0x1 << 0), + +	/* +	 * The controller is believed to be functioning correctly.  The driver +	 * is to allow command submission, process interrupts, and perform +	 * periodic background maintenance. +	 */ +	SMRT_CTLR_STATUS_RUNNING =		(0x1 << 1), + +	/* +	 * The controller is currently being reset. +	 */ +	SMRT_CTLR_STATUS_RESETTING =		(0x1 << 2), + +	/* +	 * Our async event notification command is currently in need of help +	 * from the broader driver.  This will be set by smrt_event_complete() +	 * to indicate that the command is not being processed due to a +	 * controller reset or because another fatal error occurred.  The +	 * periodic will have to pick up and recover this for us.  It is only +	 * safe for the driver to manipulate the event command outside of +	 * smrt_event_complete() if this flag is set. +	 */ +	SMRT_CTLR_ASYNC_INTERVENTION =		(0x1 << 3), + +	/* +	 * See the theory statement on discovery and resets in smrt_ciss.c for +	 * an explanation of these values. +	 */ +	SMRT_CTLR_DISCOVERY_REQUESTED =		(0x1 << 4), +	SMRT_CTLR_DISCOVERY_RUNNING =		(0x1 << 5), +	SMRT_CTLR_DISCOVERY_PERIODIC =		(0x1 << 6), +	SMRT_CTLR_DISCOVERY_REQUIRED =		(0x1 << 7), +} smrt_controller_status_t; + +#define	SMRT_CTLR_DISCOVERY_MASK	(SMRT_CTLR_DISCOVERY_REQUESTED | \ +    SMRT_CTLR_DISCOVERY_RUNNING | SMRT_CTLR_DISCOVERY_PERIODIC) + +typedef struct smrt_stats { +	uint64_t smrts_tran_aborts; +	uint64_t smrts_tran_resets; +	uint64_t smrts_tran_starts; +	uint64_t smrts_ctlr_resets; +	unsigned smrts_max_inflight; +	uint64_t smrts_unclaimed_interrupts; +	uint64_t smrts_claimed_interrupts; +	uint64_t smrts_ignored_scsi_cmds; +	uint64_t smrts_events_received; +	uint64_t smrts_events_errors; +	uint64_t smrts_events_intervened; +	uint64_t smrts_discovery_tq_errors; +} smrt_stats_t; + +typedef struct smrt_versions { +	uint8_t smrtv_hardware_version; + +	/* +	 * These strings must be large enough to hold the 4 byte version string +	 * retrieved from an IDENTIFY CONTROLLER response, as well as the +	 * terminating NUL byte: +	 */ +	char smrtv_firmware_rev[5]; +	char smrtv_recovery_rev[5]; +	char smrtv_bootblock_rev[5]; +} smrt_versions_t; + +typedef struct smrt smrt_t; +typedef struct smrt_command smrt_command_t; +typedef struct smrt_command_internal smrt_command_internal_t; +typedef struct smrt_command_scsa smrt_command_scsa_t; +typedef struct smrt_pkt smrt_pkt_t; + +/* + * Per-Controller Structure + */ +struct smrt { +	dev_info_t *smrt_dip; +	int smrt_instance; +	smrt_controller_status_t smrt_status; +	smrt_stats_t smrt_stats; + +	/* +	 * Controller configuration discovered during initialisation. +	 */ +	uint32_t smrt_host_support; +	uint32_t smrt_bus_support; +	uint32_t smrt_maxcmds; +	uint32_t smrt_sg_cnt; +	smrt_versions_t smrt_versions; +	uint16_t smrt_pci_vendor; +	uint16_t smrt_pci_device; + +	/* +	 * iport specific data +	 */ +	dev_info_t *smrt_virt_iport; +	dev_info_t *smrt_phys_iport; +	scsi_hba_tgtmap_t *smrt_virt_tgtmap; +	scsi_hba_tgtmap_t *smrt_phys_tgtmap; + +	/* +	 * The transport mode of the controller. +	 */ +	smrt_ctlr_mode_t smrt_ctlr_mode; + +	/* +	 * The current initialisation level of the driver.  Bits in this field +	 * are set during initialisation and unset during cleanup of the +	 * allocated resources. +	 */ +	smrt_init_level_t smrt_init_level; + +	/* +	 * Essentially everything is protected by "smrt_mutex".  When the +	 * completion queue is updated, threads sleeping on "smrt_cv_finishq" +	 * are awoken. +	 */ +	kmutex_t smrt_mutex; +	kcondvar_t smrt_cv_finishq; + +	/* +	 * List of enumerated logical volumes (smrt_volume_t). +	 */ +	list_t smrt_volumes; + +	/* +	 * List of enumerated physical devices (smrt_physical_t). +	 */ +	list_t smrt_physicals; + +	/* +	 * List of attached SCSA target drivers (smrt_target_t). +	 */ +	list_t smrt_targets; + +	/* +	 * Controller Heartbeat Tracking +	 */ +	uint32_t smrt_last_heartbeat; +	hrtime_t smrt_last_heartbeat_time; + +	hrtime_t smrt_last_interrupt_claimed; +	hrtime_t smrt_last_interrupt_unclaimed; +	hrtime_t smrt_last_reset_start; +	hrtime_t smrt_last_reset_finish; + +	/* +	 * Command object tracking.  These lists, and all commands within the +	 * lists, are protected by "smrt_mutex". +	 */ +	uint32_t smrt_next_tag; +	avl_tree_t smrt_inflight; +	list_t smrt_commands;		/* List of all commands. */ +	list_t smrt_finishq;		/* List of completed commands. */ +	list_t smrt_abortq;		/* List of commands to abort. */ + +	/* +	 * Discovery coordination +	 */ +	ddi_taskq_t *smrt_discover_taskq; +	hrtime_t smrt_last_phys_discovery; +	hrtime_t smrt_last_log_discovery; +	uint64_t smrt_discover_gen; + +	/* +	 * Controller interrupt handler registration. +	 */ +	int smrt_interrupt_type; +	int smrt_interrupt_cap; +	uint_t smrt_interrupt_pri; +	ddi_intr_handle_t smrt_interrupts[1]; +	int smrt_ninterrupts; + +	ddi_periodic_t smrt_periodic; + +	scsi_hba_tran_t *smrt_hba_tran; + +	ddi_dma_attr_t smrt_dma_attr; + +	/* +	 * Access to the I2O Registers: +	 */ +	unsigned smrt_i2o_bar; +	caddr_t smrt_i2o_space; +	ddi_acc_handle_t smrt_i2o_handle; + +	/* +	 * Access to the Configuration Table: +	 */ +	unsigned smrt_ct_bar; +	uint32_t smrt_ct_baseaddr; +	CfgTable_t *smrt_ct; +	ddi_acc_handle_t smrt_ct_handle; + +	/* +	 * Asynchronous Event State +	 */ +	uint32_t smrt_event_count; +	smrt_command_t *smrt_event_cmd; +	smrt_command_t *smrt_event_cancel_cmd; +	kcondvar_t smrt_event_queue; +}; + +/* + * Logical Volume Structure + */ +typedef enum smrt_volume_flags { +	SMRT_VOL_FLAG_WWN =			(0x1 << 0), +} smrt_volume_flags_t; + +typedef struct smrt_volume { +	LUNAddr_t smlv_addr; +	smrt_volume_flags_t smlv_flags; + +	uint8_t smlv_wwn[16]; +	uint64_t smlv_gen; + +	smrt_t *smlv_ctlr; +	list_node_t smlv_link; + +	/* +	 * List of SCSA targets currently attached to this Logical Volume: +	 */ +	list_t smlv_targets; +} smrt_volume_t; + +typedef struct smrt_physical { +	LUNAddr_t smpt_addr; +	uint64_t smpt_wwn; +	uint8_t smpt_dtype; +	uint16_t smpt_bmic; +	uint64_t smpt_gen; +	boolean_t smpt_supported; +	boolean_t smpt_visible; +	boolean_t smpt_unsup_warn; +	list_node_t smpt_link; +	list_t smpt_targets; +	smrt_t *smpt_ctlr; +	smrt_identify_physical_drive_t *smpt_info; +} smrt_physical_t; + +/* + * Per-Target Structure + */ +typedef struct smrt_target { +	struct scsi_device *smtg_scsi_dev; + +	boolean_t smtg_physical; + +	/* +	 * This is only used when performing discovery during panic, as we need +	 * a mechanism to determine if the set of drives has shifted. +	 */ +	boolean_t smtg_gone; + +	/* +	 * Linkage back to the device that this target represents. This may be +	 * either a smrt_volume_t or a smrt_physical_t. We keep a pointer to the +	 * address, as that's the one thing we generally care about. +	 */ +	union { +		smrt_physical_t *smtg_phys; +		smrt_volume_t *smtg_vol; +	} smtg_lun; +	list_node_t smtg_link_lun; +	LUNAddr_t *smtg_addr; + +	/* +	 * Linkage back to the controller: +	 */ +	smrt_t *smtg_ctlr; +	list_node_t smtg_link_ctlr; +} smrt_target_t; + +/* + * DMA Resource Tracking Structure + */ +typedef enum smrt_dma_level { +	SMRT_DMALEVEL_HANDLE_ALLOC =		(0x1 << 0), +	SMRT_DMALEVEL_MEMORY_ALLOC =		(0x1 << 1), +	SMRT_DMALEVEL_HANDLE_BOUND =		(0x1 << 2), +} smrt_dma_level_t; + +typedef struct smrt_dma { +	smrt_dma_level_t smdma_level; +	size_t smdma_real_size; +	ddi_dma_handle_t smdma_dma_handle; +	ddi_acc_handle_t smdma_acc_handle; +	ddi_dma_cookie_t smdma_dma_cookies[1]; +	uint_t smdma_dma_ncookies; +} smrt_dma_t; + + +typedef enum smrt_command_status { +	/* +	 * When a command is submitted to the controller, it is marked USED +	 * to avoid accidental reuse of the command without reinitialising +	 * critical fields.  The submitted command is also marked INFLIGHT +	 * to reflect its inclusion in the "smrt_inflight" AVL tree.  When +	 * the command is completed by the controller, INFLIGHT is unset. +	 */ +	SMRT_CMD_STATUS_USED =			(0x1 << 0), +	SMRT_CMD_STATUS_INFLIGHT =		(0x1 << 1), + +	/* +	 * This flag is set during abort queue processing to record that this +	 * command was aborted in response to an expired timeout, and not some +	 * other cancellation.  If the controller is able to abort the command, +	 * we use this flag to let the SCSI framework know that the command +	 * timed out. +	 */ +	SMRT_CMD_STATUS_TIMEOUT =		(0x1 << 2), + +	/* +	 * The controller set the error bit when completing this command. +	 * Details of the particular fault may be read from the error +	 * information written by the controller. +	 */ +	SMRT_CMD_STATUS_ERROR =			(0x1 << 3), + +	/* +	 * This command has been abandoned by the original submitter.  This +	 * could happen if the command did not complete in a timely fashion. +	 * When it reaches the finish queue it will be freed without further +	 * processing. +	 */ +	SMRT_CMD_STATUS_ABANDONED =		(0x1 << 4), + +	/* +	 * This command has made it through the completion queue and had final +	 * processing performed. +	 */ +	SMRT_CMD_STATUS_COMPLETE =		(0x1 << 5), + +	/* +	 * A polled message will be ignored by the regular processing of the +	 * completion queue.  The blocking function doing the polling is +	 * responsible for watching the command on which it has set the POLLED +	 * flag.  Regular completion queue processing (which might happen in +	 * the polling function, or it might happen in the interrupt handler) +	 * will set POLL_COMPLETE once it is out of the finish queue +	 * altogether. +	 */ +	SMRT_CMD_STATUS_POLLED =		(0x1 << 6), +	SMRT_CMD_STATUS_POLL_COMPLETE =		(0x1 << 7), + +	/* +	 * An abort message has been sent to the controller in an attempt to +	 * cancel this command. +	 */ +	SMRT_CMD_STATUS_ABORT_SENT =		(0x1 << 8), + +	/* +	 * This command has been passed to our tran_start(9E) handler. +	 */ +	SMRT_CMD_STATUS_TRAN_START =		(0x1 << 9), + +	/* +	 * This command was for a SCSI command that we are explicitly avoiding +	 * sending to the controller. +	 */ +	SMRT_CMD_STATUS_TRAN_IGNORED =		(0x1 << 10), + +	/* +	 * This command has been submitted once, and subsequently passed to +	 * smrt_command_reuse(). +	 */ +	SMRT_CMD_STATUS_REUSED =		(0x1 << 11), + +	/* +	 * A controller reset has been issued, so a response for this command +	 * is not expected.  If one arrives before the controller reset has +	 * taken effect, it likely cannot be trusted. +	 */ +	SMRT_CMD_STATUS_RESET_SENT =		(0x1 << 12), + +	/* +	 * Certain commands related to discovery and pinging need to be run +	 * during the context after a reset has occurred, but before the +	 * controller is considered.  Such commands can use this flag to bypass +	 * the normal smrt_submit() check. +	 */ +	SMRT_CMD_IGNORE_RUNNING =		(0x1 << 13), +} smrt_command_status_t; + +typedef enum smrt_command_type { +	SMRT_CMDTYPE_INTERNAL = 1, +	SMRT_CMDTYPE_EVENT, +	SMRT_CMDTYPE_ABORTQ, +	SMRT_CMDTYPE_SCSA, +	SMRT_CMDTYPE_PREINIT, +} smrt_command_type_t; + +struct smrt_command { +	uint32_t smcm_tag; +	smrt_command_type_t smcm_type; +	smrt_command_status_t smcm_status; + +	smrt_t *smcm_ctlr; +	smrt_target_t *smcm_target; + +	list_node_t smcm_link;		/* Linkage for allocated list. */ +	list_node_t smcm_link_finish;	/* Linkage for completion list. */ +	list_node_t smcm_link_abort;	/* Linkage for abort list. */ +	avl_node_t smcm_node;		/* Inflight AVL membership. */ + +	hrtime_t smcm_time_submit; +	hrtime_t smcm_time_complete; + +	hrtime_t smcm_expiry; + +	/* +	 * The time at which an abort message was sent to try and terminate +	 * this command, as well as the tag of the abort message itself: +	 */ +	hrtime_t smcm_abort_time; +	uint32_t smcm_abort_tag; + +	/* +	 * Ancillary data objects.  Only one of these will be allocated for any +	 * given command, but we nonetheless resist the temptation to use a +	 * union of pointers in order to make incorrect usage obvious. +	 */ +	smrt_command_scsa_t *smcm_scsa; +	smrt_command_internal_t *smcm_internal; + +	/* +	 * Physical allocation tracking for the actual command to send to the +	 * controller. +	 */ +	smrt_dma_t smcm_contig; + +	CommandList_t *smcm_va_cmd; +	uint32_t smcm_pa_cmd; + +	ErrorInfo_t *smcm_va_err; +	uint32_t smcm_pa_err; +}; + +/* + * Commands issued internally to the driver (as opposed to by the HBA + * framework) generally require a buffer in which to assemble the command body, + * and for receiving the response from the controller.  The following object + * tracks this (optional) extra buffer. + */ +struct smrt_command_internal { +	smrt_dma_t smcmi_contig; + +	void *smcmi_va; +	uint32_t smcmi_pa; +	size_t smcmi_len; +}; + +/* + * Commands issued via the SCSI framework have a number of additional + * properties. + */ +struct smrt_command_scsa { +	struct scsi_pkt *smcms_pkt; +	smrt_command_t *smcms_command; +}; + + +/* + * CISS transport routines. + */ +void smrt_periodic(void *); +void smrt_lockup_check(smrt_t *); +int smrt_submit(smrt_t *, smrt_command_t *); +void smrt_submit_simple(smrt_t *, smrt_command_t *); +int smrt_retrieve(smrt_t *); +void smrt_retrieve_simple(smrt_t *); +int smrt_poll_for(smrt_t *, smrt_command_t *); +int smrt_preinit_command_simple(smrt_t *, smrt_command_t *); + +/* + * Interrupt service routines. + */ +int smrt_interrupts_setup(smrt_t *); +int smrt_interrupts_enable(smrt_t *); +void smrt_interrupts_teardown(smrt_t *); +uint32_t smrt_isr_hw_simple(caddr_t, caddr_t); + +/* + * Interrupt enable/disable routines. + */ +void smrt_intr_set(smrt_t *, boolean_t); + +/* + * Controller initialisation routines. + */ +int smrt_ctlr_init(smrt_t *); +void smrt_ctlr_teardown(smrt_t *); +int smrt_ctlr_reset(smrt_t *); +int smrt_ctlr_wait_for_state(smrt_t *, smrt_wait_state_t); +int smrt_ctlr_init_simple(smrt_t *); +void smrt_ctlr_teardown_simple(smrt_t *); +int smrt_cfgtbl_flush(smrt_t *); +int smrt_cfgtbl_transport_has_support(smrt_t *, int); +void smrt_cfgtbl_transport_set(smrt_t *, int); +int smrt_cfgtbl_transport_confirm(smrt_t *, int); +uint32_t smrt_ctlr_get_cmdsoutmax(smrt_t *); +uint32_t smrt_ctlr_get_maxsgelements(smrt_t *); + +/* + * Device enumeration and lookup routines. + */ +void smrt_discover_request(smrt_t *); + +int smrt_logvol_discover(smrt_t *, uint16_t, uint64_t); +void smrt_logvol_teardown(smrt_t *); +smrt_volume_t *smrt_logvol_lookup_by_id(smrt_t *, unsigned long); +void smrt_logvol_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, +    void **); +boolean_t smrt_logvol_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, +    void *, scsi_tgtmap_deact_rsn_t); + +int smrt_phys_discover(smrt_t *, uint16_t, uint64_t); +smrt_physical_t *smrt_phys_lookup_by_ua(smrt_t *, const char *); +void smrt_phys_teardown(smrt_t *); +void smrt_phys_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, +    void **); +boolean_t smrt_phys_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, +    void *, scsi_tgtmap_deact_rsn_t); + +/* + * SCSI framework routines. + */ +int smrt_ctrl_hba_setup(smrt_t *); +void smrt_ctrl_hba_teardown(smrt_t *); + +int smrt_logvol_hba_setup(smrt_t *, dev_info_t *); +void smrt_logvol_hba_teardown(smrt_t *, dev_info_t *); +int smrt_phys_hba_setup(smrt_t *, dev_info_t *); +void smrt_phys_hba_teardown(smrt_t *, dev_info_t *); + +void smrt_hba_complete(smrt_command_t *); + +void smrt_process_finishq(smrt_t *); +void smrt_process_abortq(smrt_t *); + +/* + * Command block management. + */ +smrt_command_t *smrt_command_alloc(smrt_t *, smrt_command_type_t, +    int); +smrt_command_t *smrt_command_alloc_preinit(smrt_t *, size_t, int); +int smrt_command_attach_internal(smrt_t *, smrt_command_t *, size_t, +    int); +void smrt_command_free(smrt_command_t *); +smrt_command_t *smrt_lookup_inflight(smrt_t *, uint32_t); +void smrt_command_reuse(smrt_command_t *); + +/* + * Device message construction routines. + */ +void smrt_write_lun_addr_phys(LUNAddr_t *, boolean_t, unsigned, unsigned); +void smrt_write_controller_lun_addr(LUNAddr_t *); +uint16_t smrt_lun_addr_to_bmic(PhysDevAddr_t *); +void smrt_write_message_abort_one(smrt_command_t *, uint32_t); +void smrt_write_message_abort_all(smrt_command_t *, LUNAddr_t *); +void smrt_write_message_nop(smrt_command_t *, int); +void smrt_write_message_event_notify(smrt_command_t *); + +/* + * Device management routines. + */ +int smrt_device_setup(smrt_t *); +void smrt_device_teardown(smrt_t *); +uint32_t smrt_get32(smrt_t *, offset_t); +void smrt_put32(smrt_t *, offset_t, uint32_t); + +/* + * SATA related routines. + */ +int smrt_sata_determine_wwn(smrt_t *, PhysDevAddr_t *, uint64_t *, uint16_t); + +/* + * Asynchronous Event Notification + */ +int smrt_event_init(smrt_t *); +void smrt_event_fini(smrt_t *); +void smrt_event_complete(smrt_command_t *); + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SMRT_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h new file mode 100644 index 0000000000..e1f1db68b3 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h @@ -0,0 +1,345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef	_SMRT_CISS_H +#define	_SMRT_CISS_H + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * Maximum number of Scatter/Gather List entries.  These entries are statically + * allocated for all commands. + */ +#define	CISS_MAXSGENTRIES			64 + +/* + * If the controller advertises a value of 0 for the maximum S/G list length it + * supports, the specification states that we should assume a value of 31. + */ +#define	CISS_SGCNT_FALLBACK			31 + +/* + * The CDB field in the request block is fixed at 16 bytes in length.  (See + * "3.2. Request Block" in the CISS specification.) + */ +#define	CISS_CDBLEN				16 + +/* + * Command Status Values.  These are listed in "Table 2 Command Status" in "3.3 + * Error Info" of the CISS specification. + */ +#define	CISS_CMD_SUCCESS			0x00 +#define	CISS_CMD_TARGET_STATUS			0x01 +#define	CISS_CMD_DATA_UNDERRUN			0x02 +#define	CISS_CMD_DATA_OVERRUN			0x03 +#define	CISS_CMD_INVALID			0x04 +#define	CISS_CMD_PROTOCOL_ERR			0x05 +#define	CISS_CMD_HARDWARE_ERR			0x06 +#define	CISS_CMD_CONNECTION_LOST		0x07 +#define	CISS_CMD_ABORTED			0x08 +#define	CISS_CMD_ABORT_FAILED			0x09 +#define	CISS_CMD_UNSOLICITED_ABORT		0x0a +#define	CISS_CMD_TIMEOUT			0x0b +#define	CISS_CMD_UNABORTABLE			0x0c + +/* + * Request Transfer Directions, used in "RequestBlock.Type.Direction": + */ +#define	CISS_XFER_NONE				0x00 +#define	CISS_XFER_WRITE				0x01 +#define	CISS_XFER_READ				0x02 +#define	CISS_XFER_RSVD				0x03 + +/* + * Request Attributes, used in "RequestBlock.Type.Attribute": + */ +#define	CISS_ATTR_UNTAGGED			0x00 +#define	CISS_ATTR_SIMPLE			0x04 +#define	CISS_ATTR_HEADOFQUEUE			0x05 +#define	CISS_ATTR_ORDERED			0x06 + +/* + * Request Type, used in "RequestBlock.Type.Type": + */ +#define	CISS_TYPE_CMD				0x00 +#define	CISS_TYPE_MSG				0x01 + +/* + * I2O Space Register Offsets + * + * The name "I2O", and these register offsets, appear to be amongst the last + * vestiges of a long-defunct attempt at standardising mainframe-style I/O + * channels in the Intel server space: the Intelligent Input/Output (I2O) + * Architecture Specification. + * + * The draft of version 1.5 of this specification, in section "4.2.1.5.1 + * Extensions for PCI", suggests that the following are memory offsets into + * "the memory region specified by the first base address configuration + * register indicating memory space (offset 10h, 14h, and so forth)".  These + * match up with the offsets of the first two BARs in a PCI configuration space + * type 0 header. + * + * The specification also calls out the Inbound Post List FIFO, write-only at + * offset 40h; the Outbound Post List FIFO, read-only at offset 44h; the + * Interrupt Status Register, at offset 30h; and the Interrupt Mask Register, + * at offset 34h. + * + * This ill-fated attempt to increase the proprietary complexity of (and + * presumably, thus, the gross margin on) computer systems is all but extinct. + * The transport layer of this storage controller is all that's left of their + * religion. + */ +#define	CISS_I2O_INBOUND_DOORBELL		0x20 +#define	CISS_I2O_INTERRUPT_STATUS		0x30 +#define	CISS_I2O_INTERRUPT_MASK			0x34 +#define	CISS_I2O_INBOUND_POST_Q			0x40 +#define	CISS_I2O_OUTBOUND_POST_Q		0x44 +#define	CISS_I2O_OUTBOUND_DOORBELL_STATUS	0x9c +#define	CISS_I2O_OUTBOUND_DOORBELL_CLEAR	0xa0 +#define	CISS_I2O_SCRATCHPAD			0xb0 +#define	CISS_I2O_CFGTBL_CFG_OFFSET		0xb4 +#define	CISS_I2O_CFGTBL_MEM_OFFSET		0xb8 + +/* + * Rather than make a lot of small mappings for each part of the address + * space we wish to access, we will make one large mapping.  If more + * offsets are added to the I2O list above, this space should be extended + * appropriately. + */ +#define	CISS_I2O_MAP_BASE			0x20 +#define	CISS_I2O_MAP_LIMIT			0x100 + +/* + * The Scratchpad Register (I2O_SCRATCHPAD) is not mentioned in the CISS + * specification.  It serves at least two known functions: + *	- Signalling controller readiness + *	- Exposing a debugging code when the controller firmware locks up + */ +#define	CISS_SCRATCHPAD_INITIALISED		0xffff0000 + +/* + * Outbound Doorbell Register Values. + * + * These are read from the Outbound Doorbell Set/Status Register + * (CISS_I2O_OUTBOUND_DOORBELL_STATUS), but cleared by writing to the Clear + * Register (CISS_I2O_OUTBOUND_DOORBELL_CLEAR). + */ +#define	CISS_ODR_BIT_INTERRUPT			(1UL << 0) +#define	CISS_ODR_BIT_LOCKUP			(1UL << 1) + +/* + * Inbound Doorbell Register Values. + * + * These are written to and read from the Inbound Doorbell Register + * (CISS_I2O_INBOUND_DOORBELL). + */ +#define	CISS_IDR_BIT_CFGTBL_CHANGE		(1UL << 0) + +/* + * Interrupt Mask Register Values. + * + * These are written to and read from the Interrupt Mask Register + * (CISS_I2O_INTERRUPT_MASK).  Note that a 1 bit in this register masks or + * disables the interrupt in question; to enable the interrupt the bit must be + * set to 0. + */ +#define	CISS_IMR_BIT_SIMPLE_INTR_DISABLE	(1UL << 3) + +/* + * Interrupt Status Register Values. + * + * These are read from the Interrupt Status Register + * (CISS_I2O_INTERRUPT_STATUS). + */ +#define	CISS_ISR_BIT_SIMPLE_INTR		(1UL << 3) + +/* + * Transport Methods. + * + * These bit positions are used in the Configuration Table to detect controller + * support for a particular method, via "TransportSupport"; to request that the + * controller enable a particular method, via "TransportRequest"; and to detect + * whether the controller has acknowledged the request and enabled the desired + * method, via "TransportActive". + * + * See: "9.1 Configuration Table" in the CISS Specification. + */ +#define	CISS_CFGTBL_READY_FOR_COMMANDS		(1UL << 0) +#define	CISS_CFGTBL_XPORT_SIMPLE		(1UL << 1) +#define	CISS_CFGTBL_XPORT_PERFORMANT		(1UL << 2) +#define	CISS_CFGTBL_XPORT_MEMQ			(1UL << 4) + +/* + * In the Simple Transport Method, when the appropriate interrupt status bit is + * set (CISS_ISR_BIT_SIMPLE_INTR), the Outbound Post Queue register is + * repeatedly read for notifications of the completion of commands previously + * submitted to the controller.  These macros help break up the read value into + * its component fields: the tag number, and whether or not the command + * completed in error. + */ +#define	CISS_OPQ_READ_TAG(x)			((x) >> 2) +#define	CISS_OPQ_READ_ERROR(x)			((x) & (1UL << 1)) + +/* + * Physical devices that are reported may be marked as 'masked'. A masked device + * is one that the driver can see, but must not perform any I/O to. + */ +#define	SMRT_CISS_MODE_MASKED			3 + +/* + * The following packed structures are used to ease the manipulation of + * requests and responses from the controller. + */ +#pragma pack(1) + +typedef struct smrt_tag { +	uint32_t reserved:1; +	uint32_t error:1; +	uint32_t tag_value:30; +	uint32_t unused; +} smrt_tag_t; + +typedef union SCSI3Addr { +	struct { +		uint8_t Dev; +		uint8_t Bus:6; +		uint8_t Mode:2; +	} PeripDev; +	struct { +		uint8_t DevLSB; +		uint8_t DevMSB:6; +		uint8_t Mode:2; +	} LogDev; +	struct { +		uint8_t Dev:5; +		uint8_t Bus:3; +		uint8_t Targ:6; +		uint8_t Mode:2; +	} LogUnit; +} SCSI3Addr_t; + +typedef struct PhysDevAddr { +	uint32_t TargetId:24; +	uint32_t Bus:6; +	uint32_t Mode:2; +	SCSI3Addr_t Target[2]; +} PhysDevAddr_t; + +typedef struct LogDevAddr { +	uint32_t VolId:30; +	uint32_t Mode:2; +	uint8_t reserved[4]; +} LogDevAddr_t; + +typedef union LUNAddr { +	uint8_t LunAddrBytes[8]; +	SCSI3Addr_t SCSI3Lun[4]; +	PhysDevAddr_t PhysDev; +	LogDevAddr_t LogDev; +} LUNAddr_t; + +typedef struct CommandListHeader { +	uint8_t ReplyQueue; +	uint8_t SGList; +	uint16_t SGTotal; +	smrt_tag_t Tag; +	LUNAddr_t LUN; +} CommandListHeader_t; + +typedef struct RequestBlock { +	uint8_t CDBLen; +	struct { +		uint8_t Type:3; +		uint8_t Attribute:3; +		uint8_t Direction:2; +	} Type; +	uint16_t Timeout; +	uint8_t CDB[CISS_CDBLEN]; +} RequestBlock_t; + +typedef struct ErrDescriptor { +	uint64_t Addr; +	uint32_t Len; +} ErrDescriptor_t; + +typedef struct SGDescriptor { +	uint64_t Addr; +	uint32_t Len; +	uint32_t Ext; +} SGDescriptor_t; + +typedef struct CommandList { +	CommandListHeader_t Header; +	RequestBlock_t Request; +	ErrDescriptor_t ErrDesc; +	SGDescriptor_t SG[CISS_MAXSGENTRIES]; +} CommandList_t; + +typedef union MoreErrInfo { +	struct { +		uint8_t Reserved[3]; +		uint8_t Type; +		uint32_t ErrorInfo; +	} Common_Info; +	struct { +		uint8_t Reserved[2]; +		uint8_t offense_size; +		uint8_t offense_num; +		uint32_t offense_value; +	} Invalid_Cmd; +} MoreErrInfo_t; + +typedef struct ErrorInfo { +	uint8_t ScsiStatus; +	uint8_t SenseLen; +	uint16_t CommandStatus; +	uint32_t ResidualCnt; +	MoreErrInfo_t MoreErrInfo; +	uint8_t SenseInfo[MAX_SENSE_LENGTH]; +} ErrorInfo_t; + +typedef struct CfgTable { +	uint8_t Signature[4]; +	uint32_t SpecValence; +	uint32_t TransportSupport; +	uint32_t TransportActive; +	uint32_t TransportRequest; +	uint32_t Upper32Addr; +	uint32_t CoalIntDelay; +	uint32_t CoalIntCount; +	uint32_t CmdsOutMax; +	uint32_t BusTypes; +	uint32_t TransportMethodOffset; +	uint8_t ServerName[16]; +	uint32_t HeartBeat; +	uint32_t HostDrvrSupport; +	uint32_t MaxSGElements; +	uint32_t MaxLunSupport; +	uint32_t MaxPhyDevSupport; +	uint32_t MaxPhyDrvPerLun; +	uint32_t MaxPerfModeCmdsOutMax; +	uint32_t MaxBlockFetchCount; +} CfgTable_t; + +#pragma pack() + +#ifdef	__cplusplus +} +#endif + +#endif /* _SMRT_CISS_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h new file mode 100644 index 0000000000..47ef99b2e0 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h @@ -0,0 +1,371 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017 Joyent, Inc. + */ + +#ifndef	_SMRT_SCSI_H +#define	_SMRT_SCSI_H + +#include <sys/types.h> + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +/* CISS LUN Addressing MODEs */ +#define	PERIPHERIAL_DEV_ADDR 			0x0 +#define	LOGICAL_VOL_ADDR 			0x1 +#define	MASK_PERIPHERIAL_DEV_ADDR 		0x3 +#define	CISS_PHYS_MODE 				0x0 + +/* + * Vendor-specific SCSI Commands + * + * These command opcodes are for use in the opcode byte of the CDB in a request + * of type CISS_TYPE_CMD.  They are custom SCSI commands, using the + * vendor-specific part of the opcode space; i.e., 0xC0 through 0xFF. + */ +#define	CISS_SCMD_READ				0xC0 +#define	CISS_SCMD_WRITE				0xC1 +#define	CISS_SCMD_REPORT_LOGICAL_LUNS		0xC2 +#define	CISS_SCMD_REPORT_PHYSICAL_LUNS		0xC3 + +/* + * These command opcodes are _not_ in the usual vendor-specific space, but are + * nonetheless vendor-specific.  They allow BMIC commands to be written to and + * read from the controller.  If a command transfers no data, the specification + * suggests that BMIC_WRITE (0x27) is appropriate. + */ +#define	CISS_SCMD_BMIC_READ			0x26 +#define	CISS_SCMD_BMIC_WRITE			0x27 + +/* + * CISS Messages + * + * The CISS specification describes several directives that do not behave like + * SCSI commands.  They are sent in requests of type CISS_TYPE_MSG. + * + * The Abort, Reset, and Nop, messages are defined in "8. Messages" in the CISS + * Specification. + */ +#define	CISS_MSG_ABORT				0x0 +#define	CISS_ABORT_TASK				0x0 +#define	CISS_ABORT_TASKSET			0x1 + +#define	CISS_MSG_RESET				0x1 +#define	CISS_RESET_CTLR				0x0 +#define	CISS_RESET_BUS				0x1 +#define	CISS_RESET_TGT				0x3 +#define	CISS_RESET_LUN				0x4 + +#define	CISS_MSG_NOP				0x3 + +/* + * BMIC Commands + * + * These commands allow for the use of non-standard facilities specific to the + * Smart Array firmware.  They are sent to the controller through a specially + * constructed CDB with the CISS_SCMD_BMIC_READ or CISS_SCMD_BMIC_WRITE opcode. + */ +#define	CISS_BMIC_IDENTIFY_CONTROLLER		0x11 +#define	CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE	0x15 +#define	CISS_BMIC_NOTIFY_ON_EVENT		0xD0 +#define	CISS_BMIC_NOTIFY_ON_EVENT_CANCEL	0xD1 + +/* + * Device and Phy type codes.  These are used across many commands, including + * IDENTIFY PHYSICAL DEVICE and the REPORT PHYSICAL LUN extended reporting. + */ +#define	SMRT_DTYPE_PSCSI			0x00 +#define	SMRT_DTYPE_SATA				0x01 +#define	SMRT_DTYPE_SAS				0x02 +#define	SMRT_DTYPE_SATA_BW			0x03 +#define	SMRT_DTYPE_SAS_BW			0x04 +#define	SMRT_DTYPE_EXPANDER			0x05 +#define	SMRT_DTYPE_SES				0x06 +#define	SMRT_DTYPE_CONTROLLER			0x07 +#define	SMRT_DTYPE_SGPIO			0x08 +#define	SMRT_DTYPE_NVME				0x09 +#define	SMRT_DTYPE_NOPHY			0xFF + +/* + * The following packed structures are used to ease the manipulation of SCSI + * and BMIC commands sent to, and status information returned from, the + * controller. + */ +#pragma pack(1) + +typedef struct smrt_report_logical_lun_ent { +	LogDevAddr_t smrle_addr; +} smrt_report_logical_lun_ent_t; + +typedef struct smrt_report_logical_lun_extent { +	LogDevAddr_t smrle_addr; +	uint8_t smrle_wwn[16]; +} smrt_report_logical_lun_extent_t; + +typedef struct smrt_report_logical_lun { +	uint32_t smrll_datasize; /* Big Endian */ +	uint8_t smrll_extflag; +	uint8_t smrll_reserved1[3]; +	union { +		smrt_report_logical_lun_ent_t ents[SMRT_MAX_LOGDRV]; +		smrt_report_logical_lun_extent_t extents[SMRT_MAX_LOGDRV]; +	} smrll_data; +} smrt_report_logical_lun_t; + +typedef struct smrt_report_logical_lun_req { +	uint8_t smrllr_opcode; +	uint8_t smrllr_extflag; +	uint8_t smrllr_reserved1[4]; +	uint32_t smrllr_datasize; /* Big Endian */ +	uint8_t smrllr_reserved2; +	uint8_t smrllr_control; +} smrt_report_logical_lun_req_t; + +typedef struct smrt_report_physical_lun_ent { +	PhysDevAddr_t srple_addr; +} smrt_report_physical_lun_ent_t; + +/* + * This structure represents the 'physical node identifier' extended option for + * REPORT PHYSICAL LUNS.  This is triggered when the extended flags is set to + * 0x1.  Note that for SAS the other structure should always be used. + */ +typedef struct smrt_report_physical_pnid { +	uint8_t srpp_node[8]; +	uint8_t srpp_port[8]; +} smrt_report_physical_pnid_t; + +/* + * This structure represents the 'other physical device info' extended option + * for report physical luns.  This is triggered when the extended flags is set + * to 0x2. + */ +typedef struct smrt_report_physical_opdi { +	uint8_t srpo_wwid[8]; +	uint8_t srpo_dtype; +	uint8_t srpo_flags; +	uint8_t srpo_multilun; +	uint8_t srpo_paths; +	uint32_t srpo_iohdl; +} smrt_report_physical_opdi_t; + +typedef struct smrt_report_physical_lun_extent { +	PhysDevAddr_t srple_addr; +	union { +		smrt_report_physical_pnid_t srple_pnid; +		smrt_report_physical_opdi_t srple_opdi; +	} srple_extdata; +} smrt_report_physical_lun_extent_t; + +/* + * Values that can be ORed together into smrllr_extflag. smprl_extflag indicates + * if any extended processing was done or not. + */ +#define	SMRT_REPORT_PHYSICAL_LUN_EXT_NONE	0x00 +#define	SMRT_REPORT_PHYSICAL_LUN_EXT_PNID	0x01 +#define	SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI	0x02 +#define	SMRT_REPORT_PHYSICAL_LUN_EXT_MASK	0x0f +#define	SMRT_REPORT_PHYSICAL_LUN_CTRL_ONLY	(1 << 6) +#define	SMRT_REPORT_PHYSICAL_LUN_ALL_PATHS	(1 << 7) + +typedef struct smrt_report_physical_lun { +	uint32_t smrpl_datasize; /* Big Endian */ +	uint8_t smrpl_extflag; +	uint8_t smrpl_reserved1[3]; +	union { +		smrt_report_physical_lun_ent_t ents[SMRT_MAX_PHYSDEV]; +		smrt_report_physical_lun_extent_t extents[SMRT_MAX_PHYSDEV]; +	} smrpl_data; +} smrt_report_physical_lun_t; + + +typedef struct smrt_report_physical_lun_req { +	uint8_t smrplr_opcode; +	uint8_t smrplr_extflag; +	uint8_t smrplr_reserved[1]; +	uint32_t smrplr_datasize; /* Big Endian */ +	uint8_t smrplr_reserved2; +	uint8_t smrplr_control; +} smrt_report_physical_lun_req_t; + +/* + * Request structure for the BMIC command IDENTIFY CONTROLLER.  This structure + * is written into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode.  Reserved + * fields should be filled with zeroes. + */ +typedef struct smrt_identify_controller_req { +	uint8_t smicr_opcode; +	uint8_t smicr_lun; +	uint8_t smicr_reserved1[4]; +	uint8_t smicr_command; +	uint8_t smicr_reserved2[2]; +	uint8_t smicr_reserved3[1]; +	uint8_t smicr_reserved4[6]; +} smrt_identify_controller_req_t; + +/* + * Response structure for IDENTIFY CONTROLLER.  This structure is used to + * interpret the response the controller will write into the data buffer. + */ +typedef struct smrt_identify_controller { +	uint8_t smic_logical_drive_count; +	uint32_t smic_config_signature; +	uint8_t smic_firmware_rev[4]; +	uint8_t smic_recovery_rev[4]; +	uint8_t smic_hardware_version; +	uint8_t smic_bootblock_rev[4]; + +	/* +	 * These are obsolete for SAS controllers: +	 */ +	uint32_t smic_drive_present_map; +	uint32_t smic_external_drive_map; + +	uint32_t smic_board_id; +} smrt_identify_controller_t; + +/* + * Request structure for IDENTIFY PHYSICAL DEVICE.  This structure is written + * into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode.  Reserved fields + * should be filled with zeroes.  Note, the lower 8 bits of the BMIC ID are in + * index1, whereas the upper 8 bites are in index2; however, the controller may + * only support 8 bits worth of devices (and this driver does not support that + * many devices). + */ +typedef struct smrt_identify_physical_drive_req { +	uint8_t sipdr_opcode; +	uint8_t sipdr_lun; +	uint8_t	sipdr_bmic_index1; +	uint8_t sipdr_reserved1[3]; +	uint8_t sipdr_command; +	uint8_t sipdr_reserved2[2]; +	uint8_t sipdr_bmic_index2; +	uint8_t sipdr_reserved4[6]; +} smrt_identify_physical_drive_req_t; + +/* + * Relevant values for the sipd_more_flags member. + */ +#define	SMRT_MORE_FLAGS_LOGVOL	(1 << 5) +#define	SMRT_MORE_FLAGS_SPARE	(1 << 6) + +/* + * Response structure for IDENTIFY PHYSICAL DEVICE.  This structure is used to + * describe aspects of a physical drive. Note, not all fields are valid in all + * firmware revisions. + */ +typedef struct smrt_identify_physical_drive { +	uint8_t		sipd_scsi_bus;	/* Invalid for SAS */ +	uint8_t		sipd_scsi_id;	/* Invalid for SAS */ +	uint16_t	sipd_lblk_size; +	uint32_t	sipd_nblocks; +	uint32_t	sipd_rsrvd_blocsk; +	uint8_t		sipd_model[40]; +	uint8_t		sipd_serial[40]; +	uint8_t		sipd_firmware[8]; +	uint8_t		sipd_scsi_inquiry; +	uint8_t		sipd_compaq_stamp; +	uint8_t		sipd_last_failure; +	uint8_t		sipd_flags; +	uint8_t		sipd_more_flags; +	uint8_t		sipd_scsi_lun;	/* Invalid for SAS */ +	uint8_t		sipd_yet_more_flags; +	uint8_t		sipd_even_more_flags; +	uint32_t	sipd_spi_speed_rules; +	uint8_t		sipd_phys_connector[2]; +	uint8_t		sipd_phys_box_on_bus; +	uint8_t		sipd_phys_bay_in_box; +	uint32_t	sipd_rpm; +	uint8_t		sipd_device_type; +	uint8_t		sipd_sata_version; +	uint64_t	sipd_big_nblocks; +	uint64_t	sipd_ris_slba; +	uint32_t	sipd_ris_size; +	uint8_t		sipd_wwid[20]; +	uint8_t		sipd_controller_phy_map[32]; +	uint16_t	sipd_phy_count; +	uint8_t		sipd_phy_connected_dev_type[256]; +	uint8_t		sipd_phy_to_drive_bay[256]; +	uint16_t	sipd_phy_to_attached_dev[256]; +	uint8_t		sipd_box_index; +	uint8_t		sipd_drive_support; +	uint16_t	sipd_extra_flags; +	uint8_t		sipd_neogiated_link_rate[256]; +	uint8_t		sipd_phy_to_phy_map[256]; +	uint8_t		sipd_pad[312]; +} smrt_identify_physical_drive_t; + +/* + * Note that this structure describes the CISS version of the command. There + * also exists a BMIC version, but it has a slightly different structure.  This + * structure is also used for the cancellation request; however, in that case, + * the senr_flags field is reserved. + */ +typedef struct smrt_event_notify_req { +	uint8_t		senr_opcode; +	uint8_t		senr_subcode; +	uint8_t		senr_reserved1[2]; +	uint32_t	senr_flags;	/* Big Endian */ +	uint32_t	senr_size;	/* Big Endian */ +	uint8_t		senr_control; +} smrt_event_notify_req_t; + +/* + * When receiving event notifications, the buffer size must be 512 bytes large. + * We make sure that we always allocate a buffer of this size, even though we + * define a structure that is much shorter and only uses the fields that we end + * up caring about.  This size requirement comes from the specification. + */ +#define	SMRT_EVENT_NOTIFY_BUFLEN	512 + +#define	SMRT_EVENT_CLASS_PROTOCOL		0 +#define	SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR	1 + +#define	SMRT_EVENT_CLASS_HOTPLUG		1 +#define	SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE	0 + +#define	SMRT_EVENT_CLASS_HWERROR		2 +#define	SMRT_EVENT_CLASS_ENVIRONMENT		3 + +#define	SMRT_EVENT_CLASS_PHYS			4 +#define	SMRT_EVENT_PHYS_SUBCLASS_STATE		0 + +#define	SMRT_EVENT_CLASS_LOGVOL			5 + +typedef struct smrt_event_notify { +	uint32_t	sen_timestamp; +	uint16_t	sen_class; +	uint16_t	sen_subclass; +	uint16_t	sen_detail; +	uint8_t		sen_data[64]; +	char		sen_message[80]; +	uint32_t	sen_tag; +	uint16_t	sen_date; +	uint16_t	sen_year; +	uint32_t	sen_time; +	uint16_t	sen_pre_power_time; +	LUNAddr_t	sen_addr; +} smrt_event_notify_t; + +#pragma pack() + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SMRT_SCSI_H */ diff --git a/usr/src/uts/common/sys/scsi/generic/inquiry.h b/usr/src/uts/common/sys/scsi/generic/inquiry.h index ddfd683169..fcbf00d5dc 100644 --- a/usr/src/uts/common/sys/scsi/generic/inquiry.h +++ b/usr/src/uts/common/sys/scsi/generic/inquiry.h @@ -362,7 +362,8 @@ struct scsi_inquiry {  #define	DTYPE_NOTPRESENT	(DPQ_NEVER | DTYPE_UNKNOWN)  /* - * Defined Response Data Formats: + * Defined Versions for inquiry data. These represent the base version that a + * device supports.   */  #define	RDF_LEVEL0		0x00	/* no conformance claim (SCSI-1) */  #define	RDF_CCS			0x01	/* Obsolete (pseudo-spec) */ @@ -370,7 +371,8 @@ struct scsi_inquiry {  #define	RDF_SCSI_SPC		0x03	/* ANSI INCITS 301-1997 (SPC) */  #define	RDF_SCSI_SPC2		0x04	/* ANSI INCITS 351-2001 (SPC-2) */  #define	RDF_SCSI_SPC3		0x05	/* ANSI INCITS 408-2005 (SPC-3) */ -#define	RDF_SCSI_SPC4		0x06	/* t10 (SPC-4) */ +#define	RDF_SCSI_SPC4		0x06	/* ANSI INCITS 513-2015 (SPC-4) */ +#define	RDF_SCSI_SPC5		0x07	/* t10 (SPC-5) */  /*   * Defined Target Port Group Select values: @@ -436,6 +438,7 @@ struct vpd_desc {  #define	PM_CAPABLE_SPC2		RDF_SCSI_SPC2  #define	PM_CAPABLE_SPC3		RDF_SCSI_SPC3  #define	PM_CAPABLE_SPC4		RDF_SCSI_SPC4 +#define	PM_CAPABLE_SPC5		RDF_SCSI_SPC5  #define	PM_CAPABLE_LOG_MASK	0xffff0000	/* use upper 16 bit to */  						/* indicate log specifics */  #define	PM_CAPABLE_LOG_SUPPORTED	0x10000	/* Log page 0xE might be */ diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h index 06f55bd91c..30c6ae54d1 100644 --- a/usr/src/uts/common/sys/scsi/targets/sddef.h +++ b/usr/src/uts/common/sys/scsi/targets/sddef.h @@ -774,6 +774,12 @@ _NOTE(MUTEX_PROTECTS_DATA(sd_lun::un_fi_mutex,  #define	SD_FM_LOG(un)		(((struct sd_fm_internal *)\  				((un)->un_fm_private))->fm_log_level) +/* + * Version Related Macros + */ +#define	SD_SCSI_VERS_IS_GE_SPC_4(un)	\ +	(SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC4 || \ +	SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC5)  /*   * Values for un_ctype @@ -1861,6 +1867,10 @@ struct sd_fm_internal {  #define	SD_PM_CAPABLE_IS_SPC_4(pm_cap)	\  	((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) +#define	SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap)	\ +	(((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) || \ +	((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC5)) +  #define	SD_PM_CAP_LOG_SUPPORTED(pm_cap)	\  	((pm_cap & PM_CAPABLE_LOG_SUPPORTED) ? TRUE : FALSE) diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h index 0219fc2cf7..8f530afda2 100644 --- a/usr/src/uts/common/sys/shm.h +++ b/usr/src/uts/common/sys/shm.h @@ -21,6 +21,7 @@   */  /*   * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2016 Joyent, Inc.   *   * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. @@ -120,6 +121,10 @@ struct shmid_ds {  #define	SHM_LOCK	3	/* Lock segment in core */  #define	SHM_UNLOCK	4	/* Unlock segment */ +#if defined(_KERNEL) +#define	SHM_RMID	5	/* Private RMID for lx support */ +#endif +  #if !defined(_KERNEL)  int shmget(key_t, size_t, int);  int shmids(int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h index 4d8cdcede5..1eae2ca0a4 100644 --- a/usr/src/uts/common/sys/shm_impl.h +++ b/usr/src/uts/common/sys/shm_impl.h @@ -21,13 +21,12 @@  /*   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  #ifndef	_SYS_SHM_IMPL_H  #define	_SYS_SHM_IMPL_H -#pragma ident	"%Z%%M%	%I%	%E% SMI" -  #include <sys/ipc_impl.h>  #if defined(_KERNEL) || defined(_KMEMUSER)  #include <sys/shm.h> @@ -70,7 +69,11 @@ typedef struct kshmid {  	time_t		shm_ctime;	/* last change time */  	struct sptinfo	*shm_sptinfo;	/* info about ISM segment */  	struct seg	*shm_sptseg;	/* pointer to ISM segment */ -	long		shm_sptprot;	/* was reserved (still a "long") */ +	ulong_t		shm_opts; +					/* +					 * Composed of: sptprot (uchar_t) and +					 * RM_PENDING flag (1 bit). +					 */  } kshmid_t;  /* @@ -78,6 +81,14 @@ typedef struct kshmid {   */  #define	SHMSA_ISM	1	/* uses shared page table */ +/* + * shm_opts definitions + * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are + * used for additional options. + */ +#define	SHM_PROT_MASK	0xff +#define	SHM_RM_PENDING	0x100 +  typedef struct sptinfo {  	struct as	*sptas;		/* dummy as ptr. for spt segment */  } sptinfo_t; diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index aece147bec..b12dff6034 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015, Joyent, Inc.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -158,8 +159,8 @@ struct sigaction32 {   * use of these symbols by applications is injurious   *	to binary compatibility   */ -#define	NSIG	74	/* valid signals range from 1 to NSIG-1 */ -#define	MAXSIG	73	/* size of u_signal[], NSIG-1 <= MAXSIG */ +#define	NSIG	75	/* valid signals range from 1 to NSIG-1 */ +#define	MAXSIG	74	/* size of u_signal[], NSIG-1 <= MAXSIG */  #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */  #define	MINSIGSTKSZ	2048 diff --git a/usr/src/uts/common/sys/smbios.h b/usr/src/uts/common/sys/smbios.h index d28141e668..43163a7507 100644 --- a/usr/src/uts/common/sys/smbios.h +++ b/usr/src/uts/common/sys/smbios.h @@ -526,6 +526,10 @@ typedef struct smbios_processor {  #define	SMB_PRU_LGA36471	0x36	/* LGA3647-1 */  #define	SMB_PRU_SP3		0x37	/* socket SP3 */  #define	SMB_PRU_SP3r2		0x38	/* socket SP3r2 */ +#define	SMB_PRU_LGA2066		0x39	/* Socket LGA2066 */ +#define	SMB_PRU_BGA1392		0x3A	/* Socket BGA1392 */ +#define	SMB_PRU_BGA1510		0x3B	/* Socket BGA1510 */ +#define	SMB_PRU_BGA1528		0x3C	/* Socket BGA1528 */  #define	SMB_PRC_RESERVED	0x0001	/* reserved */  #define	SMB_PRC_UNKNOWN		0x0002	/* unknown */ @@ -707,6 +711,7 @@ typedef struct smbios_processor {  #define	SMB_PRF_ZARCH		0xCC	/* z/Architecture */  #define	SMB_PRF_CORE_I5		0xCD	/* Intel Core i5 */  #define	SMB_PRF_CORE_I3		0xCE	/* Intel Core i3 */ +#define	SMB_PRF_CORE_I9		0xCF	/* Intel Core i9 */  #define	SMB_PRF_C7M		0xD2	/* VIA C7-M */  #define	SMB_PRF_C7D		0xD3	/* VIA C7-D */  #define	SMB_PRF_C7		0xD4	/* VIA C7 */ @@ -872,6 +877,7 @@ typedef struct smbios_port {  #define	SMB_POC_BNC		0x20		/* BNC */  #define	SMB_POC_1394		0x21		/* 1394 */  #define	SMB_POC_SATA		0x22		/* SAS/SATA plug receptacle */ +#define	SMB_POC_USB_C		0x23		/* USB Type-C receptacle */  #define	SMB_POC_PC98		0xA0		/* PC-98 */  #define	SMB_POC_PC98HR		0xA1		/* PC-98Hireso */  #define	SMB_POC_PCH98		0xA2		/* PC-H98 */ @@ -913,6 +919,8 @@ typedef struct smbios_port {  #define	SMB_POT_NETWORK		0x1F		/* Network port */  #define	SMB_POT_SATA		0x20		/* SATA */  #define	SMB_POT_SAS		0x21		/* SAS */ +#define	SMB_POT_MFDP		0x22	/* MFDP (Multi-Function Display Port) */ +#define	SMB_POT_THUNDERBOLT	0x23		/* Thunderbolt */  #define	SMB_POT_8251		0xA0		/* 8251 compatible */  #define	SMB_POT_8251F		0xA1		/* 8251 FIFO compatible */  #define	SMB_POT_OTHER		0xFF		/* other */ @@ -933,6 +941,8 @@ typedef struct smbios_slot {  	uint16_t smbl_sg;		/* segment group number */  	uint8_t smbl_bus;		/* bus number */  	uint8_t smbl_df;		/* device/function number */ +	uint8_t smbl_dbw;		/* data bus width */ +	uint8_t smbl_npeers;		/* PCIe bifurcation peers */  } smbios_slot_t;  #define	SMB_SLT_OTHER		0x01	/* other */ @@ -1036,6 +1046,21 @@ typedef struct smbios_slot {  #define	SMB_SLCH2_PME		0x01	/* slot supports PME# signal */  #define	SMB_SLCH2_HOTPLUG	0x02	/* slot supports hot-plug devices */  #define	SMB_SLCH2_SMBUS		0x04	/* slot supports SMBus signal */ +#define	SMB_SLCH2_BIFUR		0x08	/* slot supports PCIe bifurcation */ + +/* + * SMBIOS 7.10.9 Slot Peer Devices + * + * This structure represents an optional peer device that may be part of an + * SMBIOS 3.2 slot. + */ +typedef struct smbios_slot_peer { +	uint16_t smblp_group;		/* peer segment group number */ +	uint8_t smblp_bus;		/* peer bus number */ +	uint8_t smblp_device;		/* peer device number */ +	uint8_t smblp_function;		/* peer function number */ +	uint8_t	smblp_data_width;	/* peer data bus width */ +} smbios_slot_peer_t;  /*   * SMBIOS On-Board Device Information.  See DSP0134 Section 7.11 for more @@ -1189,6 +1214,17 @@ typedef struct smbios_memdevice {  	uint16_t smbmd_minvolt;		/* minimum voltage */  	uint16_t smbmd_maxvolt;		/* maximum voltage */  	uint16_t smbmd_confvolt;	/* configured voltage */ +	uint8_t smbmd_memtech;		/* memory technology */ +	uint32_t smbmd_opcap_flags;	/* operating mode capability */ +	const char *smbmd_firmware_rev;	/* firmware rev */ +	uint16_t smbmd_modmfg_id;	/* JEDEC module mfg id */ +	uint16_t smbmd_modprod_id;	/* JEDEC module product id */ +	uint16_t smbmd_cntrlmfg_id;	/* JEDEC controller mfg id */ +	uint16_t smbmd_cntrlprod_id;	/* JEDEC controller prod id */ +	uint64_t smbmd_nvsize;		/* non-volatile size in bytes */ +	uint64_t smbmd_volatile_size;	/* volatile size in bytes */ +	uint64_t smbmd_cache_size;	/* cache size in bytes */ +	uint64_t smbmd_logical_size;	/* logical size in bytes */  } smbios_memdevice_t;  #define	SMB_MDFF_OTHER		0x01	/* other */ @@ -1234,6 +1270,7 @@ typedef struct smbios_memdevice {  #define	SMB_MDT_LPDDR2		0x1C	/* LPDDR2 */  #define	SMB_MDT_LPDDR3		0x1D	/* LPDDR3 */  #define	SMB_MDT_LPDDR4		0x1E	/* LPDDR4 */ +#define	SMB_MDT_LOGNV		0x1F	/* Logical non-volatile device */  #define	SMB_MDF_OTHER		0x0002	/* other */  #define	SMB_MDF_UNKNOWN		0x0004	/* unknown */ @@ -1256,6 +1293,20 @@ typedef struct smbios_memdevice {  #define	SMB_MDR_QUAD		0x04	/* quad */  #define	SMB_MDR_OCTAL		0x08	/* octal */ +#define	SMB_MTECH_OTHER		0x01	/* other */ +#define	SMB_MTECH_UNKNOWN	0x02	/* unknown */ +#define	SMB_MTECH_DRAM		0x03	/* DRAM */ +#define	SMB_MTECH_NVDIMM_N	0x04	/* NVDIMM-N */ +#define	SMB_MTECH_NVDIMM_F	0x05	/* NVDIMM-F */ +#define	SMB_MTECH_NVDIMM_P	0x06	/* NVDIMM-P */ +#define	SMB_MTECH_INTCPM	0x07	/* Intel persistent memory */ + +#define	SMB_MOMC_OTHER		0x01	/* other */ +#define	SMB_MOMC_UNKNOWN	0x02	/* unknown */ +#define	SMB_MOMC_VOLATILE	0x04	/* Volatile memory */ +#define	SMB_MOMC_BYTE_PM	0x08	/* Byte-accessible persistent memory */ +#define	SMB_MOMC_BLOCK_PM	0x10	/* Block-accessible persistent memory */ +  /*   * SMBIOS Memory Array Mapped Address.  See DSP0134 Section 7.20 for more   * information.  We convert start/end addresses into addr/size for convenience. @@ -1626,7 +1677,8 @@ typedef struct smbios_memdevice_ext {  #define	SMB_VERSION_28	0x0208		/* SMBIOS encoding for DMTF spec 2.8 */  #define	SMB_VERSION_30	0x0300		/* SMBIOS encoding for DMTF spec 3.0 */  #define	SMB_VERSION_31	0x0301		/* SMBIOS encoding for DMTF spec 3.1 */ -#define	SMB_VERSION	SMB_VERSION_31	/* SMBIOS latest version definitions */ +#define	SMB_VERSION_32	0x0302		/* SMBIOS encoding for DMTF spec 3.2 */ +#define	SMB_VERSION	SMB_VERSION_32	/* SMBIOS latest version definitions */  #define	SMB_O_NOCKSUM	0x1		/* do not verify header checksums */  #define	SMB_O_NOVERS	0x2		/* do not verify header versions */ @@ -1686,6 +1738,10 @@ extern int smbios_info_cache(smbios_hdl_t *, id_t, smbios_cache_t *);  extern int smbios_info_port(smbios_hdl_t *, id_t, smbios_port_t *);  extern int smbios_info_extport(smbios_hdl_t *, id_t, smbios_port_ext_t *);  extern int smbios_info_slot(smbios_hdl_t *, id_t, smbios_slot_t *); +extern int smbios_info_slot_peers(smbios_hdl_t *, id_t, uint_t *, +    smbios_slot_peer_t **); +extern void smbios_info_slot_peers_free(smbios_hdl_t *, uint_t, +    smbios_slot_peer_t *);  extern int smbios_info_obdevs(smbios_hdl_t *, id_t, int, smbios_obdev_t *);  extern int smbios_info_obdevs_ext(smbios_hdl_t *, id_t, smbios_obdev_ext_t *);  extern int smbios_info_strtab(smbios_hdl_t *, id_t, int, const char *[]); @@ -1785,6 +1841,9 @@ extern const char *smbios_memdevice_type_desc(uint_t);  extern const char *smbios_memdevice_flag_name(uint_t);  extern const char *smbios_memdevice_flag_desc(uint_t);  extern const char *smbios_memdevice_rank_desc(uint_t); +extern const char *smbios_memdevice_memtech_desc(uint_t); +extern const char *smbios_memdevice_op_capab_name(uint_t); +extern const char *smbios_memdevice_op_capab_desc(uint_t);  extern const char *smbios_onboard_type_desc(uint_t); diff --git a/usr/src/uts/common/sys/smbios_impl.h b/usr/src/uts/common/sys/smbios_impl.h index 66edfb027a..df61892a82 100644 --- a/usr/src/uts/common/sys/smbios_impl.h +++ b/usr/src/uts/common/sys/smbios_impl.h @@ -21,7 +21,7 @@  /*   * Copyright 2015 OmniTI Computer Consulting, Inc.  All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc.   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -201,8 +201,8 @@ typedef struct smb_cache {  #define	SMB_CACHE_SIZE(s)	(((s) & 0x8000) ? \  	((uint32_t)((s) & 0x7FFF) * 64 * 1024) : ((uint32_t)(s) * 1024)) -#define	SMB_CACHE_EXT_SIZE(s)	(((s) & 0x80000000U) ? 	\ -	((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) : 	\ +#define	SMB_CACHE_EXT_SIZE(s)	(((s) & 0x80000000U) ?	\ +	((uint64_t)((s) & 0x7FFFFFFFULL) * 64ULL * 1024ULL) :	\  	((uint64_t)(s) * 1024ULL))  #define	SMB_CACHE_CFG_MODE(c)		(((c) >> 8) & 3) @@ -226,6 +226,13 @@ typedef struct smb_port {  /*   * SMBIOS implementation structure for SMB_TYPE_SLOT.   */ +typedef struct smb_slot_peer { +	uint16_t smbspb_group_no;	/* segment group number */ +	uint8_t smbspb_bus;		/* bus number */ +	uint8_t smbspb_df;		/* device/function number */ +	uint8_t smbspb_width;		/* electrical width */ +} smb_slot_peer_t; +  typedef struct smb_slot {  	smb_header_t smbsl_hdr;		/* structure header */  	uint8_t smbsl_name;		/* reference designation */ @@ -239,6 +246,10 @@ typedef struct smb_slot {  	uint16_t smbsl_sg;		/* segment group number */  	uint8_t smbsl_bus;		/* bus number */  	uint8_t smbsl_df;		/* device/function number */ +	/* Added in SMBIOS 3.2+ */ +	uint8_t	smbsl_dbw;		/* Data bus width */ +	uint8_t	smbsl_npeers;		/* Peer bdf groups */ +	smb_slot_peer_t smbsl_peers[];	/* bifurcation peers */  } smb_slot_t;  /* @@ -343,6 +354,18 @@ typedef struct smb_memdevice {  	uint16_t smbmdev_minvolt;	/* minimum voltage */  	uint16_t smbmdev_maxvolt;	/* maximum voltage */  	uint16_t smbmdev_confvolt;	/* configured voltage */ +	/* Added in SMBIOS 3.2 */ +	uint8_t smbmdev_memtech;	/* memory technology */ +	uint16_t smbmdev_opmode;	/* memory operating mode capability */ +	uint8_t smbmdev_fwver;		/* firmware version */ +	uint16_t smbmdev_modulemfgid;	/* module manufacturer ID */ +	uint16_t smbmdev_moduleprodid;	/* module product ID */ +	uint16_t smbmdev_memsysmfgid;	/* memory controller manufacturer id */ +	uint16_t smbmdev_memsysprodid;	/* memory controller product id */ +	uint64_t smbmdev_nvsize;	/* non-volatile memory size */ +	uint64_t smbmdev_volsize;	/* volatile memory size */ +	uint64_t smbmdev_cachesize;	/* cache size */ +	uint64_t smbmdev_logicalsize;	/* logical size */  } smb_memdevice_t;  #define	SMB_MDS_KBYTES		0x8000	/* size in specified in kilobytes */ @@ -627,7 +650,7 @@ typedef struct smb_struct {  	const smb_header_t *smbst_hdr;	/* address of raw structure data */  	const uchar_t *smbst_str;	/* address of string data (if any) */  	const uchar_t *smbst_end;	/* address of 0x0000 ending tag */ -	struct smb_struct *smbst_next; 	/* next structure in hash chain */ +	struct smb_struct *smbst_next;	/* next structure in hash chain */  	uint16_t *smbst_strtab;		/* string index -> offset table */  	uint_t smbst_strtablen;		/* length of smbst_strtab */  } smb_struct_t; @@ -788,6 +811,20 @@ typedef struct smb_base_cache {  	uint8_t smbba_flags;		/* cache flags (SMB_CAF_*) */  } smb_base_cache_t; +typedef struct smb_base_slot { +	const char *smbbl_name;		/* reference designation */ +	uint8_t smbbl_type;		/* slot type */ +	uint8_t smbbl_width;		/* slot data bus width */ +	uint8_t smbbl_usage;		/* current usage */ +	uint8_t smbbl_length;		/* slot length */ +	uint16_t smbbl_id;		/* slot ID */ +	uint8_t smbbl_ch1;		/* slot characteristics 1 */ +	uint8_t smbbl_ch2;		/* slot characteristics 2 */ +	uint16_t smbbl_sg;		/* segment group number */ +	uint8_t smbbl_bus;		/* bus number */ +	uint8_t smbbl_df;		/* device/function number */ +} smb_base_slot_t; +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 93b0af97e8..d6e13d4823 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -22,6 +22,7 @@   * Copyright 2014 Garrett D'Amore <garrett@damore.org>   *   * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved.   */  /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/ @@ -39,6 +40,9 @@  /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */  #ifndef	_SYS_SOCKET_H  #define	_SYS_SOCKET_H @@ -204,6 +208,7 @@ struct so_snd_bufinfo {  #define	SO_SRCADDR	0x2001		/* Internal: AF_UNIX source address */  #define	SO_FILEP	0x2002		/* Internal: AF_UNIX file pointer */  #define	SO_UNIX_CLOSE	0x2003		/* Internal: AF_UNIX peer closed */ +#define	SO_REUSEPORT	0x2004		/* allow simultaneous port reuse */  #endif	/* _KERNEL */  /* @@ -303,8 +308,9 @@ struct	linger {  #define	AF_INET_OFFLOAD	30		/* Sun private; do not use */  #define	AF_TRILL	31		/* TRILL interface */  #define	AF_PACKET	32		/* PF_PACKET Linux socket interface */ +#define	AF_LX_NETLINK	33		/* Linux-compatible netlink */ -#define	AF_MAX		32 +#define	AF_MAX		33  /*   * Protocol families, same as address families for now. @@ -344,6 +350,7 @@ struct	linger {  #define	PF_INET_OFFLOAD	AF_INET_OFFLOAD	/* Sun private; do not use */  #define	PF_TRILL	AF_TRILL  #define	PF_PACKET	AF_PACKET +#define	PF_LX_NETLINK	AF_LX_NETLINK  #define	PF_MAX		AF_MAX @@ -429,6 +436,7 @@ struct msghdr32 {  					/* with left over data */  #define	MSG_XPG4_2	0x8000		/* Private: XPG4.2 flag */ +/* Obsolete but kept for compilation compatability. Use IOV_MAX. */  #define	MSG_MAXIOVLEN	16  #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index ac07bad909..6794b5687b 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -21,6 +21,7 @@  /*   * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/ @@ -102,6 +103,7 @@ struct sockaddr_ux {  typedef struct sonodeops sonodeops_t;  typedef struct sonode sonode_t; +typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *);  struct sodirect_s; @@ -244,6 +246,10 @@ struct sonode {  	struct sof_instance	*so_filter_top;		/* top of stack */  	struct sof_instance	*so_filter_bottom;	/* bottom of stack */  	clock_t			so_filter_defertime;	/* time when deferred */ + +	/* Kernel direct receive callbacks */ +	so_krecv_f		so_krecv_cb;		/* recv callback */ +	void			*so_krecv_arg;		/* recv cb arg */  };  #define	SO_HAVE_DATA(so)						\ @@ -297,15 +303,16 @@ struct sonode {  #define	SS_OOBPEND		0x00002000 /* OOB pending or present - poll */  #define	SS_HAVEOOBDATA		0x00004000 /* OOB data present */  #define	SS_HADOOBDATA		0x00008000 /* OOB data consumed */ -#define	SS_CLOSING		0x00010000 /* in process of closing */ +#define	SS_CLOSING		0x00010000 /* in process of closing */  #define	SS_FIL_DEFER		0x00020000 /* filter deferred notification */  #define	SS_FILOP_OK		0x00040000 /* socket can attach filters */  #define	SS_FIL_RCV_FLOWCTRL	0x00080000 /* filter asserted rcv flow ctrl */ +  #define	SS_FIL_SND_FLOWCTRL	0x00100000 /* filter asserted snd flow ctrl */  #define	SS_FIL_STOP		0x00200000 /* no more filter actions */ -  #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */ +#define	SS_FILOP_UNSF		0x00800000 /* block attaching unsafe filters */  #define	SS_SENTLASTREADSIG	0x01000000 /* last rx signal has been sent */  #define	SS_SENTLASTWRITESIG	0x02000000 /* last tx signal has been sent */ @@ -321,7 +328,8 @@ struct sonode {  /*   * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment.   */  #define	SO_BLOCK_FALLBACK(so, fn)				\  	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\ @@ -337,6 +345,24 @@ struct sonode {  		}						\  	} +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define	SO_BLOCK_FALLBACK_SAFE(so, fn)				\ +	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\ +	rw_enter(&(so)->so_fallback_rwlock, RW_READER);		\ +	if ((so)->so_state & SS_FALLBACK_COMP) {		\ +		rw_exit(&(so)->so_fallback_rwlock);		\ +		return (fn);					\ +	} else if (((so)->so_state & SS_FILOP_UNSF) == 0) {	\ +		mutex_enter(&(so)->so_lock);			\ +		(so)->so_state |= SS_FILOP_UNSF;		\ +		mutex_exit(&(so)->so_lock);			\ +	} +  #define	SO_UNBLOCK_FALLBACK(so)	{			\  	rw_exit(&(so)->so_fallback_rwlock);		\  } @@ -368,6 +394,7 @@ struct sonode {  /* The modes below are only for non-streams sockets */  #define	SM_ACCEPTSUPP		0x400	/* can handle accept() */  #define	SM_SENDFILESUPP		0x800	/* Private: proto supp sendfile  */ +#define	SM_DEFERERR		0x1000	/* Private: defer so_error delivery */  /*   * Socket versions. Used by the socket library when calling _so_socket(). @@ -946,6 +973,15 @@ extern struct sonode	*socreate(struct sockparams *, int, int, int, int,  extern int	so_copyin(const void *, void *, size_t, int);  extern int	so_copyout(const void *, void *, size_t, int); +/* + * Functions to manipulate the use of direct receive callbacks. This should not + * be used outside of sockfs and ksocket. These are generally considered a use + * once interface for a socket and will cause all outstanding data on the socket + * to be flushed. + */ +extern int	so_krecv_set(sonode_t *, so_krecv_f, void *); +extern void	so_krecv_unblock(sonode_t *); +  #endif  /* diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops {  #define	SOF_VERSION	1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define	SOF_ATT_SAFE	0x1 +  extern int	sof_register(int, const char *, const sof_ops_t *, int);  extern int	sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index f1bd429815..89b355970e 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  #ifndef	_SYS_SQUEUE_H @@ -29,6 +30,17 @@  extern "C" {  #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ +  #include <sys/types.h>  #include <sys/processor.h>  #include <sys/stream.h> @@ -76,16 +88,17 @@ typedef enum {  struct ip_recv_attr_s;  extern void squeue_init(void); -extern squeue_t *squeue_create(clock_t, pri_t); +extern squeue_t *squeue_create(pri_t, boolean_t);  extern void squeue_bind(squeue_t *, processorid_t);  extern void squeue_unbind(squeue_t *);  extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,      uint32_t, struct ip_recv_attr_s *, int, uint8_t);  extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *);  struct conn_s;  extern int squeue_synch_enter(struct conn_s *, mblk_t *); -extern void squeue_synch_exit(struct conn_s *); +extern void squeue_synch_exit(struct conn_s *, int);  #ifdef	__cplusplus  } diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 22550886eb..2bb717fb52 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  #ifndef	_SYS_SQUEUE_IMPL_H @@ -84,7 +85,6 @@ typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,  	    struct ip_recv_attr_s *, int, uint8_t);  typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t); -extern void squeue_worker_wakeup(squeue_t *);  extern int ip_squeue_flag;  struct squeue_s { @@ -99,14 +99,11 @@ struct squeue_s {  	ill_rx_ring_t	*sq_rx_ring;	/* The Rx ring tied to this sq */  	ill_t		*sq_ill;	/* The ill this squeue is tied to */ -	clock_t		sq_curr_time;	/* Current tick (lbolt) */ +	hrtime_t	sq_awoken;	/* time of worker wake req */  	kcondvar_t	sq_worker_cv;	/* cond var. worker thread blocks on */  	kcondvar_t	sq_poll_cv;	/* cond variable poll_thr waits on */  	kcondvar_t	sq_synch_cv;	/* cond var. synch thread waits on */  	kcondvar_t	sq_ctrlop_done_cv; /* cond variable for ctrl ops */ -	clock_t		sq_wait;	/* lbolts to wait after a fill() */ -	timeout_id_t	sq_tid;		/* timer id of pending timeout() */ -	clock_t		sq_awaken;	/* time async thread was awakened */  	processorid_t	sq_bind;	/* processor to bind to */  	kthread_t	*sq_worker;	/* kernel thread id */ @@ -117,6 +114,7 @@ struct squeue_s {  	squeue_set_t	*sq_set;	/* managed by squeue creator */  	pri_t		sq_priority;	/* squeue thread priority */ +	boolean_t	sq_isip;	/* use IP-centric features */  	/* Keep the debug-only fields at the end of the structure */  #ifdef DEBUG @@ -140,7 +138,6 @@ struct squeue_s {  #define	SQS_USER	0x00000010	/* A non interrupt user */  #define	SQS_BOUND	0x00000020	/* Worker thread is bound */  #define	SQS_REENTER	0x00000040	/* Re entered thread */ -#define	SQS_TMO_PROG	0x00000080	/* Timeout is being set */  #define	SQS_POLL_CAPAB	0x00000100	/* Squeue can control interrupts */  #define	SQS_ILL_BOUND	0x00000200	/* Squeue bound to an ill */ @@ -165,6 +162,7 @@ struct squeue_s {  #define	SQS_POLL_RESTART_DONE	0x01000000  #define	SQS_POLL_THR_QUIESCE	0x02000000  #define	SQS_PAUSE		0x04000000 /* The squeue has been paused */ +#define	SQS_EXIT		0x08000000 /* squeue is being torn down */  #define	SQS_WORKER_THR_CONTROL          \  	(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 4be8d794fc..7488d3dee8 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2015 Joyent, Inc.  All rights reserved.   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.   */ @@ -644,16 +645,13 @@ struct stroptions {  /*   * Structure for rw (read/write) procedure calls. A pointer   * to a struiod_t is passed as a parameter to the rwnext() call. - * - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - *	 as there isn't a formal definition of IOV_MAX ???   */  #define	DEF_IOV_MAX	16  struct struiod {  	mblk_t		*d_mp;		/* pointer to mblk (chain) */  	uio_t		d_uio;		/* uio info */ -	iovec_t d_iov[DEF_IOV_MAX];	/* iov referenced by uio */ +	iovec_t		*d_iov;		/* iov referenced by uio */  };  /* diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index ce86badfc1..f3bc1ed407 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -25,6 +25,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2018 Joyent, Inc.   */  #ifndef _SYS_STRSUBR_H @@ -1239,10 +1240,17 @@ extern void strsignal_nolock(stdata_t *, int, uchar_t);  struct multidata_s;  struct pdesc_s; + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */  extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s  *,      uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int);  extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *,      uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); +  extern void lso_info_set(mblk_t *, uint32_t, uint32_t);  extern void lso_info_cleanup(mblk_t *);  extern unsigned int bcksum(uchar_t *, int, unsigned int); diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index 1d94c8fd2c..b260971a89 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -1585,8 +1585,14 @@ int  ddi_ffs(long mask);  int +ddi_ffsll(long long mask); + +int  ddi_fls(long mask); +int +ddi_flsll(long long mask); +  /*   * The ddi_soft_state* routines comprise generic storage management utilities   * for driver soft state structures.  Two types of soft_state indexes are diff --git a/usr/src/uts/common/sys/sysconfig.h b/usr/src/uts/common/sys/sysconfig.h index 3a68d76ebe..d5b65ef78c 100644 --- a/usr/src/uts/common/sys/sysconfig.h +++ b/usr/src/uts/common/sys/sysconfig.h @@ -25,6 +25,7 @@  /*   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2016 Joyent, Inc.   */  #ifndef _SYS_SYSCONFIG_H @@ -101,6 +102,8 @@ extern int	mach_sysconfig(int);  #define	_CONFIG_EPHID_MAX	47	/* maximum ephemeral uid */ +#define	_CONFIG_NPROC_NCPU	48	/* NCPU (sometimes > NPROC_MAX) */ +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/sys/sysevent.h b/usr/src/uts/common/sys/sysevent.h index 304745ed08..c2be00ad27 100644 --- a/usr/src/uts/common/sys/sysevent.h +++ b/usr/src/uts/common/sys/sysevent.h @@ -21,6 +21,7 @@  /*   * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc.   */  #ifndef	_SYS_SYSEVENT_H @@ -67,10 +68,12 @@ extern "C" {  #define	SE_KERN_PID	0  #define	SUNW_VENDOR	"SUNW" +#define	ILLUMOS_VENDOR	"ILLUMOS"  #define	SE_USR_PUB	"usr:"  #define	SE_KERN_PUB	"kern:"  #define	SUNW_KERN_PUB	SUNW_VENDOR ":" SE_KERN_PUB  #define	SUNW_USR_PUB	SUNW_VENDOR ":" SE_USR_PUB +#define	ILLUMOS_KERN_PUB	ILLUMOS_VENDOR ":" SE_KERN_PUB  /*   * Event header and attribute value limits diff --git a/usr/src/uts/common/sys/sysevent/datalink.h b/usr/src/uts/common/sys/sysevent/datalink.h new file mode 100644 index 0000000000..592ef5bdde --- /dev/null +++ b/usr/src/uts/common/sys/sysevent/datalink.h @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_SYSEVENT_DATALINK_H +#define	_SYS_SYSEVENT_DATALINK_H + +/* + * Datalink System Event payloads + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DATALINK_LINK_STATE + * + * 	Event Class	- EC_DATALINK + * 	Event Sub-Class	- EC_DATALINK_LINK_STATE + * + * 	Attribute Name	- DATALINK_EV_LINK_NAME + * 	Attribute Type	- SE_DATA_TYPE_STRING + * 	Attribute Value	- [Name of the datalink] + * + * 	Attribute Name	- DATALINK_EV_LINK_ID + * 	Attribute Type	- SE_DATA_TYPE_INT32 + * 	Attribute Value	- [datalink_id_t for the device] + * + * 	Attribute Name	- DATALINK_EV_ZONE_ID + * 	Attribute Type	- SE_DATA_TYPE_INT32 + * 	Attribute Value	- [zoneid_t of the zone the datalink is in] + */ + +#define	DATALINK_EV_LINK_NAME		"link" +#define	DATALINK_EV_LINK_ID		"linkid" +#define	DATALINK_EV_ZONE_ID		"zone" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DATALINK_H */ diff --git a/usr/src/uts/common/sys/sysevent/eventdefs.h b/usr/src/uts/common/sys/sysevent/eventdefs.h index cf6e040ee9..8995ba4aa0 100644 --- a/usr/src/uts/common/sys/sysevent/eventdefs.h +++ b/usr/src/uts/common/sys/sysevent/eventdefs.h @@ -212,9 +212,11 @@ extern "C" {  #define	ESC_ZFS_HISTORY_EVENT		"ESC_ZFS_history_event"  /* - * datalink subclass definitions. + * datalink subclass definitions. Supporting attributes for datalink state found + * in sys/sysevent/datalink.h.   */  #define	ESC_DATALINK_PHYS_ADD	"ESC_datalink_phys_add"	/* new physical link */ +#define	ESC_DATALINK_LINK_STATE	"ESC_datalink_link_state"	/* link state */  /*   * VRRP subclass definitions. Supporting attributes (name/value paris) are diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@  /*   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2014 Joyent, Inc.  All rights reserved.   */  #ifndef _SYS_SYSTRACE_H  #define	_SYS_SYSTRACE_H -#pragma ident	"%Z%%M%	%I%	%E% SMI" -  #include <sys/dtrace.h>  #ifdef	__cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent;  extern systrace_sysent_t *systrace_sysent32;  extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, -    uintptr_t, uintptr_t, uintptr_t, uintptr_t); +    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, -    uintptr_t, uintptr_t, uintptr_t, uintptr_t); +    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);  extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, -    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); +    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, +    uintptr_t arg6, uintptr_t arg7);  #ifdef _SYSCALL32_IMPL  extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, -    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); +    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, +    uintptr_t arg6, uintptr_t arg7);  #endif  #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 0c07623ce6..b955e5f3f2 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -363,6 +363,24 @@ extern pid_t tcgetsid(int);  #define	TCSETSF		(_TIOC|16)  /* + * linux terminal ioctls we need to be aware of + */ +#define	TIOCSETLD	(_TIOC|123)	/* set line discipline parms */ +#define	TIOCGETLD	(_TIOC|124)	/* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { +	unsigned char veof;	/* veof value */ +	unsigned char veol;	/* veol value */ +	unsigned char vmin;	/* vmin value */ +	unsigned char vtime;	/* vtime value */ +}; + +/*   * NTP PPS ioctls   */  #define	TIOCGPPS	(_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index f9f1d6462b..6cc474f864 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -71,7 +71,10 @@ typedef struct ctxop {  	void	(*exit_op)(void *);	/* invoked during {thread,lwp}_exit() */  	void	(*free_op)(void *, int); /* function which frees the context */  	void	*arg;		/* argument to above functions, ctx pointer */ -	struct ctxop *next;	/* next context ops */ +	struct ctxop *next;		/* next context ops */ +	struct ctxop *prev;		/* previous context ops */ +	hrtime_t save_ts;		/* timestamp of last save */ +	hrtime_t restore_ts;		/* timestamp of last restore */  } ctxop_t;  /* @@ -351,6 +354,8 @@ typedef struct _kthread {  	kmutex_t	t_wait_mutex;	/* used in CV wait functions */  	char		*t_name;	/* thread name */ + +	uint64_t	t_unsafe;	/* unsafe to run with HT VCPU thread */  } kthread_t;  /* @@ -372,7 +377,7 @@ typedef struct _kthread {  #define	T_WOULDBLOCK	0x0020	/* for lockfs */  #define	T_DONTBLOCK	0x0040	/* for lockfs */  #define	T_DONTPEND	0x0080	/* for lockfs */ -#define	T_SYS_PROF	0x0100	/* profiling on for duration of system call */ +#define	T_SPLITSTK	0x0100	/* kernel stack is currently split */  #define	T_WAITCVSEM	0x0200	/* waiting for a lwp_cv or lwp_sema on sleepq */  #define	T_WATCHPT	0x0400	/* thread undergoing a watchpoint emulation */  #define	T_PANIC		0x0800	/* thread initiated a system panic */ @@ -401,6 +406,7 @@ typedef struct _kthread {  #define	TP_CHANGEBIND	0x1000	/* thread has a new cpu/cpupart binding */  #define	TP_ZTHREAD	0x2000	/* this is a kernel thread for a zone */  #define	TP_WATCHSTOP	0x4000	/* thread is stopping via holdwatch() */ +#define	TP_KTHREAD	0x8000	/* in-kernel worker thread for a process */  /*   * Thread scheduler flag (t_schedflag) definitions. @@ -413,6 +419,7 @@ typedef struct _kthread {  #define	TS_SIGNALLED	0x0010	/* thread was awakened by cv_signal() */  #define	TS_PROJWAITQ	0x0020	/* thread is on its project's waitq */  #define	TS_ZONEWAITQ	0x0040	/* thread is on its zone's waitq */ +#define	TS_VCPU		0x0080	/* thread will enter guest context */  #define	TS_CSTART	0x0100	/* setrun() by continuelwps() */  #define	TS_UNPAUSE	0x0200	/* setrun() by unpauselwps() */  #define	TS_XSTART	0x0400	/* setrun() by SIGCONT */ @@ -420,8 +427,9 @@ typedef struct _kthread {  #define	TS_RESUME	0x1000	/* setrun() by CPR resume process */  #define	TS_CREATE	0x2000	/* setrun() by syslwp_create() */  #define	TS_RUNQMATCH	0x4000	/* exact run queue balancing by setbackdq() */ +#define	TS_BSTART	0x8000	/* setrun() by brand */  #define	TS_ALLSTART	\ -	(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) +	(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART)  #define	TS_ANYWAITQ	(TS_PROJWAITQ|TS_ZONEWAITQ)  /* @@ -449,6 +457,10 @@ typedef struct _kthread {  #define	ISTOPPED(t) ((t)->t_state == TS_STOPPED && \  			!((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define	BSTOPPED(t)	((t)->t_state == TS_STOPPED && \ +			    !((t)->t_schedflag & TS_BSTART)) +  /* True if thread is asleep and wakeable */  #define	ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \  			((t)->t_flag & T_WAKEABLE))) @@ -599,6 +611,7 @@ int thread_setname(kthread_t *, const char *);  int thread_vsetname(kthread_t *, const char *, ...);  extern int default_binding_mode; +extern int default_stksize;  #endif	/* _KERNEL */ diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 81b4753049..a69bf4dd63 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -15,10 +15,11 @@   * Use is subject to license terms.   *   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved. + * Copyright 2016 Joyent, Inc.   */  /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved.   */  #ifndef _SYS_TIME_H @@ -247,8 +248,8 @@ struct itimerval32 {  #define	MSEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MILLISEC))  #define	NSEC2MSEC(n)	((n) / (NANOSEC / MILLISEC)) -#define	USEC2NSEC(m)    ((hrtime_t)(m) * (NANOSEC / MICROSEC)) -#define	NSEC2USEC(n)    ((n) / (NANOSEC / MICROSEC)) +#define	USEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define	NSEC2USEC(n)	((n) / (NANOSEC / MICROSEC))  #define	NSEC2SEC(n)	((n) / (NANOSEC / SEC))  #define	SEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / SEC)) @@ -264,6 +265,14 @@ typedef	longlong_t	hrtime_t;  #if defined(_KERNEL) || defined(_FAKE_KERNEL) +/* + * Unsigned counterpart to hrtime_t + */ +typedef	u_longlong_t	uhrtime_t; + +#define	HRTIME_MAX	LLONG_MAX +#define	UHRTIME_MAX	ULLONG_MAX +  #include <sys/time_impl.h>  #include <sys/mutex.h> diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h index ec349c962f..748e0c0627 100644 --- a/usr/src/uts/common/sys/timer.h +++ b/usr/src/uts/common/sys/timer.h @@ -25,7 +25,7 @@   */  /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc.   */  #ifndef	_SYS_TIMER_H @@ -34,6 +34,9 @@  #include <sys/types.h>  #include <sys/proc.h>  #include <sys/thread.h> +#include <sys/param.h> +#include <sys/siginfo.h> +#include <sys/port.h>  #ifdef	__cplusplus  extern "C" { @@ -42,7 +45,13 @@ extern "C" {  #ifdef	_KERNEL  #define	_TIMER_MAX	32 -extern	int	timer_max;		/* patchable via /etc/system */ +/* + * Max timers per process.  This is patchable via /etc/system and can be + * updated via kmdb.  Sticking to positive powers of 2 is recommended. + */ +extern	int	timer_max; + +#define	_TIMER_ALLOC_INIT	8	/* initial size for p_itimer array */  /*   * Bit values for the it_lock field. @@ -56,6 +65,7 @@ extern	int	timer_max;		/* patchable via /etc/system */   */  #define	IT_SIGNAL		0x01  #define	IT_PORT			0x02	/* use event port notification */ +#define	IT_CALLBACK		0x04	/* custom callback function */  struct clock_backend; @@ -83,14 +93,27 @@ struct itimer {  	struct clock_backend *it_backend;  	void		(*it_fire)(itimer_t *);  	kmutex_t	it_mutex; -	void		*it_portev;	/* port_kevent_t pointer */ -	void		*it_portsrc;	/* port_source_t pointer */ -	int		it_portfd;	/* port file descriptor */ +	union { +		struct { +			void	*_it_portev;	/* port_kevent_t pointer */ +			void	*_it_portsrc;	/* port_source_t pointer */ +			int	_it_portfd;	/* port file descriptor */ +		} _it_ev_port; +		struct { +			void		(*_it_cb_func)(itimer_t *); +			uintptr_t	_it_cb_data[2]; +		} _it_ev_cb; +	} _it_ev_data;  };  #define	it_sigq		__data.__proc.__it_sigq  #define	it_lwp		__data.__proc.__it_lwp  #define	it_frontend	__data.__it_frontend +#define	it_portev	_it_ev_data._it_ev_port._it_portev +#define	it_portsrc	_it_ev_data._it_ev_port._it_portsrc +#define	it_portfd	_it_ev_data._it_ev_port._it_portfd +#define	it_cb_func	_it_ev_data._it_ev_cb._it_cb_func +#define	it_cb_data	_it_ev_data._it_ev_cb._it_cb_data  typedef struct clock_backend {  	struct sigevent clk_default; @@ -107,7 +130,11 @@ typedef struct clock_backend {  extern void clock_add_backend(clockid_t clock, clock_backend_t *backend);  extern clock_backend_t *clock_get_backend(clockid_t clock); +extern void timer_release(struct proc *, itimer_t *); +extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it);  extern void timer_lwpbind(); +extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *, +    itimer_t **, timer_t *);  extern	void	timer_func(sigqueue_t *);  extern	void	timer_exit(void); diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index 904b52cac4..75d000b831 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@   *   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2011 Joyent, Inc.  All rights reserved.   */  /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/ @@ -159,7 +160,7 @@ extern kmutex_t ualock;  extern void mdboot(int, int, char *, boolean_t);  extern void mdpreboot(int, int, char *);  extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t);  #endif  extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index bca1ed1fa3..9584be559f 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -145,7 +145,8 @@ typedef struct uioa_s {   */  typedef enum xuio_type {  	UIOTYPE_ASYNCIO, -	UIOTYPE_ZEROCOPY +	UIOTYPE_ZEROCOPY, +	UIOTYPE_PEEKSIZE  } xuio_type_t;  typedef struct xuio { @@ -175,6 +176,15 @@ typedef struct xuio {  			int xu_zc_rw;	/* read or write buffer */  			void *xu_zc_priv;	/* fs specific */  		} xu_zc; + +		/* +		 * Peek Size Support -- facilitate peeking at the size of a +		 * waiting message on a socket. +		 */ +		struct { +			ssize_t xu_ps_size;	/* size of waiting msg */ +			boolean_t xu_ps_set;	/* was size calculated? */ +		} xu_ps;  	} xu_ext;  } xuio_t; diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h index c96f914a70..f1b209faad 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h @@ -20,7 +20,7 @@   */  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  #ifndef _SYS_USB_HIDMINOR_H @@ -44,21 +44,28 @@ extern "C" {   * transparent.   *   * So we change minor node numbering scheme to be: - *	external node minor num == instance << 1 - *	internal node minor num == instance << 1 | 0x1 + *	external node minor num == instance << 9 + *	internal node minor num == instance << 9 | 0x100   * (There are only internal nodes for keyboard/mouse now.) + * + * The 8 bits of the LSB are used for ugen minor numbering (hence the use + * of the first bit of the next byte for the "internal" flag)   */ -#define	HID_MINOR_BITS_MASK		0x1 +#define	HID_MINOR_BITS_MASK		0x1ff +#define	HID_MINOR_UGEN_BITS_MASK	0xff  #define	HID_MINOR_INSTANCE_MASK		~HID_MINOR_BITS_MASK -#define	HID_MINOR_INSTANCE_SHIFT	1 +#define	HID_MINOR_INSTANCE_SHIFT	9 -#define	HID_MINOR_INTERNAL		0x1 +#define	HID_MINOR_INTERNAL		0x100  #define	HID_MINOR_MAKE_INTERNAL(minor) \  		((minor) | HID_MINOR_INTERNAL)  #define	HID_IS_INTERNAL_OPEN(minor) \  		(((minor) & HID_MINOR_INTERNAL)) +#define	HID_IS_UGEN_OPEN(minor) \ +		(((minor) & HID_MINOR_UGEN_BITS_MASK)) +  #define	HID_MINOR_TO_INSTANCE(minor) \  		(((minor) & HID_MINOR_INSTANCE_MASK) >> \  		HID_MINOR_INSTANCE_SHIFT) diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h index e9a25ea894..ee68f0088a 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h @@ -21,7 +21,7 @@  /*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.   */  #ifndef _SYS_USB_HIDVAR_H @@ -33,6 +33,7 @@ extern "C" {  #endif  #include <sys/usb/usba/usbai_private.h> +#include <sys/usb/usba/usba_ugen.h>  /*   * HID : This header file contains the internal structures @@ -222,6 +223,8 @@ typedef struct hid_state {  	queue_t			*hid_inuse_rq;  	int			hid_internal_flag;	/* see below */  	int			hid_external_flag;	/* see below */ + +	usb_ugen_hdl_t		hid_ugen_hdl;		/* ugen support */  } hid_state_t;  /* warlock directives, stable data */ diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index 0b997c518c..15b4d0b247 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -82,6 +82,21 @@ extern "C" {  #endif  /* + * File Descriptor assignment generation. + * + * Certain file descriptor consumers (namely epoll) need to be able to detect + * when the resource underlying an fd change due to (re)assignment.  Checks + * comparing old and new file_t pointers work OK, but could easily be fooled by + * an entry freed-to and reused-from the cache.  To better detect such + * assingments, a generation number is kept in the uf_entry.  Whenever a + * non-NULL file_t is assigned to the entry, the generation is incremented, + * indicating the change.  There is a minute possibility that a rollover of the + * value could cause assigments to evade detection by consumers, but it is + * considered acceptably small. + */ +typedef uint_t uf_entry_gen_t; + +/*   * Entry in the per-process list of open files.   * Note: only certain fields are copied in flist_grow() and flist_fork().   * This is indicated in brackets in the structure member comments. @@ -96,11 +111,13 @@ typedef struct uf_entry {  	short		uf_busy;	/* file is allocated [grow, fork] */  	kcondvar_t	uf_wanted_cv;	/* waiting for setf() [never copied] */  	kcondvar_t	uf_closing_cv;	/* waiting for close() [never copied] */ -	struct portfd 	*uf_portfd;	/* associated with port [grow] */ +	struct portfd	*uf_portfd;	/* associated with port [grow] */ +	uf_entry_gen_t	uf_gen;		/* assigned fd generation [grow,fork] */  	/* Avoid false sharing - pad to coherency granularity (64 bytes) */  	char		uf_pad[64 - sizeof (kmutex_t) - 2 * sizeof (void*) -  		2 * sizeof (int) - 2 * sizeof (short) - -		2 * sizeof (kcondvar_t) - sizeof (struct portfd *)]; +		2 * sizeof (kcondvar_t) - sizeof (struct portfd *) - +		sizeof (uf_entry_gen_t)];  } uf_entry_t;  /* @@ -185,9 +202,9 @@ typedef struct {		/* kernel syscall set type */   * This value should not be changed in a patch.   */  #if defined(__sparc) -#define	__KERN_NAUXV_IMPL 20 +#define	__KERN_NAUXV_IMPL 24  #elif defined(__i386) || defined(__amd64) -#define	__KERN_NAUXV_IMPL 25 +#define	__KERN_NAUXV_IMPL 28  #endif  struct execsw; diff --git a/usr/src/uts/common/sys/vm.h b/usr/src/uts/common/sys/vm.h index a8ca2ad377..0f7dfa9fd0 100644 --- a/usr/src/uts/common/sys/vm.h +++ b/usr/src/uts/common/sys/vm.h @@ -20,6 +20,7 @@   */  /*   * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc.   */  /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/ @@ -57,6 +58,8 @@ int	queue_io_request(struct vnode *, u_offset_t);  extern	kmutex_t	memavail_lock;  extern	kcondvar_t	memavail_cv; +#define	WAKE_PAGEOUT_SCANNER()	cv_broadcast(&proc_pageout->p_cv) +  #endif	/* defined(_KERNEL) */  #ifdef	__cplusplus diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..afbf438eff 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@  /*   * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. + * Copyright 2017 Joyent, Inc.  All rights reserved.   */  #ifndef	_SYS_VM_USAGE_H @@ -79,8 +80,12 @@ extern "C" {  					/* zoneid */  #define	VMUSAGE_COL_EUSERS	0x2000	/* same as VMUSAGE_COL_RUSERS, but by */  					/* euser */ +#define	VMUSAGE_A_ZONE		0x4000	/* rss/swap for a specified zone */ -#define	VMUSAGE_MASK		0x3fff  /* all valid flags for getvmusage() */ +#define	VMUSAGE_MASK		0x7fff  /* all valid flags for getvmusage() */ + +#define	VMUSAGE_ZONE_FLAGS	(VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | \ +				VMUSAGE_A_ZONE)  typedef struct vmusage {  	id_t	vmu_zoneid;		/* zoneid, or ALL_ZONES for */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index c274bae805..2292310bda 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@   * CDDL HEADER END   */  /* + * Copyright (c) 2017, Joyent, Inc. All rights reserved. + */ +/*   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -58,6 +61,9 @@ extern pgcnt_t	desscan;	/* desired pages scanned per second */  extern pgcnt_t	slowscan;  extern pgcnt_t	fastscan;  extern pgcnt_t	pushes;		/* number of pages pushed to swap device */ +extern uint64_t	low_mem_scan;	/* num times page scan due to low memory */ +extern uint64_t	zone_cap_scan;	/* num times page scan due to zone cap */ +extern uint64_t	n_throttle;	/* num times page create throttled */  /* writable copies of tunables */  extern pgcnt_t	maxpgio;	/* max paging i/o per sec before start swaps */ @@ -159,6 +165,8 @@ extern	void	*boot_virt_alloc(void *addr, size_t size);  extern	size_t	exec_get_spslew(void); +extern	caddr_t	map_userlimit(proc_t *pp, struct as *as, int flags); +  #endif	/* _KERNEL */  #ifdef	__cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define	_SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define	VND_IOC		(('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define	VND_IOC_ATTACH		(VND_IOC | 0x1) + +#define	VND_NAMELEN	32 + +typedef struct vnd_ioc_attach { +	char		via_name[VND_NAMELEN]; +	zoneid_t	via_zoneid; +	uint32_t	via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define	VND_IOC_LINK		(VND_IOC | 0x2) + +typedef struct vnd_ioc_link { +	char		vil_name[VND_NAMELEN]; +	uint32_t	vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define	VND_IOC_UNLINK		(VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { +	uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { +	uint64_t	vib_size; +	uint32_t	vib_filler; +	uint32_t	vib_errno; +} vnd_ioc_buf_t; + +#define	VND_IOC_GETRXBUF	(VND_IOC | 0x04) +#define	VND_IOC_SETRXBUF	(VND_IOC | 0x05) +#define	VND_IOC_GETMAXBUF	(VND_IOC | 0x06) +#define	VND_IOC_GETTXBUF	(VND_IOC | 0x07) +#define	VND_IOC_SETTXBUF	(VND_IOC | 0x08) +#define	VND_IOC_GETMINTU	(VND_IOC | 0x09) +#define	VND_IOC_GETMAXTU	(VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { +	uint32_t vii_version; +	zoneid_t vii_zone; +	char vii_name[VND_NAMELEN]; +	char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { +	uint_t vl_nents; +	uint_t vl_actents; +	vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { +	uint_t vl_nents; +	uint_t vl_actents; +	caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif	/* _KERNEL */ + +#define	VND_IOC_LIST		(VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define	VND_IOC_FRAMEIO_READ	(VND_IOC | 0x30) +#define	VND_IOC_FRAMEIO_WRITE	(VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc.  All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define	_SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { +	VND_E_SUCCESS = 0,		/* no error */ +	VND_E_NOMEM,			/* no memory */ +	VND_E_NODATALINK,		/* no such datalink */ +	VND_E_NOTETHER,			/* not DL_ETHER */ +	VND_E_DLPIINVAL,		/* Unknown DLPI failures */ +	VND_E_ATTACHFAIL,		/* DL_ATTACH_REQ failed */ +	VND_E_BINDFAIL,			/* DL_BIND_REQ failed */ +	VND_E_PROMISCFAIL,		/* DL_PROMISCON_REQ failed */ +	VND_E_DIRECTFAIL,		/* DLD_CAPAB_DIRECT enable failed */ +	VND_E_CAPACKINVAL,		/* bad dl_capability_ack_t */ +	VND_E_SUBCAPINVAL,		/* bad dl_capability_sub_t */ +	VND_E_DLDBADVERS,		/* bad dld version */ +	VND_E_KSTATCREATE,		/* failed to create kstats */ +	VND_E_NODEV,			/* no such vnd link */ +	VND_E_NONETSTACK,		/* netstack doesn't exist */ +	VND_E_ASSOCIATED,		/* device already associated */ +	VND_E_ATTACHED,			/* device already attached */ +	VND_E_LINKED,			/* device already linked */ +	VND_E_BADNAME,			/* invalid name */ +	VND_E_PERM,			/* can't touch this */ +	VND_E_NOZONE,			/* no such zone */ +	VND_E_STRINIT,		/* failed to initialize vnd stream module */ +	VND_E_NOTATTACHED,		/* device not attached */ +	VND_E_NOTLINKED,		/* device not linked */ +	VND_E_LINKEXISTS,	/* another device has the same link name */ +	VND_E_MINORNODE,		/* failed to create minor node */ +	VND_E_BUFTOOBIG,		/* requested buffer size is too large */ +	VND_E_BUFTOOSMALL,		/* requested buffer size is too small */ +	VND_E_DLEXCL,			/* unable to get dlpi excl access */ +	VND_E_DIRECTNOTSUP, +			/* DLD direct capability not suported over data link */ +	VND_E_BADPROPSIZE,		/* invalid property size */ +	VND_E_BADPROP,			/* invalid property */ +	VND_E_PROPRDONLY,		/* property is read only */ +	VND_E_SYS,			/* unexpected system error */ +	VND_E_CAPABPASS, +			/* capabilities invalid, pass-through module detected */ +	VND_E_UNKNOWN			/* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 7e50091347..4c8d49c621 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@  /*   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms. - * Copyright 2014 Joyent, Inc.  All rights reserved. + * Copyright 2018 Joyent, Inc.   */  #ifndef	_SYS_VNIC_IMPL_H @@ -64,7 +64,9 @@ typedef struct vnic_s {  	mac_notify_handle_t	vn_mnh;  	uint32_t		vn_hcksum_txflags; +	mac_capab_lso_t		vn_cap_lso;  	uint32_t		vn_mtu; +	link_state_t		vn_ls;  } vnic_t;  #define	vn_mch	vn_mc_handles[0] diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 51b4f7af18..b527558895 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@  /*   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc.   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.   * Copyright 2017 RackTop Systems.   */ @@ -805,12 +805,14 @@ typedef enum vnevent	{  	VE_RMDIR	= 4,	/* Remove of directory vnode's name */  	VE_CREATE	= 5,	/* Create with vnode's name which exists */  	VE_LINK		= 6, 	/* Link with vnode's name as source */ -	VE_RENAME_DEST_DIR	= 7, 	/* Rename with vnode as target dir */ +	VE_RENAME_DEST_DIR = 7,	/* Rename with vnode as target dir */  	VE_MOUNTEDOVER	= 8, 	/* File or Filesystem got mounted over vnode */  	VE_TRUNCATE = 9,	/* Truncate */  	VE_PRE_RENAME_SRC = 10,	/* Pre-rename, with vnode as source */  	VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ -	VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */ +	VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */ +	VE_RENAME_SRC_DIR = 13,	/* Rename with vnode as source dir */ +	VE_RESIZE	= 14	/* Resize/truncate to non-zero offset */  } vnevent_t;  /* @@ -1370,7 +1372,8 @@ void	vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *);  void	vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *);  void	vnevent_create(vnode_t *, caller_context_t *);  void	vnevent_link(vnode_t *, caller_context_t *); -void	vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void	vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, +    caller_context_t *ct);  void	vnevent_mountedover(vnode_t *, caller_context_t *);  void	vnevent_truncate(vnode_t *, caller_context_t *);  int	vnevent_support(vnode_t *, caller_context_t *); @@ -1380,6 +1383,7 @@ void	vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *,  	    caller_context_t *);  void	vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *,  	    caller_context_t *); +void	vnevent_resize(vnode_t *, caller_context_t *);  /* Vnode specific data */  void vsd_create(uint_t *, void (*)(void *)); @@ -1482,6 +1486,7 @@ extern struct vnode kvps[];  typedef enum {  	KV_KVP,		/* vnode for all segkmem pages */  	KV_ZVP,		/* vnode for all ZFS pages */ +	KV_VVP,		/* vnode for all VMM pages */  #if defined(__sparc)  	KV_MPVP,	/* vnode for all page_t meta-pages */  	KV_PROMVP,	/* vnode for all PROM pages */ diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h new file mode 100644 index 0000000000..d87786b507 --- /dev/null +++ b/usr/src/uts/common/sys/vxlan.h @@ -0,0 +1,47 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_VXLAN_H +#define	_SYS_VXLAN_H + +/* + * Common VXLAN information + */ + +#include <sys/inttypes.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Sizes in bytes */ +#define	VXLAN_HDR_LEN	8 +#define	VXLAN_ID_LEN	3 + +#define	VXLAN_F_VDI	0x08000000 +#define	VXLAN_ID_SHIFT	8 + +#pragma pack(1) +typedef struct vxlan_hdr { +	uint32_t vxlan_flags; +	uint32_t vxlan_id; +} vxlan_hdr_t; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VXLAN_H */ diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..e08d75ecba --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,78 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define	_SYS_ZFD_H + +#include <sys/types.h> + +#ifdef	__cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define	ZFD_MASTER_NAME	"master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define	ZFD_SLAVE_NAME	"slave" + +#define	ZFD_NAME_LEN	16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define	ZFD_IOC		(('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define	ZFD_MAKETTY		(ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define	ZFD_EOF			(ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define	ZFD_HAS_SLAVE		(ZFD_IOC | 2) + +/* + * This ioctl links two streams into a multiplexer configuration for in-zone + * logging. + */ +#define	ZFD_MUX			(ZFD_IOC | 3) + +/* + * This ioctl controls the flow control setting for the log multiplexer stream + * (1 = true, 0 = false). The default is false which implies teeing into the + * log stream is "best-effort" but data will be discarded if the stream + * becomes full. If set and the log stream begins to fill up, the primary + * stream will stop flowing. + */ +#define	ZFD_MUX_FLOWCON		(ZFD_IOC | 4) + +#ifdef	__cplusplus +} +#endif + +#endif	/* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 56fa4b8d87..a4ec347ce4 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -20,9 +20,9 @@   */  /*   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc.   * Copyright 2014 Nexenta Systems, Inc. All rights reserved.   * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2018, Joyent, Inc.   */  #ifndef _SYS_ZONE_H @@ -43,6 +43,7 @@  #include <sys/secflags.h>  #include <netinet/in.h>  #include <sys/cpu_uarray.h> +#include <sys/nvpair.h>  #ifdef	__cplusplus  extern "C" { @@ -52,15 +53,27 @@ extern "C" {   * NOTE   *   * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice.   * Applications and drivers using these interfaces may fail to   * run on future releases.   */  /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define	MAX_ZONEID	9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define	MAX_ZONES	8192 +#define	MAX_ZONEID	(MAX_ZONES - 1)  #define	MIN_USERZONEID	1	/* lowest user-creatable zone ID */  #define	MIN_ZONEID	0	/* minimum zone ID on system */  #define	GLOBAL_ZONEID	0 @@ -99,14 +112,18 @@ extern "C" {  #define	ZONE_ATTR_INITNAME	9  #define	ZONE_ATTR_BOOTARGS	10  #define	ZONE_ATTR_BRAND		11 -#define	ZONE_ATTR_PHYS_MCAP	12 -#define	ZONE_ATTR_SCHED_CLASS	13 -#define	ZONE_ATTR_FLAGS		14 -#define	ZONE_ATTR_HOSTID	15 -#define	ZONE_ATTR_FS_ALLOWED	16 -#define	ZONE_ATTR_NETWORK	17 -#define	ZONE_ATTR_INITNORESTART	20 +#define	ZONE_ATTR_SCHED_CLASS	12 +#define	ZONE_ATTR_FLAGS		13 +#define	ZONE_ATTR_HOSTID	14 +#define	ZONE_ATTR_FS_ALLOWED	15 +#define	ZONE_ATTR_NETWORK	16 +#define	ZONE_ATTR_DID		17 +#define	ZONE_ATTR_INITNORESTART	18 +#define	ZONE_ATTR_APP_SVC_CT	19 +#define	ZONE_ATTR_SCHED_FIXEDHI	20  #define	ZONE_ATTR_SECFLAGS	21 +#define	ZONE_ATTR_INITRESTART0	22 +#define	ZONE_ATTR_INITREBOOT	23  /* Start of the brand-specific attribute namespace */  #define	ZONE_ATTR_BRAND_ATTRS	32768 @@ -122,13 +139,18 @@ extern "C" {  #define	ZONE_EVENT_READY		"ready"  #define	ZONE_EVENT_RUNNING		"running"  #define	ZONE_EVENT_SHUTTING_DOWN	"shutting_down" +#define	ZONE_EVENT_FREE			"free"  #define	ZONE_CB_NAME		"zonename"  #define	ZONE_CB_NEWSTATE	"newstate"  #define	ZONE_CB_OLDSTATE	"oldstate" +#define	ZONE_CB_RESTARTS	"restarts"  #define	ZONE_CB_TIMESTAMP	"when"  #define	ZONE_CB_ZONEID		"zoneid" +#define	ZONE_EVENT_INIT_CLASS		"init" +#define	ZONE_EVENT_INIT_RESTART_SC	"restart" +  /*   * Exit values that may be returned by scripts or programs invoked by various   * zone commands. @@ -187,6 +209,7 @@ typedef struct {  	uint32_t doi;			/* DOI for label */  	caddr32_t label;		/* label associated with zone */  	int flags; +	zoneid_t zoneid;		/* requested zoneid */  } zone_def32;  #endif  typedef struct { @@ -203,6 +226,7 @@ typedef struct {  	uint32_t doi;			/* DOI for label */  	const bslabel_t *label;		/* label associated with zone */  	int flags; +	zoneid_t zoneid;		/* requested zoneid */  } zone_def;  /* extended error information */ @@ -227,7 +251,8 @@ typedef enum {  	ZONE_IS_EMPTY,  	ZONE_IS_DOWN,  	ZONE_IS_DYING, -	ZONE_IS_DEAD +	ZONE_IS_DEAD, +	ZONE_IS_FREE		/* transient state for zone sysevent */  } zone_status_t;  #define	ZONE_MIN_STATE		ZONE_IS_UNINITIALIZED  #define	ZONE_MAX_STATE		ZONE_IS_DEAD @@ -247,9 +272,12 @@ typedef enum zone_cmd {  typedef struct zone_cmd_arg {  	uint64_t	uniqid;		/* unique "generation number" */  	zone_cmd_t	cmd;		/* requested action */ -	uint32_t	_pad;		/* need consistent 32/64 bit alignmt */ +	int		status;		/* init status on shutdown */ +	uint32_t	debug;		/* enable brand hook debug */  	char locale[MAXPATHLEN];	/* locale in which to render messages */  	char bootbuf[BOOTARGS_MAX];	/* arguments passed to zone_boot() */ +	/* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ +	int		pad;  } zone_cmd_arg_t;  /* @@ -384,7 +412,7 @@ typedef struct zone_dataset {  } zone_dataset_t;  /* - * structure for zone kstats + * structure for rctl zone kstats   */  typedef struct zone_kstat {  	kstat_named_t zk_zonename; @@ -395,12 +423,57 @@ typedef struct zone_kstat {  struct cpucap;  typedef struct { +	hrtime_t	cycle_start; +	uint_t		cycle_cnt; +	hrtime_t	zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { +	kstat_named_t	zv_zonename; +	kstat_named_t	zv_nread; +	kstat_named_t	zv_reads; +	kstat_named_t	zv_rtime; +	kstat_named_t	zv_rlentime; +	kstat_named_t	zv_rcnt; +	kstat_named_t	zv_nwritten; +	kstat_named_t	zv_writes; +	kstat_named_t	zv_wtime; +	kstat_named_t	zv_wlentime; +	kstat_named_t	zv_wcnt; +	kstat_named_t	zv_10ms_ops; +	kstat_named_t	zv_100ms_ops; +	kstat_named_t	zv_1s_ops; +	kstat_named_t	zv_10s_ops; +	kstat_named_t	zv_delay_cnt; +	kstat_named_t	zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { +	kstat_named_t	zz_zonename; +	kstat_named_t	zz_nread; +	kstat_named_t	zz_reads; +	kstat_named_t	zz_rtime; +	kstat_named_t	zz_rlentime; +	kstat_named_t	zz_nwritten; +	kstat_named_t	zz_writes; +	kstat_named_t	zz_waittime; +} zone_zfs_kstat_t; + +typedef struct {  	kstat_named_t	zm_zonename; +	kstat_named_t	zm_rss; +	kstat_named_t	zm_phys_cap; +	kstat_named_t	zm_swap; +	kstat_named_t	zm_swap_cap; +	kstat_named_t	zm_nover; +	kstat_named_t	zm_pagedout;  	kstat_named_t	zm_pgpgin;  	kstat_named_t	zm_anonpgin;  	kstat_named_t	zm_execpgin;  	kstat_named_t	zm_fspgin;  	kstat_named_t	zm_anon_alloc_fail; +	kstat_named_t	zm_pf_throttle; +	kstat_named_t	zm_pf_throttle_usec;  } zone_mcap_kstat_t;  typedef struct { @@ -415,8 +488,10 @@ typedef struct {  	kstat_named_t	zm_ffnoproc;  	kstat_named_t	zm_ffnomem;  	kstat_named_t	zm_ffmisc; +	kstat_named_t	zm_mfseglim;  	kstat_named_t	zm_nested_intp;  	kstat_named_t	zm_init_pid; +	kstat_named_t	zm_init_restarts;  	kstat_named_t	zm_boot_time;  } zone_misc_kstat_t; @@ -459,6 +534,7 @@ typedef struct zone {  	 */  	list_node_t	zone_linkage;  	zoneid_t	zone_id;	/* ID of zone */ +	zoneid_t	zone_did;	/* persistent debug ID of zone */  	uint_t		zone_ref;	/* count of zone_hold()s on zone */  	uint_t		zone_cred_ref;	/* count of zone_hold_cred()s on zone */  	/* @@ -511,10 +587,11 @@ typedef struct zone {  	kcondvar_t	zone_cv;	/* used to signal state changes */  	struct proc	*zone_zsched;	/* Dummy kernel "zsched" process */  	pid_t		zone_proc_initpid; /* pid of "init" for this zone */ -	char		*zone_initname;	/* fs path to 'init' */ +	uint_t		zone_proc_init_restarts; /* times init restarted */ +	char		*zone_initname;		/* fs path to 'init' */ +	int		zone_init_status;	/* init's exit status */  	int		zone_boot_err;  /* for zone_boot() if boot fails */  	char		*zone_bootargs;	/* arguments passed via zone_boot() */ -	uint64_t	zone_phys_mcap;	/* physical memory cap */  	/*  	 * zone_kthreads is protected by zone_status_lock.  	 */ @@ -552,9 +629,13 @@ typedef struct zone {  	tsol_mlp_list_t zone_mlps;	/* MLPs on zone-private addresses */  	boolean_t	zone_restart_init;	/* Restart init if it dies? */ +	boolean_t	zone_reboot_on_init_exit; /* Reboot if init dies? */ +	boolean_t	zone_restart_init_0;	/* Restart only if it exits 0 */ +	boolean_t	zone_setup_app_contract; /* setup contract? */  	struct brand	*zone_brand;		/* zone's brand */  	void		*zone_brand_data;	/* store brand specific data */  	id_t		zone_defaultcid;	/* dflt scheduling class id */ +	boolean_t	zone_fixed_hipri;	/* fixed sched. hi prio */  	kstat_t		*zone_swapresv_kstat;  	kstat_t		*zone_lockedmem_kstat;  	/* @@ -563,8 +644,24 @@ typedef struct zone {  	list_t		zone_dl_list;  	netstack_t	*zone_netstack;  	struct cpucap	*zone_cpucap;	/* CPU caps data */ + +	/* +	 * kstats and counters for VFS ops and bytes. +	 */ +	kmutex_t	zone_vfs_lock;		/* protects VFS statistics */ +	kstat_t		*zone_vfs_ksp; +	kstat_io_t	zone_vfs_rwstats; +	zone_vfs_kstat_t *zone_vfs_stats; + +	/* +	 * kstats for ZFS I/O ops and bytes. +	 */ +	kmutex_t	zone_zfs_lock;		/* protects ZFS statistics */ +	kstat_t		*zone_zfs_ksp; +	zone_zfs_kstat_t *zone_zfs_stats; +  	/* -	 * Solaris Auditing per-zone audit context +	 * illumos Auditing per-zone audit context  	 */  	struct au_kcontext	*zone_audit_kctxt;  	/* @@ -581,7 +678,11 @@ typedef struct zone {  						/* zone_rctls->rcs_lock */  	kstat_t		*zone_nprocs_kstat; -	kmutex_t	zone_mcap_lock;	/* protects mcap statistics */ +	/* +	 * kstats and counters for physical memory capping. +	 */ +	kstat_t		*zone_physmem_kstat; +	kmutex_t	zone_mcap_lock;		/* protects mcap statistics */  	kstat_t		*zone_mcap_ksp;  	zone_mcap_kstat_t *zone_mcap_stats;  	uint64_t	zone_pgpgin;		/* pages paged in */ @@ -606,6 +707,8 @@ typedef struct zone {  	uint32_t	zone_ffnomem;		/* as_dup/memory error */  	uint32_t	zone_ffmisc;		/* misc. other error */ +	uint32_t	zone_mfseglim;		/* map failure (# segs limit) */ +  	uint32_t	zone_nested_intp;	/* nested interp. kstat */  	struct loadavg_s zone_loadavg;		/* loadavg for this zone */ @@ -633,6 +736,53 @@ typedef struct zone {  } zone_t;  /* + * Data and counters used for ZFS fair-share disk IO. + */ +typedef struct zone_zfs_io { +	uint16_t	zpers_zfs_io_pri;	/* ZFS IO priority - 16k max */ +	uint_t		zpers_zfs_queued[2];	/* sync I/O enqueued count */ +	sys_zio_cntr_t	zpers_rd_ops;		/* Counters for ZFS reads, */ +	sys_zio_cntr_t	zpers_wr_ops;		/* writes, and */ +	sys_zio_cntr_t	zpers_lwr_ops;		/* logical writes. */ +	kstat_io_t	zpers_zfs_rwstats; +	uint64_t	zpers_io_util;		/* IO utilization metric */ +	uint64_t	zpers_zfs_rd_waittime; +	uint8_t		zpers_io_delay;		/* IO delay on logical r/w */ +	uint8_t		zpers_zfs_weight;	/* used to prevent starvation */ +	uint8_t		zpers_io_util_above_avg; /* IO util percent > avg. */ +} zone_zfs_io_t; + +/* + * "Persistent" zone data which can be accessed idependently of the zone_t. + */ +typedef struct zone_persist { +	kmutex_t	zpers_zfs_lock;	/* Protects zpers_zfsp references */ +	zone_zfs_io_t	*zpers_zfsp;	/* ZFS fair-share IO data */ +	uint8_t		zpers_over;	/* currently over cap */ +	uint32_t	zpers_pg_cnt;	/* current RSS in pages */ +	uint32_t	zpers_pg_limit;	/* current RRS limit in pages */ +	uint32_t	zpers_nover;	/* # of times over phys. cap */ +#ifndef DEBUG +	uint64_t	zpers_pg_out;	/* # pages flushed */ +#else +	/* +	 * To conserve memory, some detailed kstats are only kept for DEBUG +	 * builds. +	 */ +	uint64_t	zpers_zfs_rd_waittime; + +	uint64_t	zpers_pg_anon;		/* # clean anon pages flushed */ +	uint64_t	zpers_pg_anondirty;	/* # dirty anon pages flushed */ +	uint64_t	zpers_pg_fs;		/* # clean fs pages flushed */ +	uint64_t	zpers_pg_fsdirty;	/* # dirty fs pages flushed */ +#endif +} zone_persist_t; + +typedef enum zone_pageout_op { +	ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY +} zone_pageout_op_t; + +/*   * Special value of zone_psetid to indicate that pools are disabled.   */  #define	ZONE_PS_INVAL	PS_MYID @@ -662,6 +812,7 @@ extern zone_t *zone_find_by_name(char *);  extern zone_t *zone_find_by_any_path(const char *, boolean_t);  extern zone_t *zone_find_by_path(const char *);  extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void);  extern zone_t *zone_find_by_id_nolock(zoneid_t);  extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *);  extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -842,6 +993,7 @@ extern int zone_ncpus_online_get(zone_t *);   * Returns true if the named pool/dataset is visible in the current zone.   */  extern int zone_dataset_visible(const char *, int *); +extern int zone_dataset_visible_inzone(zone_t *, const char *, int *);  /*   * zone version of kadmin() @@ -854,10 +1006,25 @@ extern void mount_completed(zone_t *);  extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); +extern void zone_pageout_stat(int, zone_pageout_op_t); +extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern zone_persist_t zone_pdata[MAX_ZONES]; +  extern rctl_hndl_t rc_zone_locked_mem;  extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem;  extern rctl_hndl_t rc_zone_max_lofi; +/* For publishing sysevents related to a particular zone */ +extern void zone_sysevent_publish(zone_t *, const char *, const char *, +    nvlist_t *); +  #endif	/* _KERNEL */  #ifdef	__cplusplus | 
